diff options
Diffstat (limited to 'net/ipv4')
35 files changed, 679 insertions, 462 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f3dad1661343..58925b6597de 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1043,7 +1043,7 @@ static struct inet_protosw inetsw_array[] = .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, - .ops = &inet_dgram_ops, + .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, }, diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 22377c8ff14b..e8f862358518 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -220,7 +220,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); - skb_to_sgvec_nomark(skb, sg, 0, skb->len); + err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ @@ -393,7 +395,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); sg_init_table(sg, nfrags + sglists); - skb_to_sgvec_nomark(skb, sg, 0, skb->len); + err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0937b34c27ca..a651c53260ec 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -641,6 +641,32 @@ void arp_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(arp_xmit); +static bool arp_is_garp(struct net *net, struct net_device *dev, + int *addr_type, __be16 ar_op, + __be32 sip, __be32 tip, + unsigned char *sha, unsigned char *tha) +{ + bool is_garp = tip == sip; + + /* Gratuitous ARP _replies_ also require target hwaddr to be + * the same as source. + */ + if (is_garp && ar_op == htons(ARPOP_REPLY)) + is_garp = + /* IPv4 over IEEE 1394 doesn't provide target + * hardware address field in its ARP payload. + */ + tha && + !memcmp(tha, sha, dev->addr_len); + + if (is_garp) { + *addr_type = inet_addr_type_dev_table(net, dev, sip); + if (*addr_type != RTN_UNICAST) + is_garp = false; + } + return is_garp; +} + /* * Process an arp request. */ @@ -653,6 +679,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; + unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; @@ -724,6 +751,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) break; #endif default: + tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); @@ -835,19 +863,25 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); - if (IN_DEV_ARP_ACCEPT(in_dev)) { - unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip); + addr_type = -1; + if (n || IN_DEV_ARP_ACCEPT(in_dev)) { + is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, + sip, tip, sha, tha); + } + if (IN_DEV_ARP_ACCEPT(in_dev)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && - addr_type == RTN_UNICAST; - if (!n && - ((arp->ar_op == htons(ARPOP_REPLY) && - addr_type == RTN_UNICAST) || is_garp)) + (is_garp || + (arp->ar_op == htons(ARPOP_REPLY) && + (addr_type == RTN_UNICAST || + (addr_type < 0 && + /* postpone calculation to as late as possible */ + inet_addr_type_dev_table(net, dev, sip) == + RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -1079,13 +1113,17 @@ static int arp_invalidate(struct net_device *dev, __be32 ip) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; + struct neigh_table *tbl = &arp_tbl; if (neigh) { if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); + write_lock_bh(&tbl->lock); neigh_release(neigh); + neigh_remove_one(neigh, tbl); + write_unlock_bh(&tbl->lock); } return err; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 65cc02bd82bc..d815d1755473 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; @@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); - esp->esph = ip_esp_hdr(skb); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); @@ -374,9 +377,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * esp->esph = esph; sg_init_table(sg, esp->nfrags); - skb_to_sgvec(skb, sg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + esp->clen + alen); + err = skb_to_sgvec(skb, sg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + if (unlikely(err < 0)) + goto error; if (!esp->inplace) { int allocsize; @@ -400,9 +405,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * spin_unlock_bh(&x->lock); sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); - skb_to_sgvec(skb, dsg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + esp->clen + alen); + err = skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + if (unlikely(err < 0)) + goto error; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -687,7 +694,9 @@ skip_cow: esp_input_set_header(skb, seqhi); sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + err = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 39bd1edee676..4e678fa892dd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -588,13 +588,15 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) - err = fib_table_delete(net, tb, &cfg); + err = fib_table_delete(net, tb, &cfg, + NULL); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) - err = fib_table_insert(net, tb, &cfg); + err = fib_table_insert(net, tb, + &cfg, NULL); else err = -ENOBUFS; } @@ -626,14 +628,15 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_config *cfg) + struct nlmsghdr *nlh, struct fib_config *cfg, + struct netlink_ext_ack *extack) { struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, - NULL); + extack); if (err < 0) goto errout; @@ -654,6 +657,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } @@ -681,7 +685,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), - nla_len(attr)); + nla_len(attr), + extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); @@ -698,7 +703,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); - err = lwtunnel_valid_encap_type(cfg->fc_encap_type); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type, + extack); if (err < 0) goto errout; break; @@ -718,17 +724,18 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_table *tb; int err; - err = rtm_to_fib_config(net, skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; tb = fib_get_table(net, cfg.fc_table); if (!tb) { + NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto errout; } - err = fib_table_delete(net, tb, &cfg); + err = fib_table_delete(net, tb, &cfg, extack); errout: return err; } @@ -741,7 +748,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_table *tb; int err; - err = rtm_to_fib_config(net, skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; @@ -751,7 +758,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_table_insert(net, tb, &cfg); + err = fib_table_insert(net, tb, &cfg, extack); errout: return err; } @@ -763,7 +770,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; - int dumped = 0; + int dumped = 0, err; if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) @@ -783,20 +790,27 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); - if (fib_table_dump(tb, skb, cb) < 0) - goto out; + err = fib_table_dump(tb, skb, cb); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } dumped = 1; next: e++; } } out: + err = skb->len; +out_err: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; - return skb->len; + return err; } /* Prepare and feed intra-kernel routing request. @@ -838,9 +852,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) - fib_table_insert(net, tb, &cfg); + fib_table_insert(net, tb, &cfg, NULL); else - fib_table_delete(net, tb, &cfg); + fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 9c02920725db..769ab87ebc4b 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -28,8 +28,10 @@ static inline void fib_alias_accessed(struct fib_alias *fa) /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); -struct fib_info *fib_create_info(struct fib_config *cfg); -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +struct fib_info *fib_create_info(struct fib_config *cfg, + struct netlink_ext_ack *extack); +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, + struct netlink_ext_ack *extack); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da449ddb8cc1..2157dc08c407 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -32,6 +32,7 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/netlink.h> #include <net/arp.h> #include <net/ip.h> @@ -203,6 +204,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -213,8 +215,9 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); + m = fi->fib_metrics; + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) + kfree(m); kfree(fi); } @@ -454,7 +457,8 @@ static int fib_detect_death(struct fib_info *fi, int order, #ifdef CONFIG_IP_ROUTE_MULTIPATH -static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) +static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, + struct netlink_ext_ack *extack) { int nhs = 0; @@ -464,22 +468,35 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) } /* leftover implies invalid nexthop configuration, discard it */ - return remaining > 0 ? 0 : nhs; + if (remaining > 0) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - extra data after nexthops"); + nhs = 0; + } + + return nhs; } static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, - int remaining, struct fib_config *cfg) + int remaining, struct fib_config *cfg, + struct netlink_ext_ack *extack) { int ret; change_nexthops(fi) { int attrlen; - if (!rtnh_ok(rtnh, remaining)) + if (!rtnh_ok(rtnh, remaining)) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - extra data after nexthop"); return -EINVAL; + } - if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); return -EINVAL; + } nexthop_nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; @@ -505,13 +522,17 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); - if (!nla_entype) + if (!nla_entype) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, + "Encap type is missing"); goto err_inval; + } ret = lwtunnel_build_state(nla_get_u16( nla_entype), nla, AF_INET, cfg, - &lwtstate); + &lwtstate, extack); if (ret) goto errout; nexthop_nh->nh_lwtstate = @@ -593,7 +614,8 @@ static inline void fib_add_weight(struct fib_info *fi, static int fib_encap_match(u16 encap_type, struct nlattr *encap, const struct fib_nh *nh, - const struct fib_config *cfg) + const struct fib_config *cfg, + struct netlink_ext_ack *extack) { struct lwtunnel_state *lwtstate; int ret, result = 0; @@ -601,8 +623,8 @@ static int fib_encap_match(u16 encap_type, if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; - ret = lwtunnel_build_state(encap_type, encap, - AF_INET, cfg, &lwtstate); + ret = lwtunnel_build_state(encap_type, encap, AF_INET, + cfg, &lwtstate, extack); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); lwtstate_free(lwtstate); @@ -611,7 +633,8 @@ static int fib_encap_match(u16 encap_type, return result; } -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, + struct netlink_ext_ack *extack) { #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; @@ -623,9 +646,9 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) if (cfg->fc_oif || cfg->fc_gw) { if (cfg->fc_encap) { - if (fib_encap_match(cfg->fc_encap_type, - cfg->fc_encap, fi->fib_nh, cfg)) - return 1; + if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, + fi->fib_nh, cfg, extack)) + return 1; } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) @@ -714,7 +737,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) * |-> {local prefix} (terminal node) */ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh) + struct fib_nh *nh, struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -727,16 +750,25 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (nh->nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; - if (cfg->fc_scope >= RT_SCOPE_LINK) + if (cfg->fc_scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid scope"); return -EINVAL; + } dev = __dev_get_by_index(net, nh->nh_oif); if (!dev) return -ENODEV; - if (!(dev->flags & IFF_UP)) + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, + "Nexthop device is not up"); return -ENETDOWN; + } addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); - if (addr_type != RTN_UNICAST) + if (addr_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway"); return -EINVAL; + } if (!netif_carrier_ok(dev)) nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; @@ -776,18 +808,25 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } if (err) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway"); rcu_read_unlock(); return err; } } err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; + } nh->nh_scope = res.scope; nh->nh_oif = FIB_RES_OIF(res); nh->nh_dev = dev = FIB_RES_DEV(res); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG(extack, + "No egress device for nexthop gateway"); goto out; + } dev_hold(dev); if (!netif_carrier_ok(dev)) nh->nh_flags |= RTNH_F_LINKDOWN; @@ -795,17 +834,21 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } else { struct in_device *in_dev; - if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) + if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); return -EINVAL; - + } rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->nh_oif); if (!in_dev) goto out; err = -ENETDOWN; - if (!(in_dev->dev->flags & IFF_UP)) + if (!(in_dev->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); goto out; + } nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; @@ -971,16 +1014,17 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; - fi->fib_metrics[type - 1] = val; + fi->fib_metrics->metrics[type - 1] = val; } if (ecn_ca) - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } -struct fib_info *fib_create_info(struct fib_config *cfg) +struct fib_info *fib_create_info(struct fib_config *cfg, + struct netlink_ext_ack *extack) { int err; struct fib_info *fi = NULL; @@ -992,15 +1036,20 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; /* Fast check to catch the most weird cases */ - if (fib_props[cfg->fc_type].scope > cfg->fc_scope) + if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { + NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; + } - if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { + NL_SET_ERR_MSG(extack, + "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); goto err_inval; + } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { - nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); + nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); if (nhs == 0) goto err_inval; } @@ -1033,11 +1082,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; fib_info_cnt++; if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); if (!fi->fib_metrics) goto failure; + atomic_set(&fi->fib_metrics->refcnt, 1); } else - fi->fib_metrics = (u32 *) dst_default_metrics; + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1062,18 +1112,29 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); + err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); if (err != 0) goto failure; - if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) + if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) { + NL_SET_ERR_MSG(extack, + "Nexthop device index does not match RTA_OIF"); goto err_inval; - if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) + } + if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) { + NL_SET_ERR_MSG(extack, + "Nexthop gateway does not match RTA_GATEWAY"); goto err_inval; + } #ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) + if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + NL_SET_ERR_MSG(extack, + "Nexthop class id does not match RTA_FLOW"); goto err_inval; + } #endif #else + NL_SET_ERR_MSG(extack, + "Multipath support not enabled in kernel"); goto err_inval; #endif } else { @@ -1082,11 +1143,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_encap) { struct lwtunnel_state *lwtstate; - if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) { + NL_SET_ERR_MSG(extack, + "LWT encap type not specified"); goto err_inval; + } err = lwtunnel_build_state(cfg->fc_encap_type, cfg->fc_encap, AF_INET, cfg, - &lwtstate); + &lwtstate, extack); if (err) goto failure; @@ -1106,8 +1170,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } if (fib_props[cfg->fc_type].error) { - if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) + if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { + NL_SET_ERR_MSG(extack, + "Gateway, device and multipath can not be specified for this route type"); goto err_inval; + } goto link_it; } else { switch (cfg->fc_type) { @@ -1118,19 +1185,30 @@ struct fib_info *fib_create_info(struct fib_config *cfg) case RTN_MULTICAST: break; default: + NL_SET_ERR_MSG(extack, "Invalid route type"); goto err_inval; } } - if (cfg->fc_scope > RT_SCOPE_HOST) + if (cfg->fc_scope > RT_SCOPE_HOST) { + NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; + } if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ - if (nhs != 1 || nh->nh_gw) + if (nhs != 1) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have multiple nexthops"); + goto err_inval; + } + if (nh->nh_gw) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have a gateway"); goto err_inval; + } nh->nh_scope = RT_SCOPE_NOWHERE; nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); err = -ENODEV; @@ -1140,7 +1218,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh); + err = fib_check_nh(cfg, fi, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) @@ -1150,8 +1228,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_flags |= RTNH_F_LINKDOWN; } - if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) + if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { + NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); goto err_inval; + } change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); @@ -1238,7 +1318,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1201409ba1dc..d56659e97a6e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1099,9 +1099,25 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } +static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) +{ + if (plen > KEYLENGTH) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + return false; + } + + if ((plen < KEYLENGTH) && (key << plen)) { + NL_SET_ERR_MSG(extack, + "Invalid prefix for given prefix length"); + return false; + } + + return true; +} + /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, - struct fib_config *cfg) + struct fib_config *cfg, struct netlink_ext_ack *extack) { enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; @@ -1115,17 +1131,14 @@ int fib_table_insert(struct net *net, struct fib_table *tb, u32 key; int err; - if (plen > KEYLENGTH) - return -EINVAL; - key = ntohl(cfg->fc_dst); - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); - - if ((plen < KEYLENGTH) && (key << plen)) + if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; - fi = fib_create_info(cfg); + pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); + + fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; @@ -1452,6 +1465,7 @@ found: if (!(fib_flags & FIB_LOOKUP_NOREF)) atomic_inc(&fi->fib_clntref); + res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; res->type = fa->fa_type; @@ -1507,7 +1521,7 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, - struct fib_config *cfg) + struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; @@ -1517,12 +1531,9 @@ int fib_table_delete(struct net *net, struct fib_table *tb, u8 tos = cfg->fc_tos; u32 key; - if (plen > KEYLENGTH) - return -EINVAL; - key = ntohl(cfg->fc_dst); - if ((plen < KEYLENGTH) && (key << plen)) + if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; l = fib_find_node(t, &tp, key); @@ -1551,7 +1562,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi) == 0) { + fib_nh_match(cfg, fi, extack) == 0) { fa_to_delete = fa; break; } @@ -1983,6 +1994,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + int err; + if (i < s_i) { i++; continue; @@ -1993,17 +2006,14 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, continue; } - if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, - fa->fa_info, NLM_F_MULTI) < 0) { + err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + tb->tb_id, fa->fa_type, + xkey, KEYLENGTH - fa->fa_slen, + fa->fa_tos, fa->fa_info, NLM_F_MULTI); + if (err < 0) { cb->args[4] = i; - return -1; + return err; } i++; } @@ -2025,10 +2035,13 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, t_key key = cb->args[3]; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + int err; + + err = fn_trie_dump_leaf(l, tb, skb, cb); + if (err < 0) { cb->args[3] = key; cb->args[2] = count; - return -1; + return err; } ++count; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 805f6607f8d9..8e0257d01200 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <net/genetlink.h> #include <net/gue.h> +#include <net/fou.h> #include <net/ip.h> #include <net/protocol.h> #include <net/udp.h> @@ -859,25 +860,6 @@ size_t gue_encap_hlen(struct ip_tunnel_encap *e) } EXPORT_SYMBOL(gue_encap_hlen); -static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, - struct flowi4 *fl4, u8 *protocol, __be16 sport) -{ - struct udphdr *uh; - - skb_push(skb, sizeof(struct udphdr)); - skb_reset_transport_header(skb); - - uh = udp_hdr(skb); - - uh->dest = e->dport; - uh->source = sport; - uh->len = htons(skb->len); - udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, - fl4->saddr, fl4->daddr, skb->len); - - *protocol = IPPROTO_UDP; -} - int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { @@ -894,24 +876,6 @@ int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } EXPORT_SYMBOL(__fou_build_header); -int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi4 *fl4) -{ - int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : - SKB_GSO_UDP_TUNNEL; - __be16 sport; - int err; - - err = __fou_build_header(skb, e, protocol, &sport, type); - if (err) - return err; - - fou_build_udp(skb, e, fl4, protocol, sport); - - return 0; -} -EXPORT_SYMBOL(fou_build_header); - int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { @@ -985,8 +949,46 @@ int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } EXPORT_SYMBOL(__gue_build_header); -int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi4 *fl4) +#ifdef CONFIG_NET_FOU_IP_TUNNELS + +static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi4 *fl4, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; + __be16 sport; + int err; + + err = __fou_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} + +static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) { int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; @@ -1001,9 +1003,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } -EXPORT_SYMBOL(gue_build_header); -#ifdef CONFIG_NET_FOU_IP_TUNNELS static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 43318b5f5647..5610971bf859 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -489,7 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key_hash(net, fl4, skb_in); + rt = ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1054d330bf9d..a3fa1a5b6d98 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -25,6 +25,7 @@ #include <net/xfrm.h> #include <net/tcp.h> #include <net/sock_reuseport.h> +#include <net/addrconf.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -789,7 +790,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); - newsk->sk_write_space = sk_stream_write_space; /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index baf196eaf1d8..90e11479c725 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -228,14 +228,16 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct ip_tunnel_info *tun_info; struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, NULL); + err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, + extack); if (err < 0) return err; @@ -325,7 +327,8 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { static int ip6_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct ip_tunnel_info *tun_info; struct lwtunnel_state *new_state; @@ -333,7 +336,7 @@ static int ip6_tun_build_state(struct nlattr *attr, int err; err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, - NULL); + extack); if (err < 0) return err; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a02d52ed50e..551de4d023a8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1980,6 +1980,20 @@ int ip_mr_input(struct sk_buff *skb) struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; + struct net_device *dev; + + /* skb->dev passed in is the loX master dev for vrfs. + * As there are no vifs associated with loopback devices, + * get the proper interface that does have a vif associated with it. + */ + dev = skb->dev; + if (netif_is_l3_master(skb->dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + } /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. @@ -2017,7 +2031,7 @@ int ip_mr_input(struct sk_buff *skb) /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); if (!cache) { - int vif = ipmr_find_vif(mrt, skb->dev); + int vif = ipmr_find_vif(mrt, dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, @@ -2037,7 +2051,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - vif = ipmr_find_vif(mrt, skb->dev); + vif = ipmr_find_vif(mrt, dev); if (vif >= 0) { int err2 = ipmr_cache_unresolved(mrt, vif, skb); read_unlock(&mrt_lock); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 7cd8d0d918f8..6f8d9e5e062b 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -172,7 +172,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) struct iphdr *iph = ip_hdr(skb_in); u8 proto; - if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET)) + if (iph->frag_off & htons(IP_OFFSET)) return; if (skb_csum_unnecessary(skb_in)) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 655d9eebe43e..9b38cf18144e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -114,6 +114,8 @@ #include <net/ip_tunnels.h> #include <net/l3mdev.h> +#include "fib_lookup.h" + #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -1385,8 +1387,12 @@ static void rt_add_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *) dst; + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) + kfree(p); + if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1438,7 +1444,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); + if (fi->fib_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + atomic_inc(&fi->fib_metrics->refcnt); + } #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -1852,9 +1862,9 @@ static int ip_mkroute_input(struct sk_buff *skb, */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) + u8 tos, struct net_device *dev, + struct fib_result *res) { - struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); struct ip_tunnel_info *tun_info; struct flowi4 fl4; @@ -1884,8 +1894,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; - res.fi = NULL; - res.table = NULL; + res->fi = NULL; + res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1921,17 +1931,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); - err = fib_lookup(net, &fl4, &res, 0); + err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } - if (res.type == RTN_BROADCAST) + if (res->type == RTN_BROADCAST) goto brd_input; - if (res.type == RTN_LOCAL) { + if (res->type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &itag); if (err < 0) @@ -1943,10 +1953,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = -EHOSTUNREACH; goto no_route; } - if (res.type != RTN_UNICAST) + if (res->type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos); out: return err; brd_input: @@ -1960,14 +1970,14 @@ brd_input: goto martian_source; } flags |= RTCF_BROADCAST; - res.type = RTN_BROADCAST; + res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: do_cache = false; - if (res.fi) { + if (res->fi) { if (!itag) { - rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -1978,7 +1988,7 @@ local_input: } rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, - flags | RTCF_LOCAL, res.type, + flags | RTCF_LOCAL, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; @@ -1988,18 +1998,18 @@ local_input: rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; - if (res.table) - rth->rt_table_id = res.table->tb_id; + if (res->table) + rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); - if (res.type == RTN_UNREACHABLE) { + if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { - struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh *nh = &FIB_RES_NH(*res); rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { @@ -2019,9 +2029,9 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); - res.type = RTN_UNREACHABLE; - res.fi = NULL; - res.table = NULL; + res->type = RTN_UNREACHABLE; + res->fi = NULL; + res->table = NULL; goto local_input; /* @@ -2051,11 +2061,22 @@ martian_source: int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { - int res; + struct fib_result res; + int err; tos &= IPTOS_RT_MASK; rcu_read_lock(); + err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(ip_route_input_noref); + +/* called with rcu_read_lock held */ +int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, struct fib_result *res) +{ /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2070,6 +2091,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; + int err = -EINVAL; if (in_dev) our = ip_check_mc_rcu(in_dev, daddr, saddr, @@ -2085,7 +2107,6 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol); } - res = -EINVAL; if (our #ifdef CONFIG_IP_MROUTE || @@ -2093,17 +2114,14 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, IN_DEV_MFORWARD(in_dev)) #endif ) { - res = ip_route_input_mc(skb, daddr, saddr, + err = ip_route_input_mc(skb, daddr, saddr, tos, dev, our); } - rcu_read_unlock(); - return res; + return err; } - res = ip_route_input_slow(skb, daddr, saddr, tos, dev); - rcu_read_unlock(); - return res; + + return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); } -EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, @@ -2246,29 +2264,40 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, - const struct sk_buff *skb) +struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, + const struct sk_buff *skb) { - struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); - unsigned int flags = 0; struct fib_result res; struct rtable *rth; - int orig_oif; - int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; res.table = NULL; - orig_oif = fl4->flowi4_oif; - fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); rcu_read_lock(); + rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); + rcu_read_unlock(); + + return rth; +} +EXPORT_SYMBOL_GPL(ip_route_output_key_hash); + +struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, + struct fib_result *res, + const struct sk_buff *skb) +{ + struct net_device *dev_out = NULL; + int orig_oif = fl4->flowi4_oif; + unsigned int flags = 0; + struct rtable *rth; + int err = -ENETUNREACH; + if (fl4->saddr) { rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || @@ -2354,15 +2383,15 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; - res.type = RTN_LOCAL; + res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - err = fib_lookup(net, fl4, &res, 0); + err = fib_lookup(net, fl4, res, 0); if (err) { - res.fi = NULL; - res.table = NULL; + res->fi = NULL; + res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !netif_index_is_l3_master(net, fl4->flowi4_oif))) { @@ -2387,43 +2416,41 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); - res.type = RTN_UNICAST; + res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } - if (res.type == RTN_LOCAL) { + if (res->type == RTN_LOCAL) { if (!fl4->saddr) { - if (res.fi->fib_prefsrc) - fl4->saddr = res.fi->fib_prefsrc; + if (res->fi->fib_prefsrc) + fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ - dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? : + dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } - fib_select_path(net, &res, fl4, skb); + fib_select_path(net, res, fl4, skb); - dev_out = FIB_RES_DEV(res); + dev_out = FIB_RES_DEV(*res); fl4->flowi4_oif = dev_out->ifindex; make_route: - rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); + rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: - rcu_read_unlock(); return rth; } -EXPORT_SYMBOL_GPL(__ip_route_output_key_hash); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { @@ -2517,9 +2544,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); +/* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, - u32 seq, int event) + u32 seq) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; @@ -2528,7 +2556,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, u32 error; u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0); + nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); if (!nlh) return -EMSGSIZE; @@ -2636,6 +2664,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; + struct fib_result res = {}; struct rtable *rt = NULL; struct flowi4 fl4; __be32 dst = 0; @@ -2692,10 +2721,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; + rcu_read_lock(); + if (iif) { struct net_device *dev; - dev = __dev_get_by_index(net, iif); + dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_free; @@ -2704,14 +2735,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->protocol = htons(ETH_P_IP); skb->dev = dev; skb->mark = mark; - err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, + dev, &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { - rt = ip_route_output_key(net, &fl4); - + rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); @@ -2727,17 +2758,25 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = rt->rt_table_id; - err = rt_fill_info(net, dst, src, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWROUTE); + if (rtm->rtm_flags & RTM_F_FIB_MATCH) + err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, RTM_NEWROUTE, table_id, + rt->rt_type, res.prefix, res.prefixlen, + fl4.flowi4_tos, res.fi, 0); + else + err = rt_fill_info(net, dst, src, table_id, &fl4, skb, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); if (err < 0) goto errout_free; + rcu_read_unlock(); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; errout_free: + rcu_read_unlock(); kfree_skb(skb); goto errout; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0257d965f111..6426250a58ea 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -66,10 +66,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -__u32 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req) { struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp; + u32 ts, ts_now = tcp_time_stamp_raw(); u32 options = 0; ireq = inet_rsk(req); @@ -88,7 +88,7 @@ __u32 cookie_init_timestamp(struct request_sock *req) ts <<= TSBITS; ts |= options; } - return ts; + return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); } @@ -343,7 +343,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack.v64 = 0; + treq->snt_synack = 0; treq->tfo_listener = false; ireq->ir_iif = inet_request_bound_dev_if(sk, skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1e4c76d2b827..87981fcdfcf2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -386,7 +386,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U); + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -1084,9 +1084,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && + uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ @@ -1108,7 +1111,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst @@ -2183,7 +2186,7 @@ adjudge_to_death: /* Now socket is owned by kernel and we acquire BH lock - to finish close. No need to check for user refs. + * to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); @@ -2320,6 +2323,10 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window() + */ + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); @@ -2374,9 +2381,10 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l return 0; } -static int tcp_repair_options_est(struct tcp_sock *tp, +static int tcp_repair_options_est(struct sock *sk, struct tcp_repair_opt __user *optbuf, unsigned int len) { + struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; while (len >= sizeof(opt)) { @@ -2389,6 +2397,7 @@ static int tcp_repair_options_est(struct tcp_sock *tp, switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; + tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { @@ -2471,7 +2480,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet - * know which interface is going to be used */ + * know which interface is going to be used + */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; @@ -2548,7 +2558,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED) - err = tcp_repair_options_est(tp, + err = tcp_repair_options_est(sk, (struct tcp_repair_opt __user *)optval, optlen); else @@ -2706,7 +2716,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EPERM; else - tp->tsoffset = val - tcp_time_stamp; + tp->tsoffset = val - tcp_time_stamp_raw(); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); @@ -2757,7 +2767,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) - stats[i] += tcp_time_stamp - tp->chrono_start; + stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } @@ -2841,7 +2851,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; - now = tcp_time_stamp; + now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); @@ -3072,7 +3082,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TIMESTAMP: - val = tcp_time_stamp + tp->tsoffset; + val = tcp_time_stamp_raw() + tp->tsoffset; break; case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 92b045c72163..dbcc9352a48f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -91,7 +91,7 @@ struct bbr { struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ u32 rtt_cnt; /* count of packet-timed rounds elapsed */ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ - struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ + u64 cycle_mstamp; /* time of this cycle phase start */ u32 mode:3, /* current bbr_mode in state machine */ prev_ca_state:3, /* CA state on previous ACK */ packet_conservation:1, /* use packet conservation? */ @@ -411,7 +411,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); bool is_full_length = - skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > + tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > bbr->min_rtt_us; u32 inflight, bw; @@ -497,7 +497,7 @@ static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; + bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); bbr->lt_last_delivered = tp->delivered; bbr->lt_last_lost = tp->lost; bbr->lt_rtt_cnt = 0; @@ -551,7 +551,7 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) struct bbr *bbr = inet_csk_ca(sk); u32 lost, delivered; u64 bw; - s32 t; + u32 t; if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ if (bbr->mode == BBR_PROBE_BW && bbr->round_start && @@ -603,15 +603,15 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) return; /* Find average delivery rate in this sampling interval. */ - t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); - if (t < 1) - return; /* interval is less than one jiffy, so wait */ - t = jiffies_to_usecs(t); - /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ - if (t < 1) { + t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; + if ((s32)t < 1) + return; /* interval is less than one ms, so wait */ + /* Check if can multiply without overflow */ + if (t >= ~0U / USEC_PER_MSEC) { bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ return; } + t *= USEC_PER_MSEC; bw = (u64)delivered * BW_UNIT; do_div(bw, t); bbr_lt_bw_interval_done(sk, bw); @@ -730,12 +730,12 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) bool filter_expired; /* Track min RTT seen in the min_rtt_win_sec filter window: */ - filter_expired = after(tcp_time_stamp, + filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { bbr->min_rtt_us = rs->rtt_us; - bbr->min_rtt_stamp = tcp_time_stamp; + bbr->min_rtt_stamp = tcp_jiffies32; } if (bbr_probe_rtt_mode_ms > 0 && filter_expired && @@ -754,7 +754,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) /* Maintain min packets in flight for max(200 ms, 1 round). */ if (!bbr->probe_rtt_done_stamp && tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { - bbr->probe_rtt_done_stamp = tcp_time_stamp + + bbr->probe_rtt_done_stamp = tcp_jiffies32 + msecs_to_jiffies(bbr_probe_rtt_mode_ms); bbr->probe_rtt_round_done = 0; bbr->next_rtt_delivered = tp->delivered; @@ -762,8 +762,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) if (bbr->round_start) bbr->probe_rtt_round_done = 1; if (bbr->probe_rtt_round_done && - after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { - bbr->min_rtt_stamp = tcp_time_stamp; + after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { + bbr->min_rtt_stamp = tcp_jiffies32; bbr->restore_cwnd = 1; /* snap to prior_cwnd */ bbr_reset_mode(sk); } @@ -810,7 +810,7 @@ static void bbr_init(struct sock *sk) bbr->probe_rtt_done_stamp = 0; bbr->probe_rtt_round_done = 0; bbr->min_rtt_us = tcp_min_rtt(tp); - bbr->min_rtt_stamp = tcp_time_stamp; + bbr->min_rtt_stamp = tcp_jiffies32; minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ @@ -825,7 +825,7 @@ static void bbr_init(struct sock *sk) bbr->idle_restart = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; - bbr->cycle_mstamp.v64 = 0; + bbr->cycle_mstamp = 0; bbr->cycle_idx = 0; bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 36087bca9f48..609965f0e298 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -84,14 +84,14 @@ static void bictcp_init(struct sock *sk) static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { if (ca->last_cwnd == cwnd && - (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; ca->last_cwnd = cwnd; - ca->last_time = tcp_time_stamp; + ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) /* record the beginning of an epoch */ - ca->epoch_start = tcp_time_stamp; + ca->epoch_start = tcp_jiffies32; /* start off normal */ if (cwnd <= low_window) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6e3c512054a6..324c9bcc5456 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -180,6 +180,7 @@ void tcp_init_congestion_control(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 0683ba447d77..57ae5b5ae643 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -155,7 +155,7 @@ static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_jiffies32; s32 delta; delta = now - tcp_sk(sk)->lsndtime; @@ -231,21 +231,21 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && - (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; /* The CUBIC function can update ca->cnt at most once per jiffy. * On all cwnd reduction events, ca->epoch_start is set to 0, * which will force a recalculation of ca->cnt. */ - if (ca->epoch_start && tcp_time_stamp == ca->last_time) + if (ca->epoch_start && tcp_jiffies32 == ca->last_time) goto tcp_friendliness; ca->last_cwnd = cwnd; - ca->last_time = tcp_time_stamp; + ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) { - ca->epoch_start = tcp_time_stamp; /* record beginning */ + ca->epoch_start = tcp_jiffies32; /* record beginning */ ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ @@ -276,7 +276,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) * if the cwnd < 1 million packets !!! */ - t = (s32)(tcp_time_stamp - ca->epoch_start); + t = (s32)(tcp_jiffies32 - ca->epoch_start); t += msecs_to_jiffies(ca->delay_min >> 3); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; @@ -448,7 +448,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) return; /* Discard delay samples right after fast recovery */ - if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; delay = (sample->rtt_us << 3) / USEC_PER_MSEC; diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4a4d8e76738f..3eb78cde6ff0 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -104,7 +104,7 @@ static void measure_achieved_throughput(struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_jiffies32; if (icsk->icsk_ca_state == TCP_CA_Open) ca->pkts_acked = sample->pkts_acked; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 06e2dbc2b4a2..4ea8ec5c7bb4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -112,6 +112,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ +#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -441,7 +442,7 @@ void tcp_init_buffer_space(struct sock *sk) tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -463,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } /* 5. Recalculate window clamp after socket hit its memory bounds. */ @@ -555,11 +556,11 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { u32 delta_us; - if (tp->rcv_rtt_est.time.v64 == 0) + if (tp->rcv_rtt_est.time == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); + delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: @@ -571,13 +572,15 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) - tcp_rcv_rtt_update(tp, - jiffies_to_usecs(tcp_time_stamp - - tp->rx_opt.rcv_tsecr), - 0); + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + + tcp_rcv_rtt_update(tp, delta_us, 0); + } } /* @@ -590,7 +593,7 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; - time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); + time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; @@ -672,7 +675,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_rcv_rtt_measure(tp); - now = tcp_time_stamp; + now = tcp_jiffies32; if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize @@ -885,6 +888,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, struct tcp_sock *tp = tcp_sk(sk); int mib_idx; + if (WARN_ON_ONCE(metric < 0)) + return; + if (metric > tp->reordering) { tp->reordering = min(sysctl_tcp_max_reordering, metric); @@ -1134,8 +1140,8 @@ struct tcp_sacktag_state { * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. */ - struct skb_mstamp first_sackt; - struct skb_mstamp last_sackt; + u64 first_sackt; + u64 last_sackt; struct rate_sample *rate; int flag; }; @@ -1200,7 +1206,7 @@ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, int dup_sack, int pcount, - const struct skb_mstamp *xmit_time) + u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; @@ -1242,9 +1248,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; - if (state->first_sackt.v64 == 0) - state->first_sackt = *xmit_time; - state->last_sackt = *xmit_time; + if (state->first_sackt == 0) + state->first_sackt = xmit_time; + state->last_sackt = xmit_time; } if (sacked & TCPCB_LOST) { @@ -1304,7 +1310,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - &skb->skb_mstamp); + skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) @@ -1356,8 +1362,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); - if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64)) - TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0; + if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) + TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); @@ -1587,7 +1593,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - &skb->skb_mstamp); + skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); if (!before(TCP_SKB_CB(skb)->seq, @@ -1954,7 +1960,7 @@ void tcp_enter_loss(struct sock *sk) } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->retrans_out = 0; tp->lost_out = 0; @@ -2383,7 +2389,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) tcp_ecn_withdraw_cwr(tp); } } - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; } @@ -2520,7 +2526,7 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } @@ -2590,7 +2596,7 @@ static void tcp_mtup_probe_success(struct sock *sk) tcp_mss_to_mtu(sk, tp->mss_cache) / icsk->icsk_mtup.probe_size; tp->snd_cwnd_cnt = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_ssthresh = tcp_current_ssthresh(sk); icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; @@ -2911,13 +2917,13 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) struct tcp_sock *tp = tcp_sk(sk); u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; - minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp, + minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } -static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - long seq_rtt_us, long sack_rtt_us, - long ca_rtt_us) +static bool tcp_ack_update_rtt(struct sock *sk, const int flag, + long seq_rtt_us, long sack_rtt_us, + long ca_rtt_us, struct rate_sample *rs) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2936,9 +2942,13 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * See draft-ietf-tcplw-high-performance-00, section 3.3. */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - flag & FLAG_ACKED) - seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - - tp->rx_opt.rcv_tsecr); + flag & FLAG_ACKED) { + u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + + seq_rtt_us = ca_rtt_us = delta_us; + } + rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) return false; @@ -2958,16 +2968,13 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { + struct rate_sample rs; long rtt_us = -1L; - if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { - struct skb_mstamp now; - - skb_mstamp_get(&now); - rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); - } + if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) + rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs); } @@ -2976,7 +2983,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); - tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; + tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32; } /* Restart timer after forward progress on connection. @@ -3001,14 +3008,14 @@ void tcp_rearm_rto(struct sock *sk) if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); - const u32 rto_time_stamp = - tcp_skb_timestamp(skb) + rto; - s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); - /* delta may not be positive if the socket is locked + u64 rto_time_stamp = skb->skb_mstamp + + jiffies_to_usecs(rto); + s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ - if (delta > 0) - rto = delta; + if (delta_us > 0) + rto = usecs_to_jiffies(delta_us); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); @@ -3060,9 +3067,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct skb_mstamp first_ackt, last_ackt; + u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp *now = &tp->tcp_mstamp; u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; @@ -3075,7 +3081,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, bool rtt_update; int flag = 0; - first_ackt.v64 = 0; + first_ackt = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -3106,8 +3112,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_RETRANS_DATA_ACKED; } else if (!(sacked & TCPCB_SACKED_ACKED)) { last_ackt = skb->skb_mstamp; - WARN_ON_ONCE(last_ackt.v64 == 0); - if (!first_ackt.v64) + WARN_ON_ONCE(last_ackt == 0); + if (!first_ackt) first_ackt = last_ackt; last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; @@ -3122,7 +3128,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - &skb->skb_mstamp); + skb->skb_mstamp); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3165,17 +3171,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { - seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt); - ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt); + if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { + seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); + ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); } - if (sack->first_sackt.v64) { - sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt); - ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt); + if (sack->first_sackt) { + sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); + ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); } - sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, - ca_rtt_us); + ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); @@ -3190,7 +3195,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets) + if (reord < prior_fackets && reord <= tp->fackets_out) tcp_update_reordering(sk, tp->fackets_out - reord, 0); delta = tcp_is_fack(tp) ? pkts_acked : @@ -3201,7 +3206,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) { + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3211,7 +3216,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = ca_rtt_us, + .rtt_us = sack->rate->rtt_us, .in_flight = last_in_flight }; icsk->icsk_ca_ops->pkts_acked(sk, &sample); @@ -3390,7 +3395,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, u32 *last_oow_ack_time) { if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { NET_INC_STATS(net, mib_idx); @@ -3398,7 +3403,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, } } - *last_oow_ack_time = tcp_time_stamp; + *last_oow_ack_time = tcp_jiffies32; return false; /* not rate-limited: go ahead, send dupack now! */ } @@ -3553,7 +3558,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ - sack_state.first_sackt.v64 = 0; + sack_state.first_sackt = 0; sack_state.rate = &rs; /* We very likely will need to access write queue head. */ @@ -3565,7 +3570,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) { /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - tp->max_window)) { - tcp_send_challenge_ack(sk, skb); + if (!(flag & FLAG_NO_CHALLENGE_ACK)) + tcp_send_challenge_ack(sk, skb); return -1; } goto old_ack; @@ -3636,7 +3642,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) */ sk->sk_err_soft = 0; icsk->icsk_probes_out = 0; - tp->rcv_tstamp = tcp_time_stamp; + tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; @@ -5019,7 +5025,7 @@ static void tcp_new_space(struct sock *sk) if (tcp_should_expand_sndbuf(sk)) { tcp_sndbuf_expand(sk); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } sk->sk_write_space(sk); @@ -5356,7 +5362,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* @@ -5554,7 +5560,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) struct inet_connection_sock *icsk = inet_csk(sk); tcp_set_state(sk, TCP_ESTABLISHED); - icsk->icsk_ack.lrcvtime = tcp_time_stamp; + icsk->icsk_ack.lrcvtime = tcp_jiffies32; if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -5571,7 +5577,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ - tp->lsndtime = tcp_time_stamp; + tp->lsndtime = tcp_jiffies32; tcp_init_buffer_space(sk); @@ -5672,7 +5678,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, - tcp_time_stamp)) { + tcp_time_stamp(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; @@ -5917,7 +5923,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5929,7 +5935,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; } - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { @@ -5948,13 +5954,17 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) /* step 5: check the ACK field */ acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT) > 0; + FLAG_UPDATE_TS_RECENT | + FLAG_NO_CHALLENGE_ACK) > 0; + if (!acceptable) { + if (sk->sk_state == TCP_SYN_RECV) + return 1; /* send one RST */ + tcp_send_challenge_ack(sk, skb); + goto discard; + } switch (sk->sk_state) { case TCP_SYN_RECV: - if (!acceptable) - return 1; - if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); @@ -6008,7 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_update_pacing_rate(sk); /* Prevent spurious tcp_cwnd_restart() on first data packet */ - tp->lsndtime = tcp_time_stamp; + tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); @@ -6023,14 +6033,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * our SYNACK so stop the SYNACK timer. */ if (req) { - /* Return RST if ack_seq is invalid. - * Note that RFC793 only says to generate a - * DUPACK for it but for TCP Fast Open it seems - * better to treat this case like TCP_SYN_RECV - * above. - */ - if (!acceptable) - return 1; /* We no longer need the request sock. */ reqsk_fastopen_remove(sk, req, false); tcp_rearm_rto(sk); @@ -6202,7 +6204,7 @@ static void tcp_openreq_init(struct request_sock *req, req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - skb_mstamp_get(&tcp_rsk(req)->snt_synack); + tcp_rsk(req)->snt_synack = tcp_clock_us(); tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5ab2aac5ca19..191b2f78b19d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -376,8 +376,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) struct sock *sk; struct sk_buff *skb; struct request_sock *fastopen; - __u32 seq, snd_una; - __u32 remaining; + u32 seq, snd_una; + s32 remaining; + u32 delta_us; int err; struct net *net = dev_net(icmp_skb->dev); @@ -483,11 +484,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) skb = tcp_write_queue_head(sk); BUG_ON(!skb); + tcp_mstamp_refresh(tp); + delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); remaining = icsk->icsk_rto - - min(icsk->icsk_rto, - tcp_time_stamp - tcp_skb_timestamp(skb)); + usecs_to_jiffies(delta_us); - if (remaining) { + if (remaining > 0) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); } else { @@ -811,7 +813,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp + tcptw->tw_ts_offset, + tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -839,7 +841,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp + tcp_rsk(req)->ts_off, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index d6fb6c067af4..ae10ed64fe13 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -37,7 +37,7 @@ #include <net/tcp.h> /* resolution of owd */ -#define LP_RESOL 1000 +#define LP_RESOL TCP_TS_HZ /** * enum tcp_lp_state @@ -147,9 +147,9 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) tp->rx_opt.rcv_tsecr == lp->local_ref_time) goto out; - m = HZ * (tp->rx_opt.rcv_tsval - - lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - - lp->local_ref_time); + m = TCP_TS_HZ * + (tp->rx_opt.rcv_tsval - lp->remote_ref_time) / + (tp->rx_opt.rcv_tsecr - lp->local_ref_time); if (m < 0) m = -m; @@ -194,7 +194,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) if (lp->flag & LP_VALID_RHZ) { owd = tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - - tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ); if (owd < 0) owd = -owd; } @@ -264,18 +264,19 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); + u32 now = tcp_time_stamp(tp); u32 delta; if (sample->rtt_us > 0) tcp_lp_rtt_sample(sk, sample->rtt_us); /* calc inference */ - delta = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + delta = now - tp->rx_opt.rcv_tsecr; if ((s32)delta > 0) lp->inference = 3 * delta; /* test if within inference */ - if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + if (lp->last_drop && (now - lp->last_drop < lp->inference)) lp->flag |= LP_WITHIN_INF; else lp->flag &= ~LP_WITHIN_INF; @@ -312,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); /* record this drop time */ - lp->last_drop = tcp_time_stamp; + lp->last_drop = now; } static struct tcp_congestion_ops tcp_lp __read_mostly = { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 653bbd67e3a3..102b2c90bb80 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -524,7 +524,7 @@ reset: tp->snd_cwnd = 1; else tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 717be4de5324..d0642df73044 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,9 +445,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; - newicsk->icsk_ack.lrcvtime = tcp_time_stamp; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->packets_out = 0; newtp->retrans_out = 0; @@ -455,7 +455,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->tlp_high_seq = 0; - newtp->lsndtime = treq->snt_synack.stamp_jiffies; + newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; @@ -526,7 +526,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; - newtp->rack.mstamp.v64 = 0; + newtp->rack.mstamp = 0; newtp->rack.advanced = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 5de82a8d4d87..6d650ed3cb59 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -424,8 +424,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) } /* Extract info for Tcp socket info provided via netlink */ -size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, - union tcp_cc_info *info) +static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct tcpnv *ca = inet_csk_ca(sk); @@ -440,7 +440,6 @@ size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, } return 0; } -EXPORT_SYMBOL_GPL(tcpnv_get_info); static struct tcp_congestion_ops tcpnv __read_mostly = { .init = tcpnv_init, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a32172d69a03..e3aab1c1cf78 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -151,7 +151,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } @@ -160,7 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - const u32 now = tcp_time_stamp; + const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); @@ -997,8 +997,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); + skb->skb_mstamp = tp->tcp_mstamp; if (clone_it) { - skb_mstamp_get(&skb->skb_mstamp); TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; tcp_rate_skb_sent(sk, skb); @@ -1328,9 +1328,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return 0; } -/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c - * eventually). The difference is that pulled data not copied, but - * immediately discarded. +/* This is similar to __pskb_pull_tail(). The difference is that pulled + * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { @@ -1365,7 +1364,6 @@ static int __pskb_trim_head(struct sk_buff *skb, int len) } shinfo->nr_frags = k; - skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len; return len; @@ -1475,7 +1473,7 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1576,7 +1574,7 @@ static void tcp_cwnd_application_limited(struct sock *sk) } tp->snd_cwnd_used = 0; } - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) @@ -1597,14 +1595,14 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (sysctl_tcp_slow_start_after_idle && - (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && + (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); @@ -1906,7 +1904,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, const struct inet_connection_sock *icsk = inet_csk(sk); u32 age, send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp now; struct sk_buff *head; int win_divisor; @@ -1919,7 +1916,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Avoid bursty behavior by allowing defer * only if the last write was recent. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 0) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1962,8 +1959,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, } head = tcp_write_queue_head(sk); - skb_mstamp_get(&now); - age = skb_mstamp_us_delta(&now, &head->skb_mstamp); + + age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) goto send_now; @@ -1988,7 +1985,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) s32 delta; interval = net->ipv4.sysctl_tcp_probe_interval; - delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); @@ -2000,7 +1997,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } @@ -2203,7 +2200,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { - const u32 now = tcp_time_stamp; + const u32 now = tcp_jiffies32; if (tp->chrono_type > TCP_CHRONO_UNSPEC) tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; @@ -2280,6 +2277,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } max_segs = tcp_tso_segs(sk, mss_now); + tcp_mstamp_refresh(tp); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -2291,7 +2289,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tp->tcp_mstamp; goto repair; /* Skip network transmission */ } @@ -2418,10 +2416,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) timeout = max_t(u32, timeout, msecs_to_jiffies(10)); /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_time_stamp + timeout; + tlp_time_stamp = tcp_jiffies32 + timeout; rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_time_stamp; + s32 delta = rto_time_stamp - tcp_jiffies32; if (delta > 0) timeout = delta; } @@ -2879,7 +2877,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tp->tcp_mstamp; nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; @@ -3095,7 +3093,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); - skb_mstamp_get(&skb->skb_mstamp); + tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -3191,10 +3189,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); + skb->skb_mstamp = cookie_init_timestamp(req); else #endif - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tcp_clock_us(); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); @@ -3324,7 +3322,7 @@ static void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; else - tp->rcv_tstamp = tcp_time_stamp; + tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; @@ -3453,7 +3451,8 @@ int tcp_connect(struct sock *sk) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = tcp_time_stamp; + tcp_mstamp_refresh(tp); + tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); @@ -3572,7 +3571,6 @@ void tcp_send_ack(struct sock *sk) skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ - skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); } EXPORT_SYMBOL_GPL(tcp_send_ack); @@ -3606,15 +3604,16 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - skb_mstamp_get(&skb->skb_mstamp); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } +/* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; + tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c6a9fa894646..ad99569d4c1e 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -78,7 +78,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - if (!scb->tx.delivered_mstamp.v64) + if (!scb->tx.delivered_mstamp) return; if (!rs->prior_delivered || @@ -89,9 +89,9 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, rs->is_retrans = scb->sacked & TCPCB_RETRANS; /* Find the duration of the "send phase" of this window: */ - rs->interval_us = skb_mstamp_us_delta( - &skb->skb_mstamp, - &scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp_us_delta( + skb->skb_mstamp, + scb->tx.first_tx_mstamp); /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = skb->skb_mstamp; @@ -101,7 +101,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, * we don't need to reset since it'll be freed soon. */ if (scb->sacked & TCPCB_SACKED_ACKED) - scb->tx.delivered_mstamp.v64 = 0; + scb->tx.delivered_mstamp = 0; } /* Update the connection delivery information and generate a rate sample. */ @@ -125,7 +125,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ /* Return an invalid sample if no timing information is available. */ - if (!rs->prior_mstamp.v64) { + if (!rs->prior_mstamp) { rs->delivered = -1; rs->interval_us = -1; return; @@ -138,8 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp, - &rs->prior_mstamp); /* ack phase */ + ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Normally we expect interval_us >= min-rtt. diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 362b8c75bfab..fe9a493d0208 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -17,12 +17,9 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -static bool tcp_rack_sent_after(const struct skb_mstamp *t1, - const struct skb_mstamp *t2, - u32 seq1, u32 seq2) +static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { - return skb_mstamp_after(t1, t2) || - (t1->v64 == t2->v64 && after(seq1, seq2)); + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): @@ -72,14 +69,14 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) scb->sacked & TCPCB_SACKED_ACKED) continue; - if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp, + if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, tp->rack.end_seq, scb->end_seq)) { /* Step 3 in draft-cheng-tcpm-rack-00.txt: * A packet is lost if its elapsed time is beyond * the recent RTT plus the reordering window. */ - u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp, - &skb->skb_mstamp); + u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, + skb->skb_mstamp); s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; if (remaining < 0) { @@ -127,16 +124,16 @@ void tcp_rack_mark_lost(struct sock *sk) * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time) + u64 xmit_time) { u32 rtt_us; - if (tp->rack.mstamp.v64 && - !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp, + if (tp->rack.mstamp && + !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, end_seq, tp->rack.end_seq)) return; - rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time); + rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); if (sacked & TCPCB_RETRANS) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior @@ -152,7 +149,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, return; } tp->rack.rtt_us = rtt_us; - tp->rack.mstamp = *xmit_time; + tp->rack.mstamp = xmit_time; tp->rack.end_seq = end_seq; tp->rack.advanced = 1; } @@ -166,7 +163,6 @@ void tcp_rack_reo_timeout(struct sock *sk) u32 timeout, prior_inflight; prior_inflight = tcp_packets_in_flight(tp); - skb_mstamp_get(&tp->tcp_mstamp); tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 86934bcf685a..c0feeeef962a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -63,7 +63,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ @@ -73,7 +73,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; @@ -115,7 +115,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk); @@ -139,22 +139,18 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) * @timeout: A custom timeout value. * If set to 0 the default timeout is calculated and used. * Using TCP_RTO_MIN and the number of unsuccessful retransmits. - * @syn_set: true if the SYN Bit was set. * * The default "timeout" value this function can calculate and use * is equivalent to the timeout of a TCP Connection * after "boundary" unsuccessful, exponentially backed-off - * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if - * syn_set flag is set. - * + * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, - unsigned int timeout, - bool syn_set) + unsigned int timeout) { + const unsigned int rto_base = TCP_RTO_MIN; unsigned int linear_backoff_thresh, start_ts; - unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -172,7 +168,7 @@ static bool retransmits_timed_out(struct sock *sk, timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; } - return (tcp_time_stamp - start_ts) >= timeout; + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); } /* A write timeout has occurred. Process the after effects. */ @@ -181,8 +177,8 @@ static int tcp_write_timeout(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); + bool expired, do_reset; int retry_until; - bool do_reset, syn_set = false; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { @@ -196,9 +192,9 @@ static int tcp_write_timeout(struct sock *sk) sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; - syn_set = true; + expired = icsk->icsk_retransmits >= retry_until; } else { - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable * Fast Open on this path on recurring timeouts after @@ -224,15 +220,15 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until, 0, 0); + !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } + expired = retransmits_timed_out(sk, retry_until, + icsk->icsk_user_timeout); } - - if (retransmits_timed_out(sk, retry_until, - syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -339,9 +335,10 @@ static void tcp_probe_timer(struct sock *sk) */ start_ts = tcp_skb_timestamp(tcp_send_head(sk)); if (!start_ts) - skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp); + tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) + (s32)(tcp_time_stamp(tp) - start_ts) > + jiffies_to_msecs(icsk->icsk_user_timeout)) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; @@ -451,7 +448,7 @@ void tcp_retransmit_timer(struct sock *sk) tp->snd_una, tp->snd_nxt); } #endif - if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { tcp_write_err(sk); goto out; } @@ -539,7 +536,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); out:; @@ -561,6 +558,7 @@ void tcp_write_timer_handler(struct sock *sk) goto out; } + tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; switch (event) { diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 9775453b8d17..bec9cafbe3f9 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -68,7 +68,7 @@ static void tcp_westwood_init(struct sock *sk) w->cumul_ack = 0; w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; - w->rtt_win_sx = tcp_time_stamp; + w->rtt_win_sx = tcp_jiffies32; w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 1; } @@ -116,7 +116,7 @@ static void tcp_westwood_pkts_acked(struct sock *sk, static void westwood_update_window(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); - s32 delta = tcp_time_stamp - w->rtt_win_sx; + s32 delta = tcp_jiffies32 - w->rtt_win_sx; /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first @@ -140,7 +140,7 @@ static void westwood_update_window(struct sock *sk) westwood_filter(w, delta); w->bk = 0; - w->rtt_win_sx = tcp_time_stamp; + w->rtt_win_sx = tcp_jiffies32; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7bd56c9889b3..fdcb7437cc15 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1218,7 +1218,7 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ -void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) +static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) { udp_rmem_release(sk, skb->dev_scratch, 1, true); } @@ -1465,16 +1465,13 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, error = -EAGAIN; *peeked = 0; do { - int _off = *off; - spin_lock_bh(&queue->lock); skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_destructor, - peeked, &_off, err, + peeked, off, err, &last); if (skb) { spin_unlock_bh(&queue->lock); - *off = _off; return skb; } @@ -1488,20 +1485,17 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, * the sk_receive_queue lock if fwd memory scheduling * is needed. */ - _off = *off; spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_dtor_locked, - peeked, &_off, err, + peeked, off, err, &last); spin_unlock(&sk_queue->lock); spin_unlock_bh(&queue->lock); - if (skb) { - *off = _off; + if (skb) return skb; - } busy_check: if (!sk_can_busy_loop(sk)) @@ -1733,7 +1727,7 @@ static void udp_v4_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1778,7 +1772,7 @@ EXPORT_SYMBOL(udp_encap_enable); * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index feb50a16398d..a8cf8c6fb60c 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -25,7 +25,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS |