diff options
Diffstat (limited to 'net/ipv4')
64 files changed, 1160 insertions, 1162 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ef1528af7abf..1b5096a9875a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -466,8 +466,13 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; if (addr->sin_family != AF_INET) { + /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) + * only if s_addr is INADDR_ANY. + */ err = -EAFNOSUPPORT; - goto out; + if (addr->sin_family != AF_UNSPEC || + addr->sin_addr.s_addr != htonl(INADDR_ANY)) + goto out; } chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); @@ -888,7 +893,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) EXPORT_SYMBOL(inet_ioctl); #ifdef CONFIG_COMPAT -int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; int err = -ENOIOCTLCMD; @@ -1440,11 +1445,11 @@ EXPORT_SYMBOL_GPL(inet_ctl_sock_create); unsigned long snmp_fold_field(void __percpu *mib[], int offt) { unsigned long res = 0; - int i; + int i, j; for_each_possible_cpu(i) { - res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); - res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); + for (j = 0; j < SNMP_ARRAY_SZ; j++) + res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt); } return res; } @@ -1458,28 +1463,19 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) int cpu; for_each_possible_cpu(cpu) { - void *bhptr, *userptr; + void *bhptr; struct u64_stats_sync *syncp; - u64 v_bh, v_user; + u64 v; unsigned int start; - /* first mib used by softirq context, we must use _bh() accessors */ - bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu); + bhptr = per_cpu_ptr(mib[0], cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { start = u64_stats_fetch_begin_bh(syncp); - v_bh = *(((u64 *) bhptr) + offt); + v = *(((u64 *) bhptr) + offt); } while (u64_stats_fetch_retry_bh(syncp, start)); - /* second mib used in USER context */ - userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu); - syncp = (struct u64_stats_sync *)(userptr + syncp_offset); - do { - start = u64_stats_fetch_begin(syncp); - v_user = *(((u64 *) userptr) + offt); - } while (u64_stats_fetch_retry(syncp, start)); - - res += v_bh + v_user; + res += v; } return res; } @@ -1491,25 +1487,28 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) BUG_ON(ptr == NULL); ptr[0] = __alloc_percpu(mibsize, align); if (!ptr[0]) - goto err0; + return -ENOMEM; +#if SNMP_ARRAY_SZ == 2 ptr[1] = __alloc_percpu(mibsize, align); - if (!ptr[1]) - goto err1; + if (!ptr[1]) { + free_percpu(ptr[0]); + ptr[0] = NULL; + return -ENOMEM; + } +#endif return 0; -err1: - free_percpu(ptr[0]); - ptr[0] = NULL; -err0: - return -ENOMEM; } EXPORT_SYMBOL_GPL(snmp_mib_init); -void snmp_mib_free(void __percpu *ptr[2]) +void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ]) { + int i; + BUG_ON(ptr == NULL); - free_percpu(ptr[0]); - free_percpu(ptr[1]); - ptr[0] = ptr[1] = NULL; + for (i = 0; i < SNMP_ARRAY_SZ; i++) { + free_percpu(ptr[i]); + ptr[i] = NULL; + } } EXPORT_SYMBOL_GPL(snmp_mib_free); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index c1f4154552fc..36d14406261e 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -136,8 +136,6 @@ static void ah_output_done(struct crypto_async_request *base, int err) memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } - err = ah->nexthdr; - kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb, err); } @@ -264,12 +262,12 @@ static void ah_input_done(struct crypto_async_request *base, int err) if (err) goto out; + err = ah->nexthdr; + skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); skb_set_transport_header(skb, -ihl); - - err = ah->nexthdr; out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); @@ -371,8 +369,6 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) if (err == -EINPROGRESS) goto out; - if (err == -EBUSY) - err = NET_XMIT_DROP; goto out_free; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1b74d3b64371..96a164aa1367 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -97,7 +97,6 @@ #include <linux/init.h> #include <linux/net.h> #include <linux/rcupdate.h> -#include <linux/jhash.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -139,8 +138,6 @@ static const struct neigh_ops arp_generic_ops = { .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, }; static const struct neigh_ops arp_hh_ops = { @@ -149,16 +146,12 @@ static const struct neigh_ops arp_hh_ops = { .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, - .output = dev_queue_xmit, - .connected_output = dev_queue_xmit, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, + .output = neigh_direct_output, + .connected_output = neigh_direct_output, }; static const struct neigh_ops arp_broken_ops = { @@ -167,8 +160,6 @@ static const struct neigh_ops arp_broken_ops = { .error_report = arp_error_report, .output = neigh_compat_output, .connected_output = neigh_compat_output, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, }; struct neigh_table arp_tbl = { @@ -232,7 +223,7 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 hash_rnd) { - return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd); + return arp_hashfn(*(u32 *)pkey, dev, hash_rnd); } static int arp_constructor(struct neighbour *neigh) @@ -259,7 +250,7 @@ static int arp_constructor(struct neighbour *neigh) if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; - neigh->output = neigh->ops->queue_xmit; + neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) @@ -518,30 +509,6 @@ EXPORT_SYMBOL(arp_find); /* END OF OBSOLETE FUNCTIONS */ -int arp_bind_neighbour(struct dst_entry *dst) -{ - struct net_device *dev = dst->dev; - struct neighbour *n = dst->neighbour; - - if (dev == NULL) - return -EINVAL; - if (n == NULL) { - __be32 nexthop = ((struct rtable *)dst)->rt_gateway; - if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) - nexthop = 0; - n = __neigh_lookup_errno( -#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) - dev->type == ARPHRD_ATM ? - clip_tbl_hook : -#endif - &arp_tbl, &nexthop, dev); - if (IS_ERR(n)) - return PTR_ERR(n); - dst->neighbour = n; - } - return 0; -} - /* * Check if we can use proxy ARP for this path */ diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2b3c23c287cd..86f3b885b4f3 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -50,7 +50,7 @@ #include <net/tcp.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/bug.h> #include <asm/unaligned.h> @@ -476,7 +476,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, doi = doi_def->doi; doi_type = doi_def->type; - if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN) + if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN) goto doi_add_return; for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { switch (doi_def->tags[iter]) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 0d4a184af16f..c6b5092f29a1 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -258,7 +258,7 @@ static struct in_device *inetdev_init(struct net_device *dev) ip_mc_up(in_dev); /* we can receive as soon as ip_ptr is set -- do this last */ - rcu_assign_pointer(dev->ip_ptr, in_dev); + RCU_INIT_POINTER(dev->ip_ptr, in_dev); out: return in_dev; out_kfree: @@ -291,7 +291,7 @@ static void inetdev_destroy(struct in_device *in_dev) inet_free_ifa(ifa); } - rcu_assign_pointer(dev->ip_ptr, NULL); + RCU_INIT_POINTER(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); @@ -1134,15 +1134,15 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { - struct in_ifaddr *ifa = in_dev->ifa_list; - - if (!ifa) - return; + struct in_ifaddr *ifa; - arp_send(ARPOP_REQUEST, ETH_P_ARP, - ifa->ifa_local, dev, - ifa->ifa_local, NULL, - dev->dev_addr, NULL); + for (ifa = in_dev->ifa_list; ifa; + ifa = ifa->ifa_next) { + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ifa->ifa_local, dev, + ifa->ifa_local, NULL, + dev->dev_addr, NULL); + } } /* Called only under RTNL semaphore */ @@ -1175,7 +1175,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_REGISTER: printk(KERN_DEBUG "inetdev_event: bug\n"); - rcu_assign_pointer(dev->ip_ptr, NULL); + RCU_INIT_POINTER(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) @@ -1833,8 +1833,8 @@ void __init devinet_init(void) rtnl_af_register(&inet_af_ops); - rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); - rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); - rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); + rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); + rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); + rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 22524716fe70..92fc5f69f5da 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1124,9 +1124,9 @@ static struct pernet_operations fib_net_ops = { void __init ip_fib_init(void) { - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a53bb1b5b118..46339ba7a2d3 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/rcupdate.h> +#include <linux/export.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp.h> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 33e2c35b74b7..80106d89d548 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -142,6 +142,14 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }; /* Release a nexthop info record */ +static void free_fib_info_rcu(struct rcu_head *head) +{ + struct fib_info *fi = container_of(head, struct fib_info, rcu); + + if (fi->fib_metrics != (u32 *) dst_default_metrics) + kfree(fi->fib_metrics); + kfree(fi); +} void free_fib_info(struct fib_info *fi) { @@ -156,7 +164,7 @@ void free_fib_info(struct fib_info *fi) } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); - kfree_rcu(fi, rcu); + call_rcu(&fi->rcu, free_fib_info_rcu); } void fib_release_info(struct fib_info *fi) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 58c25ea5a5c1..37b671185c81 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -73,6 +73,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/prefetch.h> +#include <linux/export.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -110,9 +111,10 @@ struct leaf { struct leaf_info { struct hlist_node hlist; - struct rcu_head rcu; int plen; + u32 mask_plen; /* ntohl(inet_make_mask(plen)) */ struct list_head falh; + struct rcu_head rcu; }; struct tnode { @@ -203,7 +205,7 @@ static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) return (struct tnode *)(parent & ~NODE_TYPE_MASK); } -/* Same as rcu_assign_pointer +/* Same as RCU_INIT_POINTER * but that macro() assumes that value is a pointer. */ static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) @@ -451,6 +453,7 @@ static struct leaf_info *leaf_info_new(int plen) struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); if (li) { li->plen = plen; + li->mask_plen = ntohl(inet_make_mask(plen)); INIT_LIST_HEAD(&li->falh); } return li; @@ -526,7 +529,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node * if (n) node_set_parent(n, tn); - rcu_assign_pointer(tn->child[i], n); + RCU_INIT_POINTER(tn->child[i], n); } #define MAX_WORK 10 @@ -1012,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) tp = node_parent((struct rt_trie_node *) tn); if (!tp) - rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + RCU_INIT_POINTER(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); if (!tp) @@ -1024,7 +1027,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) if (IS_TNODE(tn)) tn = (struct tnode *)resize(t, (struct tnode *)tn); - rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + RCU_INIT_POINTER(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); } @@ -1161,7 +1164,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)tn); } else { - rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + RCU_INIT_POINTER(t->trie, (struct rt_trie_node *)tn); tp = tn; } } @@ -1359,10 +1362,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, hlist_for_each_entry_rcu(li, node, hhead, hlist) { struct fib_alias *fa; - int plen = li->plen; - __be32 mask = inet_make_mask(plen); - if (l->key != (key & ntohl(mask))) + if (l->key != (key & li->mask_plen)) continue; list_for_each_entry_rcu(fa, &li->falh, fa_list) { @@ -1394,7 +1395,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.semantic_match_passed++; #endif - res->prefixlen = plen; + res->prefixlen = li->plen; res->nh_sel = nhsel; res->type = fa->fa_type; res->scope = fa->fa_info->fib_scope; @@ -1402,7 +1403,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, res->table = tb; res->fa_head = &li->falh; if (!(fib_flags & FIB_LOOKUP_NOREF)) - atomic_inc(&res->fi->fib_clntref); + atomic_inc(&fi->fib_clntref); return 0; } } @@ -1621,7 +1622,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) put_child(t, (struct tnode *)tp, cindex, NULL); trie_rebalance(t, tp); } else - rcu_assign_pointer(t->trie, NULL); + RCU_INIT_POINTER(t->trie, NULL); free_leaf(l); } diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index c6933f2ea310..8cb1ebb7cd74 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -15,8 +15,8 @@ #include <linux/kmod.h> #include <linux/skbuff.h> #include <linux/in.h> +#include <linux/ip.h> #include <linux/netdevice.h> -#include <linux/version.h> #include <linux/spinlock.h> #include <net/protocol.h> #include <net/gre.h> @@ -34,7 +34,7 @@ int gre_add_protocol(const struct gre_protocol *proto, u8 version) if (gre_proto[version]) goto err_out_unlock; - rcu_assign_pointer(gre_proto[version], proto); + RCU_INIT_POINTER(gre_proto[version], proto); spin_unlock(&gre_proto_lock); return 0; @@ -54,7 +54,7 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) if (rcu_dereference_protected(gre_proto[version], lockdep_is_held(&gre_proto_lock)) != proto) goto err_out_unlock; - rcu_assign_pointer(gre_proto[version], NULL); + RCU_INIT_POINTER(gre_proto[version], NULL); spin_unlock(&gre_proto_lock); synchronize_rcu(); return 0; @@ -97,27 +97,17 @@ drop: static void gre_err(struct sk_buff *skb, u32 info) { const struct gre_protocol *proto; - u8 ver; - - if (!pskb_may_pull(skb, 12)) - goto drop; + const struct iphdr *iph = (const struct iphdr *)skb->data; + u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; - ver = skb->data[1]&0x7f; if (ver >= GREPROTO_MAX) - goto drop; + return; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); - if (!proto || !proto->err_handler) - goto drop_unlock; - proto->err_handler(skb, info); - rcu_read_unlock(); - return; - -drop_unlock: + if (proto && proto->err_handler) + proto->err_handler(skb, info); rcu_read_unlock(); -drop: - kfree_skb(skb); } static const struct net_protocol net_gre_protocol = { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5395e45dcce6..ab188ae12fd9 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -380,6 +380,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct icmp_bxm *param) { struct rtable *rt, *rt2; + struct flowi4 fl4_dec; int err; memset(fl4, 0, sizeof(*fl4)); @@ -408,19 +409,19 @@ static struct rtable *icmp_route_lookup(struct net *net, } else return rt; - err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(fl4), AF_INET); + err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); if (err) goto relookup_failed; - if (inet_addr_type(net, fl4->saddr) == RTN_LOCAL) { - rt2 = __ip_route_output_key(net, fl4); + if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { + rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) err = PTR_ERR(rt2); } else { struct flowi4 fl4_2 = {}; unsigned long orefdst; - fl4_2.daddr = fl4->saddr; + fl4_2.daddr = fl4_dec.saddr; rt2 = ip_route_output_key(net, &fl4_2); if (IS_ERR(rt2)) { err = PTR_ERR(rt2); @@ -428,7 +429,7 @@ static struct rtable *icmp_route_lookup(struct net *net, } /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ - err = ip_route_input(skb_in, fl4->daddr, fl4->saddr, + err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, RT_TOS(tos), rt2->dst.dev); dst_release(&rt2->dst); @@ -440,10 +441,11 @@ static struct rtable *icmp_route_lookup(struct net *net, goto relookup_failed; rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, - flowi4_to_flowi(fl4), NULL, + flowi4_to_flowi(&fl4_dec), NULL, XFRM_LOOKUP_ICMP); if (!IS_ERR(rt2)) { dst_release(&rt->dst); + memcpy(fl4, &fl4_dec, sizeof(*fl4)); rt = rt2; } else if (PTR_ERR(rt2) == -EPERM) { if (rt) @@ -1150,10 +1152,9 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.icmp_sk[i] = sk; /* Enough space for 2 64K ICMP packets, including - * sk_buff struct overhead. + * sk_buff/skb_shared_info struct overhead. */ - sk->sk_sndbuf = - (2 * ((64 * 1024) + sizeof(struct sk_buff))); + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); /* * Speedup sock_wfree() diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f1d27f6c9351..c7472eff2d51 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -767,7 +767,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) break; for (i=0; i<nsrcs; i++) { /* skip inactive filters */ - if (pmc->sfcount[MCAST_INCLUDE] || + if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) continue; @@ -1009,7 +1009,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr) /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. We will get multicast token leakage, when IFF_MULTICAST - is changed. This check should be done in dev->set_multicast_list + is changed. This check should be done in ndo_set_rx_mode routine. Something sort of: if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } --ANK @@ -1242,7 +1242,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) im->next_rcu = in_dev->mc_list; in_dev->mc_count++; - rcu_assign_pointer(in_dev->mc_list, im); + RCU_INIT_POINTER(in_dev->mc_list, im); #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im->multiaddr); @@ -1718,7 +1718,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[sfmode]--; for (j=0; j<i; j++) - (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); + (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; @@ -1813,7 +1813,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) iml->next_rcu = inet->mc_list; iml->sflist = NULL; iml->sfmode = MCAST_EXCLUDE; - rcu_assign_pointer(inet->mc_list, iml); + RCU_INIT_POINTER(inet->mc_list, iml); ip_mc_inc_group(in_dev, addr); err = 0; done: @@ -1835,7 +1835,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, psf->sl_count, psf->sl_addr, 0); - rcu_assign_pointer(iml->sflist, NULL); + RCU_INIT_POINTER(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); kfree_rcu(psf, rcu); @@ -2000,7 +2000,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } - rcu_assign_pointer(pmc->sflist, newpsl); + RCU_INIT_POINTER(pmc->sflist, newpsl); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ @@ -2103,7 +2103,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); - rcu_assign_pointer(pmc->sflist, newpsl); + RCU_INIT_POINTER(pmc->sflist, newpsl); pmc->sfmode = msf->imsf_fmode; err = 0; done: diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3267d3898437..68e8ac514383 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -108,6 +108,9 @@ static int inet_csk_diag_fill(struct sock *sk, icsk->icsk_ca_ops->name); } + if ((ext & (1 << (INET_DIAG_TOS - 1))) && (sk->sk_family != AF_INET6)) + RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); + r->idiag_family = sk->sk_family; r->idiag_state = sk->sk_state; r->idiag_timer = 0; @@ -130,6 +133,8 @@ static int inet_csk_diag_fill(struct sock *sk, &np->rcv_saddr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, &np->daddr); + if (ext & (1 << (INET_DIAG_TCLASS - 1))) + RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); } #endif @@ -869,7 +874,7 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } return netlink_dump_start(idiagnl, skb, nlh, - inet_diag_dump, NULL); + inet_diag_dump, NULL, 0); } return inet_diag_get_exact(skb, nlh); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3c0369a3a663..984ec656b03b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -21,6 +21,7 @@ #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> +#include <net/secure_seq.h> #include <net/ip.h> /* diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 85a0f75dae64..cc280a3f4f96 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -146,8 +146,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) } static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph, - u16 vlan_tag, struct vlan_group *vgrp) + struct iphdr *iph, struct tcphdr *tcph) { int nr_frags; __be32 *ptr; @@ -173,8 +172,6 @@ static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, } lro_desc->mss = tcp_data_len; - lro_desc->vgrp = vgrp; - lro_desc->vlan_tag = vlan_tag; lro_desc->active = 1; lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, @@ -247,11 +244,11 @@ static void lro_add_frags(struct net_lro_desc *lro_desc, skb->truesize += truesize; skb_frags[0].page_offset += hlen; - skb_frags[0].size -= hlen; + skb_frag_size_sub(&skb_frags[0], hlen); while (tcp_data_len > 0) { *(lro_desc->next_frag) = *skb_frags; - tcp_data_len -= skb_frags->size; + tcp_data_len -= skb_frag_size(skb_frags); lro_desc->next_frag++; skb_frags++; skb_shinfo(skb)->nr_frags++; @@ -309,29 +306,17 @@ static void lro_flush(struct net_lro_mgr *lro_mgr, skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; - if (lro_desc->vgrp) { - if (lro_mgr->features & LRO_F_NAPI) - vlan_hwaccel_receive_skb(lro_desc->parent, - lro_desc->vgrp, - lro_desc->vlan_tag); - else - vlan_hwaccel_rx(lro_desc->parent, - lro_desc->vgrp, - lro_desc->vlan_tag); - - } else { - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(lro_desc->parent); - else - netif_rx(lro_desc->parent); - } + if (lro_mgr->features & LRO_F_NAPI) + netif_receive_skb(lro_desc->parent); + else + netif_rx(lro_desc->parent); LRO_INC_STATS(lro_mgr, flushed); lro_clear_desc(lro_desc); } static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, - struct vlan_group *vgrp, u16 vlan_tag, void *priv) + void *priv) { struct net_lro_desc *lro_desc; struct iphdr *iph; @@ -360,7 +345,7 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, goto out; skb->ip_summed = lro_mgr->ip_summed_aggr; - lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp); + lro_init_desc(lro_desc, skb, iph, tcph); LRO_INC_STATS(lro_mgr, aggregated); return 0; } @@ -415,14 +400,14 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, skb_frags = skb_shinfo(skb)->frags; while (data_len > 0) { *skb_frags = *frags; - data_len -= frags->size; + data_len -= skb_frag_size(frags); skb_frags++; frags++; skb_shinfo(skb)->nr_frags++; } skb_shinfo(skb)->frags[0].page_offset += hdr_len; - skb_shinfo(skb)->frags[0].size -= hdr_len; + skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len); skb->ip_summed = ip_summed; skb->csum = sum; @@ -433,8 +418,7 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, struct skb_frag_struct *frags, int len, int true_size, - struct vlan_group *vgrp, - u16 vlan_tag, void *priv, __wsum sum) + void *priv, __wsum sum) { struct net_lro_desc *lro_desc; struct iphdr *iph; @@ -449,7 +433,7 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, if (!lro_mgr->get_frag_header || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, (void *)&tcph, &flags, priv)) { - mac_hdr = page_address(frags->page) + frags->page_offset; + mac_hdr = skb_frag_address(frags); goto out1; } @@ -480,7 +464,7 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, tcph = (void *)((u8 *)skb->data + vlan_hdr_len + IP_HDR_LEN(iph)); - lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL); + lro_init_desc(lro_desc, skb, iph, tcph); LRO_INC_STATS(lro_mgr, aggregated); return NULL; } @@ -514,7 +498,7 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, void *priv) { - if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) { + if (__lro_proc_skb(lro_mgr, skb, priv)) { if (lro_mgr->features & LRO_F_NAPI) netif_receive_skb(skb); else @@ -523,29 +507,13 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr, } EXPORT_SYMBOL(lro_receive_skb); -void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr, - struct sk_buff *skb, - struct vlan_group *vgrp, - u16 vlan_tag, - void *priv) -{ - if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) { - if (lro_mgr->features & LRO_F_NAPI) - vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); - else - vlan_hwaccel_rx(skb, vgrp, vlan_tag); - } -} -EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb); - void lro_receive_frags(struct net_lro_mgr *lro_mgr, struct skb_frag_struct *frags, int len, int true_size, void *priv, __wsum sum) { struct sk_buff *skb; - skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0, - priv, sum); + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum); if (!skb) return; @@ -556,26 +524,6 @@ void lro_receive_frags(struct net_lro_mgr *lro_mgr, } EXPORT_SYMBOL(lro_receive_frags); -void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - struct vlan_group *vgrp, - u16 vlan_tag, void *priv, __wsum sum) -{ - struct sk_buff *skb; - - skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp, - vlan_tag, priv, sum); - if (!skb) - return; - - if (lro_mgr->features & LRO_F_NAPI) - vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); - else - vlan_hwaccel_rx(skb, vgrp, vlan_tag); -} -EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags); - void lro_flush_all(struct net_lro_mgr *lro_mgr) { int i; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 3c8dfa16614d..89168c6351ff 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/kmemcheck.h> #include <linux/slab.h> +#include <linux/module.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> @@ -183,6 +184,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_daddr = inet->inet_daddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr; tw->tw_bound_dev_if = sk->sk_bound_dev_if; + tw->tw_tos = inet->tos; tw->tw_num = inet->inet_num; tw->tw_state = TCP_TIME_WAIT; tw->tw_substate = state; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ce616d92cc54..86f13c67ea85 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -19,6 +19,7 @@ #include <linux/net.h> #include <net/ip.h> #include <net/inetpeer.h> +#include <net/secure_seq.h> /* * Theory of operations. @@ -54,15 +55,11 @@ * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. - * 3. Nodes appears and disappears from unused node list only under - * "inet_peer_unused_lock". - * 4. Global variable peer_total is modified under the pool lock. - * 5. struct inet_peer fields modification: + * 3. Global variable peer_total is modified under the pool lock. + * 4. struct inet_peer fields modification: * avl_left, avl_right, avl_parent, avl_height: pool lock - * unused: unused node list lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing - * dtime: unused node list lock * daddr: unchangeable * ip_id_count: atomic value (no lock needed) */ @@ -104,19 +101,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ -int inet_peer_gc_mintime __read_mostly = 10 * HZ; -int inet_peer_gc_maxtime __read_mostly = 120 * HZ; - -static struct { - struct list_head list; - spinlock_t lock; -} unused_peers = { - .list = LIST_HEAD_INIT(unused_peers.list), - .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), -}; - -static void peer_check_expire(unsigned long dummy); -static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); /* Called from ip_output.c:ip_init */ @@ -142,21 +126,6 @@ void __init inet_initpeers(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - peer_periodic_timer.expires = jiffies - + net_random() % inet_peer_gc_maxtime - + inet_peer_gc_maxtime; - add_timer(&peer_periodic_timer); -} - -/* Called with or without local BH being disabled. */ -static void unlink_from_unused(struct inet_peer *p) -{ - spin_lock_bh(&unused_peers.lock); - list_del_init(&p->unused); - spin_unlock_bh(&unused_peers.lock); } static int addr_compare(const struct inetpeer_addr *a, @@ -203,20 +172,6 @@ static int addr_compare(const struct inetpeer_addr *a, u; \ }) -static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) -{ - int cur, old = atomic_read(ptr); - - while (old != u) { - *newv = old + a; - cur = atomic_cmpxchg(ptr, old, *newv); - if (cur == old) - return true; - old = cur; - } - return false; -} - /* * Called with rcu_read_lock() * Because we hold no lock against a writer, its quite possible we fall @@ -225,8 +180,7 @@ static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) * We exit from this function if number of links exceeds PEER_MAXDEPTH */ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, - struct inet_peer_base *base, - int *newrefcnt) + struct inet_peer_base *base) { struct inet_peer *u = rcu_dereference(base->root); int count = 0; @@ -235,11 +189,9 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, int cmp = addr_compare(daddr, &u->daddr); if (cmp == 0) { /* Before taking a reference, check if this entry was - * deleted, unlink_from_pool() sets refcnt=-1 to make - * distinction between an unused entry (refcnt=0) and - * a freed one. + * deleted (refcnt=-1) */ - if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt)) + if (!atomic_add_unless(&u->refcnt, 1, -1)) u = NULL; return u; } @@ -366,137 +318,99 @@ static void inetpeer_free_rcu(struct rcu_head *head) kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); } -/* May be called with local BH enabled. */ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) { - int do_free; - - do_free = 0; - - write_seqlock_bh(&base->lock); - /* Check the reference counter. It was artificially incremented by 1 - * in cleanup() function to prevent sudden disappearing. If we can - * atomically (because of lockless readers) take this last reference, - * it's safe to remove the node and free it later. - * We use refcnt=-1 to alert lockless readers this entry is deleted. - */ - if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { - struct inet_peer __rcu ***stackptr, ***delp; - if (lookup(&p->daddr, stack, base) != p) - BUG(); - delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty_rcu) { - *delp[0] = p->avl_right; - --stackptr; - } else { - /* look for a node to insert instead of p */ - struct inet_peer *t; - t = lookup_rightempty(p, base); - BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); - **--stackptr = t->avl_left; - /* t is removed, t->daddr > x->daddr for any - * x in p->avl_left subtree. - * Put t in the old place of p. */ - RCU_INIT_POINTER(*delp[0], t); - t->avl_left = p->avl_left; - t->avl_right = p->avl_right; - t->avl_height = p->avl_height; - BUG_ON(delp[1] != &p->avl_left); - delp[1] = &t->avl_left; /* was &p->avl_left */ - } - peer_avl_rebalance(stack, stackptr, base); - base->total--; - do_free = 1; + struct inet_peer __rcu ***stackptr, ***delp; + + if (lookup(&p->daddr, stack, base) != p) + BUG(); + delp = stackptr - 1; /* *delp[0] == p */ + if (p->avl_left == peer_avl_empty_rcu) { + *delp[0] = p->avl_right; + --stackptr; + } else { + /* look for a node to insert instead of p */ + struct inet_peer *t; + t = lookup_rightempty(p, base); + BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); + **--stackptr = t->avl_left; + /* t is removed, t->daddr > x->daddr for any + * x in p->avl_left subtree. + * Put t in the old place of p. */ + RCU_INIT_POINTER(*delp[0], t); + t->avl_left = p->avl_left; + t->avl_right = p->avl_right; + t->avl_height = p->avl_height; + BUG_ON(delp[1] != &p->avl_left); + delp[1] = &t->avl_left; /* was &p->avl_left */ } - write_sequnlock_bh(&base->lock); - - if (do_free) - call_rcu(&p->rcu, inetpeer_free_rcu); - else - /* The node is used again. Decrease the reference counter - * back. The loop "cleanup -> unlink_from_unused - * -> unlink_from_pool -> putpeer -> link_to_unused - * -> cleanup (for the same node)" - * doesn't really exist because the entry will have a - * recent deletion time and will not be cleaned again soon. - */ - inet_putpeer(p); + peer_avl_rebalance(stack, stackptr, base); + base->total--; + call_rcu(&p->rcu, inetpeer_free_rcu); } static struct inet_peer_base *family_to_base(int family) { - return (family == AF_INET ? &v4_peers : &v6_peers); -} - -static struct inet_peer_base *peer_to_base(struct inet_peer *p) -{ - return family_to_base(p->daddr.family); + return family == AF_INET ? &v4_peers : &v6_peers; } -/* May be called with local BH enabled. */ -static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) +/* perform garbage collect on all items stacked during a lookup */ +static int inet_peer_gc(struct inet_peer_base *base, + struct inet_peer __rcu **stack[PEER_MAXDEPTH], + struct inet_peer __rcu ***stackptr) { - struct inet_peer *p = NULL; - - /* Remove the first entry from the list of unused nodes. */ - spin_lock_bh(&unused_peers.lock); - if (!list_empty(&unused_peers.list)) { - __u32 delta; - - p = list_first_entry(&unused_peers.list, struct inet_peer, unused); - delta = (__u32)jiffies - p->dtime; + struct inet_peer *p, *gchead = NULL; + __u32 delta, ttl; + int cnt = 0; - if (delta < ttl) { - /* Do not prune fresh entries. */ - spin_unlock_bh(&unused_peers.lock); - return -1; + if (base->total >= inet_peer_threshold) + ttl = 0; /* be aggressive */ + else + ttl = inet_peer_maxttl + - (inet_peer_maxttl - inet_peer_minttl) / HZ * + base->total / inet_peer_threshold * HZ; + stackptr--; /* last stack slot is peer_avl_empty */ + while (stackptr > stack) { + stackptr--; + p = rcu_deref_locked(**stackptr, base); + if (atomic_read(&p->refcnt) == 0) { + smp_rmb(); + delta = (__u32)jiffies - p->dtime; + if (delta >= ttl && + atomic_cmpxchg(&p->refcnt, 0, -1) == 0) { + p->gc_next = gchead; + gchead = p; + } } - - list_del_init(&p->unused); - - /* Grab an extra reference to prevent node disappearing - * before unlink_from_pool() call. */ - atomic_inc(&p->refcnt); } - spin_unlock_bh(&unused_peers.lock); - - if (p == NULL) - /* It means that the total number of USED entries has - * grown over inet_peer_threshold. It shouldn't really - * happen because of entry limits in route cache. */ - return -1; - - unlink_from_pool(p, peer_to_base(p), stack); - return 0; + while ((p = gchead) != NULL) { + gchead = p->gc_next; + cnt++; + unlink_from_pool(p, base, stack); + } + return cnt; } -/* Called with or without local BH being disabled. */ -struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) +struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) { struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; unsigned int sequence; - int invalidated, newrefcnt = 0; + int invalidated, gccnt = 0; - /* Look up for the address quickly, lockless. + /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); sequence = read_seqbegin(&base->lock); - p = lookup_rcu(daddr, base, &newrefcnt); + p = lookup_rcu(daddr, base); invalidated = read_seqretry(&base->lock, sequence); rcu_read_unlock(); - if (p) { -found: /* The existing node has been found. - * Remove the entry from unused list if it was there. - */ - if (newrefcnt == 1) - unlink_from_unused(p); + if (p) return p; - } /* If no writer did a change during our lookup, we can return early. */ if (!create && !invalidated) @@ -506,18 +420,27 @@ found: /* The existing node has been found. * At least, nodes should be hot in our cache. */ write_seqlock_bh(&base->lock); +relookup: p = lookup(daddr, stack, base); if (p != peer_avl_empty) { - newrefcnt = atomic_inc_return(&p->refcnt); + atomic_inc(&p->refcnt); write_sequnlock_bh(&base->lock); - goto found; + return p; + } + if (!gccnt) { + gccnt = inet_peer_gc(base, stack, stackptr); + if (gccnt && create) + goto relookup; } p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; if (p) { p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); + atomic_set(&p->ip_id_count, + (daddr->family == AF_INET) ? + secure_ip_id(daddr->addr.a4) : + secure_ipv6_id(daddr->addr.a6)); p->tcp_ts_stamp = 0; p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; @@ -525,7 +448,6 @@ found: /* The existing node has been found. p->pmtu_expires = 0; p->pmtu_orig = 0; memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); - INIT_LIST_HEAD(&p->unused); /* Link the node. */ @@ -534,63 +456,15 @@ found: /* The existing node has been found. } write_sequnlock_bh(&base->lock); - if (base->total >= inet_peer_threshold) - /* Remove one less-recently-used entry. */ - cleanup_once(0, stack); - return p; } - -static int compute_total(void) -{ - return v4_peers.total + v6_peers.total; -} EXPORT_SYMBOL_GPL(inet_getpeer); -/* Called with local BH disabled. */ -static void peer_check_expire(unsigned long dummy) -{ - unsigned long now = jiffies; - int ttl, total; - struct inet_peer __rcu **stack[PEER_MAXDEPTH]; - - total = compute_total(); - if (total >= inet_peer_threshold) - ttl = inet_peer_minttl; - else - ttl = inet_peer_maxttl - - (inet_peer_maxttl - inet_peer_minttl) / HZ * - total / inet_peer_threshold * HZ; - while (!cleanup_once(ttl, stack)) { - if (jiffies != now) - break; - } - - /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime - * interval depending on the total number of entries (more entries, - * less interval). */ - total = compute_total(); - if (total >= inet_peer_threshold) - peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; - else - peer_periodic_timer.expires = jiffies - + inet_peer_gc_maxtime - - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * - total / inet_peer_threshold * HZ; - add_timer(&peer_periodic_timer); -} - void inet_putpeer(struct inet_peer *p) { - local_bh_disable(); - - if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { - list_add_tail(&p->unused, &unused_peers.list); - p->dtime = (__u32)jiffies; - spin_unlock(&unused_peers.lock); - } - - local_bh_enable(); + p->dtime = (__u32)jiffies; + smp_mb__before_atomic_dec(); + atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 0ad6035f6366..fdaabf2f2b68 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -261,8 +261,9 @@ static void ip_expire(unsigned long arg) * Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_CONNTRACK_IN && - skb_rtable(head)->rt_type != RTN_LOCAL) + if (qp->user == IP_DEFRAG_AF_PACKET || + (qp->user == IP_DEFRAG_CONNTRACK_IN && + skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; @@ -598,8 +599,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); - for (i=0; i<skb_shinfo(head)->nr_frags; i++) - plen += skb_shinfo(head)->frags[i].size; + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; @@ -681,6 +682,42 @@ int ip_defrag(struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_defrag); +struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +{ + const struct iphdr *iph; + u32 len; + + if (skb->protocol != htons(ETH_P_IP)) + return skb; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return skb; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return skb; + if (!pskb_may_pull(skb, iph->ihl*4)) + return skb; + iph = ip_hdr(skb); + len = ntohs(iph->tot_len); + if (skb->len < len || len < (iph->ihl * 4)) + return skb; + + if (ip_is_fragment(ip_hdr(skb))) { + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb) { + if (pskb_trim_rcsum(skb, len)) + return skb; + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + if (ip_defrag(skb, user)) + return NULL; + skb->rxhash = 0; + } + } + return skb; +} +EXPORT_SYMBOL(ip_check_defrag); + #ifdef CONFIG_SYSCTL static int zero; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8871067560db..d55110e93120 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -731,9 +731,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) { + struct neighbour *neigh = dst_get_neighbour(skb_dst(skb)); const struct in6_addr *addr6; int addr_type; - struct neighbour *neigh = skb_dst(skb)->neighbour; if (neigh == NULL) goto tx_error; @@ -835,8 +835,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; if (!new_skb) { ip_rt_put(rt); dev->stats.tx_dropped++; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c8f48efc5fd3..073a9b01c40c 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -165,7 +165,7 @@ int ip_call_ra_chain(struct sk_buff *skb) (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && net_eq(sock_net(sk), dev_net(dev))) { - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) return 1; } @@ -256,7 +256,7 @@ int ip_local_deliver(struct sk_buff *skb) * Reassemble IP fragments. */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ec93335901dd..05d20cca9d66 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -640,6 +640,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) } if (srrptr <= srrspace) { opt->srr_is_hit = 1; + iph->daddr = nexthop; opt->is_changed = 1; } return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 84f26e8e6c60..0bc95f3977d2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -122,6 +122,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(newskb)); + skb_dst_force(newskb); netif_rx_ni(newskb); return 0; } @@ -182,6 +183,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); @@ -203,10 +205,15 @@ static inline int ip_finish_output2(struct sk_buff *skb) skb = skb2; } - if (dst->hh) - return neigh_hh_output(dst->hh, skb); - else if (dst->neighbour) - return dst->neighbour->output(skb); + rcu_read_lock(); + neigh = dst_get_neighbour(dst); + if (neigh) { + int res = neigh_output(neigh, skb); + + rcu_read_unlock(); + return res; + } + rcu_read_unlock(); if (net_ratelimit()) printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); @@ -489,7 +496,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - (iph->frag_off & htons(IP_MF|IP_OFFSET)) || + ip_is_fragment(iph) || skb_cloned(skb)) goto slow_path; @@ -734,7 +741,7 @@ static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu, unsigned int flags) + int transhdrlen, int maxfraglen, unsigned int flags) { struct sk_buff *skb; int err; @@ -767,7 +774,7 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->csum = 0; /* specify the length of each IP datagram fragment */ - skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; __skb_queue_tail(queue, skb); } @@ -831,7 +838,7 @@ static int __ip_append_data(struct sock *sk, (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, - mtu, flags); + maxfraglen, flags); if (err) goto error; return 0; @@ -982,13 +989,13 @@ alloc_new_skb: if (page && (left = PAGE_SIZE - off) > 0) { if (copy >= left) copy = left; - if (page != frag->page) { + if (page != skb_frag_page(frag)) { if (i == MAX_SKB_FRAGS) { err = -EMSGSIZE; goto error; } - get_page(page); skb_fill_page_desc(skb, i, page, off, 0); + skb_frag_ref(skb, i); frag = &skb_shinfo(skb)->frags[i]; } } else if (i < MAX_SKB_FRAGS) { @@ -1008,12 +1015,13 @@ alloc_new_skb: err = -EMSGSIZE; goto error; } - if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag), + offset, copy, skb->len, skb) < 0) { err = -EFAULT; goto error; } cork->off += copy; - frag->size += copy; + skb_frag_size_add(frag, copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; @@ -1222,7 +1230,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (len > size) len = size; if (skb_can_coalesce(skb, i, page, offset)) { - skb_shinfo(skb)->frags[i-1].size += len; + skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); } else if (i < MAX_SKB_FRAGS) { get_page(page); skb_fill_page_desc(skb, i, page, offset, len); @@ -1458,7 +1466,7 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * structure to pass arguments. */ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, - struct ip_reply_arg *arg, unsigned int len) + const struct ip_reply_arg *arg, unsigned int len) { struct inet_sock *inet = inet_sk(sk); struct ip_options_data replyopts; @@ -1481,7 +1489,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, } flowi4_init_output(&fl4, arg->bound_dev_if, 0, - RT_TOS(ip_hdr(skb)->tos), + RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, sk->sk_protocol, ip_reply_arg_flowi_flags(arg), daddr, rt->rt_spec_dst, @@ -1498,7 +1506,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, with locally disabled BH and that sk cannot be already spinlocked. */ bh_lock_sock(sk); - inet->tos = ip_hdr(skb)->tos; + inet->tos = arg->tos; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ab0c9efd1efa..09ff51bf16a4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -33,6 +33,7 @@ #include <linux/netfilter.h> #include <linux/route.h> #include <linux/mroute.h> +#include <net/inet_ecn.h> #include <net/route.h> #include <net/xfrm.h> #include <net/compat.h> @@ -578,8 +579,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; case IP_TOS: /* This sets both TOS and Precedence */ if (sk->sk_type == SOCK_STREAM) { - val &= ~3; - val |= inet->tos & 3; + val &= ~INET_ECN_MASK; + val |= inet->tos & INET_ECN_MASK; } if (inet->tos != val) { inet->tos = val; @@ -961,7 +962,7 @@ mc_msf_out: break; case IP_TRANSPARENT: - if (!capable(CAP_NET_ADMIN)) { + if (!!val && !capable(CAP_NET_RAW) && !capable(CAP_NET_ADMIN)) { err = -EPERM; break; } @@ -1067,7 +1068,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt); */ static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen, unsigned flags) { struct inet_sock *inet = inet_sk(sk); int val; @@ -1240,7 +1241,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, msg.msg_control = optval; msg.msg_controllen = len; - msg.msg_flags = 0; + msg.msg_flags = flags; if (inet->cmsg_flags & IP_CMSG_PKTINFO) { struct in_pktinfo info; @@ -1294,7 +1295,7 @@ int ip_getsockopt(struct sock *sk, int level, { int err; - err = do_ip_getsockopt(sk, level, optname, optval, optlen); + err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && @@ -1327,7 +1328,8 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, return compat_mc_getsockopt(sk, level, optname, optval, optlen, ip_getsockopt); - err = do_ip_getsockopt(sk, level, optname, optval, optlen); + err = do_ip_getsockopt(sk, level, optname, optval, optlen, + MSG_CMSG_COMPAT); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index ab7e5542c1cf..0da2afc97f32 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -54,6 +54,7 @@ #include <linux/delay.h> #include <linux/nfs_fs.h> #include <linux/slab.h> +#include <linux/export.h> #include <net/net_namespace.h> #include <net/arp.h> #include <net/ip.h> @@ -861,41 +862,44 @@ static void __init ic_do_bootp_ext(u8 *ext) #endif switch (*ext++) { - case 1: /* Subnet mask */ - if (ic_netmask == NONE) - memcpy(&ic_netmask, ext+1, 4); - break; - case 3: /* Default gateway */ - if (ic_gateway == NONE) - memcpy(&ic_gateway, ext+1, 4); - break; - case 6: /* DNS server */ - servers= *ext/4; - if (servers > CONF_NAMESERVERS_MAX) - servers = CONF_NAMESERVERS_MAX; - for (i = 0; i < servers; i++) { - if (ic_nameservers[i] == NONE) - memcpy(&ic_nameservers[i], ext+1+4*i, 4); - } - break; - case 12: /* Host name */ - ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN); - ic_host_name_set = 1; - break; - case 15: /* Domain name (DNS) */ - ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); - break; - case 17: /* Root path */ - if (!root_server_path[0]) - ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); - break; - case 26: /* Interface MTU */ - memcpy(&mtu, ext+1, sizeof(mtu)); - ic_dev_mtu = ntohs(mtu); - break; - case 40: /* NIS Domain name (_not_ DNS) */ - ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); - break; + case 1: /* Subnet mask */ + if (ic_netmask == NONE) + memcpy(&ic_netmask, ext+1, 4); + break; + case 3: /* Default gateway */ + if (ic_gateway == NONE) + memcpy(&ic_gateway, ext+1, 4); + break; + case 6: /* DNS server */ + servers= *ext/4; + if (servers > CONF_NAMESERVERS_MAX) + servers = CONF_NAMESERVERS_MAX; + for (i = 0; i < servers; i++) { + if (ic_nameservers[i] == NONE) + memcpy(&ic_nameservers[i], ext+1+4*i, 4); + } + break; + case 12: /* Host name */ + ic_bootp_string(utsname()->nodename, ext+1, *ext, + __NEW_UTS_LEN); + ic_host_name_set = 1; + break; + case 15: /* Domain name (DNS) */ + ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); + break; + case 17: /* Root path */ + if (!root_server_path[0]) + ic_bootp_string(root_server_path, ext+1, *ext, + sizeof(root_server_path)); + break; + case 26: /* Interface MTU */ + memcpy(&mtu, ext+1, sizeof(mtu)); + ic_dev_mtu = ntohs(mtu); + break; + case 40: /* NIS Domain name (_not_ DNS) */ + ic_bootp_string(utsname()->domainname, ext+1, *ext, + __NEW_UTS_LEN); + break; } } @@ -932,7 +936,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str goto drop; /* Fragments are not supported */ - if (h->frag_off & htons(IP_OFFSET | IP_MF)) { + if (ip_is_fragment(h)) { if (net_ratelimit()) printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented " "reply.\n"); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 378b20b7ca6e..065effd8349a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -231,7 +231,7 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { - rcu_assign_pointer(*tp, t->next); + RCU_INIT_POINTER(*tp, t->next); break; } } @@ -241,8 +241,8 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); - rcu_assign_pointer(t->next, rtnl_dereference(*tp)); - rcu_assign_pointer(*tp, t); + RCU_INIT_POINTER(t->next, rtnl_dereference(*tp)); + RCU_INIT_POINTER(*tp, t); } static struct ip_tunnel * ipip_tunnel_locate(struct net *net, @@ -301,7 +301,7 @@ static void ipip_tunnel_uninit(struct net_device *dev) struct ipip_net *ipn = net_generic(net, ipip_net_id); if (dev == ipn->fb_tunnel_dev) - rcu_assign_pointer(ipn->tunnels_wc[0], NULL); + RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL); else ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); @@ -791,7 +791,7 @@ static int __net_init ipip_fb_tunnel_init(struct net_device *dev) return -ENOMEM; dev_hold(dev); - rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); + RCU_INIT_POINTER(ipn->tunnels_wc[0], tunnel); return 0; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 30a7763c400e..76a7f07b38b6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -61,6 +61,7 @@ #include <linux/if_arp.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> +#include <linux/export.h> #include <net/ipip.h> #include <net/checksum.h> #include <net/netlink.h> @@ -1176,7 +1177,7 @@ static void mrtsock_destruct(struct sock *sk) ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; - rcu_assign_pointer(mrt->mroute_sk, NULL); + RCU_INIT_POINTER(mrt->mroute_sk, NULL); mroute_clean_tables(mrt); } } @@ -1203,7 +1204,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENOENT; if (optname != MRT_INIT) { - if (sk != rcu_dereference_raw(mrt->mroute_sk) && + if (sk != rcu_access_pointer(mrt->mroute_sk) && !capable(CAP_NET_ADMIN)) return -EACCES; } @@ -1224,13 +1225,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { - rcu_assign_pointer(mrt->mroute_sk, sk); + RCU_INIT_POINTER(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; } rtnl_unlock(); return ret; case MRT_DONE: - if (sk != rcu_dereference_raw(mrt->mroute_sk)) + if (sk != rcu_access_pointer(mrt->mroute_sk)) return -EACCES; return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: @@ -1796,7 +1797,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = iph->tos, + .flowi4_tos = RT_TOS(iph->tos), .flowi4_oif = rt->rt_oif, .flowi4_iif = rt->rt_iif, .flowi4_mark = rt->rt_mark, @@ -2544,7 +2545,8 @@ int __init ip_mr_init(void) goto add_proto_fail; } #endif - rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute); + rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, + NULL, ipmr_rtm_dumproute, NULL); return 0; #ifdef CONFIG_IP_PIMSM_V2 diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 2e97e3ec1eb7..9899619ab9b8 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -5,6 +5,7 @@ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/gfp.h> +#include <linux/export.h> #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> @@ -18,17 +19,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; - __u8 flags = 0; + __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; unsigned int hh_len; - if (!skb->sk && addr_type != RTN_LOCAL) { - if (addr_type == RTN_UNSPEC) - addr_type = inet_addr_type(net, saddr); - if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) - flags |= FLOWI_FLAG_ANYSRC; - else - saddr = 0; - } + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type(net, saddr); + if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) + flags |= FLOWI_FLAG_ANYSRC; + else + saddr = 0; /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. @@ -38,7 +37,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) fl4.flowi4_tos = RT_TOS(iph->tos); fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; fl4.flowi4_mark = skb->mark; - fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : flags; + fl4.flowi4_flags = flags; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return -1; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 5c9b9d963918..e59aabd0eae4 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -218,6 +218,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) return skb; nlmsg_failure: + kfree_skb(skb); *errp = -EINVAL; printk(KERN_ERR "ip_queue: error creating packet message\n"); return NULL; @@ -313,7 +314,7 @@ ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) { struct nf_queue_entry *entry; - if (vmsg->value > NF_MAX_VERDICT) + if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN) return -EINVAL; entry = ipq_find_dequeue_entry(vmsg->id); @@ -358,12 +359,9 @@ ipq_receive_peer(struct ipq_peer_msg *pmsg, break; case IPQM_VERDICT: - if (pmsg->msg.verdict.value > NF_MAX_VERDICT) - status = -EINVAL; - else - status = ipq_set_verdict(&pmsg->msg.verdict, - len - sizeof(*pmsg)); - break; + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; default: status = -EINVAL; } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 5c9e97c79017..a639967eb727 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -317,19 +317,19 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) hash = clusterip_hashfn(skb, cipinfo->config); switch (ctinfo) { - case IP_CT_NEW: - ct->mark = hash; - break; - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - /* FIXME: we don't handle expectations at the - * moment. they can arrive on a different node than - * the master connection (e.g. FTP passive mode) */ - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - default: - break; + case IP_CT_NEW: + ct->mark = hash; + break; + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + /* FIXME: we don't handle expectations at the moment. + * They can arrive on a different node than + * the master connection (e.g. FTP passive mode) */ + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + default: /* Prevent gcc warnings */ + break; } #ifdef DEBUG @@ -395,7 +395,6 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) config = clusterip_config_init(cipinfo, e->ip.dst.s_addr, dev); if (!config) { - pr_info("cannot allocate config\n"); dev_put(dev); return -ENOMEM; } diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 446e0f467a17..b5508151e547 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -135,10 +135,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) * due to slab allocator restrictions */ n = max(size, nlbufsiz); - skb = alloc_skb(n, GFP_ATOMIC); + skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); if (!skb) { - pr_debug("cannot alloc whole buffer %ub!\n", n); - if (n > size) { /* try to allocate only as much as we need for * current packet */ diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 5585980fce2e..9682b36df38c 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -21,6 +21,7 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_acct.h> #include <linux/rculist_nulls.h> +#include <linux/export.h> struct ct_iter_state { struct seq_net_private p; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index f3a9b42b16c6..9bb1b8a37a22 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -82,7 +82,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #endif #endif /* Gather fragments. */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); if (nf_ct_ipv4_gather_frags(skb, user)) return NF_STOLEN; diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 703f366fd235..7b22382ff0e9 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -70,14 +70,14 @@ static unsigned int help(struct sk_buff *skb, static void __exit nf_nat_amanda_fini(void) { - rcu_assign_pointer(nf_nat_amanda_hook, NULL); + RCU_INIT_POINTER(nf_nat_amanda_hook, NULL); synchronize_rcu(); } static int __init nf_nat_amanda_init(void) { BUG_ON(nf_nat_amanda_hook != NULL); - rcu_assign_pointer(nf_nat_amanda_hook, help); + RCU_INIT_POINTER(nf_nat_amanda_hook, help); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 3346de5d94d0..447bc5cfdc6c 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -514,7 +514,7 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) ret = -EBUSY; goto out; } - rcu_assign_pointer(nf_nat_protos[proto->protonum], proto); + RCU_INIT_POINTER(nf_nat_protos[proto->protonum], proto); out: spin_unlock_bh(&nf_nat_lock); return ret; @@ -525,7 +525,7 @@ EXPORT_SYMBOL(nf_nat_protocol_register); void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) { spin_lock_bh(&nf_nat_lock); - rcu_assign_pointer(nf_nat_protos[proto->protonum], + RCU_INIT_POINTER(nf_nat_protos[proto->protonum], &nf_nat_unknown_protocol); spin_unlock_bh(&nf_nat_lock); synchronize_rcu(); @@ -736,10 +736,10 @@ static int __init nf_nat_init(void) /* Sew in builtin protocols. */ spin_lock_bh(&nf_nat_lock); for (i = 0; i < MAX_IP_NAT_PROTO; i++) - rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol); - rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); - rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); - rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); + RCU_INIT_POINTER(nf_nat_protos[i], &nf_nat_unknown_protocol); + RCU_INIT_POINTER(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); + RCU_INIT_POINTER(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); + RCU_INIT_POINTER(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); spin_unlock_bh(&nf_nat_lock); /* Initialize fake conntrack so that NAT will skip it */ @@ -748,12 +748,12 @@ static int __init nf_nat_init(void) l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); BUG_ON(nf_nat_seq_adjust_hook != NULL); - rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); + RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); - rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, + RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); BUG_ON(nf_ct_nat_offset != NULL); - rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset); + RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); return 0; cleanup_extend: @@ -766,9 +766,9 @@ static void __exit nf_nat_cleanup(void) unregister_pernet_subsys(&nf_nat_net_ops); nf_ct_l3proto_put(l3proto); nf_ct_extend_unregister(&nat_extend); - rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); - rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); - rcu_assign_pointer(nf_ct_nat_offset, NULL); + RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); + RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); + RCU_INIT_POINTER(nf_ct_nat_offset, NULL); synchronize_net(); } diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index dc73abb3fe27..e462a957d080 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -113,14 +113,14 @@ out: static void __exit nf_nat_ftp_fini(void) { - rcu_assign_pointer(nf_nat_ftp_hook, NULL); + RCU_INIT_POINTER(nf_nat_ftp_hook, NULL); synchronize_rcu(); } static int __init nf_nat_ftp_init(void) { BUG_ON(nf_nat_ftp_hook != NULL); - rcu_assign_pointer(nf_nat_ftp_hook, nf_nat_ftp); + RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 790f3160e012..b9a1136addbd 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -581,30 +581,30 @@ static int __init init(void) BUG_ON(nat_callforwarding_hook != NULL); BUG_ON(nat_q931_hook != NULL); - rcu_assign_pointer(set_h245_addr_hook, set_h245_addr); - rcu_assign_pointer(set_h225_addr_hook, set_h225_addr); - rcu_assign_pointer(set_sig_addr_hook, set_sig_addr); - rcu_assign_pointer(set_ras_addr_hook, set_ras_addr); - rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp); - rcu_assign_pointer(nat_t120_hook, nat_t120); - rcu_assign_pointer(nat_h245_hook, nat_h245); - rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding); - rcu_assign_pointer(nat_q931_hook, nat_q931); + RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr); + RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr); + RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr); + RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr); + RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp); + RCU_INIT_POINTER(nat_t120_hook, nat_t120); + RCU_INIT_POINTER(nat_h245_hook, nat_h245); + RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding); + RCU_INIT_POINTER(nat_q931_hook, nat_q931); return 0; } /****************************************************************************/ static void __exit fini(void) { - rcu_assign_pointer(set_h245_addr_hook, NULL); - rcu_assign_pointer(set_h225_addr_hook, NULL); - rcu_assign_pointer(set_sig_addr_hook, NULL); - rcu_assign_pointer(set_ras_addr_hook, NULL); - rcu_assign_pointer(nat_rtp_rtcp_hook, NULL); - rcu_assign_pointer(nat_t120_hook, NULL); - rcu_assign_pointer(nat_h245_hook, NULL); - rcu_assign_pointer(nat_callforwarding_hook, NULL); - rcu_assign_pointer(nat_q931_hook, NULL); + RCU_INIT_POINTER(set_h245_addr_hook, NULL); + RCU_INIT_POINTER(set_h225_addr_hook, NULL); + RCU_INIT_POINTER(set_sig_addr_hook, NULL); + RCU_INIT_POINTER(set_ras_addr_hook, NULL); + RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL); + RCU_INIT_POINTER(nat_t120_hook, NULL); + RCU_INIT_POINTER(nat_h245_hook, NULL); + RCU_INIT_POINTER(nat_callforwarding_hook, NULL); + RCU_INIT_POINTER(nat_q931_hook, NULL); synchronize_rcu(); } diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c index 535e1a802356..979ae165f4ef 100644 --- a/net/ipv4/netfilter/nf_nat_irc.c +++ b/net/ipv4/netfilter/nf_nat_irc.c @@ -75,14 +75,14 @@ static unsigned int help(struct sk_buff *skb, static void __exit nf_nat_irc_fini(void) { - rcu_assign_pointer(nf_nat_irc_hook, NULL); + RCU_INIT_POINTER(nf_nat_irc_hook, NULL); synchronize_rcu(); } static int __init nf_nat_irc_init(void) { BUG_ON(nf_nat_irc_hook != NULL); - rcu_assign_pointer(nf_nat_irc_hook, help); + RCU_INIT_POINTER(nf_nat_irc_hook, help); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 4c060038d29f..3e8284ba46b8 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -282,25 +282,25 @@ static int __init nf_nat_helper_pptp_init(void) nf_nat_need_gre(); BUG_ON(nf_nat_pptp_hook_outbound != NULL); - rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); + RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); BUG_ON(nf_nat_pptp_hook_inbound != NULL); - rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); + RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); - rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); + RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); BUG_ON(nf_nat_pptp_hook_expectfn != NULL); - rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected); + RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected); return 0; } static void __exit nf_nat_helper_pptp_fini(void) { - rcu_assign_pointer(nf_nat_pptp_hook_expectfn, NULL); - rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, NULL); - rcu_assign_pointer(nf_nat_pptp_hook_inbound, NULL); - rcu_assign_pointer(nf_nat_pptp_hook_outbound, NULL); + RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL); + RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL); + RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL); + RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL); synchronize_rcu(); } diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c index 3e61faf23a9a..a3d997618602 100644 --- a/net/ipv4/netfilter/nf_nat_proto_common.c +++ b/net/ipv4/netfilter/nf_nat_proto_common.c @@ -12,6 +12,8 @@ #include <linux/ip.h> #include <linux/netfilter.h> +#include <linux/export.h> +#include <net/secure_seq.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_rule.h> diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 5744c3ec847c..9f4dc1235dc7 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -8,6 +8,7 @@ #include <linux/types.h> #include <linux/init.h> +#include <linux/export.h> #include <linux/ip.h> #include <linux/icmp.h> diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c index 756331d42661..bd5a80a62a5b 100644 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -10,6 +10,7 @@ #include <linux/init.h> #include <linux/ip.h> #include <linux/sctp.h> +#include <linux/module.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_nat_protocol.h> diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c index aa460a595d5d..0d67bb80130f 100644 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -8,6 +8,7 @@ #include <linux/types.h> #include <linux/init.h> +#include <linux/export.h> #include <linux/ip.h> #include <linux/tcp.h> diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c index dfe65c7e2925..0b1b8601cba7 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -7,6 +7,7 @@ */ #include <linux/types.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/udp.h> diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c index 3cc8c8af39ef..f83ef23e2ab7 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udplite.c +++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c @@ -13,6 +13,7 @@ #include <linux/udp.h> #include <linux/netfilter.h> +#include <linux/module.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_protocol.h> diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index e40cf7816fdb..78844d9208f1 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -528,13 +528,13 @@ err1: static void __exit nf_nat_sip_fini(void) { - rcu_assign_pointer(nf_nat_sip_hook, NULL); - rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL); - rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); - rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); - rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); - rcu_assign_pointer(nf_nat_sdp_session_hook, NULL); - rcu_assign_pointer(nf_nat_sdp_media_hook, NULL); + RCU_INIT_POINTER(nf_nat_sip_hook, NULL); + RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, NULL); + RCU_INIT_POINTER(nf_nat_sip_expect_hook, NULL); + RCU_INIT_POINTER(nf_nat_sdp_addr_hook, NULL); + RCU_INIT_POINTER(nf_nat_sdp_port_hook, NULL); + RCU_INIT_POINTER(nf_nat_sdp_session_hook, NULL); + RCU_INIT_POINTER(nf_nat_sdp_media_hook, NULL); synchronize_rcu(); } @@ -547,13 +547,13 @@ static int __init nf_nat_sip_init(void) BUG_ON(nf_nat_sdp_port_hook != NULL); BUG_ON(nf_nat_sdp_session_hook != NULL); BUG_ON(nf_nat_sdp_media_hook != NULL); - rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); - rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust); - rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); - rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); - rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); - rcu_assign_pointer(nf_nat_sdp_session_hook, ip_nat_sdp_session); - rcu_assign_pointer(nf_nat_sdp_media_hook, ip_nat_sdp_media); + RCU_INIT_POINTER(nf_nat_sip_hook, ip_nat_sip); + RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust); + RCU_INIT_POINTER(nf_nat_sip_expect_hook, ip_nat_sip_expect); + RCU_INIT_POINTER(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); + RCU_INIT_POINTER(nf_nat_sdp_port_hook, ip_nat_sdp_port); + RCU_INIT_POINTER(nf_nat_sdp_session_hook, ip_nat_sdp_session); + RCU_INIT_POINTER(nf_nat_sdp_media_hook, ip_nat_sdp_media); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 8812a02078ab..2133c30a4a5f 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -400,11 +400,8 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, *len = 0; *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); - if (*octets == NULL) { - if (net_ratelimit()) - pr_notice("OOM in bsalg (%d)\n", __LINE__); + if (*octets == NULL) return 0; - } ptr = *octets; while (ctx->pointer < eoc) { @@ -451,11 +448,8 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, return 0; *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); - if (*oid == NULL) { - if (net_ratelimit()) - pr_notice("OOM in bsalg (%d)\n", __LINE__); + if (*oid == NULL) return 0; - } optr = *oid; @@ -719,117 +713,103 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, l = 0; switch (type) { - case SNMP_INTEGER: - len = sizeof(long); - if (!asn1_long_decode(ctx, end, &l)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, - GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - pr_notice("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - (*obj)->syntax.l[0] = l; - break; - case SNMP_OCTETSTR: - case SNMP_OPAQUE: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, - GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - if (net_ratelimit()) - pr_notice("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.c, p, len); + case SNMP_INTEGER: + len = sizeof(long); + if (!asn1_long_decode(ctx, end, &l)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + return 0; + } + (*obj)->syntax.l[0] = l; + break; + case SNMP_OCTETSTR: + case SNMP_OPAQUE: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { kfree(p); - break; - case SNMP_NULL: - case SNMP_NOSUCHOBJECT: - case SNMP_NOSUCHINSTANCE: - case SNMP_ENDOFMIBVIEW: - len = 0; - *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - pr_notice("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - if (!asn1_null_decode(ctx, end)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - break; - case SNMP_OBJECTID: - if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { - kfree(id); - return 0; - } - len *= sizeof(unsigned long); - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(lp); - kfree(id); - if (net_ratelimit()) - pr_notice("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.ul, lp, len); + kfree(id); + return 0; + } + memcpy((*obj)->syntax.c, p, len); + kfree(p); + break; + case SNMP_NULL: + case SNMP_NOSUCHOBJECT: + case SNMP_NOSUCHINSTANCE: + case SNMP_ENDOFMIBVIEW: + len = 0; + *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + return 0; + } + if (!asn1_null_decode(ctx, end)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + break; + case SNMP_OBJECTID: + if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { + kfree(id); + return 0; + } + len *= sizeof(unsigned long); + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { kfree(lp); - break; - case SNMP_IPADDR: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - if (len != 4) { - kfree(p); - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - if (net_ratelimit()) - pr_notice("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.uc, p, len); + kfree(id); + return 0; + } + memcpy((*obj)->syntax.ul, lp, len); + kfree(lp); + break; + case SNMP_IPADDR: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + if (len != 4) { kfree(p); - break; - case SNMP_COUNTER: - case SNMP_GAUGE: - case SNMP_TIMETICKS: - len = sizeof(unsigned long); - if (!asn1_ulong_decode(ctx, end, &ul)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - pr_notice("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - (*obj)->syntax.ul[0] = ul; - break; - default: kfree(id); return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(p); + kfree(id); + return 0; + } + memcpy((*obj)->syntax.uc, p, len); + kfree(p); + break; + case SNMP_COUNTER: + case SNMP_GAUGE: + case SNMP_TIMETICKS: + len = sizeof(unsigned long); + if (!asn1_ulong_decode(ctx, end, &ul)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + return 0; + } + (*obj)->syntax.ul[0] = ul; + break; + default: + kfree(id); + return 0; } (*obj)->syntax_len = len; @@ -1312,7 +1292,7 @@ static int __init nf_nat_snmp_basic_init(void) int ret = 0; BUG_ON(nf_nat_snmp_hook != NULL); - rcu_assign_pointer(nf_nat_snmp_hook, help); + RCU_INIT_POINTER(nf_nat_snmp_hook, help); ret = nf_conntrack_helper_register(&snmp_trap_helper); if (ret < 0) { @@ -1324,7 +1304,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { - rcu_assign_pointer(nf_nat_snmp_hook, NULL); + RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 483b76d042da..92900482edea 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -88,7 +88,7 @@ nf_nat_fn(unsigned int hooknum, /* We never see fragments: conntrack defrags on pre-routing and local-out, and nf_nat_out protects post-routing. */ - NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); + NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would @@ -284,7 +284,7 @@ static int __init nf_nat_standalone_init(void) #ifdef CONFIG_XFRM BUG_ON(ip_nat_decode_session != NULL); - rcu_assign_pointer(ip_nat_decode_session, nat_decode_session); + RCU_INIT_POINTER(ip_nat_decode_session, nat_decode_session); #endif ret = nf_nat_rule_init(); if (ret < 0) { @@ -302,7 +302,7 @@ static int __init nf_nat_standalone_init(void) nf_nat_rule_cleanup(); cleanup_decode_session: #ifdef CONFIG_XFRM - rcu_assign_pointer(ip_nat_decode_session, NULL); + RCU_INIT_POINTER(ip_nat_decode_session, NULL); synchronize_net(); #endif return ret; @@ -313,7 +313,7 @@ static void __exit nf_nat_standalone_fini(void) nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); nf_nat_rule_cleanup(); #ifdef CONFIG_XFRM - rcu_assign_pointer(ip_nat_decode_session, NULL); + RCU_INIT_POINTER(ip_nat_decode_session, NULL); synchronize_net(); #endif /* Conntrack caches are unregistered in nf_conntrack_cleanup */ diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c index 7274a43c7a12..a2901bf829c0 100644 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ b/net/ipv4/netfilter/nf_nat_tftp.c @@ -36,14 +36,14 @@ static unsigned int help(struct sk_buff *skb, static void __exit nf_nat_tftp_fini(void) { - rcu_assign_pointer(nf_nat_tftp_hook, NULL); + RCU_INIT_POINTER(nf_nat_tftp_hook, NULL); synchronize_rcu(); } static int __init nf_nat_tftp_init(void) { BUG_ON(nf_nat_tftp_hook != NULL); - rcu_assign_pointer(nf_nat_tftp_hook, help); + RCU_INIT_POINTER(nf_nat_tftp_hook, help); return 0; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 39b403f854c6..43d4c3b22369 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -39,6 +39,7 @@ #include <net/protocol.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> +#include <linux/export.h> #include <net/sock.h> #include <net/ping.h> #include <net/udp.h> @@ -338,7 +339,6 @@ void ping_err(struct sk_buff *skb, u32 info) sk = ping_v4_lookup(net, iph->daddr, iph->saddr, ntohs(icmph->un.echo.id), skb->dev->ifindex); if (sk == NULL) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); pr_debug("no socket, dropping\n"); return; /* No socket for error */ } @@ -678,7 +678,6 @@ static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb(sk, skb) < 0) { - ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS); kfree_skb(skb); pr_debug("ping_queue_rcv_skb -> failed\n"); return -1; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b14ec7d03b6e..466ea8bb7a4d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -42,6 +42,7 @@ #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/export.h> #include <net/sock.h> #include <net/raw.h> @@ -254,6 +255,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), + SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), + SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index c9893d43242e..007e2eb769d3 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -38,7 +38,7 @@ */ #include <linux/types.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/byteorder.h> #include <asm/current.h> #include <asm/uaccess.h> @@ -48,6 +48,7 @@ #include <linux/errno.h> #include <linux/aio.h> #include <linux/kernel.h> +#include <linux/export.h> #include <linux/spinlock.h> #include <linux/sockios.h> #include <linux/socket.h> @@ -563,7 +564,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0); + inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, + daddr, saddr, 0, 0); if (!inet->hdrincl) { err = raw_probe_proto_opt(&fl4, msg); @@ -825,28 +827,28 @@ static int compat_raw_getsockopt(struct sock *sk, int level, int optname, static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch (cmd) { - case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); + case SIOCOUTQ: { + int amount = sk_wmem_alloc_get(sk); - return put_user(amount, (int __user *)arg); - } - case SIOCINQ: { - struct sk_buff *skb; - int amount = 0; - - spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) - amount = skb->len; - spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); - } + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb->len; + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } - default: + default: #ifdef CONFIG_IP_MROUTE - return ipmr_ioctl(sk, cmd, (void __user *)arg); + return ipmr_ioctl(sk, cmd, (void __user *)arg); #else - return -ENOIOCTLCMD; + return -ENOIOCTLCMD; #endif } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index aa13ef105110..0c74da8a0473 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -108,6 +108,8 @@ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif +#include <net/atmclip.h> +#include <net/secure_seq.h> #define RT_FL_TOS(oldflp4) \ ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) @@ -118,7 +120,6 @@ static int ip_rt_max_size; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; -static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; @@ -184,6 +185,8 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) return p; } +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); + static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), @@ -198,6 +201,7 @@ static struct dst_ops ipv4_dst_ops = { .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .local_out = __ip_local_out, + .neigh_lookup = ipv4_neigh_lookup, }; #define ECN_OR_COST(class) TC_PRIO_##class @@ -319,7 +323,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) + if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) continue; rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); @@ -345,7 +349,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, do { if (--st->bucket < 0) return NULL; - } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); + } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); } @@ -411,8 +415,10 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "HHUptod\tSpecDst"); else { struct rtable *r = v; + struct neighbour *n; int len; + n = dst_get_neighbour(&r->dst); seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", r->dst.dev ? r->dst.dev->name : "*", @@ -425,9 +431,8 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + dst_metric(&r->dst, RTAX_RTTVAR)), r->rt_key_tos, - r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, - r->dst.hh ? (r->dst.hh->hh_output == - dev_queue_xmit) : 0, + -1, + (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0, r->rt_spec_dst, &len); seq_printf(seq, "%*s\n", 127 - len, ""); @@ -716,7 +721,7 @@ static inline bool compare_hash_inputs(const struct rtable *rt1, { return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | - (rt1->rt_iif ^ rt2->rt_iif)) == 0); + (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); } static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) @@ -725,8 +730,8 @@ static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | (rt1->rt_mark ^ rt2->rt_mark) | (rt1->rt_key_tos ^ rt2->rt_key_tos) | - (rt1->rt_oif ^ rt2->rt_oif) | - (rt1->rt_iif ^ rt2->rt_iif)) == 0; + (rt1->rt_route_iif ^ rt2->rt_route_iif) | + (rt1->rt_oif ^ rt2->rt_oif)) == 0; } static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) @@ -755,7 +760,7 @@ static void rt_do_flush(struct net *net, int process_context) if (process_context && need_resched()) cond_resched(); - rth = rcu_dereference_raw(rt_hash_table[i].chain); + rth = rcu_access_pointer(rt_hash_table[i].chain); if (!rth) continue; @@ -1006,6 +1011,37 @@ static int slow_chain_length(const struct rtable *head) return length >> FRACT_BITS; } +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) +{ + struct neigh_table *tbl = &arp_tbl; + static const __be32 inaddr_any = 0; + struct net_device *dev = dst->dev; + const __be32 *pkey = daddr; + struct neighbour *n; + +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) + if (dev->type == ARPHRD_ATM) + tbl = clip_tbl_hook; +#endif + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + pkey = &inaddr_any; + + n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey); + if (n) + return n; + return neigh_create(tbl, pkey, dev); +} + +static int rt_bind_neighbour(struct rtable *rt) +{ + struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); + if (IS_ERR(n)) + return PTR_ERR(n); + dst_set_neighbour(&rt->dst, n); + + return 0; +} + static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, struct sk_buff *skb, int ifindex) { @@ -1042,7 +1078,7 @@ restart: rt->dst.flags |= DST_NOCACHE; if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { - int err = arp_bind_neighbour(&rt->dst); + int err = rt_bind_neighbour(rt); if (err) { if (net_ratelimit()) printk(KERN_WARNING @@ -1138,7 +1174,7 @@ restart: route or unicast forwarding path. */ if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { - int err = arp_bind_neighbour(&rt->dst); + int err = rt_bind_neighbour(rt); if (err) { spin_unlock_bh(rt_hash_lock_addr(hash)); @@ -1268,11 +1304,42 @@ static void rt_del(unsigned hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + struct neighbour *n, *old_n; + + dst_confirm(&rt->dst); + + rt->rt_gateway = peer->redirect_learned.a4; + + n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); + if (IS_ERR(n)) + return PTR_ERR(n); + old_n = xchg(&rt->dst._neighbour, n); + if (old_n) + neigh_release(old_n); + if (!n || !(n->nud_state & NUD_VALID)) { + if (n) + neigh_event_send(n, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); + } + return 0; +} + /* called in rcu_read_lock() section */ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { + int s, i; struct in_device *in_dev = __in_dev_get_rcu(dev); + __be32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; struct inet_peer *peer; struct net *net; @@ -1295,13 +1362,43 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - peer = inet_getpeer_v4(daddr, 1); - if (peer) { - peer->redirect_learned.a4 = new_gw; + for (s = 0; s < 2; s++) { + for (i = 0; i < 2; i++) { + unsigned int hash; + struct rtable __rcu **rthp; + struct rtable *rt; - inet_putpeer(peer); + hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); + + rthp = &rt_hash_table[hash].chain; + + while ((rt = rcu_dereference(*rthp)) != NULL) { + rthp = &rt->dst.rt_next; + + if (rt->rt_key_dst != daddr || + rt->rt_key_src != skeys[s] || + rt->rt_oif != ikeys[i] || + rt_is_input_route(rt) || + rt_is_expired(rt) || + !net_eq(dev_net(rt->dst.dev), net) || + rt->dst.error || + rt->dst.dev != dev || + rt->rt_gateway != old_gw) + continue; - atomic_inc(&__rt_peer_genid); + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + + peer = rt->peer; + if (peer) { + if (peer->redirect_learned.a4 != new_gw) { + peer->redirect_learned.a4 = new_gw; + atomic_inc(&__rt_peer_genid); + } + check_peer_redir(&rt->dst, peer); + } + } + } } return; @@ -1439,20 +1536,20 @@ static int ip_error(struct sk_buff *skb) int code; switch (rt->dst.error) { - case EINVAL: - default: - goto out; - case EHOSTUNREACH: - code = ICMP_HOST_UNREACH; - break; - case ENETUNREACH: - code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(dev_net(rt->dst.dev), - IPSTATS_MIB_INNOROUTES); - break; - case EACCES: - code = ICMP_PKT_FILTERED; - break; + case EINVAL: + default: + goto out; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; + case ENETUNREACH: + code = ICMP_NET_UNREACH; + IP_INC_STATS_BH(dev_net(rt->dst.dev), + IPSTATS_MIB_INNOROUTES); + break; + case EACCES: + code = ICMP_PKT_FILTERED; + break; } if (!rt->peer) @@ -1531,11 +1628,10 @@ unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, est_mtu = mtu; peer->pmtu_learned = mtu; peer->pmtu_expires = pmtu_expires; + atomic_inc(&__rt_peer_genid); } inet_putpeer(peer); - - atomic_inc(&__rt_peer_genid); } return est_mtu ? : new_mtu; } @@ -1588,30 +1684,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) } } -static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) -{ - struct rtable *rt = (struct rtable *) dst; - __be32 orig_gw = rt->rt_gateway; - - dst_confirm(&rt->dst); - - neigh_release(rt->dst.neighbour); - rt->dst.neighbour = NULL; - - rt->rt_gateway = peer->redirect_learned.a4; - if (arp_bind_neighbour(&rt->dst) || - !(rt->dst.neighbour->nud_state & NUD_VALID)) { - if (rt->dst.neighbour) - neigh_event_send(rt->dst.neighbour, NULL); - rt->rt_gateway = orig_gw; - return -EAGAIN; - } else { - rt->rt_flags |= RTCF_REDIRECTED; - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, - rt->dst.neighbour); - } - return 0; -} static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { @@ -1703,7 +1775,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) memset(&fl4, 0, sizeof(fl4)); fl4.daddr = iph->daddr; fl4.saddr = iph->saddr; - fl4.flowi4_tos = iph->tos; + fl4.flowi4_tos = RT_TOS(iph->tos); fl4.flowi4_oif = rt->dst.dev->ifindex; fl4.flowi4_iif = skb->dev->ifindex; fl4.flowi4_mark = skb->mark; @@ -2280,8 +2352,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth = rcu_dereference(rth->dst.rt_next)) { if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | - (rth->rt_iif ^ iif) | - rth->rt_oif | + (rth->rt_route_iif ^ iif) | (rth->rt_key_tos ^ tos)) == 0 && rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && @@ -2708,6 +2779,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, .cow_metrics = ipv4_rt_blackhole_cow_metrics, + .neigh_lookup = ipv4_neigh_lookup, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -2781,7 +2853,7 @@ static int rt_fill_info(struct net *net, struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; - long expires = 0; + unsigned long expires = 0; const struct inet_peer *peer = rt->peer; u32 id = 0, ts = 0, tsage = 0, error; @@ -2838,8 +2910,12 @@ static int rt_fill_info(struct net *net, tsage = get_seconds() - peer->tcp_ts_stamp; } expires = ACCESS_ONCE(peer->pmtu_expires); - if (expires) - expires -= jiffies; + if (expires) { + if (time_before(jiffies, expires)) + expires -= jiffies; + else + expires = 0; + } } if (rt_is_input_route(rt)) { @@ -3081,13 +3157,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "gc_interval", - .data = &ip_rt_gc_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), @@ -3303,7 +3372,7 @@ int __init ip_rt_init(void) xfrm_init(); xfrm4_init(ip_rt_max_size); #endif - rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 26461492a847..90f6544c13e2 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -15,6 +15,7 @@ #include <linux/random.h> #include <linux/cryptohash.h> #include <linux/kernel.h> +#include <linux/export.h> #include <net/tcp.h> #include <net/route.h> @@ -265,7 +266,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) { struct tcp_options_received tcp_opt; - u8 *hash_location; + const u8 *hash_location; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); @@ -276,7 +277,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int mss; struct rtable *rt; __u8 rcv_wscale; - bool ecn_ok; + bool ecn_ok = false; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; @@ -316,6 +317,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; + treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 57d0752e239a..69fd7201129a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -398,20 +398,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "inet_peer_gc_mintime", - .data = &inet_peer_gc_mintime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "inet_peer_gc_maxtime", - .data = &inet_peer_gc_maxtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { .procname = "tcp_orphan_retries", .data = &sysctl_tcp_orphan_retries, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46febcacb729..34f5db1e1c8b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -374,7 +374,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { unsigned int mask; struct sock *sk = sock->sk; - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == TCP_LISTEN) @@ -524,11 +524,11 @@ EXPORT_SYMBOL(tcp_ioctl); static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { - TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } -static inline int forced_push(struct tcp_sock *tp) +static inline int forced_push(const struct tcp_sock *tp) { return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } @@ -540,7 +540,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) skb->csum = 0; tcb->seq = tcb->end_seq = tp->write_seq; - tcb->flags = TCPHDR_ACK; + tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); @@ -813,7 +813,7 @@ new_segment: goto wait_for_memory; if (can_coalesce) { - skb_shinfo(skb)->frags[i - 1].size += copy; + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); @@ -830,7 +830,7 @@ new_segment: skb_shinfo(skb)->gso_segs = 0; if (!copied) - TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; copied += copy; poffset += copy; @@ -891,9 +891,9 @@ EXPORT_SYMBOL(tcp_sendpage); #define TCP_PAGE(sk) (sk->sk_sndmsg_page) #define TCP_OFF(sk) (sk->sk_sndmsg_off) -static inline int select_size(struct sock *sk, int sg) +static inline int select_size(const struct sock *sk, int sg) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sg) { @@ -1058,8 +1058,7 @@ new_segment: /* Update the skb. */ if (merge) { - skb_shinfo(skb)->frags[i - 1].size += - copy; + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, page, off, copy); if (TCP_PAGE(sk)) { @@ -1074,7 +1073,7 @@ new_segment: } if (!copied) - TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; @@ -1194,13 +1193,11 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; -#if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); -#endif if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2409,7 +2406,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) return icsk->icsk_af_ops->setsockopt(sk, level, optname, @@ -2431,9 +2428,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); #endif /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(struct sock *sk, struct tcp_info *info) +void tcp_get_info(const struct sock *sk, struct tcp_info *info) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; @@ -2455,8 +2452,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } - if (tp->ecn_flags&TCP_ECN_OK) + if (tp->ecn_flags & TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); @@ -2857,26 +2856,25 @@ EXPORT_SYMBOL(tcp_gro_complete); #ifdef CONFIG_TCP_MD5SIG static unsigned long tcp_md5sig_users; -static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool; +static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool; static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool) +static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) { int cpu; + for_each_possible_cpu(cpu) { - struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu); - if (p) { - if (p->md5_desc.tfm) - crypto_free_hash(p->md5_desc.tfm); - kfree(p); - } + struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); + + if (p->md5_desc.tfm) + crypto_free_hash(p->md5_desc.tfm); } free_percpu(pool); } void tcp_free_md5sig_pool(void) { - struct tcp_md5sig_pool * __percpu *pool = NULL; + struct tcp_md5sig_pool __percpu *pool = NULL; spin_lock_bh(&tcp_md5sig_pool_lock); if (--tcp_md5sig_users == 0) { @@ -2889,30 +2887,24 @@ void tcp_free_md5sig_pool(void) } EXPORT_SYMBOL(tcp_free_md5sig_pool); -static struct tcp_md5sig_pool * __percpu * +static struct tcp_md5sig_pool __percpu * __tcp_alloc_md5sig_pool(struct sock *sk) { int cpu; - struct tcp_md5sig_pool * __percpu *pool; + struct tcp_md5sig_pool __percpu *pool; - pool = alloc_percpu(struct tcp_md5sig_pool *); + pool = alloc_percpu(struct tcp_md5sig_pool); if (!pool) return NULL; for_each_possible_cpu(cpu) { - struct tcp_md5sig_pool *p; struct crypto_hash *hash; - p = kzalloc(sizeof(*p), sk->sk_allocation); - if (!p) - goto out_free; - *per_cpu_ptr(pool, cpu) = p; - hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); if (!hash || IS_ERR(hash)) goto out_free; - p->md5_desc.tfm = hash; + per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; } return pool; out_free: @@ -2920,9 +2912,9 @@ out_free: return NULL; } -struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk) +struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) { - struct tcp_md5sig_pool * __percpu *pool; + struct tcp_md5sig_pool __percpu *pool; int alloc = 0; retry: @@ -2941,7 +2933,7 @@ retry: if (alloc) { /* we cannot hold spinlock here because this may sleep. */ - struct tcp_md5sig_pool * __percpu *p; + struct tcp_md5sig_pool __percpu *p; p = __tcp_alloc_md5sig_pool(sk); spin_lock_bh(&tcp_md5sig_pool_lock); @@ -2974,7 +2966,7 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool); */ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { - struct tcp_md5sig_pool * __percpu *p; + struct tcp_md5sig_pool __percpu *p; local_bh_disable(); @@ -2985,7 +2977,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) spin_unlock(&tcp_md5sig_pool_lock); if (p) - return *this_cpu_ptr(p); + return this_cpu_ptr(p); local_bh_enable(); return NULL; @@ -3000,23 +2992,25 @@ void tcp_put_md5sig_pool(void) EXPORT_SYMBOL(tcp_put_md5sig_pool); int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, - struct tcphdr *th) + const struct tcphdr *th) { struct scatterlist sg; + struct tcphdr hdr; int err; - __sum16 old_checksum = th->check; - th->check = 0; + /* We are not allowed to change tcphdr, make a local copy */ + memcpy(&hdr, th, sizeof(hdr)); + hdr.check = 0; + /* options aren't included in the hash */ - sg_init_one(&sg, th, sizeof(struct tcphdr)); - err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); - th->check = old_checksum; + sg_init_one(&sg, &hdr, sizeof(hdr)); + err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr)); return err; } EXPORT_SYMBOL(tcp_md5_hash_header); int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, - struct sk_buff *skb, unsigned header_len) + const struct sk_buff *skb, unsigned int header_len) { struct scatterlist sg; const struct tcphdr *tp = tcp_hdr(skb); @@ -3035,8 +3029,9 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, for (i = 0; i < shi->nr_frags; ++i) { const struct skb_frag_struct *f = &shi->frags[i]; - sg_set_page(&sg, f->page, f->size, f->page_offset); - if (crypto_hash_update(desc, &sg, f->size)) + struct page *page = skb_frag_page(f); + sg_set_page(&sg, page, skb_frag_size(f), f->page_offset); + if (crypto_hash_update(desc, &sg, skb_frag_size(f))) return 1; } @@ -3048,7 +3043,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, } EXPORT_SYMBOL(tcp_md5_hash_skb_data); -int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) +int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { struct scatterlist sg; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bef9f04c22ba..52b5c2d0ecd0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -206,7 +206,7 @@ static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } -static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; @@ -217,32 +217,41 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) { - if (tp->ecn_flags & TCP_ECN_OK) { - if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + if (!(tp->ecn_flags & TCP_ECN_OK)) + return; + + switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, - * it is surely retransmit. It is not in ECN RFC, - * but Linux follows this rule. */ - else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) + * and we already seen ECT on a previous segment, + * it is probably a retransmit. + */ + if (tp->ecn_flags & TCP_ECN_SEEN) tcp_enter_quickack_mode((struct sock *)tp); + break; + case INET_ECN_CE: + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + /* fallinto */ + default: + tp->ecn_flags |= TCP_ECN_SEEN; } } -static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) +static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) +static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) +static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return 1; @@ -256,14 +265,11 @@ static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) static void tcp_fixup_sndbuf(struct sock *sk) { - int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + - sizeof(struct sk_buff); + int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); - if (sk->sk_sndbuf < 3 * sndmem) { - sk->sk_sndbuf = 3 * sndmem; - if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) - sk->sk_sndbuf = sysctl_tcp_wmem[2]; - } + sndmem *= TCP_INIT_CWND; + if (sk->sk_sndbuf < sndmem) + sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -309,7 +315,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -339,17 +345,24 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) static void tcp_fixup_rcvbuf(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); + u32 mss = tcp_sk(sk)->advmss; + u32 icwnd = TCP_DEFAULT_INIT_RCVWND; + int rcvmem; - /* Try to select rcvbuf so that 4 mss-sized segments - * will fit to window and corresponding skbs will fit to our rcvbuf. - * (was 3; 4 is minimum to allow fast retransmit to work.) + /* Limit to 10 segments if mss <= 1460, + * or 14600/mss segments, with a minimum of two segments. */ - while (tcp_win_from_space(rcvmem) < tp->advmss) + if (mss > 1460) + icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); + + rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER); + while (tcp_win_from_space(rcvmem) < mss) rcvmem += 128; - if (sk->sk_rcvbuf < 4 * rcvmem) - sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); + + rcvmem *= icwnd; + + if (sk->sk_rcvbuf < rcvmem) + sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); } /* 4. Try to fixup all. It is made immediately after connection enters @@ -416,7 +429,7 @@ static void tcp_clamp_window(struct sock *sk) */ void tcp_initialize_rcv_mss(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); @@ -531,8 +544,7 @@ void tcp_rcv_space_adjust(struct sock *sk) space /= tp->advmss; if (!space) space = 1; - rcvmem = (tp->advmss + MAX_TCP_HEADER + - 16 + sizeof(struct sk_buff)); + rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); while (tcp_win_from_space(rcvmem) < tp->advmss) rcvmem += 128; space *= rcvmem; @@ -812,7 +824,7 @@ void tcp_update_metrics(struct sock *sk) } } -__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) +__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); @@ -880,6 +892,11 @@ static void tcp_init_metrics(struct sock *sk) tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; + } else { + /* ssthresh may have been reduced unnecessarily during. + * 3WHS. Restore it back to its initial default. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { @@ -887,10 +904,7 @@ static void tcp_init_metrics(struct sock *sk) tp->reordering = dst_metric(dst, RTAX_REORDERING); } - if (dst_metric(dst, RTAX_RTT) == 0) - goto reset; - - if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) + if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -916,19 +930,26 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); - if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) { reset: - /* Play conservative. If timestamps are not - * supported, TCP will fail to recalculate correct - * rtt, if initial rto is too small. FORGET ALL AND RESET! + if (tp->srtt == 0) { + /* RFC2988bis: We've failed to get a valid RTT sample from + * 3WHS. This is most likely due to retransmission, + * including spurious one. Reset the RTO back to 3secs + * from the more aggressive 1sec to avoid more spurious + * retransmission. */ - if (!tp->rx_opt.saw_tstamp && tp->srtt) { - tp->srtt = 0; - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; - } + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } - tp->snd_cwnd = tcp_init_cwnd(tp, dst); + /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC2988bis' more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -1115,7 +1136,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, return 0; /* ...Then it's D-SACK, and must reside below snd_una completely */ - if (!after(end_seq, tp->snd_una)) + if (after(end_seq, tp->snd_una)) return 0; if (!before(start_seq, tp->undo_marker)) @@ -1195,7 +1216,7 @@ static void tcp_mark_lost_retrans(struct sock *sk) tp->lost_retrans_low = new_low_seq; } -static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, +static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) { @@ -1289,7 +1310,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } -static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, +static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, struct tcp_sacktag_state *state, int dup_sack, int pcount) { @@ -1380,9 +1401,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!pcount); - /* Tweak before seqno plays */ - if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && - !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) + if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; TCP_SKB_CB(prev)->end_seq += shifted; @@ -1431,7 +1450,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tp->lost_cnt_hint -= tcp_skb_pcount(prev); } - TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; + TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); @@ -1446,13 +1465,13 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, /* I wish gso_size would have a bit more sane initialization than * something-or-zero which complicates things */ -static int tcp_skb_seglen(struct sk_buff *skb) +static int tcp_skb_seglen(const struct sk_buff *skb) { return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); } /* Shifting pages past head area doesn't work */ -static int skb_can_shift(struct sk_buff *skb) +static int skb_can_shift(const struct sk_buff *skb) { return !skb_headlen(skb) && skb_is_nonlinear(skb); } @@ -1701,19 +1720,19 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, return skb; } -static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) +static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) { return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } static int -tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, +tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_snd_una) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - unsigned char *ptr = (skb_transport_header(ack_skb) + - TCP_SKB_CB(ack_skb)->sacked); + const unsigned char *ptr = (skb_transport_header(ack_skb) + + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; @@ -2277,7 +2296,7 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag) return 0; } -static inline int tcp_fackets_out(struct tcp_sock *tp) +static inline int tcp_fackets_out(const struct tcp_sock *tp) { return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; } @@ -2297,19 +2316,20 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) * they differ. Since neither occurs due to loss, TCP should really * ignore them. */ -static inline int tcp_dupack_heuristics(struct tcp_sock *tp) +static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) { return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } -static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) +static inline int tcp_skb_timedout(const struct sock *sk, + const struct sk_buff *skb) { return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; } -static inline int tcp_head_timedout(struct sock *sk) +static inline int tcp_head_timedout(const struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); return tp->packets_out && tcp_skb_timedout(sk, tcp_write_queue_head(sk)); @@ -2620,7 +2640,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ -static inline int tcp_packet_delayed(struct tcp_sock *tp) +static inline int tcp_packet_delayed(const struct tcp_sock *tp) { return !tp->retrans_stamp || (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -2681,7 +2701,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_may_undo(struct tcp_sock *tp) +static inline int tcp_may_undo(const struct tcp_sock *tp) { return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } @@ -2745,9 +2765,9 @@ static void tcp_try_undo_dsack(struct sock *sk) * that successive retransmissions of a segment must not advance * retrans_stamp under any conditions. */ -static int tcp_any_retrans_done(struct sock *sk) +static int tcp_any_retrans_done(const struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (tp->retrans_out) @@ -2821,9 +2841,13 @@ static int tcp_try_undo_loss(struct sock *sk) static inline void tcp_complete_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - /* Do not moderate cwnd if it's already undone in cwr or recovery */ - if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { - tp->snd_cwnd = tp->snd_ssthresh; + + /* Do not moderate cwnd if it's already undone in cwr or recovery. */ + if (tp->undo_marker) { + if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + else /* PRR */ + tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_time_stamp; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); @@ -2941,6 +2965,38 @@ void tcp_simple_retransmit(struct sock *sk) } EXPORT_SYMBOL(tcp_simple_retransmit); +/* This function implements the PRR algorithm, specifcally the PRR-SSRB + * (proportional rate reduction with slow start reduction bound) as described in + * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt. + * It computes the number of packets to send (sndcnt) based on packets newly + * delivered: + * 1) If the packets in flight is larger than ssthresh, PRR spreads the + * cwnd reductions across a full RTT. + * 2) If packets in flight is lower than ssthresh (such as due to excess + * losses and/or application stalls), do not perform any further cwnd + * reductions, but instead slow start up to ssthresh. + */ +static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, + int fast_rexmit, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + int sndcnt = 0; + int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); + + if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { + u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + + tp->prior_cwnd - 1; + sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; + } else { + sndcnt = min_t(int, delta, + max_t(int, tp->prr_delivered - tp->prr_out, + newly_acked_sacked) + 1); + } + + sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); + tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2952,7 +3008,8 @@ EXPORT_SYMBOL(tcp_simple_retransmit); * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) +static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, + int newly_acked_sacked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3102,22 +3159,27 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; + tp->prior_cwnd = tp->snd_cwnd; + tp->prr_delivered = 0; + tp->prr_out = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); fast_rexmit = 1; } if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_down(sk, flag); + tp->prr_delivered += newly_acked_sacked; + tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag); tcp_xmit_retransmit_queue(sk); } -static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) { tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; } +EXPORT_SYMBOL(tcp_valid_rtt_meas); /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Supersedes RFC1323) @@ -3184,7 +3246,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) */ static void tcp_rearm_rto(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); @@ -3288,7 +3350,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if (!(scb->flags & TCPHDR_SYN)) { + if (!(scb->tcp_flags & TCPHDR_SYN)) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; @@ -3436,7 +3498,7 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ -static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, +static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, u32 ack_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -3612,7 +3674,7 @@ static int tcp_process_frto(struct sock *sk, int flag) } /* This routine deals with incoming acks, but not outgoing ones. */ -static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) +static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3622,6 +3684,8 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 prior_in_flight; u32 prior_fackets; int prior_packets; + int prior_sacked = tp->sacked_out; + int newly_acked_sacked = 0; int frto_cwnd = 0; /* If the ack is older than previous acks @@ -3693,6 +3757,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); + newly_acked_sacked = (prior_packets - prior_sacked) - + (tp->packets_out - tp->sacked_out); + if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); /* Guarantee sacktag reordering detection against wrap-arounds */ @@ -3705,7 +3772,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, - flag); + newly_acked_sacked, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, prior_in_flight); @@ -3744,14 +3811,14 @@ old_ack: * But, this can also be called on packets in the established flow when * the fast version below fails. */ -void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, - u8 **hvpp, int estab) +void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, + const u8 **hvpp, int estab) { - unsigned char *ptr; - struct tcphdr *th = tcp_hdr(skb); + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); int length = (th->doff * 4) - sizeof(struct tcphdr); - ptr = (unsigned char *)(th + 1); + ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; while (length > 0) { @@ -3862,9 +3929,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } EXPORT_SYMBOL(tcp_parse_options); -static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) +static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) { - __be32 *ptr = (__be32 *)(th + 1); + const __be32 *ptr = (const __be32 *)(th + 1); if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { @@ -3881,8 +3948,9 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp, u8 **hvpp) +static int tcp_fast_parse_options(const struct sk_buff *skb, + const struct tcphdr *th, + struct tcp_sock *tp, const u8 **hvpp) { /* In the spirit of fast parsing, compare doff directly to constant * values. Because equality is used, short doff can be ignored here. @@ -3903,10 +3971,10 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, /* * Parse MD5 Signature option */ -u8 *tcp_parse_md5sig_option(struct tcphdr *th) +const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) { - int length = (th->doff << 2) - sizeof (*th); - u8 *ptr = (u8*)(th + 1); + int length = (th->doff << 2) - sizeof(*th); + const u8 *ptr = (const u8 *)(th + 1); /* If the TCP option is too short, we can short cut */ if (length < TCPOLEN_MD5SIG) @@ -3983,8 +4051,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { - struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th = tcp_hdr(skb); + const struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = tcp_hdr(skb); u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -4023,7 +4091,7 @@ static inline int tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq) +static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { return !before(end_seq, tp->rcv_wup) && !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); @@ -4068,7 +4136,7 @@ static void tcp_reset(struct sock *sk) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ -static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -4180,7 +4248,7 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } -static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) +static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4339,7 +4407,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tcp_hdr(skb)->fin) - tcp_fin(skb, sk, tcp_hdr(skb)); + tcp_fin(sk); } } @@ -4367,7 +4435,7 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; @@ -4421,7 +4489,7 @@ queue_and_out: if (skb->len) tcp_event_data_recv(sk, skb); if (th->fin) - tcp_fin(skb, sk, th); + tcp_fin(sk); if (!skb_queue_empty(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); @@ -4851,9 +4919,9 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static int tcp_should_expand_sndbuf(struct sock *sk) +static int tcp_should_expand_sndbuf(const struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* If the user specified a specific send buffer setting, do * not modify it. @@ -4887,8 +4955,10 @@ static void tcp_new_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_should_expand_sndbuf(sk)) { - int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + - MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); + int sndmem = SKB_TRUESIZE(max_t(u32, + tp->rx_opt.mss_clamp, + tp->mss_cache) + + MAX_TCP_HEADER); int demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); sndmem *= 2 * demanded; @@ -4960,7 +5030,7 @@ static inline void tcp_ack_snd_check(struct sock *sk) * either form (or just set the sysctl tcp_stdurg). */ -static void tcp_check_urg(struct sock *sk, struct tcphdr *th) +static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); @@ -5026,7 +5096,7 @@ static void tcp_check_urg(struct sock *sk, struct tcphdr *th) } /* This is the 'fast' part of urgent handling. */ -static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) +static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); @@ -5147,9 +5217,9 @@ out: * play significant role here. */ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, int syn_inerr) + const struct tcphdr *th, int syn_inerr) { - u8 *hash_location; + const u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); /* RFC1323: H1. Apply PAWS check first. */ @@ -5230,7 +5300,7 @@ discard: * tcp_data_queue when everything is OK. */ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned len) + const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); int res; @@ -5441,9 +5511,9 @@ discard: EXPORT_SYMBOL(tcp_rcv_established); static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned len) + const struct tcphdr *th, unsigned int len) { - u8 *hash_location; + const u8 *hash_location; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_cookie_values *cvp = tp->cookie_values; @@ -5718,7 +5788,7 @@ reset_and_undo: */ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned len) + const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -5806,12 +5876,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - /* tcp_ack considers this ACK as duplicate - * and does not calculate rtt. - * Force it here. - */ - tcp_ack_update_rtt(sk, 0, 0); - if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 708dc203b034..a9db4b1a2215 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -72,6 +72,7 @@ #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/netdma.h> +#include <net/secure_seq.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -91,7 +92,7 @@ EXPORT_SYMBOL(sysctl_tcp_low_latency); static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr); static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, - __be32 daddr, __be32 saddr, struct tcphdr *th); + __be32 daddr, __be32 saddr, const struct tcphdr *th); #else static inline struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) @@ -103,7 +104,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) +static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, @@ -429,8 +430,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << - icsk->icsk_backoff; + inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : + TCP_TIMEOUT_INIT) << icsk->icsk_backoff; tcp_bound_rto(sk); skb = tcp_write_queue_head(sk); @@ -551,7 +552,7 @@ static void __tcp_v4_send_check(struct sk_buff *skb, /* This routine computes an IPv4 TCP checksum. */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) { - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } @@ -589,7 +590,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb) static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; #ifdef CONFIG_TCP_MD5SIG @@ -651,6 +652,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; net = dev_net(skb_dst(skb)->dev); + arg.tos = ip_hdr(skb)->tos; ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, &arg, arg.iov[0].iov_len); @@ -665,9 +667,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, int oif, struct tcp_md5sig_key *key, - int reply_flags) + int reply_flags, u8 tos) { - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) @@ -725,7 +727,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, arg.csumoffset = offsetof(struct tcphdr, check) / 2; if (oif) arg.bound_dev_if = oif; - + arg.tos = tos; ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, &arg, arg.iov[0].iov_len); @@ -742,7 +744,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, + tw->tw_tos ); inet_twsk_put(tw); @@ -756,7 +759,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, req->ts_recent, 0, tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), - inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, + ip_hdr(skb)->tos); } /* @@ -807,20 +811,38 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -static void syn_flood_warning(const struct sk_buff *skb) +/* + * Return 1 if a syncookie should be sent + */ +int tcp_syn_flood_action(struct sock *sk, + const struct sk_buff *skb, + const char *proto) { - const char *msg; + const char *msg = "Dropping request"; + int want_cookie = 0; + struct listen_sock *lopt; + + #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) + if (sysctl_tcp_syncookies) { msg = "Sending cookies"; - else + want_cookie = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); + } else #endif - msg = "Dropping request"; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - pr_info("TCP: Possible SYN flooding on port %d. %s.\n", - ntohs(tcp_hdr(skb)->dest), msg); + lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; + if (!lopt->synflood_warned) { + lopt->synflood_warned = 1; + pr_info("%s: Possible SYN flooding on port %d. %s. " + " Check SNMP counters.\n", + proto, ntohs(tcp_hdr(skb)->dest), msg); + } + return want_cookie; } +EXPORT_SYMBOL(tcp_syn_flood_action); /* * Save and compile IPv4 options into the request_sock if needed. @@ -908,18 +930,21 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, } sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - if (tcp_alloc_md5sig_pool(sk) == NULL) { + + md5sig = tp->md5sig_info; + if (md5sig->entries4 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } - md5sig = tp->md5sig_info; if (md5sig->alloced4 == md5sig->entries4) { keys = kmalloc((sizeof(*keys) * (md5sig->entries4 + 1)), GFP_ATOMIC); if (!keys) { kfree(newkey); - tcp_free_md5sig_pool(); + if (md5sig->entries4 == 0) + tcp_free_md5sig_pool(); return -ENOMEM; } @@ -963,6 +988,7 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) kfree(tp->md5sig_info->keys4); tp->md5sig_info->keys4 = NULL; tp->md5sig_info->alloced4 = 0; + tcp_free_md5sig_pool(); } else if (tp->md5sig_info->entries4 != i) { /* Need to do some manipulation */ memmove(&tp->md5sig_info->keys4[i], @@ -970,7 +996,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) (tp->md5sig_info->entries4 - i) * sizeof(struct tcp4_md5sig_key)); } - tcp_free_md5sig_pool(); return 0; } } @@ -1068,7 +1093,7 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, } static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, - __be32 daddr, __be32 saddr, struct tcphdr *th) + __be32 daddr, __be32 saddr, const struct tcphdr *th) { struct tcp_md5sig_pool *hp; struct hash_desc *desc; @@ -1100,12 +1125,12 @@ clear_hash_noput: } int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, - struct sock *sk, struct request_sock *req, - struct sk_buff *skb) + const struct sock *sk, const struct request_sock *req, + const struct sk_buff *skb) { struct tcp_md5sig_pool *hp; struct hash_desc *desc; - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; if (sk) { @@ -1150,7 +1175,7 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); -static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) +static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) { /* * This gets called for each TCP segment that arrives @@ -1160,10 +1185,10 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) * o MD5 hash and we're not expecting one. * o MD5 hash and its wrong. */ - __u8 *hash_location = NULL; + const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); int genhash; unsigned char newhash[16]; @@ -1226,7 +1251,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; - u8 *hash_location; + const u8 *hash_location; struct request_sock *req; struct inet_request_sock *ireq; struct tcp_sock *tp = tcp_sk(sk); @@ -1234,11 +1259,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; -#ifdef CONFIG_SYN_COOKIES int want_cookie = 0; -#else -#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ -#endif /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1249,14 +1270,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * evidently real one. */ if (inet_csk_reqsk_queue_is_full(sk) && !isn) { - if (net_ratelimit()) - syn_flood_warning(skb); -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { - want_cookie = 1; - } else -#endif - goto drop; + want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); + if (!want_cookie) + goto drop; } /* Accept backlog is full. If we have already queued enough @@ -1302,9 +1318,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) while (l-- > 0) *c++ ^= *hash_location++; -#ifdef CONFIG_SYN_COOKIES want_cookie = 0; /* not our kind of cookie */ -#endif tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { @@ -1384,6 +1398,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; if (tcp_v4_send_synack(sk, dst, req, (struct request_values *)&tmp_ext) || @@ -1458,6 +1473,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); + if (tcp_rsk(req)->snt_synack) + tcp_valid_rtt_meas(newsk, + tcp_time_stamp - tcp_rsk(req)->snt_synack); + newtp->total_retrans = req->retrans; #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ @@ -1491,6 +1510,8 @@ exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: + tcp_clear_xmit_timers(newsk); + bh_unlock_sock(newsk); sock_put(newsk); goto exit; } @@ -1572,7 +1593,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - sock_rps_save_rxhash(sk, skb->rxhash); + sock_rps_save_rxhash(sk, skb); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1589,7 +1610,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb->rxhash); + sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1597,7 +1618,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } } else - sock_rps_save_rxhash(sk, skb->rxhash); + sock_rps_save_rxhash(sk, skb); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; @@ -1629,7 +1650,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_rcv(struct sk_buff *skb) { const struct iphdr *iph; - struct tcphdr *th; + const struct tcphdr *th; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); @@ -1664,7 +1685,7 @@ int tcp_v4_rcv(struct sk_buff *skb) skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->flags = iph->tos; + TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); @@ -1793,7 +1814,7 @@ EXPORT_SYMBOL(tcp_v4_get_peer); void *tcp_v4_tw_get_peer(struct sock *sk) { - struct inet_timewait_sock *tw = inet_twsk(sk); + const struct inet_timewait_sock *tw = inet_twsk(sk); return inet_getpeer_v4(tw->tw_daddr, 1); } @@ -1855,7 +1876,7 @@ static int tcp_v4_init_sock(struct sock *sk) * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - tp->snd_cwnd = 2; + tp->snd_cwnd = TCP_INIT_CWND; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. @@ -2320,7 +2341,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) } } -static int tcp_seq_open(struct inode *inode, struct file *file) +int tcp_seq_open(struct inode *inode, struct file *file) { struct tcp_seq_afinfo *afinfo = PDE(inode)->data; struct tcp_iter_state *s; @@ -2336,23 +2357,19 @@ static int tcp_seq_open(struct inode *inode, struct file *file) s->last_pos = 0; return 0; } +EXPORT_SYMBOL(tcp_seq_open); int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) { int rc = 0; struct proc_dir_entry *p; - afinfo->seq_fops.open = tcp_seq_open; - afinfo->seq_fops.read = seq_read; - afinfo->seq_fops.llseek = seq_lseek; - afinfo->seq_fops.release = seq_release_net; - afinfo->seq_ops.start = tcp_seq_start; afinfo->seq_ops.next = tcp_seq_next; afinfo->seq_ops.stop = tcp_seq_stop; p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, - &afinfo->seq_fops, afinfo); + afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; return rc; @@ -2365,7 +2382,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) } EXPORT_SYMBOL(tcp_proc_unregister); -static void get_openreq4(struct sock *sk, struct request_sock *req, +static void get_openreq4(const struct sock *sk, const struct request_sock *req, struct seq_file *f, int i, int uid, int *len) { const struct inet_request_sock *ireq = inet_rsk(req); @@ -2395,9 +2412,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) { int timer_active; unsigned long timer_expires; - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); @@ -2446,7 +2463,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) len); } -static void get_timewait4_sock(struct inet_timewait_sock *tw, +static void get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i, int *len) { __be32 dest, src; @@ -2501,12 +2518,18 @@ out: return 0; } +static const struct file_operations tcp_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = tcp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + static struct tcp_seq_afinfo tcp4_seq_afinfo = { .name = "tcp", .family = AF_INET, - .seq_fops = { - .owner = THIS_MODULE, - }, + .seq_fops = &tcp_afinfo_seq_fops, .seq_ops = { .show = tcp4_seq_show, }, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 80b1f80759ab..66363b689ad6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -141,7 +141,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_options_received tmp_opt; - u8 *hash_location; + const u8 *hash_location; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); int paws_reject = 0; @@ -328,6 +328,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + tw->tw_transparent = inet_sk(sk)->transparent; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; @@ -344,6 +345,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw6 = inet6_twsk((struct sock *)tw); ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_tclass = np->tclass; tw->tw_ipv6only = np->ipv6only; } #endif @@ -486,7 +488,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - newtp->snd_cwnd = 2; + newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; newtp->bytes_acked = 0; @@ -566,7 +568,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock **prev) { struct tcp_options_received tmp_opt; - u8 *hash_location; + const u8 *hash_location; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); @@ -720,6 +722,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } + if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) + tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; + else if (req->retrans) /* don't take RTT sample if retrans && ~TS */ + tcp_rsk(req)->snt_synack = 0; /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 882e0b0964d0..63170e297540 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -65,7 +65,7 @@ EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; @@ -89,9 +89,9 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ -static inline __u32 tcp_acceptable_seq(struct sock *sk) +static inline __u32 tcp_acceptable_seq(const struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); if (!before(tcp_wnd_end(tp), tp->snd_nxt)) return tp->snd_nxt; @@ -116,7 +116,7 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk) static __u16 tcp_advertise_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); + const struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst) { @@ -133,7 +133,7 @@ static __u16 tcp_advertise_mss(struct sock *sk) /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) +static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) { struct tcp_sock *tp = tcp_sk(sk); s32 delta = tcp_time_stamp - tp->lsndtime; @@ -154,7 +154,7 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) /* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, - struct sk_buff *skb, struct sock *sk) + struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; @@ -295,11 +295,11 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) { - TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) - TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; } /* Packet ECN state for a SYN. */ @@ -309,13 +309,13 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = 0; if (sysctl_tcp_ecn == 1) { - TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; } } static __inline__ void -TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) { if (inet_rsk(req)->ecn_ok) th->ece = 1; @@ -356,7 +356,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; - TCP_SKB_CB(skb)->flags = flags; + TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; skb_shinfo(skb)->gso_segs = 1; @@ -565,7 +565,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, */ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) { + struct tcp_md5sig_key **md5) +{ struct tcp_sock *tp = tcp_sk(sk); struct tcp_cookie_values *cvp = tp->cookie_values; unsigned remaining = MAX_TCP_OPTION_SPACE; @@ -743,7 +744,8 @@ static unsigned tcp_synack_options(struct sock *sk, */ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) { + struct tcp_md5sig_key **md5) +{ struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned size = 0; @@ -826,7 +828,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); - if (unlikely(tcb->flags & TCPHDR_SYN)) + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); else tcp_options_size = tcp_established_options(sk, skb, &opts, @@ -850,9 +852,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | - tcb->flags); + tcb->tcp_flags); - if (unlikely(tcb->flags & TCPHDR_SYN)) { + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ @@ -875,7 +877,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); - if (likely((tcb->flags & TCPHDR_SYN) == 0)) + if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG @@ -889,11 +891,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, icsk->icsk_af_ops->send_check(sk, skb); - if (likely(tcb->flags & TCPHDR_ACK)) + if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) - tcp_event_data_sent(tp, skb, sk); + tcp_event_data_sent(tp, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, @@ -926,7 +928,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, +static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { if (skb->len <= mss_now || !sk_can_gso(sk) || @@ -947,7 +949,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, /* When a modification to fackets out becomes necessary, we need to check * skb is counted to fackets_out or not. */ -static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, +static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); @@ -962,7 +964,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ -static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr) +static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); @@ -1032,9 +1034,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ - flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); - TCP_SKB_CB(buff)->flags = flags; + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); + TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { @@ -1094,14 +1096,16 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = len; k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size <= eat) { - put_page(skb_shinfo(skb)->frags[i].page); - eat -= skb_shinfo(skb)->frags[i].size; + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size <= eat) { + skb_frag_unref(skb, i); + eat -= size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; - skb_shinfo(skb)->frags[k].size -= eat; + skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); eat = 0; } k++; @@ -1144,10 +1148,10 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) } /* Calculate MSS. Not accounting for SACKs here. */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +int tcp_mtu_to_mss(const struct sock *sk, int pmtu) { - struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: @@ -1173,10 +1177,10 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) } /* Inverse of above */ -int tcp_mss_to_mtu(struct sock *sk, int mss) +int tcp_mss_to_mtu(const struct sock *sk, int mss) { - struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); int mtu; mtu = mss + @@ -1250,8 +1254,8 @@ EXPORT_SYMBOL(tcp_sync_mss); */ unsigned int tcp_current_mss(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; unsigned header_len; struct tcp_out_options opts; @@ -1311,10 +1315,10 @@ static void tcp_cwnd_validate(struct sock *sk) * modulo only when the receiver window alone is the limiting factor or * when we would be allowed to send the split-due-to-Nagle skb fully. */ -static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); u32 needed, window, cwnd_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; @@ -1334,13 +1338,14 @@ static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, - struct sk_buff *skb) +static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, + const struct sk_buff *skb) { u32 in_flight, cwnd; /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && + tcp_skb_pcount(skb) == 1) return 1; in_flight = tcp_packets_in_flight(tp); @@ -1355,7 +1360,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, +static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1377,7 +1382,7 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp) /* Return 0, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_NODELAY was set. + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ @@ -1393,7 +1398,7 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp, /* Return non-zero if the Nagle test allows this packet to be * sent now. */ -static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, +static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the @@ -1409,7 +1414,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, * Nagle can be ignored during F-RTO too (see RFC4138). */ if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || - (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)) + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return 1; if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) @@ -1419,7 +1424,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, } /* Does at least the first segment of SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, +static inline int tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1434,10 +1439,10 @@ static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, * should be put on the wire right now. If so, it returns the number of * packets allowed by the congestion window. */ -static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, unsigned int cur_mss, int nonagle) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); unsigned int cwnd_quota; tcp_init_tso_segs(sk, skb, cur_mss); @@ -1455,7 +1460,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, /* Test if sending is allowed right now. */ int tcp_may_send_now(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); return skb && @@ -1497,9 +1502,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ - flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); - TCP_SKB_CB(buff)->flags = flags; + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); + TCP_SKB_CB(buff)->tcp_flags = flags; /* This packet was never sent out yet, so no SACK bits. */ TCP_SKB_CB(buff)->sacked = 0; @@ -1530,7 +1535,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) u32 send_win, cong_win, limit, in_flight; int win_divisor; - if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto send_now; if (icsk->icsk_ca_state != TCP_CA_Open) @@ -1657,7 +1662,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; - TCP_SKB_CB(nskb)->flags = TCPHDR_ACK; + TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; nskb->ip_summed = skb->ip_summed; @@ -1677,11 +1682,11 @@ static int tcp_mtu_probe(struct sock *sk) if (skb->len <= copy) { /* We've eaten all the data from this skb. * Throw it away. */ - TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); } else { - TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & + TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); @@ -1796,11 +1801,13 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); - sent_pkts++; + sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery) + tp->prr_out += sent_pkts; if (likely(sent_pkts)) { tcp_cwnd_validate(sk); @@ -1985,7 +1992,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. This moves PSH/FIN etc. over */ - TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; + TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. @@ -2003,7 +2010,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } /* Check if coalescing SKBs is legal. */ -static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) +static int tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return 0; @@ -2033,7 +2040,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!sysctl_tcp_retrans_collapse) return; - if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; tcp_for_write_queue_from_safe(skb, tmp, sk) { @@ -2125,12 +2132,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * since it is cheap to do so and saves bytes on the network. */ if (skb->len > 0 && - (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { /* Reuse, even though it does some unnecessary work */ tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, - TCP_SKB_CB(skb)->flags); + TCP_SKB_CB(skb)->tcp_flags); skb->ip_summed = CHECKSUM_NONE; } } @@ -2179,7 +2186,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) static int tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Forward retransmissions are possible only during Recovery. */ if (icsk->icsk_ca_state != TCP_CA_Recovery) @@ -2294,6 +2301,9 @@ begin_fwd: return; NET_INC_STATS_BH(sock_net(sk), mib_idx); + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery) + tp->prr_out += tcp_skb_pcount(skb); + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, @@ -2317,7 +2327,7 @@ void tcp_send_fin(struct sock *sk) mss_now = tcp_current_mss(sk); if (tcp_send_head(sk) != NULL) { - TCP_SKB_CB(skb)->flags |= TCPHDR_FIN; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; } else { @@ -2379,11 +2389,11 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *skb; skb = tcp_write_queue_head(sk); - if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) { + if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); return -EFAULT; } - if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) { + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) @@ -2397,7 +2407,7 @@ int tcp_send_synack(struct sock *sk) skb = nskb; } - TCP_SKB_CB(skb)->flags |= TCPHDR_ACK; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; @@ -2542,7 +2552,7 @@ EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { - struct dst_entry *dst = __sk_dst_get(sk); + const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; @@ -2794,13 +2804,13 @@ int tcp_write_wakeup(struct sock *sk) if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); - TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; if (tcp_fragment(sk, skb, seg_size, mss)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); - TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ecd44b0c45f1..2e0f0af76c19 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -334,7 +334,6 @@ void tcp_retransmit_timer(struct sock *sk) * connection. If the socket is an orphan, time it out, * we cannot allow such beasts to hang infinitely. */ -#ifdef TCP_DEBUG struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", @@ -349,7 +348,6 @@ void tcp_retransmit_timer(struct sock *sk) inet->inet_num, tp->snd_una, tp->snd_nxt); } #endif -#endif if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { tcp_write_err(sk); goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 198f75b7bdd3..ab0966df1e2a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -105,6 +105,7 @@ #include <net/route.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <trace/events/udp.h> #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -1266,7 +1267,7 @@ int udp_disconnect(struct sock *sk, int flags) sk->sk_state = TCP_CLOSE; inet->inet_daddr = 0; inet->inet_dport = 0; - sock_rps_save_rxhash(sk, 0); + sock_rps_reset_rxhash(sk); sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); @@ -1354,7 +1355,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) int rc; if (inet_sk(sk)->inet_daddr) - sock_rps_save_rxhash(sk, skb->rxhash); + sock_rps_save_rxhash(sk, skb); rc = ip_queue_rcv_skb(sk, skb); if (rc < 0) { @@ -1366,6 +1367,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) is_udplite); UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); + trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } @@ -1395,6 +1397,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) nf_reset(skb); if (up->encap_type) { + int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + /* * This is an encapsulation socket so pass the skb to * the socket's udp_encap_rcv() hook. Otherwise, just @@ -1407,11 +1411,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ /* if we're overly short, let UDP handle it */ - if (skb->len > sizeof(struct udphdr) && - up->encap_rcv != NULL) { + encap_rcv = ACCESS_ONCE(up->encap_rcv); + if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { int ret; - ret = (*up->encap_rcv)(sk, skb); + ret = encap_rcv(sk, skb); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INDATAGRAMS, @@ -1459,10 +1463,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (rcu_dereference_raw(sk->sk_filter)) { - if (udp_lib_checksum_complete(skb)) - goto drop; - } + if (rcu_access_pointer(sk->sk_filter) && + udp_lib_checksum_complete(skb)) + goto drop; if (sk_rcvqueues_full(sk, skb)) @@ -2036,7 +2039,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v) spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); } -static int udp_seq_open(struct inode *inode, struct file *file) +int udp_seq_open(struct inode *inode, struct file *file) { struct udp_seq_afinfo *afinfo = PDE(inode)->data; struct udp_iter_state *s; @@ -2052,6 +2055,7 @@ static int udp_seq_open(struct inode *inode, struct file *file) s->udp_table = afinfo->udp_table; return err; } +EXPORT_SYMBOL(udp_seq_open); /* ------------------------------------------------------------------------ */ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) @@ -2059,17 +2063,12 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) struct proc_dir_entry *p; int rc = 0; - afinfo->seq_fops.open = udp_seq_open; - afinfo->seq_fops.read = seq_read; - afinfo->seq_fops.llseek = seq_lseek; - afinfo->seq_fops.release = seq_release_net; - afinfo->seq_ops.start = udp_seq_start; afinfo->seq_ops.next = udp_seq_next; afinfo->seq_ops.stop = udp_seq_stop; p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, - &afinfo->seq_fops, afinfo); + afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; return rc; @@ -2119,14 +2118,20 @@ int udp4_seq_show(struct seq_file *seq, void *v) return 0; } +static const struct file_operations udp_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = udp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + /* ------------------------------------------------------------------------ */ static struct udp_seq_afinfo udp4_seq_afinfo = { .name = "udp", .family = AF_INET, .udp_table = &udp_table, - .seq_fops = { - .owner = THIS_MODULE, - }, + .seq_fops = &udp_afinfo_seq_fops, .seq_ops = { .show = udp4_seq_show, }, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index aee9963f7f5a..12e9499a1a6c 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -10,6 +10,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include <linux/export.h> #include "udp_impl.h" struct udp_table udplite_table __read_mostly; @@ -71,13 +72,20 @@ static struct inet_protosw udplite4_protosw = { }; #ifdef CONFIG_PROC_FS + +static const struct file_operations udplite_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = udp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + static struct udp_seq_afinfo udplite4_seq_afinfo = { .name = "udplite", .family = AF_INET, .udp_table = &udplite_table, - .seq_fops = { - .owner = THIS_MODULE, - }, + .seq_fops = &udplite_afinfo_seq_fops, .seq_ops = { .show = udp4_seq_show, }, diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 981e43eaf704..a0b4c5da8d43 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -79,13 +79,13 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, struct rtable *rt = (struct rtable *)xdst->route; const struct flowi4 *fl4 = &fl->u.ip4; - rt->rt_key_dst = fl4->daddr; - rt->rt_key_src = fl4->saddr; - rt->rt_key_tos = fl4->flowi4_tos; - rt->rt_route_iif = fl4->flowi4_iif; - rt->rt_iif = fl4->flowi4_iif; - rt->rt_oif = fl4->flowi4_oif; - rt->rt_mark = fl4->flowi4_mark; + xdst->u.rt.rt_key_dst = fl4->daddr; + xdst->u.rt.rt_key_src = fl4->saddr; + xdst->u.rt.rt_key_tos = fl4->flowi4_tos; + xdst->u.rt.rt_route_iif = fl4->flowi4_iif; + xdst->u.rt.rt_iif = fl4->flowi4_iif; + xdst->u.rt.rt_oif = fl4->flowi4_oif; + xdst->u.rt.rt_mark = fl4->flowi4_mark; xdst->u.dst.dev = dev; dev_hold(dev); @@ -117,7 +117,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; - if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { + if (!ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_UDP: case IPPROTO_UDPLITE: diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index d9ac0a0058b5..9258e751baba 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -12,6 +12,7 @@ #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/netfilter_ipv4.h> +#include <linux/export.h> static int xfrm4_init_flags(struct xfrm_state *x) { |