From 0eae88f31ca2b88911ce843452054139e028771f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 19:06:52 -0700 Subject: net: Fix various endianness glitches Sparse can help us find endianness bugs, but we need to make some cleanups to be able to more easily spot real bugs. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cb562fdd9b9a..a947428ef0ae 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -258,10 +258,9 @@ static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); (__raw_get_cpu_var(rt_cache_stat).field++) static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, - int genid) + int genid) { - return jhash_3words((__force u32)(__be32)(daddr), - (__force u32)(__be32)(saddr), + return jhash_3words((__force u32)daddr, (__force u32)saddr, idx, genid) & rt_hash_mask; } @@ -378,12 +377,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) struct rtable *r = v; int len; - seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" - "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", + seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" + "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", r->u.dst.dev ? r->u.dst.dev->name : "*", - (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, + (__force u32)r->rt_dst, + (__force u32)r->rt_gateway, r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (unsigned long)r->rt_src, + r->u.dst.__use, 0, (__force u32)r->rt_src, (dst_metric(&r->u.dst, RTAX_ADVMSS) ? (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), dst_metric(&r->u.dst, RTAX_WINDOW), @@ -685,18 +685,17 @@ static inline bool rt_caching(const struct net *net) static inline bool compare_hash_inputs(const struct flowi *fl1, const struct flowi *fl2) { - return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | + return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | + ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | (fl1->iif ^ fl2->iif)) == 0); } static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { - return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | + return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | + ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->nl_u.ip4_u.tos ^ - *(u16 *)&fl2->nl_u.ip4_u.tos) | + (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | (fl1->oif ^ fl2->oif) | (fl1->iif ^ fl2->iif)) == 0; } @@ -2319,8 +2318,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { - if (((rth->fl.fl4_dst ^ daddr) | - (rth->fl.fl4_src ^ saddr) | + if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | + ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && -- cgit v1.2.3 From 3ee943728fff536edaf8f59faa58aaa1aa7366e3 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Sat, 8 May 2010 01:57:52 -0700 Subject: ipv4: remove ip_rt_secret timer (v4) A while back there was a discussion regarding the rt_secret_interval timer. Given that we've had the ability to do emergency route cache rebuilds for awhile now, based on a statistical analysis of the various hash chain lengths in the cache, the use of the flush timer is somewhat redundant. This patch removes the rt_secret_interval sysctl, allowing us to rely solely on the statistical analysis mechanism to determine the need for route cache flushes. Signed-off-by: Neil Horman Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - kernel/sysctl_binary.c | 1 - net/ipv4/route.c | 108 ++++------------------------------------------- 3 files changed, 8 insertions(+), 102 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ae07feec6446..d68c3f121774 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -55,7 +55,6 @@ struct netns_ipv4 { int sysctl_rt_cache_rebuild_count; int current_rt_cache_rebuild_count; - struct timer_list rt_secret_timer; atomic_t rt_genid; #ifdef CONFIG_IP_MROUTE diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 59030570f5ca..937d31dc8566 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -224,7 +224,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = { { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, {} }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a947428ef0ae..dea3f9264250 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; -static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; static int rt_chain_length_max __read_mostly = 20; static struct delayed_work expires_work; @@ -918,32 +917,11 @@ void rt_cache_flush_batch(void) rt_do_flush(!in_softirq()); } -/* - * We change rt_genid and let gc do the cleanup - */ -static void rt_secret_rebuild(unsigned long __net) -{ - struct net *net = (struct net *)__net; - rt_cache_invalidate(net); - mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); -} - -static void rt_secret_rebuild_oneshot(struct net *net) -{ - del_timer_sync(&net->ipv4.rt_secret_timer); - rt_cache_invalidate(net); - if (ip_rt_secret_interval) - mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); -} - static void rt_emergency_hash_rebuild(struct net *net) { - if (net_ratelimit()) { + if (net_ratelimit()) printk(KERN_WARNING "Route hash chain too long!\n"); - printk(KERN_WARNING "Adjust your secret_interval!\n"); - } - - rt_secret_rebuild_oneshot(net); + rt_cache_invalidate(net); } /* @@ -3101,48 +3079,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, return -EINVAL; } -static void rt_secret_reschedule(int old) -{ - struct net *net; - int new = ip_rt_secret_interval; - int diff = new - old; - - if (!diff) - return; - - rtnl_lock(); - for_each_net(net) { - int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); - long time; - - if (!new) - continue; - - if (deleted) { - time = net->ipv4.rt_secret_timer.expires - jiffies; - - if (time <= 0 || (time += diff) <= 0) - time = 0; - } else - time = new; - - mod_timer(&net->ipv4.rt_secret_timer, jiffies + time); - } - rtnl_unlock(); -} - -static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int old = ip_rt_secret_interval; - int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); - - rt_secret_reschedule(old); - - return ret; -} - static ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", @@ -3251,13 +3187,6 @@ static ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "secret_interval", - .data = &ip_rt_secret_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ipv4_sysctl_rt_secret_interval, - }, { } }; @@ -3336,34 +3265,15 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { }; #endif - -static __net_init int rt_secret_timer_init(struct net *net) +static __net_init int rt_genid_init(struct net *net) { - atomic_set(&net->ipv4.rt_genid, - (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7)))); - - net->ipv4.rt_secret_timer.function = rt_secret_rebuild; - net->ipv4.rt_secret_timer.data = (unsigned long)net; - init_timer_deferrable(&net->ipv4.rt_secret_timer); - - if (ip_rt_secret_interval) { - net->ipv4.rt_secret_timer.expires = - jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&net->ipv4.rt_secret_timer); - } + get_random_bytes(&net->ipv4.rt_genid, + sizeof(net->ipv4.rt_genid)); return 0; } -static __net_exit void rt_secret_timer_exit(struct net *net) -{ - del_timer_sync(&net->ipv4.rt_secret_timer); -} - -static __net_initdata struct pernet_operations rt_secret_timer_ops = { - .init = rt_secret_timer_init, - .exit = rt_secret_timer_exit, +static __net_initdata struct pernet_operations rt_genid_ops = { + .init = rt_genid_init, }; @@ -3424,9 +3334,6 @@ int __init ip_rt_init(void) schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (register_pernet_subsys(&rt_secret_timer_ops)) - printk(KERN_ERR "Unable to setup rt_secret_timer\n"); - if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM @@ -3438,6 +3345,7 @@ int __init ip_rt_init(void) #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif + register_pernet_subsys(&rt_genid_ops); return rc; } -- cgit v1.2.3 From 7fee226ad2397b635e2fd565a59ca3ae08a164cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 May 2010 23:19:48 +0000 Subject: net: add a noref bit on skb dst Use low order bit of skb->_skb_dst to tell dst is not refcounted. Change _skb_dst to _skb_refdst to make sure all uses are catched. skb_dst() returns the dst, regardless of noref bit set or not, but with a lockdep check to make sure a noref dst is not given if current user is not rcu protected. New skb_dst_set_noref() helper to set an notrefcounted dst on a skb. (with lockdep check) skb_dst_drop() drops a reference only if skb dst was refcounted. skb_dst_force() helper is used to force a refcount on dst, when skb is queued and not anymore RCU protected. Use skb_dst_force() in __sk_add_backlog(), __dev_xmit_skb() if !IFF_XMIT_DST_RELEASE or skb enqueued on qdisc queue, in sock_queue_rcv_skb(), in __nf_queue(). Use skb_dst_force() in dev_requeue_skb(). Note: dst_use_noref() still dirties dst, we might transform it later to do one dirtying per jiffies. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 58 ++++++++++++++++++++++++++++++++++++++++++++---- include/net/dst.h | 48 ++++++++++++++++++++++++++++++++++++--- include/net/sock.h | 13 ++++++----- net/core/dev.c | 3 +++ net/core/skbuff.c | 2 +- net/core/sock.c | 6 +++++ net/ipv4/icmp.c | 6 ++--- net/ipv4/ip_options.c | 9 ++++---- net/ipv4/netfilter.c | 6 ++--- net/ipv4/route.c | 2 +- net/netfilter/nf_queue.c | 2 ++ net/sched/sch_generic.c | 4 +++- 12 files changed, 134 insertions(+), 25 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c9525bce80f6..7cdfb4d52847 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -264,7 +264,7 @@ typedef unsigned char *sk_buff_data_t; * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header - * @_skb_dst: destination entry + * @_skb_refdst: destination entry (with norefcount bit) * @sp: the security path, used for xfrm * @cb: Control buffer. Free for use by every layer. Put private vars here * @len: Length of actual data @@ -328,7 +328,7 @@ struct sk_buff { */ char cb[48] __aligned(8); - unsigned long _skb_dst; + unsigned long _skb_refdst; #ifdef CONFIG_XFRM struct sec_path *sp; #endif @@ -419,14 +419,64 @@ struct sk_buff { #include +/* + * skb might have a dst pointer attached, refcounted or not. + * _skb_refdst low order bit is set if refcount was _not_ taken + */ +#define SKB_DST_NOREF 1UL +#define SKB_DST_PTRMASK ~(SKB_DST_NOREF) + +/** + * skb_dst - returns skb dst_entry + * @skb: buffer + * + * Returns skb dst_entry, regardless of reference taken or not. + */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { - return (struct dst_entry *)skb->_skb_dst; + /* If refdst was not refcounted, check we still are in a + * rcu_read_lock section + */ + WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) && + !rcu_read_lock_held() && + !rcu_read_lock_bh_held()); + return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } +/** + * skb_dst_set - sets skb dst + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was taken on dst and should + * be released by skb_dst_drop() + */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { - skb->_skb_dst = (unsigned long)dst; + skb->_skb_refdst = (unsigned long)dst; +} + +/** + * skb_dst_set_noref - sets skb dst, without a reference + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was not taken on dst + * skb_dst_drop() should not dst_release() this dst + */ +static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +{ + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; +} + +/** + * skb_dst_is_noref - Test if skb dst isnt refcounted + * @skb: buffer + */ +static inline bool skb_dst_is_noref(const struct sk_buff *skb) +{ + return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } static inline struct rtable *skb_rtable(const struct sk_buff *skb) diff --git a/include/net/dst.h b/include/net/dst.h index aac5a5fcfda9..27207a13f2a6 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -168,6 +168,12 @@ static inline void dst_use(struct dst_entry *dst, unsigned long time) dst->lastuse = time; } +static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) +{ + dst->__use++; + dst->lastuse = time; +} + static inline struct dst_entry * dst_clone(struct dst_entry * dst) { @@ -177,11 +183,47 @@ struct dst_entry * dst_clone(struct dst_entry * dst) } extern void dst_release(struct dst_entry *dst); + +static inline void refdst_drop(unsigned long refdst) +{ + if (!(refdst & SKB_DST_NOREF)) + dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK)); +} + +/** + * skb_dst_drop - drops skb dst + * @skb: buffer + * + * Drops dst reference count if a reference was taken. + */ static inline void skb_dst_drop(struct sk_buff *skb) { - if (skb->_skb_dst) - dst_release(skb_dst(skb)); - skb->_skb_dst = 0UL; + if (skb->_skb_refdst) { + refdst_drop(skb->_skb_refdst); + skb->_skb_refdst = 0UL; + } +} + +static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) +{ + nskb->_skb_refdst = oskb->_skb_refdst; + if (!(nskb->_skb_refdst & SKB_DST_NOREF)) + dst_clone(skb_dst(nskb)); +} + +/** + * skb_dst_force - makes sure skb dst is refcounted + * @skb: buffer + * + * If dst is not yet refcounted, let's do it + */ +static inline void skb_dst_force(struct sk_buff *skb) +{ + if (skb_dst_is_noref(skb)) { + WARN_ON(!rcu_read_lock_held()); + skb->_skb_refdst &= ~SKB_DST_NOREF; + dst_clone(skb_dst(skb)); + } } /* Children define the path of the packet through the diff --git a/include/net/sock.h b/include/net/sock.h index aed16eb9db4b..5697caf8cc76 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -600,12 +600,15 @@ static inline int sk_stream_memory_free(struct sock *sk) /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_backlog.tail) { - sk->sk_backlog.head = sk->sk_backlog.tail = skb; - } else { + /* dont let skb dst not refcounted, we are going to leave rcu lock */ + skb_dst_force(skb); + + if (!sk->sk_backlog.tail) + sk->sk_backlog.head = skb; + else sk->sk_backlog.tail->next = skb; - sk->sk_backlog.tail = skb; - } + + sk->sk_backlog.tail = skb; skb->next = NULL; } diff --git a/net/core/dev.c b/net/core/dev.c index cdcb9cbedf41..6c820650b80f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2052,6 +2052,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + skb_dst_force(skb); __qdisc_update_bstats(q, skb->len); if (sch_direct_xmit(skb, q, dev, txq, root_lock)) __qdisc_run(q); @@ -2060,6 +2062,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { + skb_dst_force(skb); rc = qdisc_enqueue_root(skb, q); qdisc_run(q); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a9b0e1f77806..c543dd252433 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -520,7 +520,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; - skb_dst_set(new, dst_clone(skb_dst(old))); + skb_dst_copy(new, old); new->rxhash = old->rxhash; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); diff --git a/net/core/sock.c b/net/core/sock.c index 63530a03b8c2..bf88a167c8f2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ skb_len = skb->len; + /* we escape from rcu protected region, make sure we dont leak + * a norefcounted dst + */ + skb_dst_force(skb); + spin_lock_irqsave(&list->lock, flags); skb->dropcount = atomic_read(&sk->sk_drops); __skb_queue_tail(list, skb); @@ -1536,6 +1541,7 @@ static void __release_sock(struct sock *sk) do { struct sk_buff *next = skb->next; + WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; sk_backlog_rcv(sk, skb); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f3d339f728b0..d65e9215bcd7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -587,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) err = __ip_route_output_key(net, &rt2, &fl); else { struct flowi fl2 = {}; - struct dst_entry *odst; + unsigned long orefdst; fl2.fl4_dst = fl.fl4_src; if (ip_route_output_key(net, &rt2, &fl2)) goto relookup_failed; /* Ugh! */ - odst = skb_dst(skb_in); + orefdst = skb_in->_skb_refdst; /* save old refdst */ err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, RT_TOS(tos), rt2->u.dst.dev); dst_release(&rt2->u.dst); rt2 = skb_rtable(skb_in); - skb_dst_set(skb_in, odst); + skb_in->_skb_refdst = orefdst; /* restore old refdst */ } if (err) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 4c09a31fd140..3244133c24f6 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -601,6 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = skb_rtable(skb); struct rtable *rt2; + unsigned long orefdst; int err; if (!opt->srr) @@ -624,16 +625,16 @@ int ip_options_rcv_srr(struct sk_buff *skb) } memcpy(&nexthop, &optptr[srrptr-1], 4); - rt = skb_rtable(skb); + orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { - ip_rt_put(rt2); - skb_dst_set(skb, &rt->u.dst); + skb_dst_drop(skb); + skb->_skb_refdst = orefdst; return -EINVAL; } - ip_rt_put(rt); + refdst_drop(orefdst); if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 82fb43c5c59e..07de855e2175 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -17,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi fl = {}; - struct dst_entry *odst; + unsigned long orefdst; unsigned int hh_len; unsigned int type; @@ -51,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) if (ip_route_output_key(net, &rt, &fl) != 0) return -1; - odst = skb_dst(skb); + orefdst = skb->_skb_refdst; if (ip_route_input(skb, iph->daddr, iph->saddr, RT_TOS(iph->tos), rt->u.dst.dev) != 0) { dst_release(&rt->u.dst); return -1; } dst_release(&rt->u.dst); - dst_release(odst); + refdst_drop(orefdst); } if (skb_dst(skb)->error) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dea3f9264250..705eccfb4769 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3033,7 +3033,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; if (rt_is_expired(rt)) continue; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set_noref(skb, &rt->u.dst); if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 0b1103c0b1f3..78b3cf9c519c 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "nf_internals.h" @@ -170,6 +171,7 @@ static int __nf_queue(struct sk_buff *skb, dev_hold(physoutdev); } #endif + skb_dst_force(skb); afinfo->saveroute(skb, entry); status = qh->outfn(entry, queuenum); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a969b111bd76..a63029ef3edd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include #include #include +#include /* Main transmission queue. */ @@ -40,6 +41,7 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { + skb_dst_force(skb); q->gso_skb = skb; q->qstats.requeues++; q->q.qlen++; /* it's still part of the queue */ @@ -179,7 +181,7 @@ static inline int qdisc_restart(struct Qdisc *q) skb = dequeue_skb(q); if (unlikely(!skb)) return 0; - + WARN_ON_ONCE(skb_dst_is_noref(skb)); root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); -- cgit v1.2.3 From 407eadd996dc62a827db85f1d0c286a98fd5d336 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 May 2010 11:32:55 +0000 Subject: net: implements ip_route_input_noref() ip_route_input() is the version returning a refcounted dst, while ip_route_input_noref() returns a non refcounted one. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/route.h | 17 ++++++++++++++++- net/ipv4/route.c | 15 ++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/include/net/route.h b/include/net/route.h index 2c9fba7f7731..af6cf4b4c9dc 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -112,7 +112,22 @@ extern void rt_cache_flush_batch(void); extern int __ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp); extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp); extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); -extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin); + +extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin, bool noref); + +static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin) +{ + return ip_route_input_common(skb, dst, src, tos, devin, false); +} + +static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin) +{ + return ip_route_input_common(skb, dst, src, tos, devin, true); +} + extern unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev); extern void ip_rt_send_redirect(struct sk_buff *skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 705eccfb4769..560acc677ce4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2277,8 +2277,8 @@ martian_source: goto e_inval; } -int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) +int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, bool noref) { struct rtable * rth; unsigned hash; @@ -2304,10 +2304,15 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && !rt_is_expired(rth)) { - dst_use(&rth->u.dst, jiffies); + if (noref) { + dst_use_noref(&rth->u.dst, jiffies); + skb_dst_set_noref(skb, &rth->u.dst); + } else { + dst_use(&rth->u.dst, jiffies); + skb_dst_set(skb, &rth->u.dst); + } RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); - skb_dst_set(skb, &rth->u.dst); return 0; } RT_CACHE_STAT_INC(in_hlist_search); @@ -2350,6 +2355,7 @@ skip_cache: } return ip_route_input_slow(skb, daddr, saddr, tos, dev); } +EXPORT_SYMBOL(ip_route_input_common); static int __mkroute_output(struct rtable **result, struct fib_result *res, @@ -3361,5 +3367,4 @@ void __init ip_static_sysctl_init(void) #endif EXPORT_SYMBOL(__ip_select_ident); -EXPORT_SYMBOL(ip_route_input); EXPORT_SYMBOL(ip_route_output_key); -- cgit v1.2.3