From 0a5c047507aaaf00519921336d19c0f8f5f9f363 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 31 Mar 2011 01:51:35 -0700 Subject: fib: add __rcu annotations Add __rcu annotations and lockdep checks. Add const qualifiers node_parent() and node_parent_rcu() can use rcu_dereference_index_check() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 103 +++++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 45 deletions(-) (limited to 'net/ipv4/fib_trie.c') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b92c86f6e9b3..b9d1f33e5e04 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -126,7 +126,7 @@ struct tnode { struct work_struct work; struct tnode *tnode_free; }; - struct rt_trie_node *child[0]; + struct rt_trie_node __rcu *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -151,7 +151,7 @@ struct trie_stat { }; struct trie { - struct rt_trie_node *trie; + struct rt_trie_node __rcu *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif @@ -177,16 +177,29 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; -static inline struct tnode *node_parent(struct rt_trie_node *node) +/* + * caller must hold RTNL + */ +static inline struct tnode *node_parent(const struct rt_trie_node *node) { - return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); + + return (struct tnode *)(parent & ~NODE_TYPE_MASK); } -static inline struct tnode *node_parent_rcu(struct rt_trie_node *node) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) { - struct tnode *ret = node_parent(node); + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || + lockdep_rtnl_is_held()); - return rcu_dereference_rtnl(ret); + return (struct tnode *)(parent & ~NODE_TYPE_MASK); } /* Same as rcu_assign_pointer @@ -198,18 +211,24 @@ static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) node->parent = (unsigned long)ptr | NODE_TYPE(node); } -static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i) +/* + * caller must hold RTNL + */ +static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) { BUG_ON(i >= 1U << tn->bits); - return tn->child[i]; + return rtnl_dereference(tn->child[i]); } -static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) { - struct rt_trie_node *ret = tnode_get_child(tn, i); + BUG_ON(i >= 1U << tn->bits); - return rcu_dereference_rtnl(ret); + return rcu_dereference_rtnl(tn->child[i]); } static inline int tnode_child_length(const struct tnode *tn) @@ -487,7 +506,7 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull) { - struct rt_trie_node *chi = tn->child[i]; + struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); int isfull; BUG_ON(i >= 1<bits); @@ -665,7 +684,7 @@ one_child: for (i = 0; i < tnode_child_length(tn); i++) { struct rt_trie_node *n; - n = tn->child[i]; + n = rtnl_dereference(tn->child[i]); if (!n) continue; @@ -679,6 +698,20 @@ one_child: return (struct rt_trie_node *) tn; } + +static void tnode_clean_free(struct tnode *tn) +{ + int i; + struct tnode *tofree; + + for (i = 0; i < tnode_child_length(tn); i++) { + tofree = (struct tnode *)rtnl_dereference(tn->child[i]); + if (tofree) + tnode_free(tofree); + } + tnode_free(tn); +} + static struct tnode *inflate(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; @@ -755,8 +788,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode = (struct tnode *) node; if (inode->bits == 1) { - put_child(t, tn, 2*i, inode->child[0]); - put_child(t, tn, 2*i+1, inode->child[1]); + put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); tnode_free_safe(inode); continue; @@ -797,8 +830,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) size = tnode_child_length(left); for (j = 0; j < size; j++) { - put_child(t, left, j, inode->child[j]); - put_child(t, right, j, inode->child[j + size]); + put_child(t, left, j, rtnl_dereference(inode->child[j])); + put_child(t, right, j, rtnl_dereference(inode->child[j + size])); } put_child(t, tn, 2*i, resize(t, left)); put_child(t, tn, 2*i+1, resize(t, right)); @@ -808,18 +841,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) tnode_free_safe(oldtnode); return tn; nomem: - { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - return ERR_PTR(-ENOMEM); - } + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); } static struct tnode *halve(struct trie *t, struct tnode *tn) @@ -890,18 +913,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) tnode_free_safe(oldtnode); return tn; nomem: - { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - return ERR_PTR(-ENOMEM); - } + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); } /* readside must use rcu_read_lock currently dump routines @@ -1033,7 +1046,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) t_key cindex; pos = 0; - n = t->trie; + n = rtnl_dereference(t->trie); /* If we point to NULL, stop. Either the tree is empty and we should * just put a new leaf in if, or we have reached an empty child slot, @@ -1756,7 +1769,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) continue; if (IS_LEAF(c)) { - prefetch(p->child[idx]); + prefetch(rcu_dereference_rtnl(p->child[idx])); return (struct leaf *) c; } @@ -2272,7 +2285,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); - while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) { + while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) -- cgit v1.2.3 From 21d8c49e01a0c1c6eb6c750cd04110db4a539284 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 14 Apr 2011 14:49:37 -0700 Subject: ipv4: Call fib_select_default() only when actually necessary. fib_select_default() is a complete NOP, and completely pointless to invoke, when we have no more than 1 default route installed. And this is far and away the common case. So remember how many prefixlen==0 routes we have in the routing table, and elide the call when we have no more than one of those. This cuts output route creation time by 157 cycles on Niagara2+. In order to add the new int to fib_table, we have to correct the type of ->tb_data[] to unsigned long, otherwise the private area will be unaligned on 64-bit systems. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_trie.c | 7 +++++++ net/ipv4/route.c | 4 +++- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'net/ipv4/fib_trie.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 514627f56339..10422ef14e28 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -160,7 +160,8 @@ struct fib_table { struct hlist_node tb_hlist; u32 tb_id; int tb_default; - unsigned char tb_data[0]; + int tb_num_default; + unsigned long tb_data[0]; }; extern int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index bde80c450b52..9ac481a10d37 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1332,6 +1332,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } } + if (!plen) + tb->tb_num_default++; + list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); @@ -1697,6 +1700,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) list_del_rcu(&fa->fa_list); + if (!plen) + tb->tb_num_default--; + if (list_empty(fa_head)) { hlist_del_rcu(&li->hlist); free_leaf_info(li); @@ -1987,6 +1993,7 @@ struct fib_table *fib_trie_table(u32 id) tb->tb_id = id; tb->tb_default = -1; + tb->tb_num_default = 0; t = (struct trie *) tb->tb_data; memset(t, 0, sizeof(*t)); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0e7430c327a7..e9aee81de3e3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2615,7 +2615,9 @@ static struct rtable *ip_route_output_slow(struct net *net, fib_select_multipath(&res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !fl4.flowi4_oif) + if (!res.prefixlen && + res.table->tb_num_default > 1 && + res.type == RTN_UNICAST && !fl4.flowi4_oif) fib_select_default(&res); if (!fl4.saddr) -- cgit v1.2.3