diff options
Diffstat (limited to 'net/ipv4/fib_trie.c')
-rw-r--r-- | net/ipv4/fib_trie.c | 1767 |
1 files changed, 953 insertions, 814 deletions
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3daf0224ff2e..e13fcc602da2 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -79,6 +79,7 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/switchdev.h> #include "fib_lookup.h" #define MAX_STAT_DEPTH 32 @@ -88,38 +89,35 @@ typedef unsigned int t_key; -#define IS_TNODE(n) ((n)->bits) -#define IS_LEAF(n) (!(n)->bits) +#define IS_TRIE(n) ((n)->pos >= KEYLENGTH) +#define IS_TNODE(n) ((n)->bits) +#define IS_LEAF(n) (!(n)->bits) -#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos) - -struct tnode { +struct key_vector { t_key key; - unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; - struct tnode __rcu *parent; - struct rcu_head rcu; union { - /* The fields in this struct are valid if bits > 0 (TNODE) */ - struct { - t_key empty_children; /* KEYLENGTH bits needed */ - t_key full_children; /* KEYLENGTH bits needed */ - struct tnode __rcu *child[0]; - }; - /* This list pointer if valid if bits == 0 (LEAF) */ - struct hlist_head list; + /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ + struct hlist_head leaf; + /* This array is valid if (pos | bits) > 0 (TNODE) */ + struct key_vector __rcu *tnode[0]; }; }; -struct leaf_info { - struct hlist_node hlist; - int plen; - u32 mask_plen; /* ntohl(inet_make_mask(plen)) */ - struct list_head falh; +struct tnode { struct rcu_head rcu; + t_key empty_children; /* KEYLENGTH bits needed */ + t_key full_children; /* KEYLENGTH bits needed */ + struct key_vector __rcu *parent; + struct key_vector kv[1]; +#define tn_bits kv[0].bits }; +#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) +#define LEAF_SIZE TNODE_SIZE(1) + #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; @@ -142,13 +140,13 @@ struct trie_stat { }; struct trie { - struct tnode __rcu *trie; + struct key_vector kv[1]; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; -static void resize(struct trie *t, struct tnode *tn); +static struct key_vector *resize(struct trie *t, struct key_vector *tn); static size_t tnode_free_size; /* @@ -161,41 +159,46 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; +static inline struct tnode *tn_info(struct key_vector *kv) +{ + return container_of(kv, struct tnode, kv[0]); +} + /* caller must hold RTNL */ -#define node_parent(n) rtnl_dereference((n)->parent) +#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) +#define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) /* caller must hold RCU read lock or RTNL */ -#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent) +#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) +#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) /* wrapper for rcu_assign_pointer */ -static inline void node_set_parent(struct tnode *n, struct tnode *tp) +static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) { if (n) - rcu_assign_pointer(n->parent, tp); + rcu_assign_pointer(tn_info(n)->parent, tp); } -#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p) +#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ -static inline unsigned long tnode_child_length(const struct tnode *tn) +static inline unsigned long child_length(const struct key_vector *tn) { return (1ul << tn->bits) & ~(1ul); } -/* caller must hold RTNL */ -static inline struct tnode *tnode_get_child(const struct tnode *tn, - unsigned long i) -{ - return rtnl_dereference(tn->child[i]); -} +#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) -/* caller must hold RCU read lock or RTNL */ -static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, - unsigned long i) +static inline unsigned long get_index(t_key key, struct key_vector *kv) { - return rcu_dereference_rtnl(tn->child[i]); + unsigned long index = key ^ kv->key; + + if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) + return 0; + + return index >> kv->pos; } /* To understand this stuff, an understanding of keys and all their bits is @@ -274,106 +277,104 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) } #define TNODE_KMALLOC_MAX \ - ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *)) + ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *)) +#define TNODE_VMALLOC_MAX \ + ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) static void __node_free_rcu(struct rcu_head *head) { struct tnode *n = container_of(head, struct tnode, rcu); - if (IS_LEAF(n)) + if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); - else if (n->bits <= TNODE_KMALLOC_MAX) + else if (n->tn_bits <= TNODE_KMALLOC_MAX) kfree(n); else vfree(n); } -#define node_free(n) call_rcu(&n->rcu, __node_free_rcu) +#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) -static inline void free_leaf_info(struct leaf_info *leaf) +static struct tnode *tnode_alloc(int bits) { - kfree_rcu(leaf, rcu); -} + size_t size; + + /* verify bits is within bounds */ + if (bits > TNODE_VMALLOC_MAX) + return NULL; + + /* determine size and verify it is non-zero and didn't overflow */ + size = TNODE_SIZE(1ul << bits); -static struct tnode *tnode_alloc(size_t size) -{ if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } -static inline void empty_child_inc(struct tnode *n) +static inline void empty_child_inc(struct key_vector *n) { - ++n->empty_children ? : ++n->full_children; + ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; } -static inline void empty_child_dec(struct tnode *n) +static inline void empty_child_dec(struct key_vector *n) { - n->empty_children-- ? : n->full_children--; + tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; } -static struct tnode *leaf_new(t_key key) +static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { - struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); - if (l) { - l->parent = NULL; - /* set key and pos to reflect full key value - * any trailing zeros in the key should be ignored - * as the nodes are searched - */ - l->key = key; - l->slen = 0; - l->pos = 0; - /* set bits to 0 indicating we are not a tnode */ - l->bits = 0; + struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); + struct key_vector *l = kv->kv; - INIT_HLIST_HEAD(&l->list); - } - return l; -} + if (!kv) + return NULL; -static struct leaf_info *leaf_info_new(int plen) -{ - struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); - if (li) { - li->plen = plen; - li->mask_plen = ntohl(inet_make_mask(plen)); - INIT_LIST_HEAD(&li->falh); - } - return li; + /* initialize key vector */ + l->key = key; + l->pos = 0; + l->bits = 0; + l->slen = fa->fa_slen; + + /* link leaf to fib alias */ + INIT_HLIST_HEAD(&l->leaf); + hlist_add_head(&fa->fa_list, &l->leaf); + + return l; } -static struct tnode *tnode_new(t_key key, int pos, int bits) +static struct key_vector *tnode_new(t_key key, int pos, int bits) { - size_t sz = offsetof(struct tnode, child[1ul << bits]); - struct tnode *tn = tnode_alloc(sz); + struct tnode *tnode = tnode_alloc(bits); unsigned int shift = pos + bits; + struct key_vector *tn = tnode->kv; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); - if (tn) { - tn->parent = NULL; - tn->slen = pos; - tn->pos = pos; - tn->bits = bits; - tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; - if (bits == KEYLENGTH) - tn->full_children = 1; - else - tn->empty_children = 1ul << bits; - } + pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), + sizeof(struct key_vector *) << bits); + + if (!tnode) + return NULL; + + if (bits == KEYLENGTH) + tnode->full_children = 1; + else + tnode->empty_children = 1ul << bits; + + tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; + tn->pos = pos; + tn->bits = bits; + tn->slen = pos; - pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct tnode *) << bits); return tn; } /* Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(const struct tnode *tn, const struct tnode *n) +static inline int tnode_full(struct key_vector *tn, struct key_vector *n) { return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } @@ -381,17 +382,18 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n) /* Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */ -static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) +static void put_child(struct key_vector *tn, unsigned long i, + struct key_vector *n) { - struct tnode *chi = tnode_get_child(tn, i); + struct key_vector *chi = get_child(tn, i); int isfull, wasfull; - BUG_ON(i >= tnode_child_length(tn)); + BUG_ON(i >= child_length(tn)); /* update emptyChildren, overflow into fullChildren */ - if (n == NULL && chi != NULL) + if (!n && chi) empty_child_inc(tn); - if (n != NULL && chi == NULL) + if (n && !chi) empty_child_dec(tn); /* update fullChildren */ @@ -399,23 +401,23 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) isfull = tnode_full(tn, n); if (wasfull && !isfull) - tn->full_children--; + tn_info(tn)->full_children--; else if (!wasfull && isfull) - tn->full_children++; + tn_info(tn)->full_children++; if (n && (tn->slen < n->slen)) tn->slen = n->slen; - rcu_assign_pointer(tn->child[i], n); + rcu_assign_pointer(tn->tnode[i], n); } -static void update_children(struct tnode *tn) +static void update_children(struct key_vector *tn) { unsigned long i; /* update all of the child parent pointers */ - for (i = tnode_child_length(tn); i;) { - struct tnode *inode = tnode_get_child(tn, --i); + for (i = child_length(tn); i;) { + struct key_vector *inode = get_child(tn, --i); if (!inode) continue; @@ -431,36 +433,37 @@ static void update_children(struct tnode *tn) } } -static inline void put_child_root(struct tnode *tp, struct trie *t, - t_key key, struct tnode *n) +static inline void put_child_root(struct key_vector *tp, t_key key, + struct key_vector *n) { - if (tp) - put_child(tp, get_index(key, tp), n); + if (IS_TRIE(tp)) + rcu_assign_pointer(tp->tnode[0], n); else - rcu_assign_pointer(t->trie, n); + put_child(tp, get_index(key, tp), n); } -static inline void tnode_free_init(struct tnode *tn) +static inline void tnode_free_init(struct key_vector *tn) { - tn->rcu.next = NULL; + tn_info(tn)->rcu.next = NULL; } -static inline void tnode_free_append(struct tnode *tn, struct tnode *n) +static inline void tnode_free_append(struct key_vector *tn, + struct key_vector *n) { - n->rcu.next = tn->rcu.next; - tn->rcu.next = &n->rcu; + tn_info(n)->rcu.next = tn_info(tn)->rcu.next; + tn_info(tn)->rcu.next = &tn_info(n)->rcu; } -static void tnode_free(struct tnode *tn) +static void tnode_free(struct key_vector *tn) { - struct callback_head *head = &tn->rcu; + struct callback_head *head = &tn_info(tn)->rcu; while (head) { head = head->next; - tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); + tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); - tn = container_of(head, struct tnode, rcu); + tn = container_of(head, struct tnode, rcu)->kv; } if (tnode_free_size >= PAGE_SIZE * sync_pages) { @@ -469,14 +472,16 @@ static void tnode_free(struct tnode *tn) } } -static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn) +static struct key_vector *replace(struct trie *t, + struct key_vector *oldtnode, + struct key_vector *tn) { - struct tnode *tp = node_parent(oldtnode); + struct key_vector *tp = node_parent(oldtnode); unsigned long i; /* setup the parent pointer out of and back into this node */ NODE_INIT_PARENT(tn, tp); - put_child_root(tp, t, tn->key, tn); + put_child_root(tp, tn->key, tn); /* update all of the child parent pointers */ update_children(tn); @@ -485,18 +490,21 @@ static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn) tnode_free(oldtnode); /* resize children now that oldtnode is freed */ - for (i = tnode_child_length(tn); i;) { - struct tnode *inode = tnode_get_child(tn, --i); + for (i = child_length(tn); i;) { + struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) - resize(t, inode); + tn = resize(t, inode); } + + return tp; } -static int inflate(struct trie *t, struct tnode *oldtnode) +static struct key_vector *inflate(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *tn; + struct key_vector *tn; unsigned long i; t_key m; @@ -504,7 +512,7 @@ static int inflate(struct trie *t, struct tnode *oldtnode) tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) - return -ENOMEM; + goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); @@ -514,13 +522,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode) * point to existing tnodes and the links between our allocated * nodes. */ - for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) { - struct tnode *inode = tnode_get_child(oldtnode, --i); - struct tnode *node0, *node1; + for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { + struct key_vector *inode = get_child(oldtnode, --i); + struct key_vector *node0, *node1; unsigned long j, k; /* An empty child */ - if (inode == NULL) + if (!inode) continue; /* A leaf or an internal node with skipped bits */ @@ -534,8 +542,8 @@ static int inflate(struct trie *t, struct tnode *oldtnode) /* An internal node with two children */ if (inode->bits == 1) { - put_child(tn, 2 * i + 1, tnode_get_child(inode, 1)); - put_child(tn, 2 * i, tnode_get_child(inode, 0)); + put_child(tn, 2 * i + 1, get_child(inode, 1)); + put_child(tn, 2 * i, get_child(inode, 0)); continue; } @@ -564,11 +572,11 @@ static int inflate(struct trie *t, struct tnode *oldtnode) tnode_free_append(tn, node0); /* populate child pointers in new nodes */ - for (k = tnode_child_length(inode), j = k / 2; j;) { - put_child(node1, --j, tnode_get_child(inode, --k)); - put_child(node0, j, tnode_get_child(inode, j)); - put_child(node1, --j, tnode_get_child(inode, --k)); - put_child(node0, j, tnode_get_child(inode, j)); + for (k = child_length(inode), j = k / 2; j;) { + put_child(node1, --j, get_child(inode, --k)); + put_child(node0, j, get_child(inode, j)); + put_child(node1, --j, get_child(inode, --k)); + put_child(node0, j, get_child(inode, j)); } /* link new nodes to parent */ @@ -581,25 +589,25 @@ static int inflate(struct trie *t, struct tnode *oldtnode) } /* setup the parent pointers into and out of this node */ - replace(t, oldtnode, tn); - - return 0; + return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); - return -ENOMEM; +notnode: + return NULL; } -static int halve(struct trie *t, struct tnode *oldtnode) +static struct key_vector *halve(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *tn; + struct key_vector *tn; unsigned long i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) - return -ENOMEM; + goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); @@ -609,10 +617,10 @@ static int halve(struct trie *t, struct tnode *oldtnode) * point to existing tnodes and the links between our allocated * nodes. */ - for (i = tnode_child_length(oldtnode); i;) { - struct tnode *node1 = tnode_get_child(oldtnode, --i); - struct tnode *node0 = tnode_get_child(oldtnode, --i); - struct tnode *inode; + for (i = child_length(oldtnode); i;) { + struct key_vector *node1 = get_child(oldtnode, --i); + struct key_vector *node0 = get_child(oldtnode, --i); + struct key_vector *inode; /* At least one of the children is empty */ if (!node1 || !node0) { @@ -622,10 +630,8 @@ static int halve(struct trie *t, struct tnode *oldtnode) /* Two nonempty children */ inode = tnode_new(node0->key, oldtnode->pos, 1); - if (!inode) { - tnode_free(tn); - return -ENOMEM; - } + if (!inode) + goto nomem; tnode_free_append(tn, inode); /* initialize pointers out of node */ @@ -638,30 +644,36 @@ static int halve(struct trie *t, struct tnode *oldtnode) } /* setup the parent pointers into and out of this node */ - replace(t, oldtnode, tn); - - return 0; + return replace(t, oldtnode, tn); +nomem: + /* all pointers should be clean so we are done */ + tnode_free(tn); +notnode: + return NULL; } -static void collapse(struct trie *t, struct tnode *oldtnode) +static struct key_vector *collapse(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *n, *tp; + struct key_vector *n, *tp; unsigned long i; /* scan the tnode looking for that one child that might still exist */ - for (n = NULL, i = tnode_child_length(oldtnode); !n && i;) - n = tnode_get_child(oldtnode, --i); + for (n = NULL, i = child_length(oldtnode); !n && i;) + n = get_child(oldtnode, --i); /* compress one level */ tp = node_parent(oldtnode); - put_child_root(tp, t, oldtnode->key, n); + put_child_root(tp, oldtnode->key, n); node_set_parent(n, tp); /* drop dead node */ node_free(oldtnode); + + return tp; } -static unsigned char update_suffix(struct tnode *tn) +static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; @@ -671,8 +683,8 @@ static unsigned char update_suffix(struct tnode *tn) * why we start with a stride of 2 since a stride of 1 would * represent the nodes with suffix length equal to tn->pos */ - for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) { - struct tnode *n = tnode_get_child(tn, i); + for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { + struct key_vector *n = get_child(tn, i); if (!n || (n->slen <= slen)) continue; @@ -704,12 +716,12 @@ static unsigned char update_suffix(struct tnode *tn) * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with - * tnode_child_length() and instead of multiplying by 2 (since the + * child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * - * The left-hand side may look a bit weird: tnode_child_length(tn) + * The left-hand side may look a bit weird: child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. @@ -719,10 +731,10 @@ static unsigned char update_suffix(struct tnode *tn) * A clearer way to write this would be: * * to_be_doubled = tn->full_children; - * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - + * not_to_be_doubled = child_length(tn) - tn->empty_children - * tn->full_children; * - * new_child_length = tnode_child_length(tn) * 2; + * new_child_length = child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; @@ -739,57 +751,57 @@ static unsigned char update_suffix(struct tnode *tn) * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: - * 100 * (tnode_child_length(tn) - tn->empty_children + + * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: - * 100 * (tnode_child_length(tn) - tn->empty_children + + * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= - * inflate_threshold * tnode_child_length(tn) * 2 + * inflate_threshold * child_length(tn) * 2 * * shorten again: - * 50 * (tn->full_children + tnode_child_length(tn) - + * 50 * (tn->full_children + child_length(tn) - * tn->empty_children) >= inflate_threshold * - * tnode_child_length(tn) + * child_length(tn) * */ -static bool should_inflate(const struct tnode *tp, const struct tnode *tn) +static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ - threshold *= tp ? inflate_threshold : inflate_threshold_root; - used -= tn->empty_children; - used += tn->full_children; + threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; + used -= tn_info(tn)->empty_children; + used += tn_info(tn)->full_children; /* if bits == KEYLENGTH then pos = 0, and will fail below */ return (used > 1) && tn->pos && ((50 * used) >= threshold); } -static bool should_halve(const struct tnode *tp, const struct tnode *tn) +static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ - threshold *= tp ? halve_threshold : halve_threshold_root; - used -= tn->empty_children; + threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; + used -= tn_info(tn)->empty_children; /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); } -static bool should_collapse(const struct tnode *tn) +static inline bool should_collapse(struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); - used -= tn->empty_children; + used -= tn_info(tn)->empty_children; /* account for bits == KEYLENGTH case */ - if ((tn->bits == KEYLENGTH) && tn->full_children) + if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) used -= KEY_MAX; /* One child or none, time to drop us from the trie */ @@ -797,10 +809,13 @@ static bool should_collapse(const struct tnode *tn) } #define MAX_WORK 10 -static void resize(struct trie *t, struct tnode *tn) +static struct key_vector *resize(struct trie *t, struct key_vector *tn) { - struct tnode *tp = node_parent(tn); - struct tnode __rcu **cptr; +#ifdef CONFIG_IP_FIB_TRIE_STATS + struct trie_use_stats __percpu *stats = t->stats; +#endif + struct key_vector *tp = node_parent(tn); + unsigned long cindex = get_index(tn->key, tp); int max_work = MAX_WORK; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", @@ -810,183 +825,128 @@ static void resize(struct trie *t, struct tnode *tn) * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ - cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie; - BUG_ON(tn != rtnl_dereference(*cptr)); + BUG_ON(tn != get_child(tp, cindex)); /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ while (should_inflate(tp, tn) && max_work) { - if (inflate(t, tn)) { + tp = inflate(t, tn); + if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->resize_node_skipped); + this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; - tn = rtnl_dereference(*cptr); + tn = get_child(tp, cindex); } + /* update parent in case inflate failed */ + tp = node_parent(tn); + /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return; + return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ while (should_halve(tp, tn) && max_work) { - if (halve(t, tn)) { + tp = halve(t, tn); + if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->resize_node_skipped); + this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; - tn = rtnl_dereference(*cptr); + tn = get_child(tp, cindex); } /* Only one child remains */ - if (should_collapse(tn)) { - collapse(t, tn); - return; - } + if (should_collapse(tn)) + return collapse(t, tn); + + /* update parent in case halve failed */ + tp = node_parent(tn); /* Return if at least one deflate was run */ if (max_work != MAX_WORK) - return; + return tp; /* push the suffix length to the parent node */ if (tn->slen > tn->pos) { unsigned char slen = update_suffix(tn); - if (tp && (slen > tp->slen)) + if (slen > tp->slen) tp->slen = slen; } -} - -/* readside must use rcu_read_lock currently dump routines - via get_fa_head and dump */ - -static struct leaf_info *find_leaf_info(struct tnode *l, int plen) -{ - struct hlist_head *head = &l->list; - struct leaf_info *li; - - hlist_for_each_entry_rcu(li, head, hlist) - if (li->plen == plen) - return li; - - return NULL; -} - -static inline struct list_head *get_fa_head(struct tnode *l, int plen) -{ - struct leaf_info *li = find_leaf_info(l, plen); - - if (!li) - return NULL; - return &li->falh; + return tp; } -static void leaf_pull_suffix(struct tnode *l) +static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l) { - struct tnode *tp = node_parent(l); - - while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) { + while ((tp->slen > tp->pos) && (tp->slen > l->slen)) { if (update_suffix(tp) > l->slen) break; tp = node_parent(tp); } } -static void leaf_push_suffix(struct tnode *l) +static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l) { - struct tnode *tn = node_parent(l); - /* if this is a new leaf then tn will be NULL and we can sort * out parent suffix lengths as a part of trie_rebalance */ - while (tn && (tn->slen < l->slen)) { + while (tn->slen < l->slen) { tn->slen = l->slen; tn = node_parent(tn); } } -static void remove_leaf_info(struct tnode *l, struct leaf_info *old) -{ - /* record the location of the previous list_info entry */ - struct hlist_node **pprev = old->hlist.pprev; - struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next); - - /* remove the leaf info from the list */ - hlist_del_rcu(&old->hlist); - - /* only access li if it is pointing at the last valid hlist_node */ - if (hlist_empty(&l->list) || (*pprev)) - return; - - /* update the trie with the latest suffix length */ - l->slen = KEYLENGTH - li->plen; - leaf_pull_suffix(l); -} - -static void insert_leaf_info(struct tnode *l, struct leaf_info *new) +/* rcu_read_lock needs to be hold by caller from readside */ +static struct key_vector *fib_find_node(struct trie *t, + struct key_vector **tp, u32 key) { - struct hlist_head *head = &l->list; - struct leaf_info *li = NULL, *last = NULL; + struct key_vector *pn, *n = t->kv; + unsigned long index = 0; - if (hlist_empty(head)) { - hlist_add_head_rcu(&new->hlist, head); - } else { - hlist_for_each_entry(li, head, hlist) { - if (new->plen > li->plen) - break; - - last = li; - } - if (last) - hlist_add_behind_rcu(&new->hlist, &last->hlist); - else - hlist_add_before_rcu(&new->hlist, &li->hlist); - } - - /* if we added to the tail node then we need to update slen */ - if (l->slen < (KEYLENGTH - new->plen)) { - l->slen = KEYLENGTH - new->plen; - leaf_push_suffix(l); - } -} + do { + pn = n; + n = get_child_rcu(n, index); -/* rcu_read_lock needs to be hold by caller from readside */ -static struct tnode *fib_find_node(struct trie *t, u32 key) -{ - struct tnode *n = rcu_dereference_rtnl(t->trie); + if (!n) + break; - while (n) { - unsigned long index = get_index(key, n); + index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. - * if (index & (~0ul << bits)) + * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex + * + * This check is safe even if bits == KEYLENGTH due to the + * fact that we can only allocate a node with 32 bits if a + * long is greater than 32 bits. */ - if (index & (~0ul << n->bits)) - return NULL; - - /* we have found a leaf. Prefixes have already been compared */ - if (IS_LEAF(n)) + if (index >= (1ul << n->bits)) { + n = NULL; break; + } - n = tnode_get_child_rcu(n, index); - } + /* keep searching until we find a perfect match leaf or NULL */ + } while (IS_TNODE(n)); + + *tp = pn; return n; } @@ -994,14 +954,23 @@ static struct tnode *fib_find_node(struct trie *t, u32 key) /* Return the first fib alias matching TOS with * priority less than or equal to PRIO. */ -static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) +static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, + u8 tos, u32 prio, u32 tb_id) { struct fib_alias *fa; if (!fah) return NULL; - list_for_each_entry(fa, fah, fa_list) { + hlist_for_each_entry(fa, fah, fa_list) { + if (fa->fa_slen < slen) + continue; + if (fa->fa_slen != slen) + break; + if (fa->tb_id > tb_id) + continue; + if (fa->tb_id != tb_id) + break; if (fa->fa_tos > tos) continue; if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) @@ -1011,77 +980,23 @@ static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) return NULL; } -static void trie_rebalance(struct trie *t, struct tnode *tn) +static void trie_rebalance(struct trie *t, struct key_vector *tn) { - struct tnode *tp; - - while ((tp = node_parent(tn)) != NULL) { - resize(t, tn); - tn = tp; - } - - /* Handle last (top) tnode */ - if (IS_TNODE(tn)) - resize(t, tn); + while (!IS_TRIE(tn)) + tn = resize(t, tn); } -/* only used from updater-side */ - -static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) +static int fib_insert_node(struct trie *t, struct key_vector *tp, + struct fib_alias *new, t_key key) { - struct list_head *fa_head = NULL; - struct tnode *l, *n, *tp = NULL; - struct leaf_info *li; - - li = leaf_info_new(plen); - if (!li) - return NULL; - fa_head = &li->falh; + struct key_vector *n, *l; - n = rtnl_dereference(t->trie); - - /* If we point to NULL, stop. Either the tree is empty and we should - * just put a new leaf in if, or we have reached an empty child slot, - * and we should just put our new leaf in that. - * - * If we hit a node with a key that does't match then we should stop - * and create a new tnode to replace that node and insert ourselves - * and the other node into the new tnode. - */ - while (n) { - unsigned long index = get_index(key, n); - - /* This bit of code is a bit tricky but it combines multiple - * checks into a single check. The prefix consists of the - * prefix plus zeros for the "bits" in the prefix. The index - * is the difference between the key and this value. From - * this we can actually derive several pieces of data. - * if !(index >> bits) - * we know the value is child index - * else - * we have a mismatch in skip bits and failed - */ - if (index >> n->bits) - break; - - /* we have found a leaf. Prefixes have already been compared */ - if (IS_LEAF(n)) { - /* Case 1: n is a leaf, and prefixes match*/ - insert_leaf_info(n, li); - return fa_head; - } - - tp = n; - n = tnode_get_child_rcu(n, index); - } - - l = leaf_new(key); - if (!l) { - free_leaf_info(li); - return NULL; - } + l = leaf_new(key, new); + if (!l) + goto noleaf; - insert_leaf_info(l, li); + /* retrieve child from parent node */ + n = get_child(tp, get_index(key, tp)); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * @@ -1090,21 +1005,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) * leaves us in position for handling as case 3 */ if (n) { - struct tnode *tn; + struct key_vector *tn; tn = tnode_new(key, __fls(key ^ n->key), 1); - if (!tn) { - free_leaf_info(li); - node_free(l); - return NULL; - } + if (!tn) + goto notnode; /* initialize routes out of node */ NODE_INIT_PARENT(tn, tp); put_child(tn, get_index(key, tn) ^ 1, n); /* start adding routes into the node */ - put_child_root(tp, t, key, tn); + put_child_root(tp, key, tn); node_set_parent(n, tn); /* parent now has a NULL spot where the leaf can go */ @@ -1112,69 +1024,93 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) } /* Case 3: n is NULL, and will just insert a new leaf */ - if (tp) { - NODE_INIT_PARENT(l, tp); - put_child(tp, get_index(key, tp), l); - trie_rebalance(t, tp); + NODE_INIT_PARENT(l, tp); + put_child_root(tp, key, l); + trie_rebalance(t, tp); + + return 0; +notnode: + node_free(l); +noleaf: + return -ENOMEM; +} + +static int fib_insert_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *new, + struct fib_alias *fa, t_key key) +{ + if (!l) + return fib_insert_node(t, tp, new, key); + + if (fa) { + hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { - rcu_assign_pointer(t->trie, l); + struct fib_alias *last; + + hlist_for_each_entry(last, &l->leaf, fa_list) { + if (new->fa_slen < last->fa_slen) + break; + if ((new->fa_slen == last->fa_slen) && + (new->tb_id > last->tb_id)) + break; + fa = last; + } + + if (fa) + hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); + else + hlist_add_head_rcu(&new->fa_list, &l->leaf); } - return fa_head; + /* if we added to the tail node then we need to update slen */ + if (l->slen < new->fa_slen) { + l->slen = new->fa_slen; + leaf_push_suffix(tp, l); + } + + return 0; } -/* - * Caller must hold RTNL. - */ +/* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { - struct trie *t = (struct trie *) tb->tb_data; + struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; - struct list_head *fa_head = NULL; + struct key_vector *l, *tp; struct fib_info *fi; - int plen = cfg->fc_dst_len; + u8 plen = cfg->fc_dst_len; + u8 slen = KEYLENGTH - plen; u8 tos = cfg->fc_tos; - u32 key, mask; + u32 key; int err; - struct tnode *l; - if (plen > 32) + if (plen > KEYLENGTH) return -EINVAL; key = ntohl(cfg->fc_dst); pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); - mask = ntohl(inet_make_mask(plen)); - - if (key & ~mask) + if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - key = key & mask; - fi = fib_create_info(cfg); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; } - l = fib_find_node(t, key); - fa = NULL; - - if (l) { - fa_head = get_fa_head(l, plen); - fa = fib_find_alias(fa_head, tos, fi->fib_priority); - } + l = fib_find_node(t, &tp, key); + fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, + tb->tb_id) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,tos,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and - * insert to the head of f. - * - * If f is NULL, no fib node matched the destination key - * and we need to allocate a new one of those as well. + * insert to the tail of the section matching the suffix length + * of the new alias. */ if (fa && fa->fa_tos == tos && @@ -1192,9 +1128,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) */ fa_match = NULL; fa_first = fa; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, fa_head, fa_list) { - if (fa->fa_tos != tos) + hlist_for_each_entry_from(fa, fa_list) { + if ((fa->fa_slen != slen) || + (fa->tb_id != tb->tb_id) || + (fa->fa_tos != tos)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; @@ -1217,7 +1154,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) + if (!new_fa) goto out; fi_drop = fa->fa_info; @@ -1226,8 +1163,21 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_type = cfg->fc_type; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; + new_fa->fa_slen = fa->fa_slen; + + err = netdev_switch_fib_ipv4_add(key, plen, fi, + new_fa->fa_tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); + if (err) { + netdev_switch_fib_ipv4_abort(fi); + kmem_cache_free(fn_alias_kmem, new_fa); + goto out; + } + + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - list_replace_rcu(&fa->fa_list, &new_fa->fa_list); alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1254,37 +1204,42 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) + if (!new_fa) goto out; new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; - /* - * Insert new entry to the list. - */ - - if (!fa_head) { - fa_head = fib_insert_node(t, key, plen); - if (unlikely(!fa_head)) { - err = -ENOMEM; - goto out_free_new_fa; - } + new_fa->fa_slen = slen; + new_fa->tb_id = tb->tb_id; + + /* (Optionally) offload fib entry to switch hardware. */ + err = netdev_switch_fib_ipv4_add(key, plen, fi, tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); + if (err) { + netdev_switch_fib_ipv4_abort(fi); + goto out_free_new_fa; } + /* Insert new entry to the list. */ + err = fib_insert_alias(t, tp, l, new_fa, fa, key); + if (err) + goto out_sw_fib_del; + if (!plen) tb->tb_num_default++; - list_add_tail_rcu(&new_fa->fa_list, - (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(cfg->fc_nlinfo.nl_net); - rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, 0); succeeded: return 0; +out_sw_fib_del: + netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1293,7 +1248,7 @@ err: return err; } -static inline t_key prefix_mismatch(t_key key, struct tnode *n) +static inline t_key prefix_mismatch(t_key key, struct key_vector *n) { t_key prefix = n->key; @@ -1304,16 +1259,20 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n) int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { - struct trie *t = (struct trie *)tb->tb_data; + struct trie *t = (struct trie *) tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif const t_key key = ntohl(flp->daddr); - struct tnode *n, *pn; - struct leaf_info *li; + struct key_vector *n, *pn; + struct fib_alias *fa; + unsigned long index; t_key cindex; - n = rcu_dereference(t->trie); + pn = t->kv; + cindex = 0; + + n = get_child_rcu(pn, cindex); if (!n) return -EAGAIN; @@ -1321,24 +1280,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, this_cpu_inc(stats->gets); #endif - pn = n; - cindex = 0; - /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { - unsigned long index = get_index(key, n); + index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. - * if (index & (~0ul << bits)) + * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex + * + * This check is safe even if bits == KEYLENGTH due to the + * fact that we can only allocate a node with 32 bits if a + * long is greater than 32 bits. */ - if (index & (~0ul << n->bits)) + if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ @@ -1353,7 +1313,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, cindex = index; } - n = tnode_get_child_rcu(n, index); + n = get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } @@ -1361,7 +1321,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ - struct tnode __rcu **cptr = n->child; + struct key_vector __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of @@ -1393,13 +1353,17 @@ backtrace: while (!cindex) { t_key pkey = pn->key; - pn = node_parent_rcu(pn); - if (unlikely(!pn)) + /* If we don't have a parent then there is + * nothing for us to do as we do not have any + * further nodes to parse. + */ + if (IS_TRIE(pn)) return -EAGAIN; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif /* Get Child's index */ + pn = node_parent_rcu(pn); cindex = get_index(pkey, pn); } @@ -1407,138 +1371,134 @@ backtrace: cindex &= cindex - 1; /* grab pointer for next child node */ - cptr = &pn->child[cindex]; + cptr = &pn->tnode[cindex]; } } found: + /* this line carries forward the xor from earlier in the function */ + index = key ^ n->key; + /* Step 3: Process the leaf, if that fails fall back to backtracing */ - hlist_for_each_entry_rcu(li, &n->list, hlist) { - struct fib_alias *fa; + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + int nhsel, err; - if ((key ^ n->key) & li->mask_plen) + if ((index >= (1ul << fa->fa_slen)) && + ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH))) continue; - - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - struct fib_info *fi = fa->fa_info; - int nhsel, err; - - if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) - continue; - if (fi->fib_dead) - continue; - if (fa->fa_info->fib_scope < flp->flowi4_scope) - continue; - fib_alias_accessed(fa); - err = fib_props[fa->fa_type].error; - if (unlikely(err < 0)) { + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fi->fib_dead) + continue; + if (fa->fa_info->fib_scope < flp->flowi4_scope) + continue; + fib_alias_accessed(fa); + err = fib_props[fa->fa_type].error; + if (unlikely(err < 0)) { #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_passed); + this_cpu_inc(stats->semantic_match_passed); #endif - return err; - } - if (fi->fib_flags & RTNH_F_DEAD) + return err; + } + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (nh->nh_flags & RTNH_F_DEAD) continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) - continue; - - if (!(fib_flags & FIB_LOOKUP_NOREF)) - atomic_inc(&fi->fib_clntref); - - res->prefixlen = li->plen; - res->nh_sel = nhsel; - res->type = fa->fa_type; - res->scope = fi->fib_scope; - res->fi = fi; - res->table = tb; - res->fa_head = &li->falh; + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&fi->fib_clntref); + + res->prefixlen = KEYLENGTH - fa->fa_slen; + res->nh_sel = nhsel; + res->type = fa->fa_type; + res->scope = fi->fib_scope; + res->fi = fi; + res->table = tb; + res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_passed); + this_cpu_inc(stats->semantic_match_passed); #endif - return err; - } + return err; } - + } #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_miss); + this_cpu_inc(stats->semantic_match_miss); #endif - } goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); -/* - * Remove the leaf and return parent. - */ -static void trie_leaf_remove(struct trie *t, struct tnode *l) +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old) { - struct tnode *tp = node_parent(l); + /* record the location of the previous list_info entry */ + struct hlist_node **pprev = old->fa_list.pprev; + struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); - pr_debug("entering trie_leaf_remove(%p)\n", l); + /* remove the fib_alias from the list */ + hlist_del_rcu(&old->fa_list); - if (tp) { - put_child(tp, get_index(l->key, tp), NULL); + /* if we emptied the list this leaf will be freed and we can sort + * out parent suffix lengths as a part of trie_rebalance + */ + if (hlist_empty(&l->leaf)) { + put_child_root(tp, l->key, NULL); + node_free(l); trie_rebalance(t, tp); - } else { - RCU_INIT_POINTER(t->trie, NULL); + return; } - node_free(l); + /* only access fa if it is pointing at the last valid hlist_node */ + if (*pprev) + return; + + /* update the trie with the latest suffix length */ + l->slen = fa->fa_slen; + leaf_pull_suffix(tp, l); } -/* - * Caller must hold RTNL. - */ +/* Caller must hold RTNL. */ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; - u32 key, mask; - int plen = cfg->fc_dst_len; - u8 tos = cfg->fc_tos; struct fib_alias *fa, *fa_to_delete; - struct list_head *fa_head; - struct tnode *l; - struct leaf_info *li; + struct key_vector *l, *tp; + u8 plen = cfg->fc_dst_len; + u8 slen = KEYLENGTH - plen; + u8 tos = cfg->fc_tos; + u32 key; - if (plen > 32) + if (plen > KEYLENGTH) return -EINVAL; key = ntohl(cfg->fc_dst); - mask = ntohl(inet_make_mask(plen)); - if (key & ~mask) + if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - key = key & mask; - l = fib_find_node(t, key); - + l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; - li = find_leaf_info(l, plen); - - if (!li) - return -ESRCH; - - fa_head = &li->falh; - fa = fib_find_alias(fa_head, tos, 0); - + fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); if (!fa) return -ESRCH; pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); fa_to_delete = NULL; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, fa_head, fa_list) { + hlist_for_each_entry_from(fa, fa_list) { struct fib_info *fi = fa->fa_info; - if (fa->fa_tos != tos) + if ((fa->fa_slen != slen) || + (fa->tb_id != tb->tb_id) || + (fa->fa_tos != tos)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && @@ -1557,240 +1517,397 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, - &cfg->fc_nlinfo, 0); + netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, + cfg->fc_type, tb->tb_id); - list_del_rcu(&fa->fa_list); + rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, + &cfg->fc_nlinfo, 0); if (!plen) tb->tb_num_default--; - if (list_empty(fa_head)) { - remove_leaf_info(l, li); - free_leaf_info(li); - } + fib_remove_alias(t, tp, l, fa_to_delete); - if (hlist_empty(&l->list)) - trie_leaf_remove(t, l); - - if (fa->fa_state & FA_S_ACCESSED) + if (fa_to_delete->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); - fib_release_info(fa->fa_info); - alias_free_mem_rcu(fa); + fib_release_info(fa_to_delete->fa_info); + alias_free_mem_rcu(fa_to_delete); return 0; } -static int trie_flush_list(struct list_head *head) +/* Scan for the next leaf starting at the provided key value */ +static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) { - struct fib_alias *fa, *fa_node; - int found = 0; + struct key_vector *pn, *n = *tn; + unsigned long cindex; - list_for_each_entry_safe(fa, fa_node, head, fa_list) { - struct fib_info *fi = fa->fa_info; + /* this loop is meant to try and find the key in the trie */ + do { + /* record parent and next child index */ + pn = n; + cindex = key ? get_index(key, pn) : 0; - if (fi && (fi->fib_flags & RTNH_F_DEAD)) { - list_del_rcu(&fa->fa_list); - fib_release_info(fa->fa_info); - alias_free_mem_rcu(fa); - found++; + if (cindex >> pn->bits) + break; + + /* descend into the next child */ + n = get_child_rcu(pn, cindex++); + if (!n) + break; + + /* guarantee forward progress on the keys */ + if (IS_LEAF(n) && (n->key >= key)) + goto found; + } while (IS_TNODE(n)); + + /* this loop will search for the next leaf with a greater key */ + while (!IS_TRIE(pn)) { + /* if we exhausted the parent node we will need to climb */ + if (cindex >= (1ul << pn->bits)) { + t_key pkey = pn->key; + + pn = node_parent_rcu(pn); + cindex = get_index(pkey, pn) + 1; + continue; } + + /* grab the next available node */ + n = get_child_rcu(pn, cindex++); + if (!n) + continue; + + /* no need to compare keys since we bumped the index */ + if (IS_LEAF(n)) + goto found; + + /* Rescan start scanning in new node */ + pn = n; + cindex = 0; } - return found; + + *tn = pn; + return NULL; /* Root of trie */ +found: + /* if we are at the limit for keys just return NULL for the tnode */ + *tn = pn; + return n; } -static int trie_flush_leaf(struct tnode *l) +static void fib_trie_free(struct fib_table *tb) { - int found = 0; - struct hlist_head *lih = &l->list; + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; struct hlist_node *tmp; - struct leaf_info *li = NULL; - unsigned char plen = KEYLENGTH; + struct fib_alias *fa; + + /* walk trie in reverse order and free everything */ + for (;;) { + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + if (IS_TRIE(pn)) + break; + + n = pn; + pn = node_parent(pn); - hlist_for_each_entry_safe(li, tmp, lih, hlist) { - found += trie_flush_list(&li->falh); + /* drop emptied tnode */ + put_child_root(pn, n->key, NULL); + node_free(n); + + cindex = get_index(pkey, pn); - if (list_empty(&li->falh)) { - hlist_del_rcu(&li->hlist); - free_leaf_info(li); continue; } - plen = li->plen; - } + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; - l->slen = KEYLENGTH - plen; + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; - return found; + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + } + + put_child_root(pn, n->key, NULL); + node_free(n); + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + free_percpu(t->stats); +#endif + kfree(tb); } -/* - * Scan for the next right leaf starting at node p->child[idx] - * Since we have back pointer, no recursion necessary. - */ -static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c) +struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) { - do { - unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0; + struct trie *ot = (struct trie *)oldtb->tb_data; + struct key_vector *l, *tp = ot->kv; + struct fib_table *local_tb; + struct fib_alias *fa; + struct trie *lt; + t_key key = 0; - while (idx < tnode_child_length(p)) { - c = tnode_get_child_rcu(p, idx++); - if (!c) + if (oldtb->tb_data == oldtb->__data) + return oldtb; + + local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); + if (!local_tb) + return NULL; + + lt = (struct trie *)local_tb->tb_data; + + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { + struct key_vector *local_l = NULL, *local_tp; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + struct fib_alias *new_fa; + + if (local_tb->tb_id != fa->tb_id) continue; - if (IS_LEAF(c)) - return c; + /* clone fa for new local table */ + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + if (!new_fa) + goto out; + + memcpy(new_fa, fa, sizeof(*fa)); - /* Rescan start scanning in new node */ - p = c; - idx = 0; + /* insert clone into table */ + if (!local_l) + local_l = fib_find_node(lt, &local_tp, l->key); + + if (fib_insert_alias(lt, local_tp, local_l, new_fa, + NULL, l->key)) + goto out; } - /* Node empty, walk back up to parent */ - c = p; - } while ((p = node_parent_rcu(c)) != NULL); + /* stop loop if key wrapped back to 0 */ + key = l->key + 1; + if (key < l->key) + break; + } - return NULL; /* Root of trie */ + return local_tb; +out: + fib_trie_free(local_tb); + + return NULL; } -static struct tnode *trie_firstleaf(struct trie *t) +/* Caller must hold RTNL */ +void fib_table_flush_external(struct fib_table *tb) { - struct tnode *n = rcu_dereference_rtnl(t->trie); + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; - if (!n) - return NULL; + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; - if (IS_LEAF(n)) /* trie is just a leaf */ - return n; + if (!(cindex--)) { + t_key pkey = pn->key; - return leaf_walk_rcu(n, NULL); -} + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; -static struct tnode *trie_nextleaf(struct tnode *l) -{ - struct tnode *p = node_parent_rcu(l); + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); - if (!p) - return NULL; /* trie with just one leaf */ + continue; + } - return leaf_walk_rcu(p, l); -} + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; -static struct tnode *trie_leafindex(struct trie *t, int index) -{ - struct tnode *l = trie_firstleaf(t); + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; - while (l && index-- > 0) - l = trie_nextleaf(l); + continue; + } - return l; -} + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + /* if alias was cloned to local then we just + * need to remove the local copy from main + */ + if (tb->tb_id != fa->tb_id) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + continue; + } + /* record local slen */ + slen = fa->fa_slen; -/* - * Caller must hold RTNL. - */ + if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL)) + continue; + + netdev_switch_fib_ipv4_del(n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, + fa->fa_type, tb->tb_id); + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } else { + leaf_pull_suffix(pn, n); + } + } +} + +/* Caller must hold RTNL. */ int fib_table_flush(struct fib_table *tb) { - struct trie *t = (struct trie *) tb->tb_data; - struct tnode *l, *ll = NULL; + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; int found = 0; - for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { - found += trie_flush_leaf(l); + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; - if (ll) { - if (hlist_empty(&ll->list)) - trie_leaf_remove(t, ll); - else - leaf_pull_suffix(ll); + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; + + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); + + continue; } - ll = l; - } + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; - if (ll) { - if (hlist_empty(&ll->list)) - trie_leaf_remove(t, ll); - else - leaf_pull_suffix(ll); + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) { + slen = fa->fa_slen; + continue; + } + + netdev_switch_fib_ipv4_del(n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, + fa->fa_type, tb->tb_id); + hlist_del_rcu(&fa->fa_list); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); + found++; + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } else { + leaf_pull_suffix(pn, n); + } } pr_debug("trie_flush found=%d\n", found); return found; } -void fib_free_table(struct fib_table *tb) +static void __trie_free_rcu(struct rcu_head *head) { + struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; - free_percpu(t->stats); + if (tb->tb_data == tb->__data) + free_percpu(t->stats); #endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } -static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, - struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) +void fib_free_table(struct fib_table *tb) { - int i, s_i; + call_rcu(&tb->rcu, __trie_free_rcu); +} + +static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) +{ + __be32 xkey = htonl(l->key); struct fib_alias *fa; - __be32 xkey = htonl(key); + int i, s_i; - s_i = cb->args[5]; + s_i = cb->args[4]; i = 0; /* rcu_read_lock is hold by caller */ - - list_for_each_entry_rcu(fa, fah, fa_list) { + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { if (i < s_i) { i++; continue; } + if (tb->tb_id != fa->tb_id) { + i++; + continue; + } + if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, tb->tb_id, fa->fa_type, xkey, - plen, + KEYLENGTH - fa->fa_slen, fa->fa_tos, fa->fa_info, NLM_F_MULTI) < 0) { - cb->args[5] = i; - return -1; - } - i++; - } - cb->args[5] = i; - return skb->len; -} - -static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) -{ - struct leaf_info *li; - int i, s_i; - - s_i = cb->args[4]; - i = 0; - - /* rcu_read_lock is hold by caller */ - hlist_for_each_entry_rcu(li, &l->list, hlist) { - if (i < s_i) { - i++; - continue; - } - - if (i > s_i) - cb->args[5] = 0; - - if (list_empty(&li->falh)) - continue; - - if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) { cb->args[4] = i; return -1; } @@ -1801,44 +1918,38 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, return skb->len; } +/* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - struct tnode *l; - struct trie *t = (struct trie *) tb->tb_data; - t_key key = cb->args[2]; - int count = cb->args[3]; - - rcu_read_lock(); + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *l, *tp = t->kv; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ - if (count == 0) - l = trie_firstleaf(t); - else { - /* Normally, continue from last key, but if that is missing - * fallback to using slow rescan - */ - l = fib_find_node(t, key); - if (!l) - l = trie_leafindex(t, count); - } + int count = cb->args[2]; + t_key key = cb->args[3]; - while (l) { - cb->args[2] = l->key; + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { - cb->args[3] = count; - rcu_read_unlock(); + cb->args[3] = key; + cb->args[2] = count; return -1; } ++count; - l = trie_nextleaf(l); + key = l->key + 1; + memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); + + /* stop loop if key wrapped back to 0 */ + if (key < l->key) + break; } - cb->args[3] = count; - rcu_read_unlock(); + + cb->args[3] = key; + cb->args[2] = count; return skb->len; } @@ -1850,28 +1961,34 @@ void __init fib_trie_init(void) 0, SLAB_PANIC, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", - max(sizeof(struct tnode), - sizeof(struct leaf_info)), + LEAF_SIZE, 0, SLAB_PANIC, NULL); } - -struct fib_table *fib_trie_table(u32 id) +struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) { struct fib_table *tb; struct trie *t; + size_t sz = sizeof(*tb); + + if (!alias) + sz += sizeof(struct trie); - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), - GFP_KERNEL); - if (tb == NULL) + tb = kzalloc(sz, GFP_KERNEL); + if (!tb) return NULL; tb->tb_id = id; tb->tb_default = -1; tb->tb_num_default = 0; + tb->tb_data = (alias ? alias->__data : tb->__data); + + if (alias) + return tb; t = (struct trie *) tb->tb_data; - RCU_INIT_POINTER(t->trie, NULL); + t->kv[0].pos = KEYLENGTH; + t->kv[0].slen = KEYLENGTH; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats = alloc_percpu(struct trie_use_stats); if (!t->stats) { @@ -1888,65 +2005,63 @@ struct fib_table *fib_trie_table(u32 id) struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; - struct tnode *tnode; + struct key_vector *tnode; unsigned int index; unsigned int depth; }; -static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) +static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) { unsigned long cindex = iter->index; - struct tnode *tn = iter->tnode; - struct tnode *p; - - /* A single entry routing table */ - if (!tn) - return NULL; + struct key_vector *pn = iter->tnode; + t_key pkey; pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); -rescan: - while (cindex < tnode_child_length(tn)) { - struct tnode *n = tnode_get_child_rcu(tn, cindex); - if (n) { + while (!IS_TRIE(pn)) { + while (cindex < child_length(pn)) { + struct key_vector *n = get_child_rcu(pn, cindex++); + + if (!n) + continue; + if (IS_LEAF(n)) { - iter->tnode = tn; - iter->index = cindex + 1; + iter->tnode = pn; + iter->index = cindex; } else { /* push down one level */ iter->tnode = n; iter->index = 0; ++iter->depth; } + return n; } - ++cindex; - } - - /* Current node exhausted, pop back up */ - p = node_parent_rcu(tn); - if (p) { - cindex = get_index(tn->key, p) + 1; - tn = p; + /* Current node exhausted, pop back up */ + pkey = pn->key; + pn = node_parent_rcu(pn); + cindex = get_index(pkey, pn) + 1; --iter->depth; - goto rescan; } - /* got root? */ + /* record root node so further searches know we are done */ + iter->tnode = pn; + iter->index = 0; + return NULL; } -static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, - struct trie *t) +static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, + struct trie *t) { - struct tnode *n; + struct key_vector *n, *pn = t->kv; if (!t) return NULL; - n = rcu_dereference(t->trie); + n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; @@ -1955,7 +2070,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, iter->index = 0; iter->depth = 1; } else { - iter->tnode = NULL; + iter->tnode = pn; iter->index = 0; iter->depth = 0; } @@ -1965,7 +2080,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, static void trie_collect_stats(struct trie *t, struct trie_stat *s) { - struct tnode *n; + struct key_vector *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); @@ -1973,20 +2088,20 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { - struct leaf_info *li; + struct fib_alias *fa; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; - hlist_for_each_entry_rcu(li, &n->list, hlist) + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) ++s->prefixes; } else { s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; - s->nullpointers += n->empty_children; + s->nullpointers += tn_info(n)->empty_children; } } rcu_read_unlock(); @@ -2009,13 +2124,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - bytes = sizeof(struct tnode) * stat->leaves; + bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); - bytes += sizeof(struct leaf_info) * stat->prefixes; + bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); - bytes += sizeof(struct tnode) * stat->tnodes; + bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) @@ -2030,7 +2145,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); - bytes += sizeof(struct tnode *) * pointers; + bytes += sizeof(struct key_vector *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } @@ -2084,7 +2199,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "Basic info: size of leaf:" " %Zd bytes, size of tnode: %Zd bytes.\n", - sizeof(struct tnode), sizeof(struct tnode)); + LEAF_SIZE, TNODE_SIZE(0)); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; @@ -2123,7 +2238,7 @@ static const struct file_operations fib_triestat_fops = { .release = single_release_net, }; -static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2135,7 +2250,7 @@ static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { - struct tnode *n; + struct key_vector *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); @@ -2164,7 +2279,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; - struct tnode *n; + struct key_vector *n; ++*pos; /* next node in same table */ @@ -2250,9 +2365,9 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; - struct tnode *n = v; + struct key_vector *n = v; - if (!node_parent_rcu(n)) + if (IS_TRIE(node_parent_rcu(n))) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { @@ -2261,30 +2376,28 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", &prf, KEYLENGTH - n->pos - n->bits, n->bits, - n->full_children, n->empty_children); + tn_info(n)->full_children, + tn_info(n)->empty_children); } else { - struct leaf_info *li; __be32 val = htonl(n->key); + struct fib_alias *fa; seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); - hlist_for_each_entry_rcu(li, &n->list, hlist) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - char buf1[32], buf2[32]; - - seq_indent(seq, iter->depth+1); - seq_printf(seq, " /%d %s %s", li->plen, - rtn_scope(buf1, sizeof(buf1), - fa->fa_info->fib_scope), - rtn_type(buf2, sizeof(buf2), - fa->fa_type)); - if (fa->fa_tos) - seq_printf(seq, " tos=%d", fa->fa_tos); - seq_putc(seq, '\n'); - } + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { + char buf1[32], buf2[32]; + + seq_indent(seq, iter->depth + 1); + seq_printf(seq, " /%zu %s %s", + KEYLENGTH - fa->fa_slen, + rtn_scope(buf1, sizeof(buf1), + fa->fa_info->fib_scope), + rtn_type(buf2, sizeof(buf2), + fa->fa_type)); + if (fa->fa_tos) + seq_printf(seq, " tos=%d", fa->fa_tos); + seq_putc(seq, '\n'); } } @@ -2314,31 +2427,47 @@ static const struct file_operations fib_trie_fops = { struct fib_route_iter { struct seq_net_private p; - struct trie *main_trie; + struct fib_table *main_tb; + struct key_vector *tnode; loff_t pos; t_key key; }; -static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) +static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, + loff_t pos) { - struct tnode *l = NULL; - struct trie *t = iter->main_trie; + struct fib_table *tb = iter->main_tb; + struct key_vector *l, **tp = &iter->tnode; + struct trie *t; + t_key key; - /* use cache location of last found key */ - if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key))) + /* use cache location of next-to-find key */ + if (iter->pos > 0 && pos >= iter->pos) { pos -= iter->pos; - else { + key = iter->key; + } else { + t = (struct trie *)tb->tb_data; + iter->tnode = t->kv; iter->pos = 0; - l = trie_firstleaf(t); + key = 0; } - while (l && pos-- > 0) { + while ((l = leaf_walk_rcu(tp, key)) != NULL) { + key = l->key + 1; iter->pos++; - l = trie_nextleaf(l); + + if (pos-- <= 0) + break; + + l = NULL; + + /* handle unlikely case of a key wrap */ + if (!key) + break; } if (l) - iter->key = pos; /* remember it */ + iter->key = key; /* remember it */ else iter->pos = 0; /* forget it */ @@ -2350,37 +2479,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; + struct trie *t; rcu_read_lock(); + tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; - iter->main_trie = (struct trie *) tb->tb_data; - if (*pos == 0) - return SEQ_START_TOKEN; - else - return fib_route_get_idx(iter, *pos - 1); + iter->main_tb = tb; + + if (*pos != 0) + return fib_route_get_idx(iter, *pos); + + t = (struct trie *)tb->tb_data; + iter->tnode = t->kv; + iter->pos = 0; + iter->key = 0; + + return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; - struct tnode *l = v; + struct key_vector *l = NULL; + t_key key = iter->key; ++*pos; - if (v == SEQ_START_TOKEN) { - iter->pos = 0; - l = trie_firstleaf(iter->main_trie); - } else { + + /* only allow key of 0 for start of sequence */ + if ((v == SEQ_START_TOKEN) || key) + l = leaf_walk_rcu(&iter->tnode, key); + + if (l) { + iter->key = l->key + 1; iter->pos++; - l = trie_nextleaf(l); + } else { + iter->pos = 0; } - if (l) - iter->key = l->key; - else - iter->pos = 0; return l; } @@ -2412,8 +2550,11 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info */ static int fib_route_seq_show(struct seq_file *seq, void *v) { - struct tnode *l = v; - struct leaf_info *li; + struct fib_route_iter *iter = seq->private; + struct fib_table *tb = iter->main_tb; + struct fib_alias *fa; + struct key_vector *l = v; + __be32 prefix; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2422,45 +2563,43 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } - hlist_for_each_entry_rcu(li, &l->list, hlist) { - struct fib_alias *fa; - __be32 mask, prefix; + prefix = htonl(l->key); - mask = inet_make_mask(li->plen); - prefix = htonl(l->key); + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + const struct fib_info *fi = fa->fa_info; + __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); + unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - const struct fib_info *fi = fa->fa_info; - unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); + if ((fa->fa_type == RTN_BROADCAST) || + (fa->fa_type == RTN_MULTICAST)) + continue; - if (fa->fa_type == RTN_BROADCAST - || fa->fa_type == RTN_MULTICAST) - continue; + if (fa->tb_id != tb->tb_id) + continue; - seq_setwidth(seq, 127); - - if (fi) - seq_printf(seq, - "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", - fi->fib_dev ? fi->fib_dev->name : "*", - prefix, - fi->fib_nh->nh_gw, flags, 0, 0, - fi->fib_priority, - mask, - (fi->fib_advmss ? - fi->fib_advmss + 40 : 0), - fi->fib_window, - fi->fib_rtt >> 3); - else - seq_printf(seq, - "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", - prefix, 0, flags, 0, 0, 0, - mask, 0, 0, 0); - - seq_pad(seq, '\n'); - } + seq_setwidth(seq, 127); + + if (fi) + seq_printf(seq, + "%s\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u", + fi->fib_dev ? fi->fib_dev->name : "*", + prefix, + fi->fib_nh->nh_gw, flags, 0, 0, + fi->fib_priority, + mask, + (fi->fib_advmss ? + fi->fib_advmss + 40 : 0), + fi->fib_window, + fi->fib_rtt >> 3); + else + seq_printf(seq, + "*\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u", + prefix, 0, flags, 0, 0, 0, + mask, 0, 0, 0); + + seq_pad(seq, '\n'); } return 0; |