From a02cec2155fbea457eca8881870fd2de1a4c4c76 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Sep 2010 20:43:57 +0000 Subject: net: return operator cleanup Change "return (EXPR);" to "return EXPR;" return is not a function, parentheses are not required. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a4e0a7482c2b..96b1a749abb4 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -122,7 +122,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return (base ? (net_random() % base) + (base >> 1) : 0); + return base ? (net_random() % base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -766,9 +766,9 @@ next_elt: static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return (n->nud_state & NUD_PROBE ? + return (n->nud_state & NUD_PROBE) ? p->ucast_probes : - p->ucast_probes + p->app_probes + p->mcast_probes); + p->ucast_probes + p->app_probes + p->mcast_probes; } static void neigh_invalidate(struct neighbour *neigh) -- cgit v1.2.3 From c7d4426a98a5f6654cd0b4b33d9dab2e77192c18 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 Oct 2010 22:17:54 -0700 Subject: net: introduce DST_NOCACHE flag While doing stress tests with IP route cache disabled, and multi queue devices, I noticed a very high contention on one rwlock used in neighbour code. When many cpus are trying to send frames (possibly using a high performance multiqueue device) to the same neighbour, they fight for the neigh->lock rwlock in order to call neigh_hh_init(), and fight on hh->hh_refcnt (a pair of atomic_inc/atomic_dec_and_test()) But we dont need to call neigh_hh_init() for dst that are used only once. It costs four atomic operations at least, on two contended cache lines, plus the high contention on neigh->lock rwlock. Introduce a new dst flag, DST_NOCACHE, that is set when dst was not inserted in route cache. With the stress test bench, sending 160000000 frames on one neighbour, results are : Before patch: real 2m28.406s user 0m11.781s sys 36m17.964s After patch: real 1m26.532s user 0m12.185s sys 20m3.903s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 9 +++++---- net/core/neighbour.c | 4 +++- net/ipv4/route.c | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/include/net/dst.h b/include/net/dst.h index aa53fbc34b2b..a217c838ec0d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -43,10 +43,11 @@ struct dst_entry { short error; short obsolete; int flags; -#define DST_HOST 1 -#define DST_NOXFRM 2 -#define DST_NOPOLICY 4 -#define DST_NOHASH 8 +#define DST_HOST 0x0001 +#define DST_NOXFRM 0x0002 +#define DST_NOPOLICY 0x0004 +#define DST_NOHASH 0x0008 +#define DST_NOCACHE 0x0010 unsigned long expires; unsigned short header_len; /* more space at head required */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 96b1a749abb4..b142a0d76072 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1210,7 +1210,9 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; - if (dev->header_ops->cache && !dst->hh) { + if (dev->header_ops->cache && + !dst->hh && + !(dst->flags & DST_NOCACHE)) { write_lock_bh(&neigh->lock); if (!dst->hh) neigh_hh_init(neigh, dst, dst->ops->protocol); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a61acea975f1..c3cb8bd23638 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1107,6 +1107,7 @@ restart: * on the route gc list. */ + rt->dst.flags |= DST_NOCACHE; if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->dst); if (err) { -- cgit v1.2.3 From 110b2499370c401cdcc7c63e481084467291d556 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2010 04:27:36 +0000 Subject: net neigh: neigh_delete() and neigh_add() changes neigh_delete() and neigh_add() dont need to touch device refcount, we hold RTNL when calling them, so device cannot disappear under us. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b142a0d76072..d6996e072a41 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1531,6 +1531,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct net_device *dev = NULL; int err = -EINVAL; + ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; @@ -1540,7 +1541,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(net, ndm->ndm_ifindex); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; @@ -1556,34 +1557,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_unlock(&neigh_tbl_lock); if (nla_len(dst_attr) < tbl->key_len) - goto out_dev_put; + goto out; if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out_dev_put; + goto out; } if (dev == NULL) - goto out_dev_put; + goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; - goto out_dev_put; + goto out; } err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); neigh_release(neigh); - goto out_dev_put; + goto out; } read_unlock(&neigh_tbl_lock); err = -EAFNOSUPPORT; -out_dev_put: - if (dev) - dev_put(dev); out: return err; } @@ -1597,6 +1595,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct net_device *dev = NULL; int err; + ASSERT_RTNL(); err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); if (err < 0) goto out; @@ -1607,14 +1606,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(net, ndm->ndm_ifindex); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) - goto out_dev_put; + goto out; } read_lock(&neigh_tbl_lock); @@ -1628,7 +1627,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_unlock(&neigh_tbl_lock); if (nla_len(tb[NDA_DST]) < tbl->key_len) - goto out_dev_put; + goto out; dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; @@ -1641,29 +1640,29 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) pn->flags = ndm->ndm_flags; err = 0; } - goto out_dev_put; + goto out; } if (dev == NULL) - goto out_dev_put; + goto out; neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; - goto out_dev_put; + goto out; } neigh = __neigh_lookup_errno(tbl, dst, dev); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); - goto out_dev_put; + goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); - goto out_dev_put; + goto out; } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) @@ -1676,15 +1675,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } else err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); neigh_release(neigh); - goto out_dev_put; + goto out; } read_unlock(&neigh_tbl_lock); err = -EAFNOSUPPORT; - -out_dev_put: - if (dev) - dev_put(dev); out: return err; } -- cgit v1.2.3 From d6bf781712a1d25cc8987036b3a48535b331eb91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2010 06:15:44 +0000 Subject: net neigh: RCU conversion of neigh hash table David This is the first step for RCU conversion of neigh code. Next patches will convert hash_buckets[] and "struct neighbour" to RCU protected objects. Thanks [PATCH net-next] net neigh: RCU conversion of neigh hash table Instead of storing hash_buckets, hash_mask and hash_rnd in "struct neigh_table", a new structure is defined : struct neigh_hash_table { struct neighbour **hash_buckets; unsigned int hash_mask; __u32 hash_rnd; struct rcu_head rcu; }; And "struct neigh_table" has an RCU protected pointer to such a neigh_hash_table. This means the signature of (*hash)() function changed: We need to add a third parameter with the actual hash_rnd value, since this is not anymore a neigh_table field. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/neighbour.h | 16 +++- net/atm/clip.c | 4 +- net/core/neighbour.c | 219 ++++++++++++++++++++++++++++++------------------ net/decnet/dn_neigh.c | 13 +-- net/ipv4/arp.c | 8 +- net/ipv6/ndisc.c | 10 ++- 6 files changed, 170 insertions(+), 100 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7d08fd1062f0..37845dae6488 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -138,13 +138,22 @@ struct pneigh_entry { * neighbour table manipulation */ +struct neigh_hash_table { + struct neighbour **hash_buckets; + unsigned int hash_mask; + __u32 hash_rnd; + struct rcu_head rcu; +}; + struct neigh_table { struct neigh_table *next; int family; int entry_size; int key_len; - __u32 (*hash)(const void *pkey, const struct net_device *); + __u32 (*hash)(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); @@ -165,9 +174,7 @@ struct neigh_table { unsigned long last_rand; struct kmem_cache *kmem_cachep; struct neigh_statistics __percpu *stats; - struct neighbour **hash_buckets; - unsigned int hash_mask; - __u32 hash_rnd; + struct neigh_hash_table __rcu *nht; struct pneigh_entry **phash_buckets; }; @@ -237,6 +244,7 @@ extern void pneigh_for_each(struct neigh_table *tbl, void (*cb)(struct pneigh_en struct neigh_seq_state { struct seq_net_private p; struct neigh_table *tbl; + struct neigh_hash_table *nht; void *(*neigh_sub_iter)(struct neigh_seq_state *state, struct neighbour *n, loff_t *pos); unsigned int bucket; diff --git a/net/atm/clip.c b/net/atm/clip.c index 95fdd1185067..ff956d1115bc 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -310,9 +310,9 @@ static int clip_constructor(struct neighbour *neigh) return 0; } -static u32 clip_hash(const void *pkey, const struct net_device *dev) +static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd) { - return jhash_2words(*(u32 *) pkey, dev->ifindex, clip_tbl.hash_rnd); + return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd); } static struct neigh_table clip_tbl = { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d6996e072a41..dd8920e4f508 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -131,14 +131,17 @@ static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - for (i = 0; i <= tbl->hash_mask; i++) { + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (i = 0; i <= nht->hash_mask; i++) { struct neighbour *n, **np; - np = &tbl->hash_buckets[i]; + np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. @@ -199,9 +202,13 @@ static void pneigh_queue_purge(struct sk_buff_head *list) static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) { int i; + struct neigh_hash_table *nht; - for (i = 0; i <= tbl->hash_mask; i++) { - struct neighbour *n, **np = &tbl->hash_buckets[i]; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + for (i = 0; i <= nht->hash_mask; i++) { + struct neighbour *n, **np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { if (dev && n->dev != dev) { @@ -297,64 +304,81 @@ out_entries: goto out; } -static struct neighbour **neigh_hash_alloc(unsigned int entries) +static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) { - unsigned long size = entries * sizeof(struct neighbour *); - struct neighbour **ret; + size_t size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *ret; + struct neighbour **buckets; - if (size <= PAGE_SIZE) { - ret = kzalloc(size, GFP_ATOMIC); - } else { - ret = (struct neighbour **) - __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size)); + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) + return NULL; + if (size <= PAGE_SIZE) + buckets = kzalloc(size, GFP_ATOMIC); + else + buckets = (struct neighbour **) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); + if (!buckets) { + kfree(ret); + return NULL; } + ret->hash_buckets = buckets; + ret->hash_mask = entries - 1; + get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; } -static void neigh_hash_free(struct neighbour **hash, unsigned int entries) +static void neigh_hash_free_rcu(struct rcu_head *head) { - unsigned long size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *nht = container_of(head, + struct neigh_hash_table, + rcu); + size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); + struct neighbour **buckets = nht->hash_buckets; if (size <= PAGE_SIZE) - kfree(hash); + kfree(buckets); else - free_pages((unsigned long)hash, get_order(size)); + free_pages((unsigned long)buckets, get_order(size)); + kfree(nht); } -static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) +static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, + unsigned long new_entries) { - struct neighbour **new_hash, **old_hash; - unsigned int i, new_hash_mask, old_entries; + unsigned int i, hash; + struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); BUG_ON(!is_power_of_2(new_entries)); - new_hash = neigh_hash_alloc(new_entries); - if (!new_hash) - return; + old_nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + new_nht = neigh_hash_alloc(new_entries); + if (!new_nht) + return old_nht; - old_entries = tbl->hash_mask + 1; - new_hash_mask = new_entries - 1; - old_hash = tbl->hash_buckets; - - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - for (i = 0; i < old_entries; i++) { + for (i = 0; i <= old_nht->hash_mask; i++) { struct neighbour *n, *next; - for (n = old_hash[i]; n; n = next) { - unsigned int hash_val = tbl->hash(n->primary_key, n->dev); + for (n = old_nht->hash_buckets[i]; + n != NULL; + n = next) { + hash = tbl->hash(n->primary_key, n->dev, + new_nht->hash_rnd); - hash_val &= new_hash_mask; + hash &= new_nht->hash_mask; next = n->next; - n->next = new_hash[hash_val]; - new_hash[hash_val] = n; + n->next = new_nht->hash_buckets[hash]; + new_nht->hash_buckets[hash] = n; } } - tbl->hash_buckets = new_hash; - tbl->hash_mask = new_hash_mask; - neigh_hash_free(old_hash, old_entries); + rcu_assign_pointer(tbl->nht, new_nht); + call_rcu(&old_nht->rcu, neigh_hash_free_rcu); + return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, @@ -363,19 +387,23 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct neighbour *n; int key_len = tbl->key_len; u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - hash_val = tbl->hash(pkey, dev); - for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; + read_lock(&tbl->lock); + for (n = nht->hash_buckets[hash_val]; n; n = n->next) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { neigh_hold(n); NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup); @@ -386,12 +414,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, struct neighbour *n; int key_len = tbl->key_len; u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - hash_val = tbl->hash(pkey, NULL); - for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; + read_lock(&tbl->lock); + for (n = nht->hash_buckets[hash_val]; n; n = n->next) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { neigh_hold(n); @@ -399,7 +430,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, break; } } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup_nodev); @@ -411,6 +443,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neigh_hash_table *nht; if (!n) { rc = ERR_PTR(-ENOBUFS); @@ -437,18 +470,20 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, n->confirmed = jiffies - (n->parms->base_reachable_time << 1); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); - if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) - neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); + if (atomic_read(&tbl->entries) > (nht->hash_mask + 1)) + nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1); - hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } - for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { + for (n1 = nht->hash_buckets[hash_val]; n1; n1 = n1->next) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { neigh_hold(n1); rc = n1; @@ -456,8 +491,8 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, } } - n->next = tbl->hash_buckets[hash_val]; - tbl->hash_buckets[hash_val] = n; + n->next = nht->hash_buckets[hash_val]; + nht->hash_buckets[hash_val] = n; n->dead = 0; neigh_hold(n); write_unlock_bh(&tbl->lock); @@ -698,10 +733,13 @@ static void neigh_periodic_work(struct work_struct *work) struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neighbour *n, **np; unsigned int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function @@ -715,8 +753,8 @@ static void neigh_periodic_work(struct work_struct *work) neigh_rand_reach_time(p->base_reachable_time); } - for (i = 0 ; i <= tbl->hash_mask; i++) { - np = &tbl->hash_buckets[i]; + for (i = 0 ; i <= nht->hash_mask; i++) { + np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { unsigned int state; @@ -1438,17 +1476,14 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) panic("cannot create neighbour proc dir entry"); #endif - tbl->hash_mask = 1; - tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1); + tbl->nht = neigh_hash_alloc(8); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); - if (!tbl->hash_buckets || !tbl->phash_buckets) + if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - rwlock_init(&tbl->lock); INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); @@ -1504,8 +1539,8 @@ int neigh_table_clear(struct neigh_table *tbl) } write_unlock(&neigh_tbl_lock); - neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); - tbl->hash_buckets = NULL; + call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu); + tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; @@ -1745,18 +1780,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, unsigned long now = jiffies; unsigned int flush_delta = now - tbl->last_flush; unsigned int rand_delta = now - tbl->last_rand; - + struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_hash_rnd = tbl->hash_rnd, - .ndtc_hash_mask = tbl->hash_mask, .ndtc_proxy_qlen = tbl->proxy_queue.qlen, }; + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + ndc.ndtc_hash_rnd = nht->hash_rnd; + ndc.ndtc_hash_mask = nht->hash_mask; + rcu_read_unlock_bh(); + NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); } @@ -2088,14 +2127,18 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (h = 0; h <= tbl->hash_mask; h++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); + for (h = 0; h <= nht->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { + for (n = nht->hash_buckets[h], idx = 0; n; n = n->next) { if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) @@ -2104,7 +2147,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI) <= 0) { - read_unlock_bh(&tbl->lock); rc = -1; goto out; } @@ -2112,9 +2154,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, idx++; } } - read_unlock_bh(&tbl->lock); rc = skb->len; out: + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; return rc; @@ -2147,15 +2190,20 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (chain = 0; chain <= tbl->hash_mask; chain++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); + for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n; - for (n = tbl->hash_buckets[chain]; n; n = n->next) + for (n = nht->hash_buckets[chain]; n; n = n->next) cb(n, cookie); } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_for_each); @@ -2164,11 +2212,14 @@ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { int chain; + struct neigh_hash_table *nht; - for (chain = 0; chain <= tbl->hash_mask; chain++) { + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n, **np; - np = &tbl->hash_buckets[chain]; + np = &nht->hash_buckets[chain]; while ((n = *np) != NULL) { int release; @@ -2193,13 +2244,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); - struct neigh_table *tbl = state->tbl; + struct neigh_hash_table *nht = state->nht; struct neighbour *n = NULL; int bucket = state->bucket; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { - n = tbl->hash_buckets[bucket]; + for (bucket = 0; bucket <= nht->hash_mask; bucket++) { + n = nht->hash_buckets[bucket]; while (n) { if (!net_eq(dev_net(n->dev), net)) @@ -2234,7 +2285,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); - struct neigh_table *tbl = state->tbl; + struct neigh_hash_table *nht = state->nht; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); @@ -2265,10 +2316,10 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (n) break; - if (++state->bucket > tbl->hash_mask) + if (++state->bucket > nht->hash_mask) break; - n = tbl->hash_buckets[state->bucket]; + n = nht->hash_buckets[state->bucket]; } if (n && pos) @@ -2367,6 +2418,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) + __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; @@ -2374,8 +2426,9 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl state->bucket = 0; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); - read_lock_bh(&tbl->lock); - + rcu_read_lock_bh(); + state->nht = rcu_dereference_bh(tbl->nht); + read_lock(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); @@ -2409,11 +2462,13 @@ EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) + __releases(rcu_bh) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 0363bb95cc7d..a085dbcf5c7f 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -48,7 +48,6 @@ #include #include -static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev); static int dn_neigh_construct(struct neighbour *); static void dn_long_error_report(struct neighbour *, struct sk_buff *); static void dn_short_error_report(struct neighbour *, struct sk_buff *); @@ -93,6 +92,13 @@ static const struct neigh_ops dn_phase3_ops = { .queue_xmit = dev_queue_xmit }; +static u32 dn_neigh_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) +{ + return jhash_2words(*(__u16 *)pkey, 0, hash_rnd); +} + struct neigh_table dn_neigh_table = { .family = PF_DECnet, .entry_size = sizeof(struct dn_neigh), @@ -122,11 +128,6 @@ struct neigh_table dn_neigh_table = { .gc_thresh3 = 1024, }; -static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev) -{ - return jhash_2words(*(__u16 *)pkey, 0, dn_neigh_table.hash_rnd); -} - static int dn_neigh_construct(struct neighbour *neigh) { struct net_device *dev = neigh->dev; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d9031ad67826..f35309578170 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -127,7 +127,7 @@ EXPORT_SYMBOL(clip_tbl_hook); /* * Interface to generic neighbour cache. */ -static u32 arp_hash(const void *pkey, const struct net_device *dev); +static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -225,9 +225,11 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) } -static u32 arp_hash(const void *pkey, const struct net_device *dev) +static u32 arp_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) { - return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); + return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd); } static int arp_constructor(struct neighbour *neigh) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b3dd844cd34f..998d6d27e7cf 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -91,7 +91,9 @@ #include #include -static u32 ndisc_hash(const void *pkey, const struct net_device *dev); +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 rnd); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -350,7 +352,9 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d EXPORT_SYMBOL(ndisc_mc_map); -static u32 ndisc_hash(const void *pkey, const struct net_device *dev) +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) { const u32 *p32 = pkey; u32 addr_hash, i; @@ -359,7 +363,7 @@ static u32 ndisc_hash(const void *pkey, const struct net_device *dev) for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++) addr_hash ^= *p32++; - return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd); + return jhash_2words(addr_hash, dev->ifindex, hash_rnd); } static int ndisc_constructor(struct neighbour *neigh) -- cgit v1.2.3 From 767e97e1e0db0d0f3152cd2f3bd3403596aedbad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Oct 2010 17:49:21 -0700 Subject: neigh: RCU conversion of struct neighbour This is the second step for neighbour RCU conversion. (first was commit d6bf7817 : RCU conversion of neigh hash table) neigh_lookup() becomes lockless, but still take a reference on found neighbour. (no more read_lock()/read_unlock() on tbl->lock) struct neighbour gets an additional rcu_head field and is freed after an RCU grace period. Future work would need to eventually not take a reference on neighbour for temporary dst (DST_NOCACHE), but this would need dst->_neighbour to use a noref bit like we did for skb->_dst. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/neighbour.h | 5 +- net/core/neighbour.c | 137 ++++++++++++++++++++++++++++++------------------ 2 files changed, 88 insertions(+), 54 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 37845dae6488..a4538d553704 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -91,7 +91,7 @@ struct neigh_statistics { #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { - struct neighbour *next; + struct neighbour __rcu *next; struct neigh_table *tbl; struct neigh_parms *parms; struct net_device *dev; @@ -111,6 +111,7 @@ struct neighbour { struct sk_buff_head arp_queue; struct timer_list timer; const struct neigh_ops *ops; + struct rcu_head rcu; u8 primary_key[0]; }; @@ -139,7 +140,7 @@ struct pneigh_entry { */ struct neigh_hash_table { - struct neighbour **hash_buckets; + struct neighbour __rcu **hash_buckets; unsigned int hash_mask; __u32 hash_rnd; struct rcu_head rcu; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index dd8920e4f508..3ffafaa0414c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -139,10 +139,12 @@ static int neigh_forced_gc(struct neigh_table *tbl) nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i <= nht->hash_mask; i++) { - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. * - it is not permanent @@ -150,7 +152,9 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if (atomic_read(&n->refcnt) == 1 && !(n->nud_state & NUD_PERMANENT)) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; shrunk = 1; write_unlock(&n->lock); @@ -208,14 +212,18 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) lockdep_is_held(&tbl->lock)); for (i = 0; i <= nht->hash_mask; i++) { - struct neighbour *n, **np = &nht->hash_buckets[i]; + struct neighbour *n; + struct neighbour __rcu **np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { if (dev && n->dev != dev) { np = &n->next; continue; } - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); n->dead = 1; @@ -323,7 +331,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) kfree(ret); return NULL; } - ret->hash_buckets = buckets; + rcu_assign_pointer(ret->hash_buckets, buckets); ret->hash_mask = entries - 1; get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; @@ -362,17 +370,22 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, for (i = 0; i <= old_nht->hash_mask; i++) { struct neighbour *n, *next; - for (n = old_nht->hash_buckets[i]; + for (n = rcu_dereference_protected(old_nht->hash_buckets[i], + lockdep_is_held(&tbl->lock)); n != NULL; n = next) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash &= new_nht->hash_mask; - next = n->next; - - n->next = new_nht->hash_buckets[hash]; - new_nht->hash_buckets[hash] = n; + next = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + + rcu_assign_pointer(n->next, + rcu_dereference_protected( + new_nht->hash_buckets[hash], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(new_nht->hash_buckets[hash], n); } } @@ -394,15 +407,18 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; - read_lock(&tbl->lock); - for (n = nht->hash_buckets[hash_val]; n; n = n->next) { + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } @@ -421,16 +437,19 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; - read_lock(&tbl->lock); - for (n = nht->hash_buckets[hash_val]; n; n = n->next) { + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } @@ -483,7 +502,11 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, goto out_tbl_unlock; } - for (n1 = nht->hash_buckets[hash_val]; n1; n1 = n1->next) { + for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); + n1 != NULL; + n1 = rcu_dereference_protected(n1->next, + lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { neigh_hold(n1); rc = n1; @@ -491,10 +514,12 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, } } - n->next = nht->hash_buckets[hash_val]; - nht->hash_buckets[hash_val] = n; n->dead = 0; neigh_hold(n); + rcu_assign_pointer(n->next, + rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); NEIGH_PRINTK2("neigh %p is created.\n", n); rc = n; @@ -651,6 +676,12 @@ static inline void neigh_parms_put(struct neigh_parms *parms) neigh_parms_destroy(parms); } +static void neigh_destroy_rcu(struct rcu_head *head) +{ + struct neighbour *neigh = container_of(head, struct neighbour, rcu); + + kmem_cache_free(neigh->tbl->kmem_cachep, neigh); +} /* * neighbour must already be out of the table; * @@ -690,7 +721,7 @@ void neigh_destroy(struct neighbour *neigh) NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); atomic_dec(&neigh->tbl->entries); - kmem_cache_free(neigh->tbl->kmem_cachep, neigh); + call_rcu(&neigh->rcu, neigh_destroy_rcu); } EXPORT_SYMBOL(neigh_destroy); @@ -731,7 +762,8 @@ static void neigh_connect(struct neighbour *neigh) static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; unsigned int i; struct neigh_hash_table *nht; @@ -756,7 +788,8 @@ static void neigh_periodic_work(struct work_struct *work) for (i = 0 ; i <= nht->hash_mask; i++) { np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { unsigned int state; write_lock(&n->lock); @@ -1213,8 +1246,8 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, } /* This function can be used in contexts, where only old dev_queue_xmit - worked, f.e. if you want to override normal output path (eql, shaper), - but resolution is not made yet. + * worked, f.e. if you want to override normal output path (eql, shaper), + * but resolution is not made yet. */ int neigh_compat_output(struct sk_buff *skb) @@ -2123,7 +2156,7 @@ static void neigh_update_notify(struct neighbour *neigh) static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { - struct net * net = sock_net(skb->sk); + struct net *net = sock_net(skb->sk); struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; @@ -2132,13 +2165,14 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); for (h = 0; h <= nht->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - for (n = nht->hash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; + n != NULL; + n = rcu_dereference_bh(n->next)) { if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) @@ -2150,13 +2184,12 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, rc = -1; goto out; } - next: +next: idx++; } } rc = skb->len; out: - read_unlock(&tbl->lock); rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; @@ -2195,11 +2228,13 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); + read_lock(&tbl->lock); /* avoid resizes */ for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n; - for (n = nht->hash_buckets[chain]; n; n = n->next) + for (n = rcu_dereference_bh(nht->hash_buckets[chain]); + n != NULL; + n = rcu_dereference_bh(n->next)) cb(n, cookie); } read_unlock(&tbl->lock); @@ -2217,16 +2252,20 @@ void __neigh_for_each_release(struct neigh_table *tbl, nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain <= nht->hash_mask; chain++) { - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; np = &nht->hash_buckets[chain]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { int release; write_lock(&n->lock); release = cb(n); if (release) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; } else np = &n->next; @@ -2250,7 +2289,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) state->flags &= ~NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= nht->hash_mask; bucket++) { - n = nht->hash_buckets[bucket]; + n = rcu_dereference_bh(nht->hash_buckets[bucket]); while (n) { if (!net_eq(dev_net(n->dev), net)) @@ -2267,8 +2306,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) break; if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) @@ -2292,7 +2331,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (v) return n; } - n = n->next; + n = rcu_dereference_bh(n->next); while (1) { while (n) { @@ -2309,8 +2348,8 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) @@ -2319,7 +2358,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (++state->bucket > nht->hash_mask) break; - n = nht->hash_buckets[state->bucket]; + n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); } if (n && pos) @@ -2417,7 +2456,6 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) - __acquires(tbl->lock) __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; @@ -2428,7 +2466,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl rcu_read_lock_bh(); state->nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); + return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); @@ -2461,13 +2499,8 @@ out: EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) - __releases(tbl->lock) __releases(rcu_bh) { - struct neigh_seq_state *state = seq->private; - struct neigh_table *tbl = state->tbl; - - read_unlock(&tbl->lock); rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); -- cgit v1.2.3 From 34d101dd6204bd100fc2e6f7b5f9a10f959ce2c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 09:16:57 -0700 Subject: neigh: speedup neigh_hh_init() When a new dst is used to send a frame, neigh_resolve_output() tries to associate an struct hh_cache to this dst, calling neigh_hh_init() with the neigh rwlock write locked. Most of the time, hh_cache is already known and linked into neighbour, so we find it and increment its refcount. This patch changes the logic so that we call neigh_hh_init() with neighbour lock read locked only, so that fast path can be run in parallel by concurrent cpus. This brings part of the speedup we got with commit c7d4426a98a5f (introduce DST_NOCACHE flag) for non cached dsts, even for cached ones, removing one of the contention point that routers hit on multiqueue enabled machines. Further improvements would need to use a seqlock instead of an rwlock to protect neigh->ha[], to not dirty neigh too often and remove two atomic ops. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++ net/core/dst.c | 4 +- net/core/neighbour.c | 99 +++++++++++++++++++++++++++++------------------ 3 files changed, 69 insertions(+), 40 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6abcef67b178..4160db3721ba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -281,6 +281,12 @@ struct hh_cache { unsigned long hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)]; }; +static inline void hh_cache_put(struct hh_cache *hh) +{ + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); +} + /* Reserve HH_DATA_MOD byte aligned hard_header_len, but at least that much. * Alternative is: * dev->hard_header_len ? (dev->hard_header_len + diff --git a/net/core/dst.c b/net/core/dst.c index 6c41b1fac3db..978a1ee1f7d0 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -228,8 +228,8 @@ again: child = dst->child; dst->hh = NULL; - if (hh && atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); + if (hh) + hh_cache_put(hh); if (neigh) { dst->neighbour = NULL; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3ffafaa0414c..2044906ecd1a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -709,8 +709,7 @@ void neigh_destroy(struct neighbour *neigh) write_seqlock_bh(&hh->hh_lock); hh->hh_output = neigh_blackhole; write_sequnlock_bh(&hh->hh_lock); - if (atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); + hh_cache_put(hh); } skb_queue_purge(&neigh->arp_queue); @@ -1210,39 +1209,67 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, } EXPORT_SYMBOL(neigh_event_ns); +static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, + __be16 protocol) +{ + struct hh_cache *hh; + + for (hh = n->hh; hh; hh = hh->hh_next) { + if (hh->hh_type == protocol) { + atomic_inc(&hh->hh_refcnt); + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); + return true; + } + } + return false; +} + +/* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, __be16 protocol) { struct hh_cache *hh; struct net_device *dev = dst->dev; - for (hh = n->hh; hh; hh = hh->hh_next) - if (hh->hh_type == protocol) - break; + if (likely(neigh_hh_lookup(n, dst, protocol))) + return; - if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { - seqlock_init(&hh->hh_lock); - hh->hh_type = protocol; - atomic_set(&hh->hh_refcnt, 0); - hh->hh_next = NULL; + /* slow path */ + hh = kzalloc(sizeof(*hh), GFP_ATOMIC); + if (!hh) + return; - if (dev->header_ops->cache(n, hh)) { - kfree(hh); - hh = NULL; - } else { - atomic_inc(&hh->hh_refcnt); - hh->hh_next = n->hh; - n->hh = hh; - if (n->nud_state & NUD_CONNECTED) - hh->hh_output = n->ops->hh_output; - else - hh->hh_output = n->ops->output; - } + seqlock_init(&hh->hh_lock); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 2); + + if (dev->header_ops->cache(n, hh)) { + kfree(hh); + return; } - if (hh) { - atomic_inc(&hh->hh_refcnt); - dst->hh = hh; + read_unlock(&n->lock); + write_lock(&n->lock); + + /* must check if another thread already did the insert */ + if (neigh_hh_lookup(n, dst, protocol)) { + kfree(hh); + goto end; } + + if (n->nud_state & NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + + hh->hh_next = n->hh; + n->hh = hh; + + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); +end: + write_unlock(&n->lock); + read_lock(&n->lock); } /* This function can be used in contexts, where only old dev_queue_xmit @@ -1281,21 +1308,17 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; + + read_lock_bh(&neigh->lock); if (dev->header_ops->cache && !dst->hh && - !(dst->flags & DST_NOCACHE)) { - write_lock_bh(&neigh->lock); - if (!dst->hh) - neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - write_unlock_bh(&neigh->lock); - } else { - read_lock_bh(&neigh->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); - } + !(dst->flags & DST_NOCACHE)) + neigh_hh_init(neigh, dst, dst->ops->protocol); + + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + read_unlock_bh(&neigh->lock); + if (err >= 0) rc = neigh->ops->queue_xmit(skb); else -- cgit v1.2.3 From 0ed8ddf4045fcfcac36bad753dc4046118c603ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Oct 2010 10:44:07 +0000 Subject: neigh: Protect neigh->ha[] with a seqlock Add a seqlock in struct neighbour to protect neigh->ha[], and avoid dirtying neighbour in stress situation (many different flows / dsts) Dirtying takes place because of read_lock(&n->lock) and n->used writes. Switching to a seqlock, and writing n->used only on jiffies changes permits less dirtying. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/neighbour.h | 16 +++++++++++++++- net/core/neighbour.c | 47 ++++++++++++++++++++++++++++++----------------- net/ipv4/arp.c | 6 ++---- net/sched/sch_teql.c | 8 ++++---- 4 files changed, 51 insertions(+), 26 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index a4538d553704..f04e7a2522c5 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -105,6 +105,7 @@ struct neighbour { atomic_t refcnt; atomic_t probes; rwlock_t lock; + seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))]; struct hh_cache *hh; int (*output)(struct sk_buff *skb); @@ -302,7 +303,10 @@ static inline void neigh_confirm(struct neighbour *neigh) static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { - neigh->used = jiffies; + unsigned long now = ACCESS_ONCE(jiffies); + + if (neigh->used != now) + neigh->used = now; if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) return __neigh_event_send(neigh, skb); return 0; @@ -373,4 +377,14 @@ struct neighbour_cb { #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) +static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, + const struct net_device *dev) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&n->ha_lock); + memcpy(dst, n->ha, dev->addr_len); + } while (read_seqretry(&n->ha_lock, seq)); +} #endif diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2044906ecd1a..b165b96355bf 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -294,6 +294,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; @@ -1015,7 +1016,7 @@ out_unlock_bh: } EXPORT_SYMBOL(__neigh_event_send); -static void neigh_update_hhs(struct neighbour *neigh) +static void neigh_update_hhs(const struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) @@ -1151,7 +1152,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } if (lladdr != neigh->ha) { + write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); + write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - @@ -1214,6 +1217,7 @@ static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, { struct hh_cache *hh; + smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */ for (hh = n->hh; hh; hh = hh->hh_next) { if (hh->hh_type == protocol) { atomic_inc(&hh->hh_refcnt); @@ -1248,8 +1252,8 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, kfree(hh); return; } - read_unlock(&n->lock); - write_lock(&n->lock); + + write_lock_bh(&n->lock); /* must check if another thread already did the insert */ if (neigh_hh_lookup(n, dst, protocol)) { @@ -1263,13 +1267,13 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, hh->hh_output = n->ops->output; hh->hh_next = n->hh; + smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */ n->hh = hh; if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) hh_cache_put(hh); end: - write_unlock(&n->lock); - read_lock(&n->lock); + write_unlock_bh(&n->lock); } /* This function can be used in contexts, where only old dev_queue_xmit @@ -1308,16 +1312,18 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; + unsigned int seq; - read_lock_bh(&neigh->lock); if (dev->header_ops->cache && !dst->hh && !(dst->flags & DST_NOCACHE)) neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = neigh->ops->queue_xmit(skb); @@ -1344,13 +1350,16 @@ int neigh_connected_output(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct neighbour *neigh = dst->neighbour; struct net_device *dev = neigh->dev; + unsigned int seq; __skb_pull(skb, skb_network_offset(skb)); - read_lock_bh(&neigh->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + if (err >= 0) err = neigh->ops->queue_xmit(skb); else { @@ -2148,10 +2157,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; - if ((neigh->nud_state & NUD_VALID) && - nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) { - read_unlock_bh(&neigh->lock); - goto nla_put_failure; + if (neigh->nud_state & NUD_VALID) { + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, neigh, neigh->dev); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { + read_unlock_bh(&neigh->lock); + goto nla_put_failure; + } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f35309578170..d8e540c5b071 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -502,10 +502,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) if (n) { n->used = jiffies; - if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { - read_lock_bh(&n->lock); - memcpy(haddr, n->ha, dev->addr_len); - read_unlock_bh(&n->lock); + if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) { + neigh_ha_snapshot(haddr, n, dev); neigh_release(n); return 0; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index feaabc103ce6..401af9596709 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -241,11 +241,11 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device * } if (neigh_event_send(n, skb_res) == 0) { int err; + char haddr[MAX_ADDR_LEN]; - read_lock(&n->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - n->ha, NULL, skb->len); - read_unlock(&n->lock); + neigh_ha_snapshot(haddr, n, dev); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr, + NULL, skb->len); if (err < 0) { neigh_release(n); -- cgit v1.2.3 From a5c30b349b872aa2ac13babbd5ceb26737f17e95 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Oct 2010 06:04:42 +0000 Subject: net/neighbour: cancel_delayed_work() + flush_scheduled_work() -> cancel_delayed_work_sync() flush_scheduled_work() is going away. Prepare for it. Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- net/core/neighbour.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core/neighbour.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b165b96355bf..8cc8f9a79db9 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1588,8 +1588,7 @@ int neigh_table_clear(struct neigh_table *tbl) struct neigh_table **tp; /* It is not clean... Fix it to unload IPv6 module safely */ - cancel_delayed_work(&tbl->gc_work); - flush_scheduled_work(); + cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue); neigh_ifdown(tbl, NULL); -- cgit v1.2.3