summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-03-31 23:25:40 -0400
committerDavid S. Miller <davem@davemloft.net>2018-03-31 23:25:40 -0400
commit70ae7222c61d4f19c844c8fe75f053f8976b9552 (patch)
treefd2adc5d5bf77f50ac9f50e126e5b39695bd4427 /include
parent5749d6af4925a9da362b37df920c2a1dc54689a1 (diff)
parentf2d1c724fca176ddbd32a9397df49221afbcb227 (diff)
downloadlinux-70ae7222c61d4f19c844c8fe75f053f8976b9552.tar.bz2
Merge branch 'inet-frags-bring-rhashtables-to-IP-defrag'
Eric Dumazet says: ==================== inet: frags: bring rhashtables to IP defrag IP defrag processing is one of the remaining problematic layer in linux. It uses static hash tables of 1024 buckets, and up to 128 items per bucket. A work queue is supposed to garbage collect items when host is under memory pressure, and doing a hash rebuild, changing seed used in hash computations. This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, occurring every 5 seconds if host is under fire. Then there is the problem of sharing this hash table for all netns. It is time to switch to rhashtables, and allocate one of them per netns to speedup netns dismantle, since this is a critical metric these days. Lookup is now using RCU, and 64bit hosts can now provision whatever amount of memory needed to handle the expected workloads. v2: Addressed Herbert and Kirill feedbacks (Use rhashtable_free_and_destroy(), and split the big patch into small units) v3: Removed the extra add_frag_mem_limit(...) from inet_frag_create() Removed the refcount_inc_not_zero() call from inet_frags_free_cb(), as we can exploit del_timer() return value. v4: kbuild robot feedback about one missing static (squashed) Additional patches : inet: frags: do not clone skb in ip_expire() ipv6: frags: rewrite ip6_expire_frag_queue() rhashtable: reorganize struct rhashtable layout inet: frags: reorganize struct netns_frags inet: frags: get rid of ipfrag_skb_cb/FRAG_CB ipv6: frags: get rid of ip6frag_skb_cb/FRAG6_CB inet: frags: get rid of nf_ct_frag6_skb_cb/NFCT_FRAG6_CB ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/rhashtable.h8
-rw-r--r--include/linux/skbuff.h1
-rw-r--r--include/net/inet_frag.h126
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/ipv6.h27
5 files changed, 62 insertions, 101 deletions
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 668a21f04b09..1f8ad121eb43 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -152,25 +152,25 @@ struct rhashtable_params {
/**
* struct rhashtable - Hash table handle
* @tbl: Bucket table
- * @nelems: Number of elements in table
* @key_len: Key length for hashfn
- * @p: Configuration parameters
* @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
* @rhlist: True if this is an rhltable
* @run_work: Deferred worker to expand/shrink asynchronously
* @mutex: Mutex to protect current/future table swapping
* @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
*/
struct rhashtable {
struct bucket_table __rcu *tbl;
- atomic_t nelems;
unsigned int key_len;
- struct rhashtable_params p;
unsigned int max_elems;
+ struct rhashtable_params p;
bool rhlist;
struct work_struct run_work;
struct mutex mutex;
spinlock_t lock;
+ atomic_t nelems;
};
/**
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 47082f54ec1f..9065477ed255 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -672,6 +672,7 @@ struct sk_buff {
* UDP receive path is one user.
*/
unsigned long dev_scratch;
+ int ip_defrag_offset;
};
};
struct rb_node rbnode; /* used in netem & tcp stack */
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 351f0c3cdcd9..ed07e3786d98 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -2,14 +2,20 @@
#ifndef __NET_FRAG_H__
#define __NET_FRAG_H__
+#include <linux/rhashtable.h>
+
struct netns_frags {
- /* Keep atomic mem on separate cachelines in structs that include it */
- atomic_t mem ____cacheline_aligned_in_smp;
/* sysctls */
+ long high_thresh;
+ long low_thresh;
int timeout;
- int high_thresh;
- int low_thresh;
int max_dist;
+ struct inet_frags *f;
+
+ struct rhashtable rhashtable ____cacheline_aligned_in_smp;
+
+ /* Keep atomic mem on separate cachelines in structs that include it */
+ atomic_long_t mem ____cacheline_aligned_in_smp;
};
/**
@@ -25,12 +31,30 @@ enum {
INET_FRAG_COMPLETE = BIT(2),
};
+struct frag_v4_compare_key {
+ __be32 saddr;
+ __be32 daddr;
+ u32 user;
+ u32 vif;
+ __be16 id;
+ u16 protocol;
+};
+
+struct frag_v6_compare_key {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ u32 user;
+ __be32 id;
+ u32 iif;
+};
+
/**
* struct inet_frag_queue - fragment queue
*
- * @lock: spinlock protecting the queue
+ * @node: rhash node
+ * @key: keys identifying this frag.
* @timer: queue expiration timer
- * @list: hash bucket list
+ * @lock: spinlock protecting this frag
* @refcnt: reference count of the queue
* @fragments: received fragments head
* @fragments_tail: received fragments tail
@@ -40,12 +64,16 @@ enum {
* @flags: fragment queue flags
* @max_size: maximum received fragment size
* @net: namespace that this frag belongs to
- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
+ * @rcu: rcu head for freeing deferall
*/
struct inet_frag_queue {
- spinlock_t lock;
+ struct rhash_head node;
+ union {
+ struct frag_v4_compare_key v4;
+ struct frag_v6_compare_key v6;
+ } key;
struct timer_list timer;
- struct hlist_node list;
+ spinlock_t lock;
refcount_t refcnt;
struct sk_buff *fragments;
struct sk_buff *fragments_tail;
@@ -54,101 +82,57 @@ struct inet_frag_queue {
int meat;
__u8 flags;
u16 max_size;
- struct netns_frags *net;
- struct hlist_node list_evictor;
-};
-
-#define INETFRAGS_HASHSZ 1024
-
-/* averaged:
- * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
- * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
- * struct frag_queue))
- */
-#define INETFRAGS_MAXDEPTH 128
-
-struct inet_frag_bucket {
- struct hlist_head chain;
- spinlock_t chain_lock;
+ struct netns_frags *net;
+ struct rcu_head rcu;
};
struct inet_frags {
- struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
-
- struct work_struct frags_work;
- unsigned int next_bucket;
- unsigned long last_rebuild_jiffies;
- bool rebuild;
-
- /* The first call to hashfn is responsible to initialize
- * rnd. This is best done with net_get_random_once.
- *
- * rnd_seqlock is used to let hash insertion detect
- * when it needs to re-lookup the hash chain to use.
- */
- u32 rnd;
- seqlock_t rnd_seqlock;
unsigned int qsize;
- unsigned int (*hashfn)(const struct inet_frag_queue *);
- bool (*match)(const struct inet_frag_queue *q,
- const void *arg);
void (*constructor)(struct inet_frag_queue *q,
const void *arg);
void (*destructor)(struct inet_frag_queue *);
void (*frag_expire)(struct timer_list *t);
struct kmem_cache *frags_cachep;
const char *frags_cache_name;
+ struct rhashtable_params rhash_params;
};
int inet_frags_init(struct inet_frags *);
void inet_frags_fini(struct inet_frags *);
-static inline void inet_frags_init_net(struct netns_frags *nf)
+static inline int inet_frags_init_net(struct netns_frags *nf)
{
- atomic_set(&nf->mem, 0);
+ atomic_long_set(&nf->mem, 0);
+ return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
}
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
-
-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
- struct inet_frags *f, void *key, unsigned int hash);
+void inet_frags_exit_net(struct netns_frags *nf);
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
- const char *prefix);
+void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_destroy(struct inet_frag_queue *q);
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
+static inline void inet_frag_put(struct inet_frag_queue *q)
{
if (refcount_dec_and_test(&q->refcnt))
- inet_frag_destroy(q, f);
-}
-
-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
-{
- return !hlist_unhashed(&q->list_evictor);
+ inet_frag_destroy(q);
}
/* Memory Tracking Functions. */
-static inline int frag_mem_limit(struct netns_frags *nf)
-{
- return atomic_read(&nf->mem);
-}
-
-static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+static inline long frag_mem_limit(const struct netns_frags *nf)
{
- atomic_sub(i, &nf->mem);
+ return atomic_long_read(&nf->mem);
}
-static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
{
- atomic_add(i, &nf->mem);
+ atomic_long_sub(val, &nf->mem);
}
-static inline int sum_frag_mem_limit(struct netns_frags *nf)
+static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
{
- return atomic_read(&nf->mem);
+ atomic_long_add(val, &nf->mem);
}
/* RFC 3168 support :
diff --git a/include/net/ip.h b/include/net/ip.h
index 36f8f7811093..ecffd843e7b8 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s
return skb;
}
#endif
-int ip_frag_mem(struct net *net);
/*
* Functions provided by ip_forward.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 50a6f0ddb878..37455e840347 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
idev->cnf.accept_ra;
}
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int ip6_frag_mem(struct net *net)
-{
- return sum_frag_mem_limit(&net->ipv6.frags);
-}
-#endif
-
#define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */
#define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */
#define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */
@@ -579,17 +572,8 @@ enum ip6_defrag_users {
__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
};
-struct ip6_create_arg {
- __be32 id;
- u32 user;
- const struct in6_addr *src;
- const struct in6_addr *dst;
- int iif;
- u8 ecn;
-};
-
void ip6_frag_init(struct inet_frag_queue *q, const void *a);
-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+extern const struct rhashtable_params ip6_rhash_params;
/*
* Equivalent of ipv4 struct ip
@@ -597,19 +581,12 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
struct frag_queue {
struct inet_frag_queue q;
- __be32 id; /* fragment id */
- u32 user;
- struct in6_addr saddr;
- struct in6_addr daddr;
-
int iif;
- unsigned int csum;
__u16 nhoffset;
u8 ecn;
};
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
- struct inet_frags *frags);
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
static inline bool ipv6_addr_any(const struct in6_addr *a)
{