From adf0516845bcd0e626323c858ece28ee58c74455 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Aug 2016 13:47:06 +0200 Subject: netfilter: remove ip_conntrack* sysctl compat code This backward compatibility has been around for more than ten years, since Yasuyuki Kozakai introduced IPv6 in conntrack. These days, we have alternate /proc/net/nf_conntrack* entries, the ctnetlink interface and the conntrack utility got adopted by many people in the user community according to what I observed on the netfilter user mailing list. So let's get rid of this. Note that nf_conntrack_htable_size and unsigned int nf_conntrack_max do not need to be exported as symbol anymore. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 8 -------- include/net/netns/conntrack.h | 8 -------- 2 files changed, 16 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 1a5fb36f165f..de629f1520df 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -134,14 +134,6 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto); void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto); -static inline void nf_ct_kfree_compat_sysctl_table(struct nf_proto_net *pn) -{ -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - kfree(pn->ctl_compat_table); - pn->ctl_compat_table = NULL; -#endif -} - /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 38b1a80517f0..e469e85de3f9 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -15,10 +15,6 @@ struct nf_proto_net { #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_header; struct ctl_table *ctl_table; -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - struct ctl_table_header *ctl_compat_header; - struct ctl_table *ctl_compat_table; -#endif #endif unsigned int users; }; @@ -58,10 +54,6 @@ struct nf_ip_net { struct nf_udp_net udp; struct nf_icmp_net icmp; struct nf_icmp_net icmpv6; -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - struct ctl_table_header *ctl_table_header; - struct ctl_table *ctl_table; -#endif }; struct ct_pcpu { -- cgit v1.2.3 From 92e47ba8839bacc185db89f3b11cd8036193e6a9 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 13 Aug 2016 22:35:36 +0800 Subject: netfilter: conntrack: simplify the code by using nf_conntrack_get_ht Since commit 64b87639c9cb ("netfilter: conntrack: fix race between nf_conntrack proc read and hash resize") introduce the nf_conntrack_get_ht, so there's no need to check nf_conntrack_generation again and again to get the hash table and hash size. And convert nf_conntrack_get_ht to inline function here. Suggested-by: Pablo Neira Ayuso Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 20 ++++++++++++++ include/net/netfilter/nf_conntrack_core.h | 3 -- net/netfilter/nf_conntrack_core.c | 46 +++++++------------------------ 3 files changed, 30 insertions(+), 39 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 445b019c2078..2a127480d4cc 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -303,9 +303,29 @@ struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); int nf_conntrack_hash_resize(unsigned int hashsize); + +extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; +extern seqcount_t nf_conntrack_generation; extern unsigned int nf_conntrack_max; +/* must be called with rcu read lock held */ +static inline void +nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) +{ + struct hlist_nulls_head *hptr; + unsigned int sequence, hsz; + + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hsz = nf_conntrack_htable_size; + hptr = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + *hash = hptr; + *hsize = hsz; +} + struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags); diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 79d7ac5c9740..62e17d1319ff 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -51,8 +51,6 @@ bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto); -void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize); - /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, @@ -83,7 +81,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, #define CONNTRACK_LOCKS 1024 -extern struct hlist_nulls_head *nf_conntrack_hash; extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; void nf_conntrack_lock(spinlock_t *lock); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 22558b7ff7cd..aeba28c5512b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -74,7 +74,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash); static __read_mostly struct kmem_cache *nf_conntrack_cachep; static __read_mostly spinlock_t nf_conntrack_locks_all_lock; -static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; @@ -162,6 +161,7 @@ static void nf_conntrack_all_unlock(void) unsigned int nf_conntrack_htable_size __read_mostly; unsigned int nf_conntrack_max __read_mostly; +seqcount_t nf_conntrack_generation __read_mostly; DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); @@ -478,23 +478,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } -/* must be called with rcu read lock held */ -void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) -{ - struct hlist_nulls_head *hptr; - unsigned int sequence, hsz; - - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hsz = nf_conntrack_htable_size; - hptr = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); - - *hash = hptr; - *hsize = hsz; -} -EXPORT_SYMBOL_GPL(nf_conntrack_get_ht); - /* * Warning : * - Caller must take a reference on returned object @@ -507,14 +490,11 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int bucket, sequence; + unsigned int bucket, hsize; begin: - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - bucket = scale_hash(hash); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + nf_conntrack_get_ht(&ct_hash, &hsize); + bucket = reciprocal_scale(hash, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { if (nf_ct_key_equal(h, tuple, zone, net)) { @@ -820,18 +800,15 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; - unsigned int hash, sequence; + unsigned int hash, hsize; struct hlist_nulls_node *n; struct nf_conn *ct; zone = nf_ct_zone(ignored_conntrack); rcu_read_lock(); - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hash = hash_conntrack(net, tuple); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + nf_conntrack_get_ht(&ct_hash, &hsize); + hash = __hash_conntrack(net, tuple, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); @@ -897,14 +874,11 @@ static noinline int early_drop(struct net *net, unsigned int _hash) for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; - unsigned hash, sequence, drops; + unsigned int hash, hsize, drops; rcu_read_lock(); - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hash = scale_hash(_hash++); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + nf_conntrack_get_ht(&ct_hash, &hsize); + hash = reciprocal_scale(_hash++, hsize); drops = early_drop_list(net, &ct_hash[hash]); rcu_read_unlock(); -- cgit v1.2.3 From c016c7e45ddfa5085b35b644e659ec014969740d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Aug 2016 12:41:54 +0200 Subject: netfilter: nf_tables: honor NLM_F_EXCL flag in set element insertion If the NLM_F_EXCL flag is set, then new elements that clash with an existing one return EEXIST. In case you try to add an element whose data area differs from what we have, then this returns EBUSY. If no flag is specified at all, then this returns success to userspace. This patch also update the set insert operation so we can fetch the existing element that clashes with the one you want to add, we need this to make sure the element data doesn't differ. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- net/netfilter/nf_tables_api.c | 20 +++++++++++++++----- net/netfilter/nft_set_hash.c | 17 +++++++++++++---- net/netfilter/nft_set_rbtree.c | 12 ++++++++---- 4 files changed, 38 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f2f13399ce44..8972468bc94b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -251,7 +251,8 @@ struct nft_set_ops { int (*insert)(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem); + const struct nft_set_elem *elem, + struct nft_set_ext **ext); void (*activate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 221d27f09623..bd9715e5ff26 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3483,12 +3483,12 @@ static int nft_setelem_parse_flags(const struct nft_set *set, } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, u32 nlmsg_flags) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; - struct nft_set_ext *ext; + struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_userdata *udata; @@ -3615,9 +3615,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err4; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(ctx->net, set, &elem); - if (err < 0) + err = set->ops->insert(ctx->net, set, &elem, &ext2); + if (err) { + if (err == -EEXIST) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && + memcmp(nft_set_ext_data(ext), + nft_set_ext_data(ext2), set->dlen) != 0) + err = -EBUSY; + else if (!(nlmsg_flags & NLM_F_EXCL)) + err = 0; + } goto err5; + } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -3673,7 +3683,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) return -ENFILE; - err = nft_add_set_elem(&ctx, set, attr); + err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); if (err < 0) { atomic_dec(&set->nelems); break; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 564fa7929ed5..3794cb2fc788 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -126,7 +126,8 @@ err1: } static int nft_hash_insert(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he = elem->priv; @@ -135,9 +136,17 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, .set = set, .key = elem->key.val.data, }; - - return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, - nft_hash_params); + struct nft_hash_elem *prev; + + prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params); + if (IS_ERR(prev)) + return PTR_ERR(prev); + if (prev) { + *ext = &prev->ext; + return -EEXIST; + } + return 0; } static void nft_hash_activate(const struct net *net, const struct nft_set *set, diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 6473936d05c6..038682d48261 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -94,7 +94,8 @@ out: } static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, - struct nft_rbtree_elem *new) + struct nft_rbtree_elem *new, + struct nft_set_ext **ext) { struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); @@ -122,8 +123,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, else if (!nft_rbtree_interval_end(rbe) && nft_rbtree_interval_end(new)) p = &parent->rb_right; - else + else { + *ext = &rbe->ext; return -EEXIST; + } } } } @@ -133,13 +136,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + struct nft_set_ext **ext) { struct nft_rbtree_elem *rbe = elem->priv; int err; spin_lock_bh(&nft_rbtree_lock); - err = __nft_rbtree_insert(net, set, rbe); + err = __nft_rbtree_insert(net, set, rbe, ext); spin_unlock_bh(&nft_rbtree_lock); return err; -- cgit v1.2.3 From 616b14b46957b52dc7e1f3ec2210d3f9051b1178 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Aug 2016 15:33:30 +0200 Subject: netfilter: don't rely on DYING bit to detect when destroy event was sent The reliable event delivery mode currently (ab)uses the DYING bit to detect which entries on the dying list have to be skipped when re-delivering events from the eache worker in reliable event mode. Currently when we delete the conntrack from main table we only set this bit if we could also deliver the netlink destroy event to userspace. If we fail we move it to the dying list, the ecache worker will reattempt event delivery for all confirmed conntracks on the dying list that do not have the DYING bit set. Once timer is gone, we can no longer use if (del_timer()) to detect when we 'stole' the reference count owned by the timer/hash entry, so we need some other way to avoid racing with other cpu. Pablo suggested to add a marker in the ecache extension that skips entries that have been unhashed from main table but are still waiting for the last reference count to be dropped (e.g. because one skb waiting on nfqueue verdict still holds a reference). We do this by adding a tristate. If we fail to deliver the destroy event, make a note of this in the eache extension. The worker can then skip all entries that are in a different state. Either they never delivered a destroy event, e.g. because the netlink backend was not loaded, or redelivery took place already. Once the conntrack timer is removed we will now be able to replace del_timer() test with test_and_set_bit(DYING, &ct->status) to avoid racing with other cpu that tries to evict the same conntrack. Because DYING will then be set right before we report the destroy event we can no longer skip event reporting when dying bit is set. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 17 ++++++++++++----- net/netfilter/nf_conntrack_ecache.c | 22 ++++++++++++++-------- 2 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index fa36447371c6..12d967b58726 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -12,12 +12,19 @@ #include #include +enum nf_ct_ecache_state { + NFCT_ECACHE_UNKNOWN, /* destroy event not sent */ + NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ + NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */ +}; + struct nf_conntrack_ecache { - unsigned long cache; /* bitops want long */ - unsigned long missed; /* missed events */ - u16 ctmask; /* bitmask of ct events to be delivered */ - u16 expmask; /* bitmask of expect events to be delivered */ - u32 portid; /* netlink portid of destroyer */ + unsigned long cache; /* bitops want long */ + unsigned long missed; /* missed events */ + u16 ctmask; /* bitmask of ct events to be delivered */ + u16 expmask; /* bitmask of expect events to be delivered */ + u32 portid; /* netlink portid of destroyer */ + enum nf_ct_ecache_state state; /* ecache state */ }; static inline struct nf_conntrack_ecache * diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index d28011b42845..da9df2d56e66 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -49,8 +49,13 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct nf_conntrack_ecache *e; - if (nf_ct_is_dying(ct)) + if (!nf_ct_is_confirmed(ct)) + continue; + + e = nf_ct_ecache_find(ct); + if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) continue; if (nf_conntrack_event(IPCT_DESTROY, ct)) { @@ -58,8 +63,7 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) break; } - /* we've got the event delivered, now it's dying */ - set_bit(IPS_DYING_BIT, &ct->status); + e->state = NFCT_ECACHE_DESTROY_SENT; refs[evicted] = ct; if (++evicted >= ARRAY_SIZE(refs)) { @@ -130,7 +134,7 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, if (!e) goto out_unlock; - if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) { + if (nf_ct_is_confirmed(ct)) { struct nf_ct_event item = { .ct = ct, .portid = e->portid ? e->portid : portid, @@ -150,11 +154,13 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, * triggered by a process, we store the PORTID * to include it in the retransmission. */ - if (eventmask & (1 << IPCT_DESTROY) && - e->portid == 0 && portid != 0) - e->portid = portid; - else + if (eventmask & (1 << IPCT_DESTROY)) { + if (e->portid == 0 && portid != 0) + e->portid = portid; + e->state = NFCT_ECACHE_DESTROY_FAIL; + } else { e->missed |= eventmask; + } } else { e->missed &= ~missed; } -- cgit v1.2.3 From f330a7fdbe1611104622faff7e614a246a7d20f0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Aug 2016 15:33:31 +0200 Subject: netfilter: conntrack: get rid of conntrack timer With stats enabled this eats 80 bytes on x86_64 per nf_conn entry, as Eric Dumazet pointed out during netfilter workshop 2016. Eric also says: "Another reason was the fact that Thomas was about to change max timer range [..]" (500462a9de657f8, 'timers: Switch to a non-cascading wheel'). Remove the timer and use a 32bit jiffies value containing timestamp until entry is valid. During conntrack lookup, even before doing tuple comparision, check the timeout value and evict the entry in case it is too old. The dying bit is used as a synchronization point to avoid races where multiple cpus try to evict the same entry. Because lookup is always lockless, we need to bump the refcnt once when we evict, else we could try to evict already-dead entry that is being recycled. This is the standard/expected way when conntrack entries are destroyed. Followup patches will introduce garbage colliction via work queue and further places where we can reap obsoleted entries (e.g. during netlink dumps), this is needed to avoid expired conntracks from hanging around for too long when lookup rate is low after a busy period. Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 23 +++++++-- net/netfilter/nf_conntrack_core.c | 91 ++++++++++++++++++++---------------- net/netfilter/nf_conntrack_netlink.c | 14 ++---- net/netfilter/nf_conntrack_pptp.c | 3 +- net/netfilter/nf_nat_core.c | 6 --- 5 files changed, 74 insertions(+), 63 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 2a127480d4cc..7277751128e8 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -42,7 +42,6 @@ union nf_conntrack_expect_proto { #include #include -#include #ifdef CONFIG_NETFILTER_DEBUG #define NF_CT_ASSERT(x) WARN_ON(!(x)) @@ -73,7 +72,7 @@ struct nf_conn_help { #include struct nf_conn { - /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, + /* Usage count in here is 1 for hash table, 1 per skb, * plus 1 for any connection(s) we are `master' for * * Hint, SKB address this struct and refcnt via skb->nfct and @@ -96,8 +95,8 @@ struct nf_conn { /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; - /* Timer function; drops refcnt when it goes off. */ - struct timer_list timeout; + /* jiffies32 when this ct is considered dead */ + u32 timeout; possible_net_t ct_net; @@ -291,14 +290,28 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } +#define nfct_time_stamp ((u32)(jiffies)) + /* jiffies until ct expires, 0 if already expired */ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { - long timeout = (long)ct->timeout.expires - (long)jiffies; + s32 timeout = ct->timeout - nfct_time_stamp; return timeout > 0 ? timeout : 0; } +static inline bool nf_ct_is_expired(const struct nf_conn *ct) +{ + return (__s32)(ct->timeout - nfct_time_stamp) <= 0; +} + +/* use after obtaining a reference count */ +static inline bool nf_ct_should_gc(const struct nf_conn *ct) +{ + return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && + !nf_ct_is_dying(ct); +} + struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 887926aefc72..87ee6dad777c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -371,7 +371,6 @@ destroy_conntrack(struct nf_conntrack *nfct) pr_debug("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); - NF_CT_ASSERT(!timer_pending(&ct->timeout)); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); @@ -434,35 +433,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; + if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) + return false; + tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) tstamp->stop = ktime_get_real_ns(); - if (nf_ct_is_dying(ct)) - goto delete; - if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { - /* destroy event was not delivered */ + /* destroy event was not delivered. nf_ct_put will + * be done by event cache worker on redelivery. + */ nf_ct_delete_from_lists(ct); nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); return false; } nf_conntrack_ecache_work(nf_ct_net(ct)); - set_bit(IPS_DYING_BIT, &ct->status); - delete: nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; } EXPORT_SYMBOL_GPL(nf_ct_delete); -static void death_by_timeout(unsigned long ul_conntrack) -{ - nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); -} - static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, @@ -480,6 +474,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } +/* caller must hold rcu readlock and none of the nf_conntrack_locks */ +static void nf_ct_gc_expired(struct nf_conn *ct) +{ + if (!atomic_inc_not_zero(&ct->ct_general.use)) + return; + + if (nf_ct_should_gc(ct)) + nf_ct_kill(ct); + + nf_ct_put(ct); +} + /* * Warning : * - Caller must take a reference on returned object @@ -499,6 +505,17 @@ begin: bucket = reciprocal_scale(hash, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { + struct nf_conn *ct; + + ct = nf_ct_tuplehash_to_ctrack(h); + if (nf_ct_is_expired(ct)) { + nf_ct_gc_expired(ct); + continue; + } + + if (nf_ct_is_dying(ct)) + continue; + if (nf_ct_key_equal(h, tuple, zone, net)) { NF_CT_STAT_INC_ATOMIC(net, found); return h; @@ -597,7 +614,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) zone, net)) goto out; - add_timer(&ct->timeout); smp_wmb(); /* The caller holds a reference to this object */ atomic_set(&ct->ct_general.use, 2); @@ -750,8 +766,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ - ct->timeout.expires += jiffies; - add_timer(&ct->timeout); + ct->timeout += nfct_time_stamp; atomic_inc(&ct->ct_general.use); ct->status |= IPS_CONFIRMED; @@ -815,8 +830,16 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); - if (ct != ignored_conntrack && - nf_ct_key_equal(h, tuple, zone, net)) { + + if (ct == ignored_conntrack) + continue; + + if (nf_ct_is_expired(ct)) { + nf_ct_gc_expired(ct); + continue; + } + + if (nf_ct_key_equal(h, tuple, zone, net)) { NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; @@ -850,6 +873,11 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); + if (nf_ct_is_expired(tmp)) { + nf_ct_gc_expired(tmp); + continue; + } + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || !net_eq(nf_ct_net(tmp), net) || nf_ct_is_dying(tmp)) @@ -867,7 +895,6 @@ static unsigned int early_drop_list(struct net *net, */ if (net_eq(nf_ct_net(tmp), net) && nf_ct_is_confirmed(tmp) && - del_timer(&tmp->timeout) && nf_ct_delete(tmp, 0, 0)) drops++; @@ -937,8 +964,6 @@ __nf_conntrack_alloc(struct net *net, /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; - /* Don't set timer yet: wait for confirmation */ - setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); write_pnet(&ct->ct_net, net); memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - @@ -1312,7 +1337,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, unsigned long extra_jiffies, int do_acct) { - NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); NF_CT_ASSERT(skb); /* Only update if this is not a fixed timeout */ @@ -1320,18 +1344,10 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, goto acct; /* If not in hash table, timer will not be active yet */ - if (!nf_ct_is_confirmed(ct)) { - ct->timeout.expires = extra_jiffies; - } else { - unsigned long newtime = jiffies + extra_jiffies; - - /* Only update the timeout if the new timeout is at least - HZ jiffies from the old timeout. Need del_timer for race - avoidance (may already be dying). */ - if (newtime - ct->timeout.expires >= HZ) - mod_timer_pending(&ct->timeout, newtime); - } + if (nf_ct_is_confirmed(ct)) + extra_jiffies += nfct_time_stamp; + ct->timeout = extra_jiffies; acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); @@ -1346,11 +1362,7 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); - if (del_timer(&ct->timeout)) { - ct->timeout.function((unsigned long)ct); - return true; - } - return false; + return nf_ct_delete(ct, 0, 0); } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); @@ -1485,11 +1497,8 @@ void nf_ct_iterate_cleanup(struct net *net, while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ - if (del_timer(&ct->timeout)) - nf_ct_delete(ct, portid, report); - - /* ... else the timer will get him soon. */ + nf_ct_delete(ct, portid, report); nf_ct_put(ct); cond_resched(); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 68800c10a320..81fd34ce0a57 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1144,9 +1144,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } } - if (del_timer(&ct->timeout)) - nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); - + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_put(ct); return 0; @@ -1514,11 +1512,10 @@ static int ctnetlink_change_timeout(struct nf_conn *ct, { u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); - if (!del_timer(&ct->timeout)) - return -ETIME; + ct->timeout = nfct_time_stamp + timeout * HZ; - ct->timeout.expires = jiffies + timeout * HZ; - add_timer(&ct->timeout); + if (test_bit(IPS_DYING_BIT, &ct->status)) + return -ETIME; return 0; } @@ -1716,9 +1713,8 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); - ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; rcu_read_lock(); if (cda[CTA_HELP]) { diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 5588c7ae1ac2..f60a4755d71e 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -157,8 +157,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, pr_debug("setting timeout of conntrack %p to 0\n", sibling); sibling->proto.gre.timeout = 0; sibling->proto.gre.stream_timeout = 0; - if (del_timer(&sibling->timeout)) - sibling->timeout.function((unsigned long)sibling); + nf_ct_kill(sibling); nf_ct_put(sibling); return 1; } else { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index de31818417b8..81ae41f85d3a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -565,16 +565,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ - if (!del_timer(&ct->timeout)) - return 1; - ct->status &= ~IPS_NAT_DONE_MASK; - rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, nf_nat_bysource_params); - add_timer(&ct->timeout); - /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ -- cgit v1.2.3 From ad66713f5a20034b3b2a0cbc184319b2ede93f11 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Aug 2016 15:33:35 +0200 Subject: netfilter: remove __nf_ct_kill_acct helper After timer removal this just calls nf_ct_delete so remove the __ prefix version and make nf_ct_kill a shorthand for nf_ct_delete. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 13 +++---------- net/netfilter/nf_conntrack_core.c | 12 +++++------- 2 files changed, 8 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 7277751128e8..50418052a520 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -219,21 +219,14 @@ static inline void nf_ct_refresh(struct nf_conn *ct, __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, 0); } -bool __nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, int do_acct); - /* kill conntrack and do accounting */ -static inline bool nf_ct_kill_acct(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb) -{ - return __nf_ct_kill_acct(ct, ctinfo, skb, 1); -} +bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + const struct sk_buff *skb); /* kill conntrack without accounting */ static inline bool nf_ct_kill(struct nf_conn *ct) { - return __nf_ct_kill_acct(ct, 0, NULL, 0); + return nf_ct_delete(ct, 0, 0); } /* These are for NAT. Icky. */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7c66ce401ce9..ac1db4019d5c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1430,17 +1430,15 @@ acct: } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); -bool __nf_ct_kill_acct(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, - int do_acct) +bool nf_ct_kill_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) { - if (do_acct) - nf_ct_acct_update(ct, ctinfo, skb->len); + nf_ct_acct_update(ct, ctinfo, skb->len); return nf_ct_delete(ct, 0, 0); } -EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); +EXPORT_SYMBOL_GPL(nf_ct_kill_acct); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) -- cgit v1.2.3 From 779994fa3636d46848edb402fe7517968e036e6f Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 29 Aug 2016 18:25:28 +0800 Subject: netfilter: log: Check param to avoid overflow in nf_log_set The nf_log_set is an interface function, so it should do the strict sanity check of parameters. Convert the return value of nf_log_set as int instead of void. When the pf is invalid, return -EOPNOTSUPP. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 3 +-- net/bridge/netfilter/nf_log_bridge.c | 3 +-- net/ipv4/netfilter/nf_log_arp.c | 3 +-- net/ipv4/netfilter/nf_log_ipv4.c | 3 +-- net/ipv6/netfilter/nf_log_ipv6.c | 3 +-- net/netfilter/nf_log.c | 8 +++++--- 6 files changed, 10 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 83d855ba6af1..ee07dc8b0a7b 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -60,8 +60,7 @@ struct nf_logger { int nf_log_register(u_int8_t pf, struct nf_logger *logger); void nf_log_unregister(struct nf_logger *logger); -void nf_log_set(struct net *net, u_int8_t pf, - const struct nf_logger *logger); +int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger); void nf_log_unset(struct net *net, const struct nf_logger *logger); int nf_log_bind_pf(struct net *net, u_int8_t pf, diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c index 5d9953a90929..1663df598545 100644 --- a/net/bridge/netfilter/nf_log_bridge.c +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -50,8 +50,7 @@ static struct nf_logger nf_bridge_logger __read_mostly = { static int __net_init nf_log_bridge_net_init(struct net *net) { - nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); - return 0; + return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); } static void __net_exit nf_log_bridge_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index cf8f2d4e867a..8945c2653814 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -111,8 +111,7 @@ static struct nf_logger nf_arp_logger __read_mostly = { static int __net_init nf_log_arp_net_init(struct net *net) { - nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); - return 0; + return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); } static void __net_exit nf_log_arp_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 076aadda0473..20f225593a8b 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -347,8 +347,7 @@ static struct nf_logger nf_ip_logger __read_mostly = { static int __net_init nf_log_ipv4_net_init(struct net *net) { - nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); - return 0; + return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); } static void __net_exit nf_log_ipv4_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 8dd869642f45..c1bcf699a23d 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -379,8 +379,7 @@ static struct nf_logger nf_ip6_logger __read_mostly = { static int __net_init nf_log_ipv6_net_init(struct net *net) { - nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); - return 0; + return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); } static void __net_exit nf_log_ipv6_net_exit(struct net *net) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index aa5847a16713..30a17d649a83 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -39,12 +39,12 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) return NULL; } -void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) +int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) { const struct nf_logger *log; - if (pf == NFPROTO_UNSPEC) - return; + if (pf == NFPROTO_UNSPEC || pf >= ARRAY_SIZE(net->nf.nf_loggers)) + return -EOPNOTSUPP; mutex_lock(&nf_log_mutex); log = nft_log_dereference(net->nf.nf_loggers[pf]); @@ -52,6 +52,8 @@ void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); + + return 0; } EXPORT_SYMBOL(nf_log_set); -- cgit v1.2.3