From 470acf55a021713869b9bcc967268ac90c8a0fac Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Fri, 14 Apr 2017 10:00:08 +0800 Subject: netfilter: xt_CT: fix refcnt leak on error path There are two cases which causes refcnt leak. 1. When nf_ct_timeout_ext_add failed in xt_ct_set_timeout, it should free the timeout refcnt. Now goto the err_put_timeout error handler instead of going ahead. 2. When the time policy is not found, we should call module_put. Otherwise, the related cthelper module cannot be removed anymore. It is easy to reproduce by typing the following command: # iptables -t raw -A OUTPUT -p tcp -j CT --helper ftp --timeout xxx Signed-off-by: Gao Feng Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_CT.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index b008db0184b8..81fdcdca7457 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -167,8 +167,10 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, goto err_put_timeout; } timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); - if (timeout_ext == NULL) + if (!timeout_ext) { ret = -ENOMEM; + goto err_put_timeout; + } rcu_read_unlock(); return ret; @@ -200,6 +202,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { struct nf_conntrack_zone zone; + struct nf_conn_help *help; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -248,7 +251,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (info->timeout[0]) { ret = xt_ct_set_timeout(ct, par, info->timeout); if (ret < 0) - goto err3; + goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); nf_conntrack_get(&ct->ct_general); @@ -256,6 +259,10 @@ out: info->ct = ct; return 0; +err4: + help = nfct_help(ct); + if (help) + module_put(help->helper->me); err3: nf_ct_tmpl_free(ct); err2: -- cgit v1.2.3 From cf5d70918877c6a6655dc1e92e2ebb661ce904fd Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Fri, 14 Apr 2017 14:26:38 -0700 Subject: openvswitch: Delete conntrack entry clashing with an expectation. Conntrack helpers do not check for a potentially clashing conntrack entry when creating a new expectation. Also, nf_conntrack_in() will check expectations (via init_conntrack()) only if a conntrack entry can not be found. The expectation for a packet which also matches an existing conntrack entry will not be removed by conntrack, and is currently handled inconsistently by OVS, as OVS expects the expectation to be removed when the connection tracking entry matching that expectation is confirmed. It should be noted that normally an IP stack would not allow reuse of a 5-tuple of an old (possibly lingering) connection for a new data connection, so this is somewhat unlikely corner case. However, it is possible that a misbehaving source could cause conntrack entries be created that could then interfere with new related connections. Fix this in the OVS module by deleting the clashing conntrack entry after an expectation has been matched. This causes the following nf_conntrack_in() call also find the expectation and remove it when creating the new conntrack entry, as well as the forthcoming reply direction packets to match the new related connection instead of the old clashing conntrack entry. Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Reported-by: Yang Song Signed-off-by: Jarno Rajahalme Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/openvswitch/conntrack.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 7b2c2fce408a..c92a5795dcda 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -514,10 +514,38 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, u16 proto, const struct sk_buff *skb) { struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) return NULL; - return __nf_ct_expect_find(net, zone, &tuple); + + exp = __nf_ct_expect_find(net, zone, &tuple); + if (exp) { + struct nf_conntrack_tuple_hash *h; + + /* Delete existing conntrack entry, if it clashes with the + * expectation. This can happen since conntrack ALGs do not + * check for clashes between (new) expectations and existing + * conntrack entries. nf_conntrack_in() will check the + * expectations only if a conntrack entry can not be found, + * which can lead to OVS finding the expectation (here) in the + * init direction, but which will not be removed by the + * nf_conntrack_in() call, if a matching conntrack entry is + * found instead. In this case all init direction packets + * would be reported as new related packets, while reply + * direction packets would be reported as un-related + * established packets. + */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (h) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + nf_ct_delete(ct, 0, 0); + nf_conntrack_put(&ct->ct_general); + } + } + + return exp; } /* This replicates logic from nf_conntrack_core.c that is not exported. */ -- cgit v1.2.3 From 66e5a6b18bd09d0431e97cd3c162e76c5c2aebba Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 15 Apr 2017 19:27:42 +0800 Subject: netfilter: nf_ct_helper: permit cthelpers with different names via nfnetlink cthelpers added via nfnetlink may have the same tuple, i.e. except for the l3proto and l4proto, other fields are all zero. So even with the different names, we will also fail to add them: # nfct helper add ssdp inet udp # nfct helper add tftp inet udp nfct v1.4.3: netlink error: File exists So in order to avoid unpredictable behaviour, we should: 1. cthelpers can be selected by nft ct helper obj or xt_CT target, so report error if duplicated { name, l3proto, l4proto } tuple exist. 2. cthelpers can be selected by nf_ct_tuple_src_mask_cmp when nf_ct_auto_assign_helper is enabled, so also report error if duplicated { l3proto, l4proto, src-port } tuple exist. Also note, if the cthelper is added from userspace, then the src-port will always be zero, it's invalid for nf_ct_auto_assign_helper, so there's no need to check the second point listed above. Fixes: 893e093c786c ("netfilter: nf_ct_helper: bail out on duplicated helpers") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 4eeb3418366a..99bcd44aac70 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -386,17 +386,33 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); struct nf_conntrack_helper *cur; - int ret = 0; + int ret = 0, i; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); mutex_lock(&nf_ct_helper_mutex); - hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; - goto out; + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) { + if (!strcmp(cur->name, me->name) && + (cur->tuple.src.l3num == NFPROTO_UNSPEC || + cur->tuple.src.l3num == me->tuple.src.l3num) && + cur->tuple.dst.protonum == me->tuple.dst.protonum) { + ret = -EEXIST; + goto out; + } + } + } + + /* avoid unpredictable behaviour for auto_assign_helper */ + if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, + &mask)) { + ret = -EEXIST; + goto out; + } } } hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); -- cgit v1.2.3 From 54a5f9d9abab639039eb7288bdc26c9c67f4e79b Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 15 Apr 2017 23:37:36 +0800 Subject: netfilter: nft_set_bitmap: free dummy elements when destroy the set We forget to free dummy elements when deleting the set. So when I was running nft-test.py, I saw many kmemleak warnings: kmemleak: 1344 new suspected memory leaks ... # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800631345c8 (size 32): comm "nft", pid 9075, jiffies 4295743309 (age 1354.815s) hex dump (first 32 bytes): f8 63 13 63 00 88 ff ff 88 79 13 63 00 88 ff ff .c.c.....y.c.... 04 0c 00 00 00 00 00 00 00 00 00 00 08 03 00 00 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc+0x164/0x310 [] nft_set_elem_init+0x3d/0x1b0 [nf_tables] [] nft_add_set_elem+0x45a/0x8c0 [nf_tables] [] nf_tables_newsetelem+0x105/0x1d0 [nf_tables] [] nfnetlink_rcv+0x414/0x770 [nfnetlink] [] netlink_unicast+0x1f6/0x310 [] netlink_sendmsg+0x306/0x3b0 ... Fixes: e920dde516088 ("netfilter: nft_set_bitmap: keep a list of dummy elements") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_bitmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 8ebbc2940f4c..b988162b5b15 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -257,6 +257,11 @@ static int nft_bitmap_init(const struct nft_set *set, static void nft_bitmap_destroy(const struct nft_set *set) { + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *be, *n; + + list_for_each_entry_safe(be, n, &priv->list, head) + nft_set_elem_destroy(set, be, true); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, -- cgit v1.2.3 From 14e567615679a9999ce6bf4f23d6c9e00f03e00e Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 17 Apr 2017 21:18:55 +0800 Subject: netfilter: ctnetlink: drop the incorrect cthelper module request First, when creating a new ct, we will invoke request_module to try to load the related inkernel cthelper. So there's no need to call the request_module again when updating the ct helpinfo. Second, ctnetlink_change_helper may be called with rcu_read_lock held, i.e. rcu_read_lock -> nfqnl_recv_verdict -> nfqnl_ct_parse -> ctnetlink_glue_parse -> ctnetlink_glue_parse_ct -> ctnetlink_change_helper. But the request_module invocation may sleep, so we can't call it with the rcu_read_lock held. Remove it now. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dc7dfd68fafe..48c184552de0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1512,23 +1512,8 @@ static int ctnetlink_change_helper(struct nf_conn *ct, helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); - if (helper == NULL) { -#ifdef CONFIG_MODULES - spin_unlock_bh(&nf_conntrack_expect_lock); - - if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_expect_lock); - return -EOPNOTSUPP; - } - - spin_lock_bh(&nf_conntrack_expect_lock); - helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), - nf_ct_protonum(ct)); - if (helper) - return -EAGAIN; -#endif + if (helper == NULL) return -EOPNOTSUPP; - } if (help) { if (help->helper == helper) { -- cgit v1.2.3 From 88be4c09d9008f9ff337cbf48c5d0f06c8f872e7 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 17 Apr 2017 21:18:56 +0800 Subject: netfilter: ctnetlink: fix deadlock due to acquire _expect_lock twice Currently, ctnetlink_change_conntrack is always protected by _expect_lock, but this will cause a deadlock when deleting the helper from a conntrack, as the _expect_lock will be acquired again by nf_ct_remove_expectations: CPU0 ---- lock(nf_conntrack_expect_lock); lock(nf_conntrack_expect_lock); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by lt-conntrack_gr/12853: #0: (&table[i].mutex){+.+.+.}, at: [] nfnetlink_rcv_msg+0x399/0x6a9 [nfnetlink] #1: (nf_conntrack_expect_lock){+.....}, at: [] ctnetlink_new_conntrack+0x17f/0x408 [nf_conntrack_netlink] Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x1608/0x1680 ? ctnetlink_parse_tuple_proto+0x10f/0x1c0 [nf_conntrack_netlink] lock_acquire+0x100/0x1f0 ? nf_ct_remove_expectations+0x32/0x90 [nf_conntrack] _raw_spin_lock_bh+0x3f/0x50 ? nf_ct_remove_expectations+0x32/0x90 [nf_conntrack] nf_ct_remove_expectations+0x32/0x90 [nf_conntrack] ctnetlink_change_helper+0xc6/0x190 [nf_conntrack_netlink] ctnetlink_new_conntrack+0x1b2/0x408 [nf_conntrack_netlink] nfnetlink_rcv_msg+0x60a/0x6a9 [nfnetlink] ? nfnetlink_rcv_msg+0x1b9/0x6a9 [nfnetlink] ? nfnetlink_bind+0x1a0/0x1a0 [nfnetlink] netlink_rcv_skb+0xa4/0xc0 nfnetlink_rcv+0x87/0x770 [nfnetlink] Since the operations are unrelated to nf_ct_expect, so we can drop the _expect_lock. Also note, after removing the _expect_lock protection, another CPU may invoke nf_conntrack_helper_unregister, so we should use rcu_read_lock to protect __nf_conntrack_helper_find invoked by ctnetlink_change_helper. Fixes: ca7433df3a67 ("netfilter: conntrack: seperate expect locking from nf_conntrack_lock") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 48c184552de0..e5f97777b1f4 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1510,23 +1510,29 @@ static int ctnetlink_change_helper(struct nf_conn *ct, return 0; } + rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); - if (helper == NULL) + if (helper == NULL) { + rcu_read_unlock(); return -EOPNOTSUPP; + } if (help) { if (help->helper == helper) { /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); - return 0; + err = 0; } else - return -EBUSY; + err = -EBUSY; + } else { + /* we cannot set a helper for an existing conntrack */ + err = -EOPNOTSUPP; } - /* we cannot set a helper for an existing conntrack */ - return -EOPNOTSUPP; + rcu_read_unlock(); + return err; } static int ctnetlink_change_timeout(struct nf_conn *ct, @@ -1945,9 +1951,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_expect_lock); err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_expect_lock); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | @@ -2342,11 +2346,7 @@ ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) if (ret < 0) return ret; - spin_lock_bh(&nf_conntrack_expect_lock); - ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); - spin_unlock_bh(&nf_conntrack_expect_lock); - - return ret; + return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); } static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, -- cgit v1.2.3 From 53b56da83d7899de375a9de153fd7f5397de85e6 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 17 Apr 2017 21:18:57 +0800 Subject: netfilter: ctnetlink: make it safer when updating ct->status After converting to use rcu for conntrack hash, one CPU may update the ct->status via ctnetlink, while another CPU may process the packets and update the ct->status. So the non-atomic operation "ct->status |= status;" via ctnetlink becomes unsafe, and this may clear the IPS_DYING_BIT bit set by another CPU unexpectedly. For example: CPU0 CPU1 ctnetlink_change_status __nf_conntrack_find_get old = ct->status nf_ct_gc_expired - nf_ct_kill - test_and_set_bit(IPS_DYING_BIT new = old | status; - ct->status = new; <-- oops, _DYING_ is cleared! Now using a series of atomic bit operation to solve the above issue. Also note, user shouldn't set IPS_TEMPLATE, IPS_SEQ_ADJUST directly, so make these two bits be unchangable too. If we set the IPS_TEMPLATE_BIT, ct will be freed by nf_ct_tmpl_free, but actually it is alloced by nf_conntrack_alloc. If we set the IPS_SEQ_ADJUST_BIT, this may cause the NULL pointer deference, as the nfct_seqadj(ct) maybe NULL. Last, add some comments to describe the logic change due to the commit a963d710f367 ("netfilter: ctnetlink: Fix regression in CTA_STATUS processing"), which makes me feel a little confusing. Fixes: 76507f69c44e ("[NETFILTER]: nf_conntrack: use RCU for conntrack hash") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 13 ++++++--- net/netfilter/nf_conntrack_netlink.c | 33 ++++++++++++++++------ 2 files changed, 33 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 6a8e33dd4ecb..38fc383139f0 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -82,10 +82,6 @@ enum ip_conntrack_status { IPS_DYING_BIT = 9, IPS_DYING = (1 << IPS_DYING_BIT), - /* Bits that cannot be altered from userland. */ - IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | - IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING), - /* Connection has fixed timeout. */ IPS_FIXED_TIMEOUT_BIT = 10, IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), @@ -101,6 +97,15 @@ enum ip_conntrack_status { /* Conntrack got a helper explicitly attached via CT target. */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), + + /* Be careful here, modifying these bits can make things messy, + * so don't let users modify them directly. + */ + IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | + IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | + IPS_SEQ_ADJUST | IPS_TEMPLATE), + + __IPS_MAX_BIT = 14, }; /* Connection tracking event types */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e5f97777b1f4..86deed6a8db4 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1419,6 +1419,24 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } #endif +static void +__ctnetlink_change_status(struct nf_conn *ct, unsigned long on, + unsigned long off) +{ + unsigned int bit; + + /* Ignore these unchangable bits */ + on &= ~IPS_UNCHANGEABLE_MASK; + off &= ~IPS_UNCHANGEABLE_MASK; + + for (bit = 0; bit < __IPS_MAX_BIT; bit++) { + if (on & (1 << bit)) + set_bit(bit, &ct->status); + else if (off & (1 << bit)) + clear_bit(bit, &ct->status); + } +} + static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1438,10 +1456,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* ASSURED bit can only be set */ return -EBUSY; - /* Be careful here, modifying NAT bits can screw up things, - * so don't let users modify them directly if they don't pass - * nf_nat_range. */ - ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + __ctnetlink_change_status(ct, status, 0); return 0; } @@ -1628,7 +1643,7 @@ ctnetlink_change_seq_adj(struct nf_conn *ct, if (ret < 0) return ret; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } if (cda[CTA_SEQ_ADJ_REPLY]) { @@ -1637,7 +1652,7 @@ ctnetlink_change_seq_adj(struct nf_conn *ct, if (ret < 0) return ret; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } return 0; @@ -2289,10 +2304,10 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* This check is less strict than ctnetlink_change_status() * because callers often flip IPS_EXPECTED bits when sending * an NFQA_CT attribute to the kernel. So ignore the - * unchangeable bits but do not error out. + * unchangeable bits but do not error out. Also user programs + * are allowed to clear the bits that they are allowed to change. */ - ct->status = (status & ~IPS_UNCHANGEABLE_MASK) | - (ct->status & IPS_UNCHANGEABLE_MASK); + __ctnetlink_change_status(ct, status, ~status); return 0; } -- cgit v1.2.3 From 64f3967c7aacbfa1f0614cdb1a23e3f7e76eb61b Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 17 Apr 2017 21:18:58 +0800 Subject: netfilter: ctnetlink: acquire ct->lock before operating nf_ct_seqadj We should acquire the ct->lock before accessing or modifying the nf_ct_seqadj, as another CPU may modify the nf_ct_seqadj at the same time during its packet proccessing. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 86deed6a8db4..78f8c9adbd3c 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -417,8 +417,7 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, - const struct nf_conn *ct) +static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; @@ -426,15 +425,20 @@ static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; + spin_lock_bh(&ct->lock); seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) - return -1; + goto err; seq = &seqadj->seq[IP_CT_DIR_REPLY]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) - return -1; + goto err; + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return -1; } static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) @@ -1637,11 +1641,12 @@ ctnetlink_change_seq_adj(struct nf_conn *ct, if (!seqadj) return 0; + spin_lock_bh(&ct->lock); if (cda[CTA_SEQ_ADJ_ORIG]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) - return ret; + goto err; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } @@ -1650,12 +1655,16 @@ ctnetlink_change_seq_adj(struct nf_conn *ct, ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) - return ret; + goto err; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return ret; } static int -- cgit v1.2.3 From 6bd3d1929246bb0cf4f3bd6e25a0ba442b0de845 Mon Sep 17 00:00:00 2001 From: Peter Tirsek Date: Tue, 18 Apr 2017 12:39:58 -0500 Subject: netfilter: xt_socket: Fix broken IPv6 handling Commit 834184b1f3a4 ("netfilter: defrag: only register defrag functionality if needed") used the outdated XT_SOCKET_HAVE_IPV6 macro which was removed earlier in commit 8db4c5be88f6 ("netfilter: move socket lookup infrastructure to nf_socket_ipv{4,6}.c"). With that macro never being defined, the xt_socket match emits an "Unknown family 10" warning when used with IPv6: WARNING: CPU: 0 PID: 1377 at net/netfilter/xt_socket.c:160 socket_mt_enable_defrag+0x47/0x50 [xt_socket] Unknown family 10 Modules linked in: xt_socket nf_socket_ipv4 nf_socket_ipv6 nf_defrag_ipv4 [...] CPU: 0 PID: 1377 Comm: ip6tables-resto Not tainted 4.10.10 #1 Hardware name: [...] Call Trace: ? __warn+0xe7/0x100 ? socket_mt_enable_defrag+0x47/0x50 [xt_socket] ? socket_mt_enable_defrag+0x47/0x50 [xt_socket] ? warn_slowpath_fmt+0x39/0x40 ? socket_mt_enable_defrag+0x47/0x50 [xt_socket] ? socket_mt_v2_check+0x12/0x40 [xt_socket] ? xt_check_match+0x6b/0x1a0 [x_tables] ? xt_find_match+0x93/0xd0 [x_tables] ? xt_request_find_match+0x20/0x80 [x_tables] ? translate_table+0x48e/0x870 [ip6_tables] ? translate_table+0x577/0x870 [ip6_tables] ? walk_component+0x3a/0x200 ? kmalloc_order+0x1d/0x50 ? do_ip6t_set_ctl+0x181/0x490 [ip6_tables] ? filename_lookup+0xa5/0x120 ? nf_setsockopt+0x3a/0x60 ? ipv6_setsockopt+0xb0/0xc0 ? sock_common_setsockopt+0x23/0x30 ? SyS_socketcall+0x41d/0x630 ? vfs_read+0xfa/0x120 ? do_fast_syscall_32+0x7a/0x110 ? entry_SYSENTER_32+0x47/0x71 This patch brings the conditional back in line with how the rest of the file handles IPv6. Fixes: 834184b1f3a4 ("netfilter: defrag: only register defrag functionality if needed") Signed-off-by: Peter Tirsek Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 770bbec878f1..e75ef39669c5 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -152,7 +152,7 @@ static int socket_mt_enable_defrag(struct net *net, int family) switch (family) { case NFPROTO_IPV4: return nf_defrag_ipv4_enable(net); -#ifdef XT_SOCKET_HAVE_IPV6 +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: return nf_defrag_ipv6_enable(net); #endif -- cgit v1.2.3 From cf3cb246e277d98987aa8d62ef2730dbee2f5fa7 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Wed, 19 Apr 2017 21:47:33 +0200 Subject: bridge: ebtables: fix reception of frames DNAT-ed to bridge device/port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When trying to redirect bridged frames to the bridge device itself or a bridge port (brouting) via the dnat target then this currently fails: The ethernet destination of the frame is dnat'ed to the MAC address of the bridge device or port just fine. However, the IP code drops it in the beginning of ip_input.c/ip_rcv() as the dnat target left the skb->pkt_type as PACKET_OTHERHOST. Fixing this by resetting skb->pkt_type to an appropriate type after dnat'ing. Signed-off-by: Linus Lüssing Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_dnat.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index 4e0b0c359325..e0bb624c3845 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -9,6 +9,7 @@ */ #include #include +#include "../br_private.h" #include #include #include @@ -18,11 +19,30 @@ static unsigned int ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; + struct net_device *dev; if (!skb_make_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); + + if (is_multicast_ether_addr(info->mac)) { + if (is_broadcast_ether_addr(info->mac)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + if (xt_hooknum(par) != NF_BR_BROUTING) + dev = br_port_get_rcu(xt_in(par))->br->dev; + else + dev = xt_in(par); + + if (ether_addr_equal(info->mac, dev->dev_addr)) + skb->pkt_type = PACKET_HOST; + else + skb->pkt_type = PACKET_OTHERHOST; + } + return info->target; } -- cgit v1.2.3 From 277a292835c196894ef895d5e1fd6170bb916f55 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 23 Apr 2017 18:29:30 +0800 Subject: netfilter: nft_dynset: continue to next expr if _OP_ADD succeeded Currently, after adding the following nft rules: # nft add set x target1 { type ipv4_addr \; flags timeout \;} # nft add rule x y set add ip daddr timeout 1d @target1 counter the counters will always be zero despite of the elements are added to the dynamic set "target1" or not, as we will break the nft expr traversal unconditionally: # nft list ruleset ... set target1 { ... elements = { 8.8.8.8 expires 23h59m53s} } chain output { ... set add ip daddr timeout 1d @target1 counter packets 0 bytes 0 ^ ^ ... } Since we add the elements to the set successfully, we should continue to the next expression. Additionally, if elements are added to "flow table" successfully, we will _always_ continue to the next expr, even if the operation is _OP_ADD. So it's better to keep them to be consistent. Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates") Reported-by: Robert White Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_dynset.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 049ad2d9ee66..fafbeea3ed04 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -82,8 +82,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = jiffies + timeout; - } else if (sexpr == NULL) - goto out; + } if (sexpr != NULL) sexpr->ops->eval(sexpr, regs, pkt); @@ -92,7 +91,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; return; } -out: + if (!priv->invert) regs->verdict.code = NFT_BREAK; } -- cgit v1.2.3 From 9dd2ab609eef736d5639e0de1bcc2e71e714b28e Mon Sep 17 00:00:00 2001 From: Dave Johnson Date: Mon, 24 Apr 2017 09:11:24 -0400 Subject: netfilter: Wrong icmp6 checksum for ICMPV6_TIME_EXCEED in reverse SNATv6 path When recalculating the outer ICMPv6 checksum for a reverse path NATv6 such as ICMPV6_TIME_EXCEED nf_nat_icmpv6_reply_translation() was accessing data beyond the headlen of the skb for non-linear skb. This resulted in incorrect ICMPv6 checksum as garbage data was used. Patch replaces csum_partial() with skb_checksum() which supports non-linear skbs similar to nf_nat_icmp_reply_translation() from ipv4. Signed-off-by: Dave Johnson Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index e0be97e636a4..69937b637ee5 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -235,7 +235,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, - csum_partial(&inside->icmp6, + skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } -- cgit v1.2.3 From 1442f6f7c1b77de1c508318164a527e240c24a4d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Apr 2017 11:44:16 +0200 Subject: ipvs: explicitly forbid ipv6 service/dest creation if ipv6 mod is disabled When creating a new ipvs service, ipv6 addresses are always accepted if CONFIG_IP_VS_IPV6 is enabled. On dest creation the address family is not explicitly checked. This allows the user-space to configure ipvs services even if the system is booted with ipv6.disable=1. On specific configuration, ipvs can try to call ipv6 routing code at setup time, causing the kernel to oops due to fib6_rules_ops being NULL. This change addresses the issue adding a check for the ipv6 module being enabled while validating ipv6 service operations and adding the same validation for dest operations. According to git history, this issue is apparently present since the introduction of ipv6 support, and the oops can be triggered since commit 09571c7ae30865ad ("IPVS: Add function to determine if IPv6 address is local") Fixes: 09571c7ae30865ad ("IPVS: Add function to determine if IPv6 address is local") Signed-off-by: Paolo Abeni Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5aeb0dde6ccc..4d753beaac32 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3078,6 +3078,17 @@ nla_put_failure: return skb->len; } +static bool ip_vs_is_af_valid(int af) +{ + if (af == AF_INET) + return true; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6 && ipv6_mod_enabled()) + return true; +#endif + return false; +} + static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, @@ -3104,11 +3115,7 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); -#ifdef CONFIG_IP_VS_IPV6 - if (usvc->af != AF_INET && usvc->af != AF_INET6) -#else - if (usvc->af != AF_INET) -#endif + if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { @@ -3610,6 +3617,11 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (udest.af == 0) udest.af = svc->af; + if (!ip_vs_is_af_valid(udest.af)) { + ret = -EAFNOSUPPORT; + goto out; + } + if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services -- cgit v1.2.3 From 7dde07e9c53617549d67dd3e1d791496d0d3868e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 28 Apr 2017 15:57:56 +0300 Subject: netfilter: x_tables: unlock on error in xt_find_table_lock() According to my static checker we should unlock here before the return. That seems reasonable to me as well. Fixes" b9e69e127397 ("netfilter: xtables: don't hook tables by default") Signed-off-by: Dan Carpenter Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 14857afc9937..f134d384852f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1051,8 +1051,10 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, list_for_each_entry(t, &init_net.xt.tables[af], list) { if (strcmp(t->name, name)) continue; - if (!try_module_get(t->me)) + if (!try_module_get(t->me)) { + mutex_unlock(&xt[af].mutex); return NULL; + } mutex_unlock(&xt[af].mutex); if (t->table_init(net) != 0) { -- cgit v1.2.3 From 9744a6fcefcb4d56501d69adb04c24559d353cad Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 1 May 2017 12:58:50 +0200 Subject: netfilter: nf_tables: check if same extensions are set when adding elements If no NLM_F_EXCL is set and the element already exists in the set, make sure that both elements have the same extensions. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 434c739dfeca..11a96e8dd3cd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3749,6 +3749,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || + nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) + return -EBUSY; if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), -- cgit v1.2.3 From 6d717134a1a6e1b34a7d0d70e953037bc2642046 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 May 2017 14:43:44 -0700 Subject: net: ipv6: Do not duplicate DAD on link up Andrey reported a warning triggered by the rcu code: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 5911 at lib/debugobjects.c:289 debug_print_object+0x175/0x210 ODEBUG: activate active (active state 1) object type: rcu_head hint: (null) Modules linked in: CPU: 1 PID: 5911 Comm: a.out Not tainted 4.11.0-rc8+ #271 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x192/0x22d lib/dump_stack.c:52 __warn+0x19f/0x1e0 kernel/panic.c:549 warn_slowpath_fmt+0xe0/0x120 kernel/panic.c:564 debug_print_object+0x175/0x210 lib/debugobjects.c:286 debug_object_activate+0x574/0x7e0 lib/debugobjects.c:442 debug_rcu_head_queue kernel/rcu/rcu.h:75 __call_rcu.constprop.76+0xff/0x9c0 kernel/rcu/tree.c:3229 call_rcu_sched+0x12/0x20 kernel/rcu/tree.c:3288 rt6_rcu_free net/ipv6/ip6_fib.c:158 rt6_release+0x1ea/0x290 net/ipv6/ip6_fib.c:188 fib6_del_route net/ipv6/ip6_fib.c:1461 fib6_del+0xa42/0xdc0 net/ipv6/ip6_fib.c:1500 __ip6_del_rt+0x100/0x160 net/ipv6/route.c:2174 ip6_del_rt+0x140/0x1b0 net/ipv6/route.c:2187 __ipv6_ifa_notify+0x269/0x780 net/ipv6/addrconf.c:5520 addrconf_ifdown+0xe60/0x1a20 net/ipv6/addrconf.c:3672 ... Andrey's reproducer program runs in a very tight loop, calling 'unshare -n' and then spawning 2 sets of 14 threads running random ioctl calls. The relevant networking sequence: 1. New network namespace created via unshare -n - ip6tnl0 device is created in down state 2. address added to ip6tnl0 - equivalent to ip -6 addr add dev ip6tnl0 fd00::bb/1 - DAD is started on the address and when it completes the host route is inserted into the FIB 3. ip6tnl0 is brought up - the new fixup_permanent_addr function restarts DAD on the address 4. exit namespace - teardown / cleanup sequence starts - once in a blue moon, lo teardown appears to happen BEFORE teardown of ip6tunl0 + down on 'lo' removes the host route from the FIB since the dst->dev for the route is loobback + host route added to rcu callback list * rcu callback has not run yet, so rt is NOT on the gc list so it has NOT been marked obsolete 5. in parallel to 4. worker_thread runs addrconf_dad_completed - DAD on the address on ip6tnl0 completes - calls ipv6_ifa_notify which inserts the host route All of that happens very quickly. The result is that a host route that has been deleted from the IPv6 FIB and added to the RCU list is re-inserted into the FIB. The exit namespace eventually gets to cleaning up ip6tnl0 which removes the host route from the FIB again, calls the rcu function for cleanup -- and triggers the double rcu trace. The root cause is duplicate DAD on the address -- steps 2 and 3. Arguably, DAD should not be started in step 2. The interface is in the down state, so it can not really send out requests for the address which makes starting DAD pointless. Since the second DAD was introduced by a recent change, seems appropriate to use it for the Fixes tag and have the fixup function only start DAD for addresses in the PREDAD state which occurs in addrconf_ifdown if the address is retained. Big thanks to Andrey for isolating a reliable reproducer for this problem. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Reported-by: Andrey Konovalov Signed-off-by: David Ahern Tested-by: Andrey Konovalov Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b09ac38d8dc4..a2a370b71249 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3328,7 +3328,8 @@ static int fixup_permanent_addr(struct inet6_dev *idev, idev->dev, 0, 0); } - addrconf_dad_start(ifp); + if (ifp->state == INET6_IFADDR_STATE_PREDAD) + addrconf_dad_start(ifp); return 0; } @@ -3683,7 +3684,7 @@ restart: if (keep) { /* set state to skip the notifier below */ state = INET6_IFADDR_STATE_DEAD; - ifa->state = 0; + ifa->state = INET6_IFADDR_STATE_PREDAD; if (!(ifa->flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; -- cgit v1.2.3 From ab71632c457269a6f3ec5a233c7a83682ae27019 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 3 May 2017 14:18:43 +0200 Subject: netfilter: conntrack: Force inlining of build check to prevent build failure If gcc (e.g. 4.1.2) decides not to inline total_extension_size(), the build will fail with: net/built-in.o: In function `nf_conntrack_init_start': (.text+0x9baf6): undefined reference to `__compiletime_assert_1893' or ERROR: "__compiletime_assert_1893" [net/netfilter/nf_conntrack.ko] undefined! Fix this by forcing inlining of total_extension_size(). Fixes: b3a5db109e0670d6 ("netfilter: conntrack: use u8 for extension sizes again") Signed-off-by: Geert Uytterhoeven Acked-by: Arnd Bergmann Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f9245dbfe435..3c8f1ed2f555 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1853,7 +1853,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -static unsigned int total_extension_size(void) +static __always_inline unsigned int total_extension_size(void) { /* remember to add new extensions below */ BUILD_BUG_ON(NF_CT_EXT_NUM > 9); -- cgit v1.2.3 From 8b485ce69876c65db12ed390e7f9c0d2a64eff2c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 May 2017 06:39:31 -0700 Subject: tcp: do not inherit fastopen_req from parent Under fuzzer stress, it is possible that a child gets a non NULL fastopen_req pointer from its parent at accept() time, when/if parent morphs from listener to active session. We need to make sure this can not happen, by clearing the field after socket cloning. BUG: Double free or freeing an invalid pointer Unexpected shadow byte: 0xFB CPU: 3 PID: 20933 Comm: syz-executor3 Not tainted 4.11.0+ #306 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x292/0x395 lib/dump_stack.c:52 kasan_object_err+0x1c/0x70 mm/kasan/report.c:164 kasan_report_double_free+0x5c/0x70 mm/kasan/report.c:185 kasan_slab_free+0x9d/0xc0 mm/kasan/kasan.c:580 slab_free_hook mm/slub.c:1357 [inline] slab_free_freelist_hook mm/slub.c:1379 [inline] slab_free mm/slub.c:2961 [inline] kfree+0xe8/0x2b0 mm/slub.c:3882 tcp_free_fastopen_req net/ipv4/tcp.c:1077 [inline] tcp_disconnect+0xc15/0x13e0 net/ipv4/tcp.c:2328 inet_child_forget+0xb8/0x600 net/ipv4/inet_connection_sock.c:898 inet_csk_reqsk_queue_add+0x1e7/0x250 net/ipv4/inet_connection_sock.c:928 tcp_get_cookie_sock+0x21a/0x510 net/ipv4/syncookies.c:217 cookie_v4_check+0x1a19/0x28b0 net/ipv4/syncookies.c:384 tcp_v4_cookie_check net/ipv4/tcp_ipv4.c:1384 [inline] tcp_v4_do_rcv+0x731/0x940 net/ipv4/tcp_ipv4.c:1421 tcp_v4_rcv+0x2dc0/0x31c0 net/ipv4/tcp_ipv4.c:1715 ip_local_deliver_finish+0x4cc/0xc20 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:257 [inline] ip_local_deliver+0x1ce/0x700 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:492 [inline] ip_rcv_finish+0xb1d/0x20b0 net/ipv4/ip_input.c:396 NF_HOOK include/linux/netfilter.h:257 [inline] ip_rcv+0xd8c/0x19c0 net/ipv4/ip_input.c:487 __netif_receive_skb_core+0x1ad1/0x3400 net/core/dev.c:4210 __netif_receive_skb+0x2a/0x1a0 net/core/dev.c:4248 process_backlog+0xe5/0x6c0 net/core/dev.c:4868 napi_poll net/core/dev.c:5270 [inline] net_rx_action+0xe70/0x18e0 net/core/dev.c:5335 __do_softirq+0x2fb/0xb99 kernel/softirq.c:284 do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:899 do_softirq.part.17+0x1e8/0x230 kernel/softirq.c:328 do_softirq kernel/softirq.c:176 [inline] __local_bh_enable_ip+0x1cf/0x1e0 kernel/softirq.c:181 local_bh_enable include/linux/bottom_half.h:31 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:931 [inline] ip_finish_output2+0x9ab/0x15e0 net/ipv4/ip_output.c:230 ip_finish_output+0xa35/0xdf0 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:246 [inline] ip_output+0x1f6/0x7b0 net/ipv4/ip_output.c:404 dst_output include/net/dst.h:486 [inline] ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124 ip_queue_xmit+0x9a8/0x1a10 net/ipv4/ip_output.c:503 tcp_transmit_skb+0x1ade/0x3470 net/ipv4/tcp_output.c:1057 tcp_write_xmit+0x79e/0x55b0 net/ipv4/tcp_output.c:2265 __tcp_push_pending_frames+0xfa/0x3a0 net/ipv4/tcp_output.c:2450 tcp_push+0x4ee/0x780 net/ipv4/tcp.c:683 tcp_sendmsg+0x128d/0x39b0 net/ipv4/tcp.c:1342 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x660/0x810 net/socket.c:1696 SyS_sendto+0x40/0x50 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x446059 RSP: 002b:00007faa6761fb58 EFLAGS: 00000282 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000000017 RCX: 0000000000446059 RDX: 0000000000000001 RSI: 0000000020ba3fcd RDI: 0000000000000017 RBP: 00000000006e40a0 R08: 0000000020ba4ff0 R09: 0000000000000010 R10: 0000000020000000 R11: 0000000000000282 R12: 0000000000708150 R13: 0000000000000000 R14: 00007faa676209c0 R15: 00007faa67620700 Object at ffff88003b5bbcb8, in cache kmalloc-64 size: 64 Allocated: PID = 20909 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:513 set_track mm/kasan/kasan.c:525 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:616 kmem_cache_alloc_trace+0x82/0x270 mm/slub.c:2745 kmalloc include/linux/slab.h:490 [inline] kzalloc include/linux/slab.h:663 [inline] tcp_sendmsg_fastopen net/ipv4/tcp.c:1094 [inline] tcp_sendmsg+0x221a/0x39b0 net/ipv4/tcp.c:1139 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x660/0x810 net/socket.c:1696 SyS_sendto+0x40/0x50 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed: PID = 20909 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:513 set_track mm/kasan/kasan.c:525 [inline] kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:589 slab_free_hook mm/slub.c:1357 [inline] slab_free_freelist_hook mm/slub.c:1379 [inline] slab_free mm/slub.c:2961 [inline] kfree+0xe8/0x2b0 mm/slub.c:3882 tcp_free_fastopen_req net/ipv4/tcp.c:1077 [inline] tcp_disconnect+0xc15/0x13e0 net/ipv4/tcp.c:2328 __inet_stream_connect+0x20c/0xf90 net/ipv4/af_inet.c:593 tcp_sendmsg_fastopen net/ipv4/tcp.c:1111 [inline] tcp_sendmsg+0x23a8/0x39b0 net/ipv4/tcp.c:1139 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x660/0x810 net/socket.c:1696 SyS_sendto+0x40/0x50 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Fixes: 7db92362d2fe ("tcp: fix potential double free issue for fastopen_req") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8f6373b0cd77..717be4de5324 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -523,6 +523,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; newtp->rack.mstamp.v64 = 0; -- cgit v1.2.3 From 985538eee06f169bb13c7ecb1a7fcdeb700b05c7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 3 May 2017 14:50:40 +0100 Subject: net/sched: remove redundant null check on head head is previously null checked and so the 2nd null check on head is redundant and therefore can be removed. Detected by CoverityScan, CID#1399505 ("Logically dead code") Signed-off-by: Colin Ian King Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 2efb36c08f2a..dee469fed967 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -203,8 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) head; rcu_assign_pointer(tp->root, new); - if (head) - call_rcu(&head->rcu, mall_destroy_rcu); + call_rcu(&head->rcu, mall_destroy_rcu); return 0; err_replace_hw_filter: -- cgit v1.2.3 From 86f4c90a1c5c1493f07f2d12c1079f5bf01936f2 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 3 May 2017 17:06:58 +0200 Subject: ipv4, ipv6: ensure raw socket message is big enough to hold an IP header raw_send_hdrinc() and rawv6_send_hdrinc() expect that the buffer copied from the userspace contains the IPv4/IPv6 header, so if too few bytes are copied, parts of the header may remain uninitialized. This bug has been detected with KMSAN. For the record, the KMSAN report: ================================================================== BUG: KMSAN: use of unitialized memory in nf_ct_frag6_gather+0xf5a/0x44a0 inter: 0 CPU: 0 PID: 1036 Comm: probe Not tainted 4.11.0-rc5+ #2455 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x143/0x1b0 lib/dump_stack.c:52 kmsan_report+0x16b/0x1e0 mm/kmsan/kmsan.c:1078 __kmsan_warning_32+0x5c/0xa0 mm/kmsan/kmsan_instr.c:510 nf_ct_frag6_gather+0xf5a/0x44a0 net/ipv6/netfilter/nf_conntrack_reasm.c:577 ipv6_defrag+0x1d9/0x280 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68 nf_hook_entry_hookfn ./include/linux/netfilter.h:102 nf_hook_slow+0x13f/0x3c0 net/netfilter/core.c:310 nf_hook ./include/linux/netfilter.h:212 NF_HOOK ./include/linux/netfilter.h:255 rawv6_send_hdrinc net/ipv6/raw.c:673 rawv6_sendmsg+0x2fcb/0x41a0 net/ipv6/raw.c:919 inet_sendmsg+0x3f8/0x6d0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 SYSC_sendto+0x6a5/0x7c0 net/socket.c:1696 SyS_sendto+0xbc/0xe0 net/socket.c:1664 do_syscall_64+0x72/0xa0 arch/x86/entry/common.c:285 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:246 RIP: 0033:0x436e03 RSP: 002b:00007ffce48baf38 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000436e03 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00007ffce48baf90 R08: 00007ffce48baf50 R09: 000000000000001c R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000401790 R14: 0000000000401820 R15: 0000000000000000 origin: 00000000d9400053 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:362 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:257 kmsan_poison_shadow+0x6d/0xc0 mm/kmsan/kmsan.c:270 slab_alloc_node mm/slub.c:2735 __kmalloc_node_track_caller+0x1f4/0x390 mm/slub.c:4341 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x2cd/0x740 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:933 alloc_skb_with_frags+0x209/0xbc0 net/core/skbuff.c:4678 sock_alloc_send_pskb+0x9ff/0xe00 net/core/sock.c:1903 sock_alloc_send_skb+0xe4/0x100 net/core/sock.c:1920 rawv6_send_hdrinc net/ipv6/raw.c:638 rawv6_sendmsg+0x2918/0x41a0 net/ipv6/raw.c:919 inet_sendmsg+0x3f8/0x6d0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 SYSC_sendto+0x6a5/0x7c0 net/socket.c:1696 SyS_sendto+0xbc/0xe0 net/socket.c:1664 do_syscall_64+0x72/0xa0 arch/x86/entry/common.c:285 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:246 ================================================================== , triggered by the following syscalls: socket(PF_INET6, SOCK_RAW, IPPROTO_RAW) = 3 sendto(3, NULL, 0, 0, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "ff00::", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = -1 EPERM A similar report is triggered in net/ipv4/raw.c if we use a PF_INET socket instead of a PF_INET6 one. Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- net/ipv4/raw.c | 3 +++ net/ipv6/raw.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9d943974de2b..bdffad875691 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -358,6 +358,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct iphdr)) + return -EINVAL; + if (flags&MSG_PROBE) goto out; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0da6a12b5472..1f992d9e261d 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -632,6 +632,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct ipv6hdr)) + return -EINVAL; if (flags&MSG_PROBE) goto out; -- cgit v1.2.3 From 77ef033b687c3e030017c94a29bf6ea3aaaef678 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Thu, 4 May 2017 16:48:58 +0200 Subject: rtnetlink: NUL-terminate IFLA_PHYS_PORT_NAME string IFLA_PHYS_PORT_NAME is a string attribute, so terminate it with \0. Otherwise libnl3 fails to validate netlink messages with this attribute. "ip -detail a" assumes too that the attribute is NUL-terminated when printing it. It often was, due to padding. I noticed this as libvirtd failing to start on a system with sfc driver after upgrading it to Linux 4.11, i.e. when sfc added support for phys_port_name. Signed-off-by: Michal Schmidt Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6e67315ec368..bcb0f610ee42 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1054,7 +1054,7 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) return err; } - if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; -- cgit v1.2.3 From 2f460933f58eee3393aba64f0f6d14acb08d1724 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 3 May 2017 22:07:31 -0700 Subject: ipv6: initialize route null entry in addrconf_init() Andrey reported a crash on init_net.ipv6.ip6_null_entry->rt6i_idev since it is always NULL. This is clearly wrong, we have code to initialize it to loopback_dev, unfortunately the order is still not correct. loopback_dev is registered very early during boot, we lose a chance to re-initialize it in notifier. addrconf_init() is called after ip6_route_init(), which means we have no chance to correct it. Fix it by moving this initialization explicitly after ipv6_add_dev(init_net.loopback_dev) in addrconf_init(). Reported-by: Andrey Konovalov Signed-off-by: Cong Wang Tested-by: Andrey Konovalov Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 + net/ipv6/addrconf.c | 2 ++ net/ipv6/route.c | 26 +++++++++++++++----------- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9dc2c182a263..f5e625f53367 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -84,6 +84,7 @@ struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int ifindex, struct flowi6 *fl6, int flags); +void ip6_route_init_special_entries(void); int ip6_route_init(void); void ip6_route_cleanup(void); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a2a370b71249..77a4bd526d6e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6573,6 +6573,8 @@ int __init addrconf_init(void) goto errlo; } + ip6_route_init_special_entries(); + for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_addr_lst[i]); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a1bf426c959b..2f1136627dcb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4027,6 +4027,21 @@ static struct notifier_block ip6_route_dev_notifier = { .priority = 0, }; +void __init ip6_route_init_special_entries(void) +{ + /* Registering of the loopback is done before this portion of code, + * the loopback reference in rt6_info will not be taken, do it + * manually for init_net */ + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #endif +} + int __init ip6_route_init(void) { int ret; @@ -4053,17 +4068,6 @@ int __init ip6_route_init(void) ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; - /* Registering of the loopback is done before this portion of code, - * the loopback reference in rt6_info will not be taken, do it - * manually for init_net */ - init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #ifdef CONFIG_IPV6_MULTIPLE_TABLES - init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #endif ret = fib6_init(); if (ret) goto out_register_subsys; -- cgit v1.2.3