diff options
33 files changed, 916 insertions, 930 deletions
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 4dd9306c9d56..dc4f58a3cdcc 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -380,16 +380,16 @@ static inline unsigned long ifname_compare_aligned(const char *_a, * allows us to return 0 for single core systems without forcing * callers to deal with SMP vs. NONSMP issues. */ -static inline u64 xt_percpu_counter_alloc(void) +static inline unsigned long xt_percpu_counter_alloc(void) { if (nr_cpu_ids > 1) { void __percpu *res = __alloc_percpu(sizeof(struct xt_counters), sizeof(struct xt_counters)); if (res == NULL) - return (u64) -ENOMEM; + return -ENOMEM; - return (u64) (__force unsigned long) res; + return (__force unsigned long) res; } return 0; diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a6cc576fd467..af4c10ebb241 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -731,6 +731,12 @@ struct ip_vs_pe { u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); + /* create connections for real-server outgoing packets */ + struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, __be16 cport); }; /* The application module object (a.k.a. app incarnation) */ @@ -874,6 +880,7 @@ struct netns_ipvs { /* Service counters */ atomic_t ftpsvc_counter; atomic_t nullsvc_counter; + atomic_t conn_out_counter; #ifdef CONFIG_SYSCTL /* 1/rate drop and drop-entry variables */ @@ -1147,6 +1154,12 @@ static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) */ const char *ip_vs_proto_name(unsigned int proto); void ip_vs_init_hash_table(struct list_head *table, int rows); +struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport); #define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table((t), ARRAY_SIZE((t))) #define IP_VS_APP_TYPE_FTP 1 @@ -1378,6 +1391,10 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); +struct ip_vs_dest * +ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport); + int ip_vs_use_count_inc(void); void ip_vs_use_count_dec(void); int ip_vs_register_nl_ioctl(void); diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index fde4068eec0b..dd78bea227c8 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -289,8 +289,6 @@ struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); extern unsigned int nf_conntrack_htable_size; extern unsigned int nf_conntrack_max; -extern unsigned int nf_conntrack_hash_rnd; -void init_nf_conntrack_hash_rnd(void); struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 62e17d1319ff..3e2f3328945c 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -81,6 +81,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, #define CONNTRACK_LOCKS 1024 +extern struct hlist_nulls_head *nf_conntrack_hash; extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; void nf_conntrack_lock(spinlock_t *lock); diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index dce56f09ac9a..5ed33ea4718e 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -10,6 +10,7 @@ extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; +extern struct hlist_head *nf_ct_expect_hash; struct nf_conntrack_expect { /* Conntrack expectation list member */ diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 956d8a6ac069..1a5fb36f165f 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -23,6 +23,9 @@ struct nf_conntrack_l4proto { /* L4 Protocol number. */ u_int8_t l4proto; + /* Resolve clashes on insertion races. */ + bool allow_clash; + /* Try to fill in the third arg: dataoff is offset past network protocol hdr. Return true if possible. */ bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff, diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f6b1daf2e698..092235458691 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -303,7 +303,7 @@ void nft_unregister_set(struct nft_set_ops *ops); struct nft_set { struct list_head list; struct list_head bindings; - char name[IFNAMSIZ]; + char name[NFT_SET_MAXNAMELEN]; u32 ktype; u32 dtype; u32 size; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 723b61c82b3f..38b1a80517f0 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -84,7 +84,6 @@ struct netns_ct { struct ctl_table_header *event_sysctl_header; struct ctl_table_header *helper_sysctl_header; #endif - char *slabname; unsigned int sysctl_log_invalid; /* Log invalid packets */ int sysctl_events; int sysctl_acct; @@ -93,11 +92,6 @@ struct netns_ct { int sysctl_tstamp; int sysctl_checksum; - unsigned int htable_size; - seqcount_t generation; - struct kmem_cache *nf_conntrack_cachep; - struct hlist_nulls_head *hash; - struct hlist_head *expect_hash; struct ct_pcpu __percpu *pcpu_lists; struct ip_conntrack_stat __percpu *stat; struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; @@ -107,9 +101,5 @@ struct netns_ct { unsigned int labels_used; u8 label_words; #endif -#ifdef CONFIG_NF_NAT_NEEDED - struct hlist_head *nat_bysource; - unsigned int nat_htable_size; -#endif }; #endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 660231363bb5..6a4dbe04f09e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -3,6 +3,7 @@ #define NFT_TABLE_MAXNAMELEN 32 #define NFT_CHAIN_MAXNAMELEN 32 +#define NFT_SET_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 /** diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 60f5161abcb4..2033f929aa66 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -34,27 +34,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); MODULE_DESCRIPTION("arptables core"); -/*#define DEBUG_ARP_TABLES*/ -/*#define DEBUG_ARP_TABLES_USER*/ - -#ifdef DEBUG_ARP_TABLES -#define dprintf(format, args...) pr_debug(format, ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_ARP_TABLES_USER -#define duprintf(format, args...) pr_debug(format, ## args) -#else -#define duprintf(format, args...) -#endif - -#ifdef CONFIG_NETFILTER_DEBUG -#define ARP_NF_ASSERT(x) WARN_ON(!(x)) -#else -#define ARP_NF_ASSERT(x) -#endif - void *arpt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(arpt, ARPT); @@ -113,36 +92,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr, #define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, - ARPT_INV_ARPOP)) { - dprintf("ARP operation field mismatch.\n"); - dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", - arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + ARPT_INV_ARPOP)) return 0; - } if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, - ARPT_INV_ARPHRD)) { - dprintf("ARP hardware address format mismatch.\n"); - dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", - arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + ARPT_INV_ARPHRD)) return 0; - } if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, - ARPT_INV_ARPPRO)) { - dprintf("ARP protocol address format mismatch.\n"); - dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", - arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + ARPT_INV_ARPPRO)) return 0; - } if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, - ARPT_INV_ARPHLN)) { - dprintf("ARP hardware address length mismatch.\n"); - dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", - arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + ARPT_INV_ARPHLN)) return 0; - } src_devaddr = arpptr; arpptr += dev->addr_len; @@ -155,49 +118,25 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), ARPT_INV_SRCDEVADDR) || FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), - ARPT_INV_TGTDEVADDR)) { - dprintf("Source or target device address mismatch.\n"); - + ARPT_INV_TGTDEVADDR)) return 0; - } if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, ARPT_INV_SRCIP) || FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), - ARPT_INV_TGTIP)) { - dprintf("Source or target IP address mismatch.\n"); - - dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", - &src_ipaddr, - &arpinfo->smsk.s_addr, - &arpinfo->src.s_addr, - arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); - dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n", - &tgt_ipaddr, - &arpinfo->tmsk.s_addr, - &arpinfo->tgt.s_addr, - arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + ARPT_INV_TGTIP)) return 0; - } /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, arpinfo->iniface, - arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) return 0; - } ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, arpinfo->outiface, - arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) return 0; - } return 1; #undef FWINV @@ -205,16 +144,10 @@ static inline int arp_packet_match(const struct arphdr *arphdr, static inline int arp_checkentry(const struct arpt_arp *arp) { - if (arp->flags & ~ARPT_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - arp->flags & ~ARPT_F_MASK); + if (arp->flags & ~ARPT_F_MASK) return 0; - } - if (arp->invflags & ~ARPT_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - arp->invflags & ~ARPT_INV_MASK); + if (arp->invflags & ~ARPT_INV_MASK) return 0; - } return 1; } @@ -406,11 +339,9 @@ static int mark_source_chains(const struct xt_table_info *newinfo, = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { - pr_notice("arptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); @@ -423,12 +354,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last * big jump. @@ -462,8 +389,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct arpt_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -480,8 +405,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -489,7 +413,6 @@ next: static inline int check_target(struct arpt_entry *e, const char *name) { struct xt_entry_target *t = arpt_get_target(e); - int ret; struct xt_tgchk_param par = { .table = name, .entryinfo = e, @@ -499,13 +422,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) .family = NFPROTO_ARP, }; - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); - if (ret < 0) { - duprintf("arp_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); } static inline int @@ -513,17 +430,18 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { struct xt_entry_target *t; struct xt_target *target; + unsigned long pcnt; int ret; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto out; } @@ -569,17 +487,12 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!arp_checkentry(&e->arp)) return -EINVAL; @@ -596,12 +509,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -646,7 +556,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ @@ -663,31 +572,21 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } - duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) return ret; - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -895,11 +794,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct arpt_getinfo)) { - duprintf("length %u != %Zu\n", *len, - sizeof(struct arpt_getinfo)); + if (*len != sizeof(struct arpt_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -955,33 +851,25 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, struct arpt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct arpt_get_entries) + get.size) { - duprintf("get_entries: %u != %Zu\n", *len, - sizeof(struct arpt_get_entries) + get.size); + if (*len != sizeof(struct arpt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", - private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1019,8 +907,6 @@ static int __do_replace(struct net *net, const char *name, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1030,8 +916,6 @@ static int __do_replace(struct net *net, const char *name, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1101,8 +985,6 @@ static int do_replace(struct net *net, const void __user *user, if (ret != 0) goto free_newinfo; - duprintf("arp_tables: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1200,20 +1082,14 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, unsigned int entry_offset; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_arpt_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!arp_checkentry(&e->arp)) return -EINVAL; @@ -1230,8 +1106,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto out; } @@ -1301,7 +1175,6 @@ static int translate_compat_table(struct xt_table_info **pinfo, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); @@ -1316,11 +1189,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1411,8 +1281,6 @@ static int compat_do_replace(struct net *net, void __user *user, if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1445,7 +1313,6 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1528,17 +1395,13 @@ static int compat_get_entries(struct net *net, struct compat_arpt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_arpt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_arpt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); @@ -1547,16 +1410,13 @@ static int compat_get_entries(struct net *net, const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); if (!ret && get.size == info.size) { ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + } else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(NFPROTO_ARP); module_put(t->me); xt_table_unlock(t); @@ -1608,7 +1468,6 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned break; default: - duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1651,7 +1510,6 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len } default: - duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1696,7 +1554,6 @@ int arpt_register_table(struct net *net, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); - duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) goto out_free; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 735d1ee8c1ab..54906e0e8e0c 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -35,34 +35,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv4 packet filter"); -/*#define DEBUG_IP_FIREWALL*/ -/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ -/*#define DEBUG_IP_FIREWALL_USER*/ - -#ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) pr_info(format , ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) pr_info(format , ## args) -#else -#define duprintf(format, args...) -#endif - #ifdef CONFIG_NETFILTER_DEBUG #define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif -#if 0 -/* All the better to debug you with... */ -#define static -#define inline -#endif - void *ipt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ipt, IPT); @@ -85,52 +63,28 @@ ip_packet_match(const struct iphdr *ip, if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, IPT_INV_SRCIP) || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) { - dprintf("Source or dest mismatch.\n"); - - dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", - &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr, - ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); - dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n", - &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr, - ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + IPT_INV_DSTIP)) return false; - } ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, ipinfo->iniface, - ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, IPT_INV_VIA_IN)) return false; - } ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, ipinfo->outiface, - ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) return false; - } /* Check specific protocol */ if (ipinfo->proto && - FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { - dprintf("Packet protocol %hi does not match %hi.%s\n", - ip->protocol, ipinfo->proto, - ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : ""); + FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) return false; - } /* If we have a fragment rule but the packet is not a fragment * then we return zero */ - if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { - dprintf("Fragment rule but not fragment.%s\n", - ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) return false; - } return true; } @@ -138,16 +92,10 @@ ip_packet_match(const struct iphdr *ip, static bool ip_checkentry(const struct ipt_ip *ip) { - if (ip->flags & ~IPT_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - ip->flags & ~IPT_F_MASK); + if (ip->flags & ~IPT_F_MASK) return false; - } - if (ip->invflags & ~IPT_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - ip->invflags & ~IPT_INV_MASK); + if (ip->invflags & ~IPT_INV_MASK) return false; - } return true; } @@ -346,10 +294,6 @@ ipt_do_table(struct sk_buff *skb, e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u), UF %p\n", - table->name, hook, - get_entry(table_base, private->underflow[hook])); - do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; @@ -396,22 +340,15 @@ ipt_do_table(struct sk_buff *skb, if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); - pr_debug("Underflow (this is normal) " - "to %p\n", e); } else { e = jumpstack[--stackidx]; - pr_debug("Pulled %p out from pos %u\n", - e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && - !(e->ip.flags & IPT_F_GOTO)) { + !(e->ip.flags & IPT_F_GOTO)) jumpstack[stackidx++] = e; - pr_debug("Pushed %p into pos %u\n", - e, stackidx - 1); - } e = get_entry(table_base, v); continue; @@ -429,18 +366,13 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); xt_write_recseq_end(addend); local_bh_enable(); -#ifdef DEBUG_ALLOW_ALL - return NF_ACCEPT; -#else if (acpar.hotdrop) return NF_DROP; else return verdict; -#endif } static bool find_jump_target(const struct xt_table_info *t, @@ -480,11 +412,9 @@ mark_source_chains(const struct xt_table_info *newinfo, = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - pr_err("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ @@ -496,26 +426,13 @@ mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); -#ifdef DEBUG_IP_FIREWALL_USER - if (e->comefrom - & (1 << NF_INET_NUMHOOKS)) { - duprintf("Back unset " - "on hook %u " - "rule %u\n", - hook, pos); - } -#endif oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; @@ -543,8 +460,6 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct ipt_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -561,8 +476,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -584,18 +498,12 @@ static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; - int ret; par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, m->u.match_size - sizeof(*m), - ip->proto, ip->invflags & IPT_INV_PROTO); - if (ret < 0) { - duprintf("check failed for `%s'.\n", par->match->name); - return ret; - } - return 0; + return xt_check_match(par, m->u.match_size - sizeof(*m), + ip->proto, ip->invflags & IPT_INV_PROTO); } static int @@ -606,10 +514,8 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("find_check_match: `%s' not found\n", m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } m->u.kernel.match = match; ret = check_match(m, par); @@ -634,16 +540,9 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name) .hook_mask = e->comefrom, .family = NFPROTO_IPV4, }; - int ret; - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), - e->ip.proto, e->ip.invflags & IPT_INV_PROTO); - if (ret < 0) { - duprintf("check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ip.proto, e->ip.invflags & IPT_INV_PROTO); } static int @@ -656,10 +555,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -678,7 +579,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto cleanup_matches; } @@ -732,17 +632,12 @@ check_entry_size_and_hooks(struct ipt_entry *e, if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!ip_checkentry(&e->ip)) return -EINVAL; @@ -759,12 +654,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -816,7 +708,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -833,27 +724,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, ++newinfo->stacksize; } - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -1081,11 +963,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct ipt_getinfo)) { - duprintf("length %u != %zu\n", *len, - sizeof(struct ipt_getinfo)); + if (*len != sizeof(struct ipt_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -1143,31 +1022,23 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, struct ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct ipt_get_entries) + get.size) { - duprintf("get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct ipt_get_entries) + get.size) return -EINVAL; - } get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1203,8 +1074,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1214,8 +1083,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1284,8 +1151,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1411,11 +1276,9 @@ compat_find_calc_match(struct xt_entry_match *m, match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; @@ -1447,20 +1310,14 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, unsigned int j; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_ipt_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!ip_checkentry(&e->ip)) return -EINVAL; @@ -1484,8 +1341,6 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto release_matches; } @@ -1567,7 +1422,6 @@ translate_compat_table(struct net *net, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); xt_compat_init_offsets(AF_INET, compatr->num_entries); @@ -1582,11 +1436,8 @@ translate_compat_table(struct net *net, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1683,8 +1534,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1718,7 +1567,6 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1768,19 +1616,15 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, struct compat_ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_ipt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET); @@ -1788,16 +1632,13 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); - if (!ret && get.size == info.size) { + if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(AF_INET); module_put(t->me); xt_table_unlock(t); @@ -1850,7 +1691,6 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1902,7 +1742,6 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } default: - duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -2004,7 +1843,6 @@ icmp_match(const struct sk_buff *skb, struct xt_action_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ICMP tinygram.\n"); par->hotdrop = true; return false; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index e3c46e8e2762..ae1a71a97132 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net) in->ctl_table[0].data = &nf_conntrack_max; in->ctl_table[1].data = &net->ct.count; - in->ctl_table[2].data = &net->ct.htable_size; + in->ctl_table[2].data = &nf_conntrack_htable_size; in->ctl_table[3].data = &net->ct.sysctl_checksum; in->ctl_table[4].data = &net->ct.sysctl_log_invalid; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index f0dfe92a00d6..c6f3c406f707 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -31,15 +31,14 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { n = rcu_dereference( - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); } return head; } @@ -114,6 +112,23 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +static bool ct_seq_should_skip(const struct nf_conn *ct, + const struct net *net, + const struct nf_conntrack_tuple_hash *hash) +{ + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + return true; + + if (nf_ct_l3num(ct) != AF_INET) + return true; + + if (!net_eq(nf_ct_net(ct), net)) + return true; + + return false; +} + static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; @@ -123,14 +138,15 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; NF_CT_ASSERT(ct); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (ct_seq_should_skip(ct, seq_file_net(s), hash)) return 0; + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; - /* we only want to print DIR_ORIGINAL */ - if (NF_CT_DIRECTION(hash)) - goto release; - if (nf_ct_l3num(ct) != AF_INET) + /* check if we raced w. object reuse */ + if (!nf_ct_is_confirmed(ct) || + ct_seq_should_skip(ct, seq_file_net(s), hash)) goto release; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); @@ -220,13 +236,12 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference( - hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -236,7 +251,6 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); @@ -244,7 +258,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference( - hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } @@ -285,6 +299,9 @@ static int exp_seq_show(struct seq_file *s, void *v) exp = hlist_entry(n, struct nf_conntrack_expect, hnode); + if (!net_eq(nf_ct_net(exp->master), seq_file_net(s))) + return 0; + if (exp->tuple.src.l3num != AF_INET) return 0; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 73e606c719ef..63e06c3dd319 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -39,34 +39,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); -/*#define DEBUG_IP_FIREWALL*/ -/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ -/*#define DEBUG_IP_FIREWALL_USER*/ - -#ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) pr_info(format , ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) pr_info(format , ## args) -#else -#define duprintf(format, args...) -#endif - #ifdef CONFIG_NETFILTER_DEBUG #define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif -#if 0 -/* All the better to debug you with... */ -#define static -#define inline -#endif - void *ip6t_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ip6t, IP6T); @@ -100,35 +78,18 @@ ip6_packet_match(const struct sk_buff *skb, if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, &ip6info->src), IP6T_INV_SRCIP) || FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, - &ip6info->dst), IP6T_INV_DSTIP)) { - dprintf("Source or dest mismatch.\n"); -/* - dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, - ipinfo->smsk.s_addr, ipinfo->src.s_addr, - ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); - dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, - ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, - ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + &ip6info->dst), IP6T_INV_DSTIP)) return false; - } ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, ip6info->iniface, - ip6info->invflags & IP6T_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) return false; - } ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, ip6info->outiface, - ip6info->invflags & IP6T_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) return false; - } /* ... might want to do something with class and flowlabel here ... */ @@ -145,11 +106,6 @@ ip6_packet_match(const struct sk_buff *skb, } *fragoff = _frag_off; - dprintf("Packet protocol %hi ?= %s%hi.\n", - protohdr, - ip6info->invflags & IP6T_INV_PROTO ? "!":"", - ip6info->proto); - if (ip6info->proto == protohdr) { if (ip6info->invflags & IP6T_INV_PROTO) return false; @@ -169,16 +125,11 @@ ip6_packet_match(const struct sk_buff *skb, static bool ip6_checkentry(const struct ip6t_ip6 *ipv6) { - if (ipv6->flags & ~IP6T_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - ipv6->flags & ~IP6T_F_MASK); + if (ipv6->flags & ~IP6T_F_MASK) return false; - } - if (ipv6->invflags & ~IP6T_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - ipv6->invflags & ~IP6T_INV_MASK); + if (ipv6->invflags & ~IP6T_INV_MASK) return false; - } + return true; } @@ -446,13 +397,9 @@ ip6t_do_table(struct sk_buff *skb, xt_write_recseq_end(addend); local_bh_enable(); -#ifdef DEBUG_ALLOW_ALL - return NF_ACCEPT; -#else if (acpar.hotdrop) return NF_DROP; else return verdict; -#endif } static bool find_jump_target(const struct xt_table_info *t, @@ -492,11 +439,9 @@ mark_source_chains(const struct xt_table_info *newinfo, = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - pr_err("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ @@ -508,26 +453,13 @@ mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); -#ifdef DEBUG_IP_FIREWALL_USER - if (e->comefrom - & (1 << NF_INET_NUMHOOKS)) { - duprintf("Back unset " - "on hook %u " - "rule %u\n", - hook, pos); - } -#endif oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; @@ -555,8 +487,6 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct ip6t_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -573,8 +503,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -595,19 +524,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; - int ret; par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, m->u.match_size - sizeof(*m), - ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); - if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - par.match->name); - return ret; - } - return 0; + return xt_check_match(par, m->u.match_size - sizeof(*m), + ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); } static int @@ -618,10 +540,9 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("find_check_match: `%s' not found\n", m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; ret = check_match(m, par); @@ -646,17 +567,11 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name) .hook_mask = e->comefrom, .family = NFPROTO_IPV6, }; - int ret; t = ip6t_get_target(e); - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), - e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); - if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ipv6.proto, + e->ipv6.invflags & IP6T_INV_PROTO); } static int @@ -669,10 +584,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -691,7 +608,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto cleanup_matches; } @@ -744,17 +660,12 @@ check_entry_size_and_hooks(struct ip6t_entry *e, if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!ip6_checkentry(&e->ipv6)) return -EINVAL; @@ -771,12 +682,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -828,7 +736,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -845,27 +752,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, ++newinfo->stacksize; } - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -1093,11 +991,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct ip6t_getinfo)) { - duprintf("length %u != %zu\n", *len, - sizeof(struct ip6t_getinfo)); + if (*len != sizeof(struct ip6t_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -1155,31 +1050,24 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, struct ip6t_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct ip6t_get_entries) + get.size) { - duprintf("get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct ip6t_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR_OR_NULL(t)) { struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1215,8 +1103,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1226,8 +1112,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1296,8 +1180,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("ip_tables: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1422,11 +1304,9 @@ compat_find_calc_match(struct xt_entry_match *m, match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; @@ -1458,20 +1338,14 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, unsigned int j; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_ip6t_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!ip6_checkentry(&e->ipv6)) return -EINVAL; @@ -1495,8 +1369,6 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto release_matches; } @@ -1575,7 +1447,6 @@ translate_compat_table(struct net *net, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET6); xt_compat_init_offsets(AF_INET6, compatr->num_entries); @@ -1590,11 +1461,8 @@ translate_compat_table(struct net *net, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1685,8 +1553,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1720,7 +1586,6 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1770,19 +1635,15 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, struct compat_ip6t_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); @@ -1790,16 +1651,13 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); - if (!ret && get.size == info.size) { + if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(AF_INET6); module_put(t->me); xt_table_unlock(t); @@ -1852,7 +1710,6 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; default: - duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1904,7 +1761,6 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } default: - duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -2006,7 +1862,6 @@ icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ICMP tinygram.\n"); par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 5d778dd11f66..06bed74cf5ee 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -60,7 +60,7 @@ synproxy_send_tcp(struct net *net, fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL || dst->error) { + if (dst->error) { dst_release(dst); goto free_nskb; } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 85ca189bdc3d..2cb3c626cd43 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key) spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } +static void ip_vs_conn_expire(unsigned long data); /* * Returns hash value for IPVS connection entry @@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, } EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); +static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) +{ + __ip_vs_conn_put(cp); + ip_vs_conn_expire((unsigned long)cp); +} + /* * Put back the conn and restart its timer with its timeout */ -void ip_vs_conn_put(struct ip_vs_conn *cp) +static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) { unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 0 : cp->timeout; @@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) __ip_vs_conn_put(cp); } +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && + (atomic_read(&cp->refcnt) == 1) && + !timer_pending(&cp->timer)) + /* expire connection immediately */ + __ip_vs_conn_put_notimer(cp); + else + __ip_vs_conn_put_timer(cp); +} /* * Fill a no_client_port connection with a client port number @@ -819,7 +836,8 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); - if (cp->flags & IP_VS_CONN_F_NFCT) { + if ((cp->flags & IP_VS_CONN_F_NFCT) && + !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { /* Do not access conntracks during subsys cleanup * because nf_conntrack_find_get can not be used after * conntrack cleanup for the net. @@ -834,7 +852,10 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + ip_vs_conn_rcu_free(&cp->rcu_head); + else + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); return; } @@ -850,7 +871,7 @@ static void ip_vs_conn_expire(unsigned long data) if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); - ip_vs_conn_put(cp); + __ip_vs_conn_put_timer(cp); } /* Modify timer, so that it expires as soon as possible. @@ -1240,6 +1261,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp) return 1; } +static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) +{ + struct ip_vs_service *svc; + + if (!cp->dest) + return false; + svc = rcu_dereference(cp->dest->svc); + return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); +} + /* Called from keventd and must protect itself from softirqs */ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { @@ -1254,11 +1285,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; if (cp->ipvs != ipvs) continue; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (atomic_read(&cp->n_control) || + !ip_vs_conn_ops_mode(cp)) + continue; + else + /* connection template of OPS */ + goto try_drop; + } if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: @@ -1286,6 +1322,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) continue; } } else { +try_drop: if (!todrop_entry(cp)) continue; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b9a4082afa3a..1207f20d24e4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); #ifdef CONFIG_IP_VS_DEBUG EXPORT_SYMBOL(ip_vs_get_debug_level); #endif +EXPORT_SYMBOL(ip_vs_new_conn_out); static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ @@ -611,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + atomic_inc(&cp->control->in_pkts); + else + atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } @@ -1100,6 +1104,143 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, } } +/* Generic function to create new connections for outgoing RS packets + * + * Pre-requisites for successful connection creation: + * 1) Virtual Service is NOT fwmark based: + * In fwmark-VS actual vaddr and vport are unknown to IPVS + * 2) Real Server and Virtual Service were NOT configured without port: + * This is to allow match of different VS to the same RS ip-addr + */ +struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + struct ip_vs_conn_param param; + struct ip_vs_conn *ct = NULL, *cp = NULL; + const union nf_inet_addr *vaddr, *daddr, *caddr; + union nf_inet_addr snet; + __be16 vport; + unsigned int flags; + + EnterFunction(12); + vaddr = &svc->addr; + vport = svc->port; + daddr = &iph->saddr; + caddr = &iph->daddr; + + /* check pre-requisites are satisfied */ + if (svc->fwmark) + return NULL; + if (!vport || !dport) + return NULL; + + /* for persistent service first create connection template */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + /* apply netmask the same way ingress-side does */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &caddr->in6, + (__force __u32)svc->netmask); + else +#endif + snet.ip = caddr->ip & svc->netmask; + /* fill params and create template if not existent */ + if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, + &snet, 0, vaddr, + vport, ¶m) < 0) + return NULL; + ct = ip_vs_ct_in_get(¶m); + if (!ct) { + ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, + IP_VS_CONN_F_TEMPLATE, dest, 0); + if (!ct) { + kfree(param.pe_data); + return NULL; + } + ct->timeout = svc->timeout; + } else { + kfree(param.pe_data); + } + } + + /* connection flags */ + flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && + iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; + /* create connection */ + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, ¶m); + cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); + if (!cp) { + if (ct) + ip_vs_conn_put(ct); + return NULL; + } + if (ct) { + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + } + ip_vs_conn_stats(cp, svc); + + /* return connection (will be used to handle outgoing packet) */ + IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + LeaveFunction(12); + return cp; +} + +/* Handle outgoing packets which are considered requests initiated by + * real servers, so that subsequent responses from external client can be + * routed to the right real server. + * Used also for outgoing responses in OPS mode. + * + * Connection management is handled by persistent-engine specific callback. + */ +static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, + struct netns_ipvs *ipvs, + int af, struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp = NULL; + __be16 _ports[2], *pptr; + + if (hooknum == NF_INET_LOCAL_IN) + return NULL; + + pptr = frag_safe_skb_hp(skb, iph->len, + sizeof(_ports), _ports, iph); + if (!pptr) + return NULL; + + rcu_read_lock(); + dest = ip_vs_find_real_service(ipvs, af, iph->protocol, + &iph->saddr, pptr[0]); + if (dest) { + struct ip_vs_service *svc; + struct ip_vs_pe *pe; + + svc = rcu_dereference(dest->svc); + if (svc) { + pe = rcu_dereference(svc->pe); + if (pe && pe->conn_out) + cp = pe->conn_out(svc, dest, skb, iph, + pptr[0], pptr[1]); + } + } + rcu_read_unlock(); + + return cp; +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1245,6 +1386,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); + + /* Check for real-server-started requests */ + if (atomic_read(&ipvs->conn_out_counter)) { + /* Currently only for UDP: + * connection oriented protocols typically use + * ephemeral ports for outgoing connections, so + * related incoming responses would not match any VS + */ + if (pp->protocol == IPPROTO_UDP) { + cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph, + hooknum); + } + } + if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || @@ -1837,6 +1994,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); + else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + /* increment is done inside ip_vs_sync_conn too */ + atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f35ebc02fa5c..c3c809b2e712 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, return false; } +/* Find real service record by <proto,addr,port>. + * In case of multiple records with the same <proto,addr,port>, only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, + __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned int hash; + struct ip_vs_dest *dest; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ @@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); + if (svc->pe && svc->pe->conn_out) + atomic_inc(&ipvs->conn_out_counter); ip_vs_start_estimator(ipvs, &svc->stats); @@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; + bool new_pe_conn_out, old_pe_conn_out; /* * Lookup the scheduler, by 'u->sched_name' @@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); - if (pe != old_pe) + if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); + /* check for optional methods in new pe */ + new_pe_conn_out = (pe && pe->conn_out) ? true : false; + old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; + if (new_pe_conn_out && !old_pe_conn_out) + atomic_inc(&svc->ipvs->conn_out_counter); + if (old_pe_conn_out && !new_pe_conn_out) + atomic_dec(&svc->ipvs->conn_out_counter); + } out: ip_vs_scheduler_put(old_sched); @@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); + if (old_pe && old_pe->conn_out) + atomic_dec(&ipvs->conn_out_counter); ip_vs_pe_put(old_pe); /* @@ -3969,6 +4012,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) (unsigned long) ipvs); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); + atomic_set(&ipvs->conn_out_counter, 0); /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 30434fb133df..f04fd8df210b 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return; + /* Never alter conntrack for OPS conns (no reply is expected) */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + /* Alter reply only in original direction */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0a6eb5c0d9e9..d07ef9e31c12 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) return cp->pe_data_len; } +static struct ip_vs_conn * +ip_vs_sip_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + if (likely(iph->protocol == IPPROTO_UDP)) + return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport); + /* currently no need to handle other than UDP */ + return NULL; +} + static struct ip_vs_pe ip_vs_sip_pe = { .name = "sip", @@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe = .ct_match = ip_vs_sip_ct_match, .hashkey_raw = ip_vs_sip_hashkey_raw, .show_pe_data = ip_vs_sip_show_pe_data, + .conn_out = ip_vs_sip_conn_out, }; static int __init ip_vs_sip_init(void) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2fd607408998..0cd29365004f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -54,6 +54,7 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> +#include <net/netns/hash.h> #define NF_CONNTRACK_VERSION "0.5.0" @@ -68,7 +69,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); +struct hlist_nulls_head *nf_conntrack_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_hash); + +static __read_mostly struct kmem_cache *nf_conntrack_cachep; static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly bool nf_conntrack_locks_all; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -107,7 +113,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { nf_conntrack_double_unlock(h1, h2); return true; } @@ -141,43 +147,43 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); -unsigned int nf_conntrack_hash_rnd __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); +static unsigned int nf_conntrack_hash_rnd __read_mostly; -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + const struct net *net) { unsigned int n; + u32 seed; + + get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); /* The direction must be ignored, so we hash everything up to the * destination ports (which is a multiple of 4) and treat the last * three bytes manually. */ + seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, seed ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } -static u32 __hash_bucket(u32 hash, unsigned int size) -{ - return reciprocal_scale(hash, size); -} - -static u32 hash_bucket(u32 hash, const struct net *net) +static u32 scale_hash(u32 hash) { - return __hash_bucket(hash, net->ct.htable_size); + return reciprocal_scale(hash, nf_conntrack_htable_size); } -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - unsigned int size) +static u32 __hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple, + unsigned int size) { - return __hash_bucket(hash_conntrack_raw(tuple), size); + return reciprocal_scale(hash_conntrack_raw(tuple, net), size); } -static inline u_int32_t hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, net->ct.htable_size); + return scale_hash(hash_conntrack_raw(tuple, net)); } bool @@ -358,7 +364,7 @@ destroy_conntrack(struct nf_conntrack *nfct) } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - if (l4proto && l4proto->destroy) + if (l4proto->destroy) l4proto->destroy(ct); rcu_read_unlock(); @@ -393,7 +399,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -445,7 +451,8 @@ static void death_by_timeout(unsigned long ul_conntrack) static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct nf_conntrack_zone *zone, + const struct net *net) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -454,7 +461,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, */ return nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && - nf_ct_is_confirmed(ct); + nf_ct_is_confirmed(ct) && + net_eq(net, nf_ct_net(ct)); } /* @@ -467,21 +475,23 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int bucket = hash_bucket(hash, net); + unsigned int bucket, sequence; - /* Disable BHs the entire time since we normally need to disable them - * at least once for the stats anyway. - */ - local_bh_disable(); begin: - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { - if (nf_ct_key_equal(h, tuple, zone)) { - NF_CT_STAT_INC(net, found); - local_bh_enable(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + bucket = scale_hash(hash); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { + if (nf_ct_key_equal(h, tuple, zone, net)) { + NF_CT_STAT_INC_ATOMIC(net, found); return h; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } /* * if the nulls value we got at the end of this lookup is @@ -489,10 +499,9 @@ begin: * We probably met an item that was moved to another chain. */ if (get_nulls_value(n) != bucket) { - NF_CT_STAT_INC(net, search_restart); + NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } - local_bh_enable(); return NULL; } @@ -514,7 +523,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { nf_ct_put(ct); goto begin; } @@ -530,7 +539,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple)); + hash_conntrack_raw(tuple, net)); } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -538,12 +547,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int reply_hash) { - struct net *net = nf_ct_net(ct); - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.hash[hash]); + &nf_conntrack_hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, - &net->ct.hash[reply_hash]); + &nf_conntrack_hash[reply_hash]); } int @@ -560,7 +567,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -568,17 +575,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) goto out; add_timer(&ct->timeout); @@ -599,6 +603,63 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +static inline void nf_ct_acct_update(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int len) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); + atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); + } +} + +static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + const struct nf_conn *loser_ct) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(loser_ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + enum ip_conntrack_info ctinfo; + unsigned int bytes; + + /* u32 should be fine since we must have seen one packet. */ + bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); + nf_ct_acct_update(ct, ctinfo, bytes); + } +} + +/* Resolve race on insertion if this protocol allows this. */ +static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_tuple_hash *h) +{ + /* This is the conntrack entry already in hashes that won race. */ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct nf_conntrack_l4proto *l4proto; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->allow_clash && + !nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use)) { + nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); + nf_conntrack_put(skb->nfct); + /* Assign conntrack already in hashes to this skbuff. Don't + * modify skb->nfctinfo to ensure consistent stateful filtering. + */ + skb->nfct = &ct->ct_general; + return NF_ACCEPT; + } + NF_CT_STAT_INC(net, drop); + return NF_DROP; +} + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -613,6 +674,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct net *net; unsigned int sequence; + int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); @@ -628,10 +690,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; - hash = hash_bucket(hash, net); + hash = scale_hash(hash); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -655,23 +717,22 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ nf_ct_del_from_dying_or_unconfirmed_list(ct); - if (unlikely(nf_ct_is_dying(ct))) - goto out; + if (unlikely(nf_ct_is_dying(ct))) { + nf_ct_add_to_dying_list(ct); + goto dying; + } /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) goto out; /* Timer relative to confirmation time, not original @@ -710,10 +771,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) out: nf_ct_add_to_dying_list(ct); + ret = nf_ct_resolve_clash(net, skb, ctinfo, h); +dying: nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); - return NF_DROP; + return ret; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -726,29 +789,31 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; + unsigned int hash, sequence; struct hlist_nulls_node *n; struct nf_conn *ct; - unsigned int hash; zone = nf_ct_zone(ignored_conntrack); - hash = hash_conntrack(net, tuple); - /* Disable BHs the entire time since we need to disable them at - * least once for the stats anyway. - */ - rcu_read_lock_bh(); - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + rcu_read_lock(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hash = hash_conntrack(net, tuple); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && - nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { - NF_CT_STAT_INC(net, found); - rcu_read_unlock_bh(); + nf_ct_key_equal(h, tuple, zone, net)) { + NF_CT_STAT_INC_ATOMIC(net, found); + rcu_read_unlock(); return 1; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } - rcu_read_unlock_bh(); + rcu_read_unlock(); return 0; } @@ -762,71 +827,63 @@ static noinline int early_drop(struct net *net, unsigned int _hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct = NULL, *tmp; + struct nf_conn *tmp; struct hlist_nulls_node *n; - unsigned int i = 0, cnt = 0; - int dropped = 0; - unsigned int hash, sequence; + unsigned int i, hash, sequence; + struct nf_conn *ct = NULL; spinlock_t *lockp; + bool ret = false; + + i = 0; local_bh_disable(); restart: - sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_bucket(_hash, net); - for (; i < net->ct.htable_size; i++) { + sequence = read_seqcount_begin(&nf_conntrack_generation); + for (; i < NF_CT_EVICTION_RANGE; i++) { + hash = scale_hash(_hash++); lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { spin_unlock(lockp); goto restart; } - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], - hnnode) { + hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], + hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && - !nf_ct_is_dying(tmp) && - atomic_inc_not_zero(&tmp->ct_general.use)) { + + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || + !net_eq(nf_ct_net(tmp), net) || + nf_ct_is_dying(tmp)) + continue; + + if (atomic_inc_not_zero(&tmp->ct_general.use)) { ct = tmp; break; } - cnt++; } - hash = (hash + 1) % net->ct.htable_size; spin_unlock(lockp); - - if (ct || cnt >= NF_CT_EVICTION_RANGE) + if (ct) break; - } + local_bh_enable(); if (!ct) - return dropped; + return false; - if (del_timer(&ct->timeout)) { + /* kill only if in same netns -- might have moved due to + * SLAB_DESTROY_BY_RCU rules + */ + if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) { if (nf_ct_delete(ct, 0, 0)) { - dropped = 1; NF_CT_STAT_INC_ATOMIC(net, early_drop); + ret = true; } } - nf_ct_put(ct); - return dropped; -} - -void init_nf_conntrack_hash_rnd(void) -{ - unsigned int rand; - /* - * Why not initialize nf_conntrack_rnd in a "init()" function ? - * Because there isn't enough entropy when system initializing, - * and we initialize it as late as possible. - */ - do { - get_random_bytes(&rand, sizeof(rand)); - } while (!rand); - cmpxchg(&nf_conntrack_hash_rnd, 0, rand); + nf_ct_put(ct); + return ret; } static struct nf_conn * @@ -838,12 +895,6 @@ __nf_conntrack_alloc(struct net *net, { struct nf_conn *ct; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - /* recompute the hash as nf_conntrack_hash_rnd is initialized */ - hash = hash_conntrack_raw(orig); - } - /* We don't want any race condition at early drop stage */ atomic_inc(&net->ct.count); @@ -860,7 +911,7 @@ __nf_conntrack_alloc(struct net *net, * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_DESTROY_BY_RCU. */ - ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) goto out; @@ -887,7 +938,7 @@ __nf_conntrack_alloc(struct net *net, atomic_set(&ct->ct_general.use, 0); return ct; out_free: - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); out: atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); @@ -914,7 +965,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); } @@ -1061,7 +1112,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple); + hash = hash_conntrack_raw(&tuple, net); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, @@ -1270,17 +1321,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, } acct: - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -1289,18 +1331,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, const struct sk_buff *skb, int do_acct) { - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len - skb_network_offset(skb), - &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); if (del_timer(&ct->timeout)) { ct->timeout.function((unsigned long)ct); @@ -1396,16 +1428,17 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), int cpu; spinlock_t *lockp; - for (; *bucket < net->ct.htable_size; (*bucket)++) { + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (*bucket < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) + if (net_eq(nf_ct_net(ct), net) && + iter(ct, data)) goto found; } } @@ -1443,6 +1476,9 @@ void nf_ct_iterate_cleanup(struct net *net, might_sleep(); + if (atomic_read(&net->ct.count) == 0) + return; + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) @@ -1494,6 +1530,8 @@ void nf_conntrack_cleanup_end(void) while (untrack_refs() > 0) schedule(); + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + #ifdef CONFIG_NF_CONNTRACK_ZONES nf_ct_extend_unregister(&nf_ct_zone_extend); #endif @@ -1544,15 +1582,12 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); nf_conntrack_proto_pernet_fini(net); nf_conntrack_helper_pernet_fini(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_tstamp_pernet_fini(net); nf_conntrack_acct_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); - kmem_cache_destroy(net->ct.nf_conntrack_cachep); - kfree(net->ct.slabname); free_percpu(net->ct.stat); free_percpu(net->ct.pcpu_lists); } @@ -1607,7 +1642,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) local_bh_disable(); nf_conntrack_all_lock(); - write_seqcount_begin(&init_net.ct.generation); + write_seqcount_begin(&nf_conntrack_generation); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections @@ -1615,26 +1650,28 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) * though since that required taking the locks. */ - for (i = 0; i < init_net.ct.htable_size; i++) { - while (!hlist_nulls_empty(&init_net.ct.hash[i])) { - h = hlist_nulls_entry(init_net.ct.hash[i].first, - struct nf_conntrack_tuple_hash, hnnode); + for (i = 0; i < nf_conntrack_htable_size; i++) { + while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + h = hlist_nulls_entry(nf_conntrack_hash[i].first, + struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, hashsize); + bucket = __hash_conntrack(nf_ct_net(ct), + &h->tuple, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } - old_size = init_net.ct.htable_size; - old_hash = init_net.ct.hash; + old_size = nf_conntrack_htable_size; + old_hash = nf_conntrack_hash; - init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; - init_net.ct.hash = hash; + nf_conntrack_hash = hash; + nf_conntrack_htable_size = hashsize; - write_seqcount_end(&init_net.ct.generation); + write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); local_bh_enable(); + synchronize_net(); nf_ct_free_hashtable(old_hash, old_size); return 0; } @@ -1655,7 +1692,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); int nf_conntrack_init_start(void) { int max_factor = 8; - int i, ret, cpu; + int ret = -ENOMEM; + int i, cpu; + + seqcount_init(&nf_conntrack_generation); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); @@ -1682,8 +1722,19 @@ int nf_conntrack_init_start(void) * entries. */ max_factor = 4; } + + nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); + if (!nf_conntrack_hash) + return -ENOMEM; + nf_conntrack_max = max_factor * nf_conntrack_htable_size; + nf_conntrack_cachep = kmem_cache_create("nf_conntrack", + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!nf_conntrack_cachep) + goto err_cachep; + printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); @@ -1760,6 +1811,9 @@ err_tstamp: err_acct: nf_conntrack_expect_fini(); err_expect: + kmem_cache_destroy(nf_conntrack_cachep); +err_cachep: + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); return ret; } @@ -1783,7 +1837,6 @@ int nf_conntrack_init_net(struct net *net) int cpu; atomic_set(&net->ct.count, 0); - seqcount_init(&net->ct.generation); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) @@ -1801,24 +1854,6 @@ int nf_conntrack_init_net(struct net *net) if (!net->ct.stat) goto err_pcpu_lists; - net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); - if (!net->ct.slabname) - goto err_slabname; - - net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, - sizeof(struct nf_conn), 0, - SLAB_DESTROY_BY_RCU, NULL); - if (!net->ct.nf_conntrack_cachep) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_cache; - } - - net->ct.htable_size = nf_conntrack_htable_size; - net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); - if (!net->ct.hash) { - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_hash; - } ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; @@ -1850,12 +1885,6 @@ err_tstamp: err_acct: nf_conntrack_expect_pernet_fini(net); err_expect: - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); -err_hash: - kmem_cache_destroy(net->ct.nf_conntrack_cachep); -err_cache: - kfree(net->ct.slabname); -err_slabname: free_percpu(net->ct.stat); err_pcpu_lists: free_percpu(net->ct.pcpu_lists); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 278927ab0948..9e3693128313 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -24,6 +24,7 @@ #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> +#include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -35,9 +36,13 @@ unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); +struct hlist_head *nf_ct_expect_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_expect_hash); + unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; +static unsigned int nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -72,21 +77,32 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) nf_ct_expect_put(exp); } -static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) +static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash; + unsigned int hash, seed; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - } + get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); + + seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); + (__force __u16)tuple->dst.u.all) ^ seed); return reciprocal_scale(hash, nf_ct_expect_hsize); } +static bool +nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_expect *i, + const struct nf_conntrack_zone *zone, + const struct net *net) +{ + return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + net_eq(net, nf_ct_net(i->master)) && + nf_ct_zone_equal_any(i->master, zone); +} + struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, @@ -98,10 +114,9 @@ __nf_ct_expect_find(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { - if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) + h = nf_ct_expect_dst_hash(net, tuple); + hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { + if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; @@ -139,11 +154,10 @@ nf_ct_find_expectation(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { + h = nf_ct_expect_dst_hash(net, tuple); + hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && - nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) { + nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } @@ -223,6 +237,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -232,6 +247,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, return a->master == b->master && a->class == b->class && nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -342,7 +358,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); - unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); + unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ atomic_add(2, &exp->use); @@ -350,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; - hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, @@ -401,8 +417,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) ret = -ESHUTDOWN; goto out; } - h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { + h = nf_ct_expect_dst_hash(net, &expect->tuple); + hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { if (del_timer(&i->timeout)) { nf_ct_unlink_expect(i); @@ -468,12 +484,11 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -483,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } @@ -623,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { - int err = -ENOMEM; - net->ct.expect_count = 0; - net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); - if (net->ct.expect_hash == NULL) - goto err1; - - err = exp_proc_init(net); - if (err < 0) - goto err2; - - return 0; -err2: - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); -err1: - return err; + return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } int nf_conntrack_expect_init(void) @@ -660,6 +659,13 @@ int nf_conntrack_expect_init(void) 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; + + nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); + if (!nf_ct_expect_hash) { + kmem_cache_destroy(nf_ct_expect_cachep); + return -ENOMEM; + } + return 0; } @@ -667,4 +673,5 @@ void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); + nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3b40ec575cd5..f703adb7e5f7 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = true; +static bool nf_ct_auto_assign_helper __read_mostly = false; module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 1)"); + "Enable automatic conntrack helper assignment (default 0)"); #ifdef CONFIG_SYSCTL static struct ctl_table helper_sysctl_table[] = { @@ -400,7 +400,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], hnode) { + &nf_ct_expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( help->helper, @@ -424,10 +424,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_unlock_bh(&pcpu->lock); } local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) unhelp(h, me); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 294a8e28cec4..a18d1ceabad5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -824,19 +824,22 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conn *)cb->args[1]; local_bh_disable(); - for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (cb->args[0] >= net->ct.htable_size) { + if (cb->args[0] >= nf_conntrack_htable_size) { spin_unlock(lockp); goto out; } - hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], - hnnode) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], + hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); + if (!net_eq(net, nf_ct_net(ct))) + continue; + /* Dump entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then dump everything. */ @@ -2629,10 +2632,14 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]], + hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; + + if (!net_eq(nf_ct_net(exp->master), net)) + continue; + if (cb->args[1]) { if (exp != last) continue; @@ -2883,8 +2890,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + m_help = nfct_help(exp->master); if (!strcmp(m_help->helper->name, name) && del_timer(&exp->timeout)) { @@ -2901,8 +2912,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 478f92f834b6..4fd040575ffe 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, @@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 1ac8ee13a873..9d692f5adb94 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, @@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0f1a45bcacb2..f87e84ebcec3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -54,14 +54,13 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( hlist_nulls_first_rcu( - &net->ct.hash[st->bucket])); + &nf_conntrack_hash[st->bucket])); } return head; } @@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &init_net.ct.htable_size, + .data = &nf_conntrack_htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, @@ -512,7 +510,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) goto out_kmemdup; table[1].data = &net->ct.count; - table[2].data = &net->ct.htable_size; table[3].data = &net->ct.sysctl_checksum; table[4].data = &net->ct.sysctl_log_invalid; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 06a9f45771ab..6877a396f8fc 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -38,6 +38,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -118,15 +121,17 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { unsigned int hash; + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, net->ct.nat_htable_size); + return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -196,9 +201,10 @@ find_appropriate_src(struct net *net, const struct nf_conn_nat *nat; const struct nf_conn *ct; - hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, @@ -431,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct, nat = nfct_nat(ct); nat->ct = ct; hlist_add_head_rcu(&nat->bysource, - &net->ct.nat_bysource[srchash]); + &nf_nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -819,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, } #endif -static int __net_init nf_nat_net_init(struct net *net) -{ - /* Leave them the same for the moment. */ - net->ct.nat_htable_size = net->ct.htable_size; - net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); - if (!net->ct.nat_bysource) - return -ENOMEM; - return 0; -} - static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); - synchronize_rcu(); - nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { - .init = nf_nat_net_init, .exit = nf_nat_net_exit, }; @@ -852,8 +845,16 @@ static int __init nf_nat_init(void) { int ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM; + ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -877,6 +878,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -895,6 +897,7 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); } MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7a85a9dd37ad..4d292b933b5c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2317,7 +2317,7 @@ nft_select_set_ops(const struct nlattr * const nla[], static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, [NFTA_SET_NAME] = { .type = NLA_STRING, - .len = IFNAMSIZ - 1 }, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, @@ -2401,7 +2401,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, unsigned long *inuse; unsigned int n = 0, min = 0; - p = strnchr(name, IFNAMSIZ, '%'); + p = strnchr(name, NFT_SET_MAXNAMELEN, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; @@ -2696,7 +2696,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - char name[IFNAMSIZ]; + char name[NFT_SET_MAXNAMELEN]; unsigned int size; bool create; u64 timeout; @@ -3375,6 +3375,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem) } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +static int nft_setelem_parse_flags(const struct nft_set *set, + const struct nlattr *attr, u32 *flags) +{ + if (attr == NULL) + return 0; + + *flags = ntohl(nla_get_be32(attr)); + if (*flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + if (!(set->flags & NFT_SET_INTERVAL) && + *flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + + return 0; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -3388,8 +3404,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; + u32 flags = 0; u64 timeout; - u32 flags; u8 ulen; int err; @@ -3403,17 +3419,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_prepare(&tmpl); - flags = 0; - if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { - flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); - if (flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (!(set->flags & NFT_SET_INTERVAL) && - flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (flags != 0) - nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - } + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && @@ -3582,9 +3592,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_set_ext_tmpl tmpl; struct nft_data_desc desc; struct nft_set_elem elem; + struct nft_set_ext *ext; struct nft_trans *trans; + u32 flags = 0; + void *priv; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -3596,6 +3610,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) goto err1; + nft_set_ext_prepare(&tmpl); + + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -3605,24 +3627,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + + err = -ENOMEM; + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, + GFP_KERNEL); + if (elem.priv == NULL) + goto err2; + + ext = nft_set_elem_ext(set, elem.priv); + if (flags) + *nft_set_ext_flags(ext) = flags; + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err3; } - elem.priv = set->ops->deactivate(set, &elem); - if (elem.priv == NULL) { + priv = set->ops->deactivate(set, &elem); + if (priv == NULL) { err = -ENOENT; - goto err3; + goto err4; } + kfree(elem.priv); + elem.priv = priv; nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err3: +err4: kfree(trans); +err3: + kfree(elem.priv); err2: nft_data_uninit(&elem.key.val, desc.type); err1: diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 2671b9deb103..3c84f14326f5 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) int i; local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) untimeout(h, timeout); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 25998facefd0..137e308d5b24 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -198,6 +198,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr, } break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_replace(ct, + ®s->data[priv->sreg], + ®s->data[priv->sreg], + NF_CT_LABELS_MAX_SIZE / sizeof(u32)); + break; +#endif default: break; } @@ -365,6 +373,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, len = FIELD_SIZEOF(struct nf_conn, mark); break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = NF_CT_LABELS_MAX_SIZE; + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; + break; +#endif default: return -EOPNOTSUPP; } @@ -384,6 +402,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, static void nft_ct_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_put(ctx->net); + break; +#endif + default: + break; + } + nft_ct_l3proto_module_put(ctx->afi->family); } diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index 1c30f41cff5b..f762094af7c1 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -29,6 +29,17 @@ struct nft_rbtree_elem { struct nft_set_ext ext; }; +static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) +{ + return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && + (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); +} + +static bool nft_rbtree_equal(const struct nft_set *set, const void *this, + const struct nft_rbtree_elem *interval) +{ + return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; +} static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) @@ -37,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_rbtree_elem *rbe, *interval = NULL; const struct rb_node *parent; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); + const void *this; int d; spin_lock_bh(&nft_rbtree_lock); @@ -44,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); + this = nft_set_ext_key(&rbe->ext); + d = memcmp(this, key, set->klen); if (d < 0) { parent = parent->rb_left; + /* In case of adjacent ranges, we always see the high + * part of the range in first place, before the low one. + * So don't update interval if the keys are equal. + */ + if (interval && nft_rbtree_equal(set, this, interval)) + continue; interval = rbe; } else if (d > 0) parent = parent->rb_right; @@ -56,9 +75,7 @@ found: parent = parent->rb_left; continue; } - if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(&rbe->ext) & - NFT_SET_ELEM_INTERVAL_END) + if (nft_rbtree_interval_end(rbe)) goto out; spin_unlock_bh(&nft_rbtree_lock); @@ -98,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set, else if (d > 0) p = &parent->rb_right; else { - if (nft_set_elem_active(&rbe->ext, genmask)) - return -EEXIST; - p = &parent->rb_left; + if (nft_set_elem_active(&rbe->ext, genmask)) { + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(new)) + p = &parent->rb_left; + else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(new)) + p = &parent->rb_right; + else + return -EEXIST; + } } } rb_link_node(&new->node, parent, p); @@ -145,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe; + struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int d; @@ -163,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, parent = parent->rb_left; continue; } + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(this)) { + parent = parent->rb_left; + continue; + } else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(this)) { + parent = parent->rb_right; + continue; + } nft_set_elem_change_active(set, &rbe->ext); return rbe; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 9741a76c7405..9f0bc49fa969 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -439,20 +439,12 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 protonum; l3proto = __nf_ct_l3proto_find(l3num); - if (!l3proto) { - pr_debug("ovs_ct_find_existing: Can't get l3proto\n"); - return NULL; - } if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum) <= 0) { pr_debug("ovs_ct_find_existing: Can't get protonum\n"); return NULL; } l4proto = __nf_ct_l4proto_find(l3num, protonum); - if (!l4proto) { - pr_debug("ovs_ct_find_existing: Can't get l4proto\n"); - return NULL; - } if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("ovs_ct_find_existing: Can't get tuple\n"); |