summaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/Kconfig9
-rw-r--r--net/netfilter/Makefile5
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h71
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c19
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c24
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c26
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c40
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c452
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c883
-rw-r--r--net/netfilter/nf_conntrack_bpf.c17
-rw-r--r--net/netfilter/nf_conntrack_core.c30
-rw-r--r--net/netfilter/nf_conntrack_helper.c100
-rw-r--r--net/netfilter/nf_conntrack_proto.c124
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c53
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c104
-rw-r--r--net/netfilter/nf_conntrack_standalone.c8
-rw-r--r--net/netfilter/nf_flow_table_ip.c8
-rw-r--r--net/netfilter/nf_nat_ovs.c135
-rw-r--r--net/netfilter/nf_tables_api.c90
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nft_bitwise.c6
-rw-r--r--net/netfilter/nft_byteorder.c3
-rw-r--r--net/netfilter/nft_cmp.c9
-rw-r--r--net/netfilter/nft_compat.c9
-rw-r--r--net/netfilter/nft_connlimit.c3
-rw-r--r--net/netfilter/nft_counter.c5
-rw-r--r--net/netfilter/nft_ct.c6
-rw-r--r--net/netfilter/nft_dup_netdev.c3
-rw-r--r--net/netfilter/nft_dynset.c7
-rw-r--r--net/netfilter/nft_exthdr.c10
-rw-r--r--net/netfilter/nft_fib.c2
-rw-r--r--net/netfilter/nft_flow_offload.c3
-rw-r--r--net/netfilter/nft_fwd_netdev.c6
-rw-r--r--net/netfilter/nft_hash.c4
-rw-r--r--net/netfilter/nft_immediate.c3
-rw-r--r--net/netfilter/nft_inner.c385
-rw-r--r--net/netfilter/nft_last.c3
-rw-r--r--net/netfilter/nft_limit.c5
-rw-r--r--net/netfilter/nft_log.c3
-rw-r--r--net/netfilter/nft_lookup.c3
-rw-r--r--net/netfilter/nft_masq.c3
-rw-r--r--net/netfilter/nft_meta.c67
-rw-r--r--net/netfilter/nft_nat.c3
-rw-r--r--net/netfilter/nft_numgen.c6
-rw-r--r--net/netfilter/nft_objref.c28
-rw-r--r--net/netfilter/nft_osf.c3
-rw-r--r--net/netfilter/nft_payload.c141
-rw-r--r--net/netfilter/nft_queue.c6
-rw-r--r--net/netfilter/nft_quota.c5
-rw-r--r--net/netfilter/nft_range.c3
-rw-r--r--net/netfilter/nft_redir.c3
-rw-r--r--net/netfilter/nft_reject.c3
-rw-r--r--net/netfilter/nft_rt.c2
-rw-r--r--net/netfilter/nft_socket.c2
-rw-r--r--net/netfilter/nft_synproxy.c3
-rw-r--r--net/netfilter/nft_tproxy.c2
-rw-r--r--net/netfilter/nft_tunnel.c2
-rw-r--r--net/netfilter/nft_xfrm.c2
-rw-r--r--net/netfilter/xt_sctp.c1
59 files changed, 2517 insertions, 446 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 4b8d04640ff3..f71b41c7ce2f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -459,6 +459,9 @@ config NF_NAT_REDIRECT
config NF_NAT_MASQUERADE
bool
+config NF_NAT_OVS
+ bool
+
config NETFILTER_SYNPROXY
tristate
@@ -568,12 +571,6 @@ config NFT_TUNNEL
This option adds the "tunnel" expression that you can use to set
tunneling policies.
-config NFT_OBJREF
- tristate "Netfilter nf_tables stateful object reference module"
- help
- This option adds the "objref" expression that allows you to refer to
- stateful objects, such as counters and quotas.
-
config NFT_QUEUE
depends on NETFILTER_NETLINK_QUEUE
tristate "Netfilter nf_tables queue module"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0f060d100880..3754eb06fb41 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o
obj-$(CONFIG_NF_NAT) += nf_nat.o
nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o
+nf_nat-$(CONFIG_NF_NAT_OVS) += nf_nat_ovs.o
ifeq ($(CONFIG_NF_NAT),m)
nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o
@@ -86,7 +87,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \
- nft_counter.o nft_chain_route.o nf_tables_offload.o \
+ nft_counter.o nft_objref.o nft_inner.o \
+ nft_chain_route.o nf_tables_offload.o \
nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
nft_set_pipapo.o
@@ -104,7 +106,6 @@ obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
-obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7499192af586..7c2399541771 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -159,6 +159,17 @@ htable_size(u8 hbits)
(SET_WITH_TIMEOUT(set) && \
ip_set_timeout_expired(ext_timeout(d, set)))
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+static const union nf_inet_addr onesmask = {
+ .all[0] = 0xffffffff,
+ .all[1] = 0xffffffff,
+ .all[2] = 0xffffffff,
+ .all[3] = 0xffffffff
+};
+
+static const union nf_inet_addr zeromask = {};
+#endif
+
#endif /* _IP_SET_HASH_GEN_H */
#ifndef MTYPE
@@ -283,8 +294,9 @@ struct htype {
u32 markmask; /* markmask value for mark mask to store */
#endif
u8 bucketsize; /* max elements in an array block */
-#ifdef IP_SET_HASH_WITH_NETMASK
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
u8 netmask; /* netmask value for subnets to store */
+ union nf_inet_addr bitmask; /* stores bitmask */
#endif
struct list_head ad; /* Resize add|del backlist */
struct mtype_elem next; /* temporary storage for uadd */
@@ -459,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
/* Resizing changes htable_bits, so we ignore it */
return x->maxelem == y->maxelem &&
a->timeout == b->timeout &&
-#ifdef IP_SET_HASH_WITH_NETMASK
- x->netmask == y->netmask &&
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ nf_inet_addr_cmp(&x->bitmask, &y->bitmask) &&
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
x->markmask == y->markmask &&
@@ -1264,9 +1276,21 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
htonl(jhash_size(htable_bits))) ||
nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_BITMASK
+ /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask
+ * and not bitmask. These two are mutually exclusive. */
+ if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) {
+ if (set->family == NFPROTO_IPV4) {
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip))
+ goto nla_put_failure;
+ } else if (set->family == NFPROTO_IPV6) {
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6))
+ goto nla_put_failure;
+ }
+ }
+#endif
#ifdef IP_SET_HASH_WITH_NETMASK
- if (h->netmask != HOST_MASK &&
- nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+ if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
goto nla_put_failure;
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
@@ -1429,8 +1453,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
u32 markmask;
#endif
u8 hbits;
-#ifdef IP_SET_HASH_WITH_NETMASK
- u8 netmask;
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ int ret __attribute__((unused)) = 0;
+ u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+ union nf_inet_addr bitmask = onesmask;
#endif
size_t hsize;
struct htype *h;
@@ -1468,7 +1494,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#endif
#ifdef IP_SET_HASH_WITH_NETMASK
- netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
if (tb[IPSET_ATTR_NETMASK]) {
netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
@@ -1476,6 +1501,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
(set->family == NFPROTO_IPV6 && netmask > 128) ||
netmask == 0)
return -IPSET_ERR_INVALID_NETMASK;
+
+ /* we convert netmask to bitmask and store it */
+ if (set->family == NFPROTO_IPV4)
+ bitmask.ip = ip_set_netmask(netmask);
+ else
+ ip6_netmask(&bitmask, netmask);
+ }
+#endif
+
+#ifdef IP_SET_HASH_WITH_BITMASK
+ if (tb[IPSET_ATTR_BITMASK]) {
+ /* bitmask and netmask do the same thing, allow only one of these options */
+ if (tb[IPSET_ATTR_NETMASK])
+ return -IPSET_ERR_BITMASK_NETMASK_EXCL;
+
+ if (set->family == NFPROTO_IPV4) {
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip);
+ if (ret || !bitmask.ip)
+ return -IPSET_ERR_INVALID_NETMASK;
+ } else if (set->family == NFPROTO_IPV6) {
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask);
+ if (ret || ipv6_addr_any(&bitmask.in6))
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+
+ if (nf_inet_addr_cmp(&bitmask, &zeromask))
+ return -IPSET_ERR_INVALID_NETMASK;
}
#endif
@@ -1518,7 +1570,8 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
for (i = 0; i < ahash_numof_locks(hbits); i++)
spin_lock_init(&t->hregion[i].lock);
h->maxelem = maxelem;
-#ifdef IP_SET_HASH_WITH_NETMASK
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ h->bitmask = bitmask;
h->netmask = netmask;
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 75d556d71652..e30513cefd90 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -24,7 +24,8 @@
/* 2 Comments support */
/* 3 Forceadd support */
/* 4 skbinfo support */
-#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */
+/* 5 bucketsize, initval support */
+#define IPSET_TYPE_REV_MAX 6 /* bitmask support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -34,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip");
/* Type specific function prefix */
#define HTYPE hash_ip
#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
/* IPv4 variant */
@@ -86,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
__be32 ip;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
- ip &= ip_set_netmask(h->netmask);
+ ip &= h->bitmask.ip;
if (ip == 0)
return -EINVAL;
@@ -119,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- ip &= ip_set_hostmask(h->netmask);
+ ip &= ntohl(h->bitmask.ip);
e.ip = htonl(ip);
if (e.ip == 0)
return -IPSET_ERR_HASH_ELEM;
@@ -185,12 +187,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
}
-static void
-hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
- ip6_netmask(ip, prefix);
-}
-
static bool
hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
{
@@ -227,7 +223,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
- hash_ip6_netmask(&e.ip, h->netmask);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
if (ipv6_addr_any(&e.ip.in6))
return -EINVAL;
@@ -266,7 +262,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- hash_ip6_netmask(&e.ip, h->netmask);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
if (ipv6_addr_any(&e.ip.in6))
return -IPSET_ERR_HASH_ELEM;
@@ -293,6 +289,7 @@ static struct ip_set_type hash_ip_type __read_mostly = {
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 7303138e46be..2ffbd0b78a8c 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -26,7 +26,8 @@
/* 3 Comments support added */
/* 4 Forceadd support added */
/* 5 skbinfo support added */
-#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */
+/* 6 bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX 7 /* bitmask support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -35,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port");
/* Type specific function prefix */
#define HTYPE hash_ipport
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
/* IPv4 variant */
@@ -92,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ const struct MTYPE *h = set->data;
if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
&e.port, &e.proto))
return -EINVAL;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= h->bitmask.ip;
+ if (e.ip == 0)
+ return -EINVAL;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -129,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
+ e.ip &= h->bitmask.ip;
+ if (e.ip == 0)
+ return -EINVAL;
+
e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
@@ -253,12 +264,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ const struct MTYPE *h = set->data;
if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
&e.port, &e.proto))
return -EINVAL;
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -298,6 +314,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
@@ -356,6 +376,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 3d09eefe998a..cdfb78c6e0d3 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -23,7 +23,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support added */
/* 2 skbinfo support added */
-#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */
+/* 3 bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX 4 /* bitmask support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -33,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net");
/* Type specific function prefix */
#define HTYPE hash_netnet
#define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
#define IPSET_NET_COUNT 2
/* IPv4 variants */
@@ -153,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
- e.ip[0] &= ip_set_netmask(e.cidr[0]);
- e.ip[1] &= ip_set_netmask(e.cidr[1]);
+ e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip);
+ e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -213,8 +216,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] ||
tb[IPSET_ATTR_IP2_TO])) {
- e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
- e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1]));
ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_enomatch(ret, flags, adt, set) ? -ret :
ip_set_eexist(ret, flags) ? 0 : ret;
@@ -404,6 +407,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
+ nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+ nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+ if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+ return -EINVAL;
+
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -414,6 +422,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ const struct hash_netnet6 *h = set->data;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -453,6 +462,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
+ nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+ nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+ if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+ return -IPSET_ERR_HASH_ELEM;
+
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -484,6 +498,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 51ad557a525b..2fcc26507d69 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -132,21 +132,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -168,21 +168,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -200,17 +200,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
s = this_cpu_ptr(cp->dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -2448,6 +2448,10 @@ static void __exit ip_vs_cleanup(void)
ip_vs_conn_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
+ /* common rcu_barrier() used by:
+ * - ip_vs_control_cleanup()
+ */
+ rcu_barrier();
pr_info("ipvs unloaded.\n");
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 988222fff9f0..c9f598505642 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -49,8 +49,7 @@
MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
+DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */
/* sysctl variables */
@@ -241,6 +240,47 @@ static void defense_work_handler(struct work_struct *work)
}
#endif
+static void est_reload_work_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, est_reload_work.work);
+ int genid_done = atomic_read(&ipvs->est_genid_done);
+ unsigned long delay = HZ / 10; /* repeat startups after failure */
+ bool repeat = false;
+ int genid;
+ int id;
+
+ mutex_lock(&ipvs->est_mutex);
+ genid = atomic_read(&ipvs->est_genid);
+ for (id = 0; id < ipvs->est_kt_count; id++) {
+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
+
+ /* netns clean up started, abort delayed work */
+ if (!ipvs->enable)
+ goto unlock;
+ if (!kd)
+ continue;
+ /* New config ? Stop kthread tasks */
+ if (genid != genid_done)
+ ip_vs_est_kthread_stop(kd);
+ if (!kd->task && !ip_vs_est_stopped(ipvs)) {
+ /* Do not start kthreads above 0 in calc phase */
+ if ((!id || !ipvs->est_calc_phase) &&
+ ip_vs_est_kthread_start(ipvs, kd) < 0)
+ repeat = true;
+ }
+ }
+
+ atomic_set(&ipvs->est_genid_done, genid);
+
+ if (repeat)
+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
+ delay);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+}
+
int
ip_vs_use_count_inc(void)
{
@@ -471,7 +511,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
static void ip_vs_service_free(struct ip_vs_service *svc)
{
- free_percpu(svc->stats.cpustats);
+ ip_vs_stats_release(&svc->stats);
kfree(svc);
}
@@ -483,17 +523,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head)
ip_vs_service_free(svc);
}
-static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+static void __ip_vs_svc_put(struct ip_vs_service *svc)
{
if (atomic_dec_and_test(&svc->refcnt)) {
IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
- if (do_delay)
- call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
- else
- ip_vs_service_free(svc);
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
}
}
@@ -780,14 +817,22 @@ out:
return dest;
}
+static void ip_vs_dest_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest *dest;
+
+ dest = container_of(head, struct ip_vs_dest, rcu_head);
+ ip_vs_stats_release(&dest->stats);
+ ip_vs_dest_put_and_free(dest);
+}
+
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
__ip_vs_dst_cache_reset(dest);
- __ip_vs_svc_put(svc, false);
- free_percpu(dest->stats.cpustats);
- ip_vs_dest_put_and_free(dest);
+ __ip_vs_svc_put(svc);
+ call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
}
/*
@@ -811,12 +856,22 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
}
}
+static void ip_vs_stats_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_stats_rcu *rs = container_of(head,
+ struct ip_vs_stats_rcu,
+ rcu_head);
+
+ ip_vs_stats_release(&rs->s);
+ kfree(rs);
+}
+
static void
ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
{
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
- spin_lock_bh(&src->lock);
+ spin_lock(&src->lock);
IP_VS_SHOW_STATS_COUNTER(conns);
IP_VS_SHOW_STATS_COUNTER(inpkts);
@@ -826,7 +881,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
ip_vs_read_estimator(dst, src);
- spin_unlock_bh(&src->lock);
+ spin_unlock(&src->lock);
}
static void
@@ -847,7 +902,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
- spin_lock_bh(&stats->lock);
+ spin_lock(&stats->lock);
/* get current counters as zero point, rates are zeroed */
@@ -861,7 +916,48 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
ip_vs_zero_estimator(stats);
- spin_unlock_bh(&stats->lock);
+ spin_unlock(&stats->lock);
+}
+
+/* Allocate fields after kzalloc */
+int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
+{
+ int i;
+
+ spin_lock_init(&s->lock);
+ s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!s->cpustats)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
+
+ u64_stats_init(&cs->syncp);
+ }
+ return 0;
+}
+
+struct ip_vs_stats *ip_vs_stats_alloc(void)
+{
+ struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s && ip_vs_stats_init_alloc(s) >= 0)
+ return s;
+ kfree(s);
+ return NULL;
+}
+
+void ip_vs_stats_release(struct ip_vs_stats *stats)
+{
+ free_percpu(stats->cpustats);
+}
+
+void ip_vs_stats_free(struct ip_vs_stats *stats)
+{
+ if (stats) {
+ ip_vs_stats_release(stats);
+ kfree(stats);
+ }
}
/*
@@ -923,7 +1019,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
- __ip_vs_svc_put(old_svc, true);
+ __ip_vs_svc_put(old_svc);
}
}
@@ -942,7 +1038,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
spin_unlock_bh(&dest->dst_lock);
if (add) {
- ip_vs_start_estimator(svc->ipvs, &dest->stats);
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -963,14 +1058,13 @@ static int
ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
- unsigned int atype, i;
+ unsigned int atype;
+ int ret;
EnterFunction(2);
#ifdef CONFIG_IP_VS_IPV6
if (udest->af == AF_INET6) {
- int ret;
-
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
@@ -992,15 +1086,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
if (dest == NULL)
return -ENOMEM;
- dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!dest->stats.cpustats)
+ ret = ip_vs_stats_init_alloc(&dest->stats);
+ if (ret < 0)
goto err_alloc;
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ip_vs_dest_stats;
- ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
- u64_stats_init(&ip_vs_dest_stats->syncp);
- }
+ ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ if (ret < 0)
+ goto err_stats;
dest->af = udest->af;
dest->protocol = svc->protocol;
@@ -1017,15 +1109,17 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
- spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
LeaveFunction(2);
return 0;
+err_stats:
+ ip_vs_stats_release(&dest->stats);
+
err_alloc:
kfree(dest);
- return -ENOMEM;
+ return ret;
}
@@ -1087,14 +1181,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
+ ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ if (ret < 0)
+ goto err;
__ip_vs_update_dest(svc, dest, udest, 1);
- ret = 0;
} else {
/*
* Allocate and initialize the dest structure
*/
ret = ip_vs_new_dest(svc, udest);
}
+
+err:
LeaveFunction(2);
return ret;
@@ -1284,7 +1382,7 @@ static int
ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
- int ret = 0, i;
+ int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
@@ -1344,18 +1442,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ret = -ENOMEM;
goto out_err;
}
- svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!svc->stats.cpustats) {
- ret = -ENOMEM;
+ ret = ip_vs_stats_init_alloc(&svc->stats);
+ if (ret < 0)
goto out_err;
- }
-
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ip_vs_stats;
- ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
- u64_stats_init(&ip_vs_stats->syncp);
- }
-
/* I'm the first user of the service */
atomic_set(&svc->refcnt, 0);
@@ -1372,7 +1461,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
- spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
if (sched) {
@@ -1382,6 +1470,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
sched = NULL;
}
+ ret = ip_vs_start_estimator(ipvs, &svc->stats);
+ if (ret < 0)
+ goto out_err;
+
/* Bind the ct retriever */
RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
@@ -1394,8 +1486,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
if (svc->pe && svc->pe->conn_out)
atomic_inc(&ipvs->conn_out_counter);
- ip_vs_start_estimator(ipvs, &svc->stats);
-
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services++;
@@ -1406,8 +1496,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ip_vs_svc_hash(svc);
*svc_p = svc;
- /* Now there is a service - full throttle */
- ipvs->enable = 1;
+
+ if (!ipvs->enable) {
+ /* Now there is a service - full throttle */
+ ipvs->enable = 1;
+
+ /* Start estimation for first time */
+ ip_vs_est_reload_start(ipvs);
+ }
+
return 0;
@@ -1571,7 +1668,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/*
* Free the service if nobody refers to it
*/
- __ip_vs_svc_put(svc, true);
+ __ip_vs_svc_put(svc);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1761,7 +1858,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
}
}
- ip_vs_zero_stats(&ipvs->tot_stats);
+ ip_vs_zero_stats(&ipvs->tot_stats->s);
return 0;
}
@@ -1843,6 +1940,148 @@ proc_do_sync_ports(struct ctl_table *table, int write,
return rc;
}
+static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ cpumask_var_t newmask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpulist_parse(buffer, newmask);
+ if (ret)
+ goto out;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (!ipvs->est_cpulist_valid) {
+ if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ ipvs->est_cpulist_valid = 1;
+ }
+ cpumask_and(newmask, newmask, &current->cpus_mask);
+ cpumask_copy(*valp, newmask);
+ /* est_max_threads may depend on cpulist size */
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+ ipvs->est_calc_phase = 1;
+ ip_vs_est_reload_start(ipvs);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+
+out:
+ free_cpumask_var(newmask);
+ return ret;
+}
+
+static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
+ size_t size)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ struct cpumask *mask;
+ int ret;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (ipvs->est_cpulist_valid)
+ mask = *valp;
+ else
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+
+ mutex_unlock(&ipvs->est_mutex);
+
+ return ret;
+}
+
+static int ipvs_proc_est_cpulist(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ /* Ignore both read and write(append) if *ppos not 0 */
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ /* proc_sys_call_handler() appends terminator */
+ ret = ipvs_proc_est_cpumask_set(table, buffer);
+ if (ret >= 0)
+ *ppos += *lenp;
+ } else {
+ /* proc_sys_call_handler() allocates 1 byte for terminator */
+ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
+ if (ret >= 0) {
+ *lenp = ret;
+ *ppos += *lenp;
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_est_nice(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ if (val < MIN_NICE || val > MAX_NICE) {
+ ret = -EINVAL;
+ } else {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
+ *valp = val;
+ ip_vs_est_reload_start(ipvs);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_run_estimation(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
+ *valp = val;
+ ip_vs_est_reload_start(ipvs);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+ }
+ return ret;
+}
+
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
* Do not change order or insert new entries without
@@ -2017,7 +2256,19 @@ static struct ctl_table vs_vars[] = {
.procname = "run_estimation",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = ipvs_proc_run_estimation,
+ },
+ {
+ .procname = "est_cpulist",
+ .maxlen = NR_CPUS, /* unused */
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_cpulist,
+ },
+ {
+ .procname = "est_nice",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_nice,
},
#ifdef CONFIG_IP_VS_DEBUG
{
@@ -2255,7 +2506,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
seq_puts(seq,
" Conns Packets Packets Bytes Bytes\n");
- ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
(unsigned long long)show.conns,
(unsigned long long)show.inpkts,
@@ -2279,7 +2530,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
- struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
struct ip_vs_kstats kstats;
int i;
@@ -2296,13 +2547,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
u64 conns, inpkts, outpkts, inbytes, outbytes;
do {
- start = u64_stats_fetch_begin_irq(&u->syncp);
- conns = u->cnt.conns;
- inpkts = u->cnt.inpkts;
- outpkts = u->cnt.outpkts;
- inbytes = u->cnt.inbytes;
- outbytes = u->cnt.outbytes;
- } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+ start = u64_stats_fetch_begin(&u->syncp);
+ conns = u64_stats_read(&u->cnt.conns);
+ inpkts = u64_stats_read(&u->cnt.inpkts);
+ outpkts = u64_stats_read(&u->cnt.outpkts);
+ inbytes = u64_stats_read(&u->cnt.inbytes);
+ outbytes = u64_stats_read(&u->cnt.outbytes);
+ } while (u64_stats_fetch_retry(&u->syncp, start));
seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
i, (u64)conns, (u64)inpkts,
@@ -4027,13 +4278,17 @@ static void ip_vs_genl_unregister(void)
static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
- int idx;
struct ctl_table *tbl;
+ int idx, ret;
atomic_set(&ipvs->dropentry, 0);
spin_lock_init(&ipvs->dropentry_lock);
spin_lock_init(&ipvs->droppacket_lock);
spin_lock_init(&ipvs->securetcp_lock);
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
+ expire_nodest_conn_handler);
+ ipvs->est_stopped = 0;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
@@ -4094,31 +4349,44 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
ipvs->sysctl_run_estimation = 1;
+ tbl[idx].extra2 = ipvs;
tbl[idx++].data = &ipvs->sysctl_run_estimation;
+
+ ipvs->est_cpulist_valid = 0;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_cpulist;
+
+ ipvs->sysctl_est_nice = IPVS_EST_NICE;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_nice;
+
#ifdef CONFIG_IP_VS_DEBUG
/* Global sysctls must be ro in non-init netns */
if (!net_eq(net, &init_net))
tbl[idx++].mode = 0444;
#endif
+ ret = -ENOMEM;
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
- if (ipvs->sysctl_hdr == NULL) {
- if (!net_eq(net, &init_net))
- kfree(tbl);
- return -ENOMEM;
- }
- ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
+ if (!ipvs->sysctl_hdr)
+ goto err;
ipvs->sysctl_tbl = tbl;
+
+ ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
+ if (ret < 0)
+ goto err;
+
/* Schedule defense work */
- INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
queue_delayed_work(system_long_wq, &ipvs->defense_work,
DEFENSE_TIMER_PERIOD);
- /* Init delayed work for expiring no dest conn */
- INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
- expire_nodest_conn_handler);
-
return 0;
+
+err:
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return ret;
}
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
@@ -4129,7 +4397,10 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+
+ if (ipvs->est_cpulist_valid)
+ free_cpumask_var(ipvs->sysctl_est_cpulist);
if (!net_eq(net, &init_net))
kfree(ipvs->sysctl_tbl);
@@ -4151,7 +4422,8 @@ static struct notifier_block ip_vs_dst_notifier = {
int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
- int i, idx;
+ int ret = -ENOMEM;
+ int idx;
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -4164,18 +4436,14 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
atomic_set(&ipvs->nullsvc_counter, 0);
atomic_set(&ipvs->conn_out_counter, 0);
- /* procfs stats */
- ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!ipvs->tot_stats.cpustats)
- return -ENOMEM;
-
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ipvs_tot_stats;
- ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
- u64_stats_init(&ipvs_tot_stats->syncp);
- }
+ INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
- spin_lock_init(&ipvs->tot_stats.lock);
+ /* procfs stats */
+ ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
+ if (!ipvs->tot_stats)
+ goto out;
+ if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
+ goto err_tot_stats;
#ifdef CONFIG_PROC_FS
if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
@@ -4190,7 +4458,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
goto err_percpu;
#endif
- if (ip_vs_control_net_init_sysctl(ipvs))
+ ret = ip_vs_control_net_init_sysctl(ipvs);
+ if (ret < 0)
goto err;
return 0;
@@ -4207,20 +4476,26 @@ err_stats:
err_vs:
#endif
- free_percpu(ipvs->tot_stats.cpustats);
- return -ENOMEM;
+ ip_vs_stats_release(&ipvs->tot_stats->s);
+
+err_tot_stats:
+ kfree(ipvs->tot_stats);
+
+out:
+ return ret;
}
void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
+ cancel_delayed_work_sync(&ipvs->est_reload_work);
#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
remove_proc_entry("ip_vs", ipvs->net->proc_net);
#endif
- free_percpu(ipvs->tot_stats.cpustats);
+ call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
}
int __init ip_vs_register_nl_ioctl(void)
@@ -4280,5 +4555,6 @@ void ip_vs_control_cleanup(void)
{
EnterFunction(2);
unregister_netdevice_notifier(&ip_vs_dst_notifier);
+ /* relying on common rcu_barrier() in ip_vs_cleanup() */
LeaveFunction(2);
}
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 9a1a7af6a186..ce2a1549b304 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -30,9 +30,6 @@
long interval, it is easy to implement a user level daemon which
periodically reads those statistical counters and measure rate.
- Currently, the measurement is activated by slow timer handler. Hope
- this measurement will not introduce too much load.
-
We measure rate during the last 8 seconds every 2 seconds:
avgrate = avgrate*(1-W) + rate*W
@@ -47,68 +44,79 @@
to 32-bit values for conns, packets, bps, cps and pps.
* A lot of code is taken from net/core/gen_estimator.c
- */
-
-/*
- * Make a summary from each cpu
+ KEY POINTS:
+ - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
+ - kthreads read the cpustats to update the estimators (svcs, dests, total)
+ - the states of estimators can be read (get stats) or modified (zero stats)
+ from processes
+
+ KTHREADS:
+ - estimators are added initially to est_temp_list and later kthread 0
+ distributes them to one or many kthreads for estimation
+ - kthread contexts are created and attached to array
+ - the kthread tasks are started when first service is added, before that
+ the total stats are not estimated
+ - when configuration (cpulist/nice) is changed, the tasks are restarted
+ by work (est_reload_work)
+ - kthread tasks are stopped while the cpulist is empty
+ - the kthread context holds lists with estimators (chains) which are
+ processed every 2 seconds
+ - as estimators can be added dynamically and in bursts, we try to spread
+ them to multiple chains which are estimated at different time
+ - on start, kthread 0 enters calculation phase to determine the chain limits
+ and the limit of estimators per kthread
+ - est_add_ktid: ktid where to add new ests, can point to empty slot where
+ we should add kt data
*/
-static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
- struct ip_vs_cpu_stats __percpu *stats)
-{
- int i;
- bool add = false;
-
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
- unsigned int start;
- u64 conns, inpkts, outpkts, inbytes, outbytes;
- if (add) {
- do {
- start = u64_stats_fetch_begin(&s->syncp);
- conns = s->cnt.conns;
- inpkts = s->cnt.inpkts;
- outpkts = s->cnt.outpkts;
- inbytes = s->cnt.inbytes;
- outbytes = s->cnt.outbytes;
- } while (u64_stats_fetch_retry(&s->syncp, start));
- sum->conns += conns;
- sum->inpkts += inpkts;
- sum->outpkts += outpkts;
- sum->inbytes += inbytes;
- sum->outbytes += outbytes;
- } else {
- add = true;
- do {
- start = u64_stats_fetch_begin(&s->syncp);
- sum->conns = s->cnt.conns;
- sum->inpkts = s->cnt.inpkts;
- sum->outpkts = s->cnt.outpkts;
- sum->inbytes = s->cnt.inbytes;
- sum->outbytes = s->cnt.outbytes;
- } while (u64_stats_fetch_retry(&s->syncp, start));
- }
- }
-}
+static struct lock_class_key __ipvs_est_key;
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
-static void estimation_timer(struct timer_list *t)
+static void ip_vs_chain_estimation(struct hlist_head *chain)
{
struct ip_vs_estimator *e;
+ struct ip_vs_cpu_stats *c;
struct ip_vs_stats *s;
u64 rate;
- struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer);
- if (!sysctl_run_estimation(ipvs))
- goto skip;
+ hlist_for_each_entry_rcu(e, chain, list) {
+ u64 conns, inpkts, outpkts, inbytes, outbytes;
+ u64 kconns = 0, kinpkts = 0, koutpkts = 0;
+ u64 kinbytes = 0, koutbytes = 0;
+ unsigned int start;
+ int i;
+
+ if (kthread_should_stop())
+ break;
- spin_lock(&ipvs->est_lock);
- list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
+ for_each_possible_cpu(i) {
+ c = per_cpu_ptr(s->cpustats, i);
+ do {
+ start = u64_stats_fetch_begin(&c->syncp);
+ conns = u64_stats_read(&c->cnt.conns);
+ inpkts = u64_stats_read(&c->cnt.inpkts);
+ outpkts = u64_stats_read(&c->cnt.outpkts);
+ inbytes = u64_stats_read(&c->cnt.inbytes);
+ outbytes = u64_stats_read(&c->cnt.outbytes);
+ } while (u64_stats_fetch_retry(&c->syncp, start));
+ kconns += conns;
+ kinpkts += inpkts;
+ koutpkts += outpkts;
+ kinbytes += inbytes;
+ koutbytes += outbytes;
+ }
spin_lock(&s->lock);
- ip_vs_read_cpu_stats(&s->kstats, s->cpustats);
+
+ s->kstats.conns = kconns;
+ s->kstats.inpkts = kinpkts;
+ s->kstats.outpkts = koutpkts;
+ s->kstats.inbytes = kinbytes;
+ s->kstats.outbytes = koutbytes;
/* scaled by 2^10, but divided 2 seconds */
rate = (s->kstats.conns - e->last_conns) << 9;
@@ -133,30 +141,758 @@ static void estimation_timer(struct timer_list *t)
e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
spin_unlock(&s->lock);
}
- spin_unlock(&ipvs->est_lock);
+}
-skip:
- mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
+static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
+{
+ struct ip_vs_est_tick_data *td;
+ int cid;
+
+ rcu_read_lock();
+ td = rcu_dereference(kd->ticks[row]);
+ if (!td)
+ goto out;
+ for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
+ if (kthread_should_stop())
+ break;
+ ip_vs_chain_estimation(&td->chains[cid]);
+ cond_resched_rcu();
+ td = rcu_dereference(kd->ticks[row]);
+ if (!td)
+ break;
+ }
+
+out:
+ rcu_read_unlock();
}
-void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+static int ip_vs_estimation_kthread(void *data)
{
- struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_est_kt_data *kd = data;
+ struct netns_ipvs *ipvs = kd->ipvs;
+ int row = kd->est_row;
+ unsigned long now;
+ int id = kd->id;
+ long gap;
+
+ if (id > 0) {
+ if (!ipvs->est_chain_max)
+ return 0;
+ } else {
+ if (!ipvs->est_chain_max) {
+ ipvs->est_calc_phase = 1;
+ /* commit est_calc_phase before reading est_genid */
+ smp_mb();
+ }
+
+ /* kthread 0 will handle the calc phase */
+ if (ipvs->est_calc_phase)
+ ip_vs_est_calc_phase(ipvs);
+ }
+
+ while (1) {
+ if (!id && !hlist_empty(&ipvs->est_temp_list))
+ ip_vs_est_drain_temp_list(ipvs);
+ set_current_state(TASK_IDLE);
+ if (kthread_should_stop())
+ break;
+
+ /* before estimation, check if we should sleep */
+ now = jiffies;
+ gap = kd->est_timer - now;
+ if (gap > 0) {
+ if (gap > IPVS_EST_TICK) {
+ kd->est_timer = now - IPVS_EST_TICK;
+ gap = IPVS_EST_TICK;
+ }
+ schedule_timeout(gap);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ if (gap < -8 * IPVS_EST_TICK)
+ kd->est_timer = now;
+ }
+
+ if (kd->tick_len[row])
+ ip_vs_tick_estimation(kd, row);
+
+ row++;
+ if (row >= IPVS_EST_NTICKS)
+ row = 0;
+ WRITE_ONCE(kd->est_row, row);
+ kd->est_timer += IPVS_EST_TICK;
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+/* Schedule stop/start for kthread tasks */
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+{
+ /* Ignore reloads before first service is added */
+ if (!ipvs->enable)
+ return;
+ ip_vs_est_stopped_recalc(ipvs);
+ /* Bump the kthread configuration genid */
+ atomic_inc(&ipvs->est_genid);
+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
+}
+
+/* Start kthread task with current configuration */
+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+ struct ip_vs_est_kt_data *kd)
+{
+ unsigned long now;
+ int ret = 0;
+ long gap;
+
+ lockdep_assert_held(&ipvs->est_mutex);
+
+ if (kd->task)
+ goto out;
+ now = jiffies;
+ gap = kd->est_timer - now;
+ /* Sync est_timer if task is starting later */
+ if (abs(gap) > 4 * IPVS_EST_TICK)
+ kd->est_timer = now;
+ kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
+ ipvs->gen, kd->id);
+ if (IS_ERR(kd->task)) {
+ ret = PTR_ERR(kd->task);
+ kd->task = NULL;
+ goto out;
+ }
+
+ set_user_nice(kd->task, sysctl_est_nice(ipvs));
+ set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
+
+ pr_info("starting estimator thread %d...\n", kd->id);
+ wake_up_process(kd->task);
+
+out:
+ return ret;
+}
+
+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
+{
+ if (kd->task) {
+ pr_info("stopping estimator thread %d...\n", kd->id);
+ kthread_stop(kd->task);
+ kd->task = NULL;
+ }
+}
+
+/* Apply parameters to kthread */
+static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
+ struct ip_vs_est_kt_data *kd)
+{
+ kd->chain_max = ipvs->est_chain_max;
+ /* We are using single chain on RCU preemption */
+ if (IPVS_EST_TICK_CHAINS == 1)
+ kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
+ kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
+ kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
+}
+
+/* Create and start estimation kthread in a free or new array slot */
+static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_est_kt_data *kd = NULL;
+ int id = ipvs->est_kt_count;
+ int ret = -ENOMEM;
+ void *arr = NULL;
+ int i;
+
+ if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+ ipvs->enable && ipvs->est_max_threads)
+ return -EINVAL;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ for (i = 0; i < id; i++) {
+ if (!ipvs->est_kt_arr[i])
+ break;
+ }
+ if (i >= id) {
+ arr = krealloc_array(ipvs->est_kt_arr, id + 1,
+ sizeof(struct ip_vs_est_kt_data *),
+ GFP_KERNEL);
+ if (!arr)
+ goto out;
+ ipvs->est_kt_arr = arr;
+ } else {
+ id = i;
+ }
- INIT_LIST_HEAD(&est->list);
+ kd = kzalloc(sizeof(*kd), GFP_KERNEL);
+ if (!kd)
+ goto out;
+ kd->ipvs = ipvs;
+ bitmap_fill(kd->avail, IPVS_EST_NTICKS);
+ kd->est_timer = jiffies;
+ kd->id = id;
+ ip_vs_est_set_params(ipvs, kd);
+
+ /* Pre-allocate stats used in calc phase */
+ if (!id && !kd->calc_stats) {
+ kd->calc_stats = ip_vs_stats_alloc();
+ if (!kd->calc_stats)
+ goto out;
+ }
+
+ /* Start kthread tasks only when services are present */
+ if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
+ ret = ip_vs_est_kthread_start(ipvs, kd);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (arr)
+ ipvs->est_kt_count++;
+ ipvs->est_kt_arr[id] = kd;
+ kd = NULL;
+ /* Use most recent kthread for new ests */
+ ipvs->est_add_ktid = id;
+ ret = 0;
+
+out:
+ mutex_unlock(&ipvs->est_mutex);
+ if (kd) {
+ ip_vs_stats_free(kd->calc_stats);
+ kfree(kd);
+ }
- spin_lock_bh(&ipvs->est_lock);
- list_add(&est->list, &ipvs->est_list);
- spin_unlock_bh(&ipvs->est_lock);
+ return ret;
}
+/* Select ktid where to add new ests: available, unused or new slot */
+static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
+{
+ int ktid, best = ipvs->est_kt_count;
+ struct ip_vs_est_kt_data *kd;
+
+ for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
+ kd = ipvs->est_kt_arr[ktid];
+ if (kd) {
+ if (kd->est_count < kd->est_max_count) {
+ best = ktid;
+ break;
+ }
+ } else if (ktid < best) {
+ best = ktid;
+ }
+ }
+ ipvs->est_add_ktid = best;
+}
+
+/* Add estimator to current kthread (est_add_ktid) */
+static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
+ struct ip_vs_estimator *est)
+{
+ struct ip_vs_est_kt_data *kd = NULL;
+ struct ip_vs_est_tick_data *td;
+ int ktid, row, crow, cid, ret;
+ int delay = est->ktrow;
+
+ BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
+ "Too many chains for ktcid");
+
+ if (ipvs->est_add_ktid < ipvs->est_kt_count) {
+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+ if (kd)
+ goto add_est;
+ }
+
+ ret = ip_vs_est_add_kthread(ipvs);
+ if (ret < 0)
+ goto out;
+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+
+add_est:
+ ktid = kd->id;
+ /* For small number of estimators prefer to use few ticks,
+ * otherwise try to add into the last estimated row.
+ * est_row and add_row point after the row we should use
+ */
+ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
+ crow = READ_ONCE(kd->est_row);
+ else
+ crow = kd->add_row;
+ crow += delay;
+ if (crow >= IPVS_EST_NTICKS)
+ crow -= IPVS_EST_NTICKS;
+ /* Assume initial delay ? */
+ if (delay >= IPVS_EST_NTICKS - 1) {
+ /* Preserve initial delay or decrease it if no space in tick */
+ row = crow;
+ if (crow < IPVS_EST_NTICKS - 1) {
+ crow++;
+ row = find_last_bit(kd->avail, crow);
+ }
+ if (row >= crow)
+ row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
+ } else {
+ /* Preserve delay or increase it if no space in tick */
+ row = IPVS_EST_NTICKS;
+ if (crow > 0)
+ row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
+ if (row >= IPVS_EST_NTICKS)
+ row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
+ }
+
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ if (!td) {
+ td = kzalloc(sizeof(*td), GFP_KERNEL);
+ if (!td) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rcu_assign_pointer(kd->ticks[row], td);
+ }
+
+ cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
+
+ kd->est_count++;
+ kd->tick_len[row]++;
+ if (!td->chain_len[cid])
+ __set_bit(cid, td->present);
+ td->chain_len[cid]++;
+ est->ktid = ktid;
+ est->ktrow = row;
+ est->ktcid = cid;
+ hlist_add_head_rcu(&est->list, &td->chains[cid]);
+
+ if (td->chain_len[cid] >= kd->chain_max) {
+ __set_bit(cid, td->full);
+ if (kd->tick_len[row] >= kd->tick_max)
+ __clear_bit(row, kd->avail);
+ }
+
+ /* Update est_add_ktid to point to first available/empty kt slot */
+ if (kd->est_count == kd->est_max_count)
+ ip_vs_est_update_ktid(ipvs);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/* Start estimation for stats */
+int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ int ret;
+
+ if (!ipvs->est_max_threads && ipvs->enable)
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
+ est->ktid = -1;
+ est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
+
+ /* We prefer this code to be short, kthread 0 will requeue the
+ * estimator to available chain. If tasks are disabled, we
+ * will not allocate much memory, just for kt 0.
+ */
+ ret = 0;
+ if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+ ret = ip_vs_est_add_kthread(ipvs);
+ if (ret >= 0)
+ hlist_add_head(&est->list, &ipvs->est_temp_list);
+ else
+ INIT_HLIST_NODE(&est->list);
+ return ret;
+}
+
+static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
+{
+ if (kd) {
+ if (kd->task) {
+ pr_info("stop unused estimator thread %d...\n", kd->id);
+ kthread_stop(kd->task);
+ }
+ ip_vs_stats_free(kd->calc_stats);
+ kfree(kd);
+ }
+}
+
+/* Unlink estimator from chain */
void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_est_tick_data *td;
+ struct ip_vs_est_kt_data *kd;
+ int ktid = est->ktid;
+ int row = est->ktrow;
+ int cid = est->ktcid;
+
+ /* Failed to add to chain ? */
+ if (hlist_unhashed(&est->list))
+ return;
+
+ /* On return, estimator can be freed, dequeue it now */
+
+ /* In est_temp_list ? */
+ if (ktid < 0) {
+ hlist_del(&est->list);
+ goto end_kt0;
+ }
+
+ hlist_del_rcu(&est->list);
+ kd = ipvs->est_kt_arr[ktid];
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ __clear_bit(cid, td->full);
+ td->chain_len[cid]--;
+ if (!td->chain_len[cid])
+ __clear_bit(cid, td->present);
+ kd->tick_len[row]--;
+ __set_bit(row, kd->avail);
+ if (!kd->tick_len[row]) {
+ RCU_INIT_POINTER(kd->ticks[row], NULL);
+ kfree_rcu(td);
+ }
+ kd->est_count--;
+ if (kd->est_count) {
+ /* This kt slot can become available just now, prefer it */
+ if (ktid < ipvs->est_add_ktid)
+ ipvs->est_add_ktid = ktid;
+ return;
+ }
- spin_lock_bh(&ipvs->est_lock);
- list_del(&est->list);
- spin_unlock_bh(&ipvs->est_lock);
+ if (ktid > 0) {
+ mutex_lock(&ipvs->est_mutex);
+ ip_vs_est_kthread_destroy(kd);
+ ipvs->est_kt_arr[ktid] = NULL;
+ if (ktid == ipvs->est_kt_count - 1) {
+ ipvs->est_kt_count--;
+ while (ipvs->est_kt_count > 1 &&
+ !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
+ ipvs->est_kt_count--;
+ }
+ mutex_unlock(&ipvs->est_mutex);
+
+ /* This slot is now empty, prefer another available kt slot */
+ if (ktid == ipvs->est_add_ktid)
+ ip_vs_est_update_ktid(ipvs);
+ }
+
+end_kt0:
+ /* kt 0 is freed after all other kthreads and chains are empty */
+ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
+ kd = ipvs->est_kt_arr[0];
+ if (!kd || !kd->est_count) {
+ mutex_lock(&ipvs->est_mutex);
+ if (kd) {
+ ip_vs_est_kthread_destroy(kd);
+ ipvs->est_kt_arr[0] = NULL;
+ }
+ ipvs->est_kt_count--;
+ mutex_unlock(&ipvs->est_mutex);
+ ipvs->est_add_ktid = 0;
+ }
+ }
+}
+
+/* Register all ests from est_temp_list to kthreads */
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_estimator *est;
+
+ while (1) {
+ int max = 16;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ while (max-- > 0) {
+ est = hlist_entry_safe(ipvs->est_temp_list.first,
+ struct ip_vs_estimator, list);
+ if (est) {
+ if (kthread_should_stop())
+ goto unlock;
+ hlist_del_init(&est->list);
+ if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
+ continue;
+ est->ktid = -1;
+ hlist_add_head(&est->list,
+ &ipvs->est_temp_list);
+ /* Abort, some entries will not be estimated
+ * until next attempt
+ */
+ }
+ goto unlock;
+ }
+ mutex_unlock(&__ip_vs_mutex);
+ cond_resched();
+ }
+
+unlock:
+ mutex_unlock(&__ip_vs_mutex);
+}
+
+/* Calculate limits for all kthreads */
+static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
+{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ struct ip_vs_est_kt_data *kd;
+ struct hlist_head chain;
+ struct ip_vs_stats *s;
+ int cache_factor = 4;
+ int i, loops, ntest;
+ s32 min_est = 0;
+ ktime_t t1, t2;
+ int max = 8;
+ int ret = 1;
+ s64 diff;
+ u64 val;
+
+ INIT_HLIST_HEAD(&chain);
+ mutex_lock(&__ip_vs_mutex);
+ kd = ipvs->est_kt_arr[0];
+ mutex_unlock(&__ip_vs_mutex);
+ s = kd ? kd->calc_stats : NULL;
+ if (!s)
+ goto out;
+ hlist_add_head(&s->est.list, &chain);
+
+ loops = 1;
+ /* Get best result from many tests */
+ for (ntest = 0; ntest < 12; ntest++) {
+ if (!(ntest & 3)) {
+ /* Wait for cpufreq frequency transition */
+ wait_event_idle_timeout(wq, kthread_should_stop(),
+ HZ / 50);
+ if (!ipvs->enable || kthread_should_stop())
+ goto stop;
+ }
+
+ local_bh_disable();
+ rcu_read_lock();
+
+ /* Put stats in cache */
+ ip_vs_chain_estimation(&chain);
+
+ t1 = ktime_get();
+ for (i = loops * cache_factor; i > 0; i--)
+ ip_vs_chain_estimation(&chain);
+ t2 = ktime_get();
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ if (!ipvs->enable || kthread_should_stop())
+ goto stop;
+ cond_resched();
+
+ diff = ktime_to_ns(ktime_sub(t2, t1));
+ if (diff <= 1 * NSEC_PER_USEC) {
+ /* Do more loops on low time resolution */
+ loops *= 2;
+ continue;
+ }
+ if (diff >= NSEC_PER_SEC)
+ continue;
+ val = diff;
+ do_div(val, loops);
+ if (!min_est || val < min_est) {
+ min_est = val;
+ /* goal: 95usec per chain */
+ val = 95 * NSEC_PER_USEC;
+ if (val >= min_est) {
+ do_div(val, min_est);
+ max = (int)val;
+ } else {
+ max = 1;
+ }
+ }
+ }
+
+out:
+ if (s)
+ hlist_del_init(&s->est.list);
+ *chain_max = max;
+ return ret;
+
+stop:
+ ret = 0;
+ goto out;
+}
+
+/* Calculate the parameters and apply them in context of kt #0
+ * ECP: est_calc_phase
+ * ECM: est_chain_max
+ * ECP ECM Insert Chain enable Description
+ * ---------------------------------------------------------------------------
+ * 0 0 est_temp_list 0 create kt #0 context
+ * 0 0 est_temp_list 0->1 service added, start kthread #0 task
+ * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase
+ * 1 0 est_temp_list 1 kt #0: determine est_chain_max,
+ * stop tasks, move ests to est_temp_list
+ * and free kd for kthreads 1..last
+ * 1->0 0->N kt chains 1 ests can go to kthreads
+ * 0 N kt chains 1 drain est_temp_list, create new kthread
+ * contexts, start tasks, estimate
+ */
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
+{
+ int genid = atomic_read(&ipvs->est_genid);
+ struct ip_vs_est_tick_data *td;
+ struct ip_vs_est_kt_data *kd;
+ struct ip_vs_estimator *est;
+ struct ip_vs_stats *stats;
+ int id, row, cid, delay;
+ bool last, last_td;
+ int chain_max;
+ int step;
+
+ if (!ip_vs_est_calc_limits(ipvs, &chain_max))
+ return;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Stop all other tasks, so that we can immediately move the
+ * estimators to est_temp_list without RCU grace period
+ */
+ mutex_lock(&ipvs->est_mutex);
+ for (id = 1; id < ipvs->est_kt_count; id++) {
+ /* netns clean up started, abort */
+ if (!ipvs->enable)
+ goto unlock2;
+ kd = ipvs->est_kt_arr[id];
+ if (!kd)
+ continue;
+ ip_vs_est_kthread_stop(kd);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+
+ /* Move all estimators to est_temp_list but carefully,
+ * all estimators and kthread data can be released while
+ * we reschedule. Even for kthread 0.
+ */
+ step = 0;
+
+ /* Order entries in est_temp_list in ascending delay, so now
+ * walk delay(desc), id(desc), cid(asc)
+ */
+ delay = IPVS_EST_NTICKS;
+
+next_delay:
+ delay--;
+ if (delay < 0)
+ goto end_dequeue;
+
+last_kt:
+ /* Destroy contexts backwards */
+ id = ipvs->est_kt_count;
+
+next_kt:
+ if (!ipvs->enable || kthread_should_stop())
+ goto unlock;
+ id--;
+ if (id < 0)
+ goto next_delay;
+ kd = ipvs->est_kt_arr[id];
+ if (!kd)
+ goto next_kt;
+ /* kt 0 can exist with empty chains */
+ if (!id && kd->est_count <= 1)
+ goto next_delay;
+
+ row = kd->est_row + delay;
+ if (row >= IPVS_EST_NTICKS)
+ row -= IPVS_EST_NTICKS;
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ if (!td)
+ goto next_kt;
+
+ cid = 0;
+
+walk_chain:
+ if (kthread_should_stop())
+ goto unlock;
+ step++;
+ if (!(step & 63)) {
+ /* Give chance estimators to be added (to est_temp_list)
+ * and deleted (releasing kthread contexts)
+ */
+ mutex_unlock(&__ip_vs_mutex);
+ cond_resched();
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Current kt released ? */
+ if (id >= ipvs->est_kt_count)
+ goto last_kt;
+ if (kd != ipvs->est_kt_arr[id])
+ goto next_kt;
+ /* Current td released ? */
+ if (td != rcu_dereference_protected(kd->ticks[row], 1))
+ goto next_kt;
+ /* No fatal changes on the current kd and td */
+ }
+ est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
+ list);
+ if (!est) {
+ cid++;
+ if (cid >= IPVS_EST_TICK_CHAINS)
+ goto next_kt;
+ goto walk_chain;
+ }
+ /* We can cheat and increase est_count to protect kt 0 context
+ * from release but we prefer to keep the last estimator
+ */
+ last = kd->est_count <= 1;
+ /* Do not free kt #0 data */
+ if (!id && last)
+ goto next_delay;
+ last_td = kd->tick_len[row] <= 1;
+ stats = container_of(est, struct ip_vs_stats, est);
+ ip_vs_stop_estimator(ipvs, stats);
+ /* Tasks are stopped, move without RCU grace period */
+ est->ktid = -1;
+ est->ktrow = row - kd->est_row;
+ if (est->ktrow < 0)
+ est->ktrow += IPVS_EST_NTICKS;
+ hlist_add_head(&est->list, &ipvs->est_temp_list);
+ /* kd freed ? */
+ if (last)
+ goto next_kt;
+ /* td freed ? */
+ if (last_td)
+ goto next_kt;
+ goto walk_chain;
+
+end_dequeue:
+ /* All estimators removed while calculating ? */
+ if (!ipvs->est_kt_count)
+ goto unlock;
+ kd = ipvs->est_kt_arr[0];
+ if (!kd)
+ goto unlock;
+ kd->add_row = kd->est_row;
+ ipvs->est_chain_max = chain_max;
+ ip_vs_est_set_params(ipvs, kd);
+
+ pr_info("using max %d ests per chain, %d per kthread\n",
+ kd->chain_max, kd->est_max_count);
+
+ /* Try to keep tot_stats in kt0, enqueue it early */
+ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
+ ipvs->tot_stats->s.est.ktid == -1) {
+ hlist_del(&ipvs->tot_stats->s.est.list);
+ hlist_add_head(&ipvs->tot_stats->s.est.list,
+ &ipvs->est_temp_list);
+ }
+
+ mutex_lock(&ipvs->est_mutex);
+
+ /* We completed the calc phase, new calc phase not requested */
+ if (genid == atomic_read(&ipvs->est_genid))
+ ipvs->est_calc_phase = 0;
+
+unlock2:
+ mutex_unlock(&ipvs->est_mutex);
+
+unlock:
+ mutex_unlock(&__ip_vs_mutex);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -191,14 +927,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
{
- INIT_LIST_HEAD(&ipvs->est_list);
- spin_lock_init(&ipvs->est_lock);
- timer_setup(&ipvs->est_timer, estimation_timer, 0);
- mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+ INIT_HLIST_HEAD(&ipvs->est_temp_list);
+ ipvs->est_kt_arr = NULL;
+ ipvs->est_max_threads = 0;
+ ipvs->est_calc_phase = 0;
+ ipvs->est_chain_max = 0;
+ ipvs->est_kt_count = 0;
+ ipvs->est_add_ktid = 0;
+ atomic_set(&ipvs->est_genid, 0);
+ atomic_set(&ipvs->est_genid_done, 0);
+ __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
return 0;
}
void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
{
- del_timer_sync(&ipvs->est_timer);
+ int i;
+
+ for (i = 0; i < ipvs->est_kt_count; i++)
+ ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
+ kfree(ipvs->est_kt_arr);
+ mutex_destroy(&ipvs->est_mutex);
}
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 8639e7efd0e2..24002bc61e07 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -191,19 +191,16 @@ BTF_ID(struct, nf_conn___init)
/* Check writes into `struct nf_conn` */
static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int off,
- int size, enum bpf_access_type atype,
- u32 *next_btf_id,
- enum bpf_type_flag *flag)
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype,
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
- const struct btf_type *ncit;
- const struct btf_type *nct;
+ const struct btf_type *ncit, *nct, *t;
size_t end;
- ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]);
- nct = btf_type_by_id(btf, btf_nf_conn_ids[0]);
-
+ ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]);
+ nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
if (t != nct && t != ncit) {
bpf_log(log, "only read is supported\n");
return -EACCES;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8006ca862551..496c4920505b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -211,28 +211,24 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
unsigned int zoneid,
const struct net *net)
{
- struct {
- struct nf_conntrack_man src;
- union nf_inet_addr dst_addr;
- unsigned int zone;
- u32 net_mix;
- u16 dport;
- u16 proto;
- } __aligned(SIPHASH_ALIGNMENT) combined;
+ u64 a, b, c, d;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
- memset(&combined, 0, sizeof(combined));
+ /* The direction must be ignored, handle usable tuplehash members manually */
+ a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3];
+ b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3];
- /* The direction must be ignored, so handle usable members manually. */
- combined.src = tuple->src;
- combined.dst_addr = tuple->dst.u3;
- combined.zone = zoneid;
- combined.net_mix = net_hash_mix(net);
- combined.dport = (__force __u16)tuple->dst.u.all;
- combined.proto = tuple->dst.protonum;
+ c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16;
+ c |= tuple->dst.protonum;
- return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
+ d = (u64)zoneid << 32 | net_hash_mix(net);
+
+ /* IPv4: u3.all[1,2,3] == 0 */
+ c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2];
+ d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2];
+
+ return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd);
}
static u32 scale_hash(u32 hash)
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index ff737a76052e..48ea6d0264b5 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -26,7 +26,9 @@
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_log.h>
+#include <net/ip.h>
static DEFINE_MUTEX(nf_ct_helper_mutex);
struct hlist_head *nf_ct_helper_hash __read_mostly;
@@ -240,6 +242,104 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
}
EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
+/* 'skb' should already be pulled to nh_ofs. */
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, u16 proto)
+{
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ unsigned int protoff;
+ int err;
+
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
+ helper->tuple.src.l3num != proto)
+ return NF_ACCEPT;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ protoff = ip_hdrlen(skb);
+ proto = ip_hdr(skb)->protocol;
+ break;
+ case NFPROTO_IPV6: {
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+ int ofs;
+
+ ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+ protoff = ofs;
+ proto = nexthdr;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "helper invoked on non-IP family!");
+ return NF_DROP;
+ }
+
+ if (helper->tuple.dst.protonum != proto)
+ return NF_ACCEPT;
+
+ err = helper->help(skb, protoff, ct, ctinfo);
+ if (err != NF_ACCEPT)
+ return err;
+
+ /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
+ * FTP with NAT) adusting the TCP payload size when mangling IP
+ * addresses and/or port numbers in the text-based control connection.
+ */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper);
+
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+ u8 proto, bool nat, struct nf_conntrack_helper **hp)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
+ int ret = 0;
+
+ helper = nf_conntrack_helper_try_module_get(name, family, proto);
+ if (!helper)
+ return -EINVAL;
+
+ help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+ if (!help) {
+ nf_conntrack_helper_put(helper);
+ return -ENOMEM;
+ }
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (nat) {
+ ret = nf_nat_helper_try_module_get(name, family, proto);
+ if (ret) {
+ nf_conntrack_helper_put(helper);
+ return ret;
+ }
+ }
+#endif
+ rcu_assign_pointer(help->helper, helper);
+ *hp = helper;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_add_helper);
+
/* appropriate ct lock protecting must be taken by caller */
static int unhelp(struct nf_conn *ct, void *me)
{
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 895b09cbd7cf..99323fb12d0f 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -121,17 +121,61 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
};
EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);
-unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (state->hook == NF_INET_POST_ROUTING &&
+ netif_is_l3_master(state->out))
+ return true;
+#endif
+ return false;
+}
+
+unsigned int nf_confirm(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
const struct nf_conn_help *help;
+ enum ip_conntrack_info ctinfo;
+ unsigned int protoff;
+ struct nf_conn *ct;
+ bool seqadj_needed;
+ __be16 frag_off;
+ u8 pnum;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || in_vrf_postrouting(state))
+ return NF_ACCEPT;
help = nfct_help(ct);
+
+ seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb);
+ if (!help && !seqadj_needed)
+ return nf_conntrack_confirm(skb);
+
+ /* helper->help() do not expect ICMP packets */
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return nf_conntrack_confirm(skb);
+
+ switch (nf_ct_l3num(ct)) {
+ case NFPROTO_IPV4:
+ protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+ break;
+ case NFPROTO_IPV6:
+ pnum = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+ return nf_conntrack_confirm(skb);
+ break;
+ default:
+ return nf_conntrack_confirm(skb);
+ }
+
if (help) {
const struct nf_conntrack_helper *helper;
int ret;
- /* rcu_read_lock()ed by nf_hook_thresh */
+ /* rcu_read_lock()ed by nf_hook */
helper = rcu_dereference(help->helper);
if (helper) {
ret = helper->help(skb,
@@ -142,12 +186,10 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
}
}
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
- }
+ if (seqadj_needed &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
}
/* We've seen it coming out the other side: confirm it */
@@ -155,35 +197,6 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
}
EXPORT_SYMBOL_GPL(nf_confirm);
-static bool in_vrf_postrouting(const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
- if (state->hook == NF_INET_POST_ROUTING &&
- netif_is_l3_master(state->out))
- return true;
-#endif
- return false;
-}
-
-static unsigned int ipv4_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return nf_conntrack_confirm(skb);
-
- if (in_vrf_postrouting(state))
- return NF_ACCEPT;
-
- return nf_confirm(skb,
- skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
-}
-
static unsigned int ipv4_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -230,13 +243,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = {
.priority = NF_IP_PRI_CONNTRACK,
},
{
- .hook = ipv4_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
- .hook = ipv4_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
@@ -373,33 +386,6 @@ static struct nf_sockopt_ops so_getorigdst6 = {
.owner = THIS_MODULE,
};
-static unsigned int ipv6_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned char pnum = ipv6_hdr(skb)->nexthdr;
- __be16 frag_off;
- int protoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return nf_conntrack_confirm(skb);
-
- if (in_vrf_postrouting(state))
- return NF_ACCEPT;
-
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
- &frag_off);
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return nf_conntrack_confirm(skb);
- }
-
- return nf_confirm(skb, protoff, ct, ctinfo);
-}
-
static unsigned int ipv6_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -428,13 +414,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = {
.priority = NF_IP6_PRI_CONNTRACK,
},
{
- .hook = ipv6_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_LAST,
},
{
- .hook = ipv6_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_LAST - 1,
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 61e3b05cf02c..1020d67600a9 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -129,6 +129,56 @@ static void icmpv6_error_log(const struct sk_buff *skb,
nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg);
}
+static noinline_for_stack int
+nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state)
+{
+ u8 hl = ipv6_hdr(skb)->hop_limit;
+ union nf_inet_addr outer_daddr;
+ union {
+ struct nd_opt_hdr nd_opt;
+ struct rd_msg rd_msg;
+ } tmp;
+ const struct nd_opt_hdr *nd_opt;
+ const struct rd_msg *rd_msg;
+
+ rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg);
+ if (!rd_msg) {
+ icmpv6_error_log(skb, state, "short redirect");
+ return -NF_ACCEPT;
+ }
+
+ if (rd_msg->icmph.icmp6_code != 0)
+ return NF_ACCEPT;
+
+ if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+ icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect");
+ return -NF_ACCEPT;
+ }
+
+ dataoff += sizeof(*rd_msg);
+
+ /* warning: rd_msg no longer usable after this call */
+ nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt);
+ if (!nd_opt || nd_opt->nd_opt_len == 0) {
+ icmpv6_error_log(skb, state, "redirect without options");
+ return -NF_ACCEPT;
+ }
+
+ /* We could call ndisc_parse_options(), but it would need
+ * skb_linearize() and a bit more work.
+ */
+ if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR)
+ return NF_ACCEPT;
+
+ memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr,
+ sizeof(outer_daddr.ip6));
+ dataoff += 8;
+ return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
+ IPPROTO_ICMPV6, &outer_daddr);
+}
+
int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int dataoff,
@@ -159,6 +209,9 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
return NF_ACCEPT;
}
+ if (icmp6h->icmp6_type == NDISC_REDIRECT)
+ return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state);
+
/* is not error message ? */
if (icmp6h->icmp6_type >= 128)
return NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 5a936334b517..d88b92a8ffca 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -60,6 +60,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
[SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
[SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS,
+ [SCTP_CONNTRACK_DATA_SENT] = 30 SECS,
};
#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1
@@ -74,6 +75,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT
#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
+#define sDS SCTP_CONNTRACK_DATA_SENT
#define sIV SCTP_CONNTRACK_MAX
/*
@@ -90,15 +92,16 @@ COOKIE WAIT - We have seen an INIT chunk in the original direction, or als
COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
-SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction.
SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
to that of the SHUTDOWN chunk.
CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
the SHUTDOWN chunk. Connection is closed.
HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow.
-HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to
- that of the HEARTBEAT chunk. Secondary connection is
- established.
+HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK/DATA/SACK in the direction
+ opposite to that of the HEARTBEAT/DATA chunk. Secondary connection
+ is established.
+DATA_SENT - We have seen a DATA/SACK in a new flow.
*/
/* TODO
@@ -112,36 +115,38 @@ cookie echoed to closed.
*/
/* SCTP conntrack state transitions */
-static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
+static const u8 sctp_conntracks[2][12][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
-/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
-/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/
-/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA},
-/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
-/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */
+/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA, sCW},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS, sCL},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA, sSA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA, sCL},
+/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS},
+/* data/sack */ {sDS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}
},
{
/* REPLY */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
-/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
-/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
-/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */
-/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA},
-/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
-/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL, sIV},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR, sIV},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA, sIV},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA, sIV},
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* Can't come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA, sIV},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA, sIV},
+/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sHA},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA},
+/* data/sack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA},
}
};
@@ -253,6 +258,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
pr_debug("SCTP_CID_HEARTBEAT_ACK");
i = 10;
break;
+ case SCTP_CID_DATA:
+ case SCTP_CID_SACK:
+ pr_debug("SCTP_CID_DATA/SACK");
+ i = 11;
+ break;
default:
/* Other chunks like DATA or SACK do not change the state */
pr_debug("Unknown chunk type, Will stay in %s\n",
@@ -306,7 +316,9 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
ih->init_tag);
ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
- } else if (sch->type == SCTP_CID_HEARTBEAT) {
+ } else if (sch->type == SCTP_CID_HEARTBEAT ||
+ sch->type == SCTP_CID_DATA ||
+ sch->type == SCTP_CID_SACK) {
pr_debug("Setting vtag %x for secondary conntrack\n",
sh->vtag);
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
@@ -392,19 +404,19 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
if (!sctp_new(ct, skb, sh, dataoff))
return -NF_ACCEPT;
- }
-
- /* Check the verification tag (Sec 8.5) */
- if (!test_bit(SCTP_CID_INIT, map) &&
- !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
- !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
- !test_bit(SCTP_CID_ABORT, map) &&
- !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
- !test_bit(SCTP_CID_HEARTBEAT, map) &&
- !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
- sh->vtag != ct->proto.sctp.vtag[dir]) {
- pr_debug("Verification tag check failed\n");
- goto out;
+ } else {
+ /* Check the verification tag (Sec 8.5) */
+ if (!test_bit(SCTP_CID_INIT, map) &&
+ !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
+ !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
+ !test_bit(SCTP_CID_ABORT, map) &&
+ !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) {
+ pr_debug("Verification tag check failed\n");
+ goto out;
+ }
}
old_state = new_state = SCTP_CONNTRACK_NONE;
@@ -464,6 +476,11 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
} else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
}
+ } else if (sch->type == SCTP_CID_DATA || sch->type == SCTP_CID_SACK) {
+ if (ct->proto.sctp.vtag[dir] == 0) {
+ pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir);
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ }
}
old_state = ct->proto.sctp.state;
@@ -684,6 +701,7 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_DATA_SENT] = { .type = NLA_U32 },
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index bca839ab1ae8..0250725e38a4 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -602,6 +602,7 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT,
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT,
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST,
@@ -892,6 +893,12 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT] = {
+ .procname = "nf_conntrack_sctp_timeout_data_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = {
@@ -1036,6 +1043,7 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
XASSIGN(SHUTDOWN_ACK_SENT, sn);
XASSIGN(HEARTBEAT_SENT, sn);
XASSIGN(HEARTBEAT_ACKED, sn);
+ XASSIGN(DATA_SENT, sn);
#undef XASSIGN
#endif
}
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index b350fe9d00b0..19efba1e51ef 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -421,6 +421,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (ret == NF_DROP)
flow_offload_teardown(flow);
break;
+ default:
+ WARN_ON_ONCE(1);
+ ret = NF_DROP;
+ break;
}
return ret;
@@ -682,6 +686,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
if (ret == NF_DROP)
flow_offload_teardown(flow);
break;
+ default:
+ WARN_ON_ONCE(1);
+ ret = NF_DROP;
+ break;
}
return ret;
diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c
new file mode 100644
index 000000000000..551abd2da614
--- /dev/null
+++ b/net/netfilter/nf_nat_ovs.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support nat functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_nat.h>
+
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int *action,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype)
+{
+ __be16 proto = skb_protocol(skb, true);
+ int hooknum, err = NF_ACCEPT;
+
+ /* See HOOK2MANIP(). */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+ else
+ hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (proto == htons(ETH_P_IP) &&
+ ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ hooknum))
+ err = NF_DROP;
+ goto out;
+ } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) {
+ __be16 frag_off;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int hdrlen = ipv6_skip_exthdr(skb,
+ sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct,
+ ctinfo,
+ hooknum,
+ hdrlen))
+ err = NF_DROP;
+ goto out;
+ }
+ }
+ /* Non-ICMP, fall thru to initialize if needed. */
+ fallthrough;
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ /* Initialize according to the NAT action. */
+ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+ /* Action is set up to establish a new
+ * mapping.
+ */
+ ? nf_nat_setup_info(ct, range, maniptype)
+ : nf_nat_alloc_null_binding(ct, hooknum);
+ if (err != NF_ACCEPT)
+ goto out;
+ }
+ break;
+
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+
+ default:
+ err = NF_DROP;
+ goto out;
+ }
+
+ err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+ if (err == NF_ACCEPT)
+ *action |= BIT(maniptype);
+out:
+ return err;
+}
+
+int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int *action,
+ const struct nf_nat_range2 *range, bool commit)
+{
+ enum nf_nat_manip_type maniptype;
+ int err, ct_action = *action;
+
+ *action = 0;
+
+ /* Add NAT extension if not confirmed yet. */
+ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+ return NF_DROP; /* Can't NAT. */
+
+ if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+ (ctinfo != IP_CT_RELATED || commit)) {
+ /* NAT an established or related connection like before. */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+ /* This is the REPLY direction for a connection
+ * for which NAT was applied in the forward
+ * direction. Do the reverse NAT.
+ */
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+ else
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+ } else if (ct_action & BIT(NF_NAT_MANIP_SRC)) {
+ maniptype = NF_NAT_MANIP_SRC;
+ } else if (ct_action & BIT(NF_NAT_MANIP_DST)) {
+ maniptype = NF_NAT_MANIP_DST;
+ } else {
+ return NF_ACCEPT;
+ }
+
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, maniptype);
+ if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
+ if (ct->status & IPS_SRC_NAT) {
+ if (maniptype == NF_NAT_MANIP_SRC)
+ maniptype = NF_NAT_MANIP_DST;
+ else
+ maniptype = NF_NAT_MANIP_SRC;
+
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, range,
+ maniptype);
+ } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, NULL,
+ NF_NAT_MANIP_SRC);
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_ct_nat);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7a09421f19e1..832b881f7c17 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
for_each_possible_cpu(cpu) {
cpu_stats = per_cpu_ptr(stats, cpu);
do {
- seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ seq = u64_stats_fetch_begin(&cpu_stats->syncp);
pkts = cpu_stats->pkts;
bytes = cpu_stats->bytes;
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq));
total.pkts += pkts;
total.bytes += bytes;
}
@@ -2759,7 +2759,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
};
static int nf_tables_fill_expr_info(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
goto nla_put_failure;
@@ -2769,7 +2769,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb,
NFTA_EXPR_DATA);
if (data == NULL)
goto nla_put_failure;
- if (expr->ops->dump(skb, expr) < 0)
+ if (expr->ops->dump(skb, expr, reset) < 0)
goto nla_put_failure;
nla_nest_end(skb, data);
}
@@ -2781,14 +2781,14 @@ nla_put_failure:
};
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
struct nlattr *nest;
nest = nla_nest_start_noflag(skb, attr);
if (!nest)
goto nla_put_failure;
- if (nf_tables_fill_expr_info(skb, expr) < 0)
+ if (nf_tables_fill_expr_info(skb, expr, reset) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
return 0;
@@ -2857,6 +2857,43 @@ err1:
return err;
}
+int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
+ struct nft_expr_info *info)
+{
+ struct nlattr *tb[NFTA_EXPR_MAX + 1];
+ const struct nft_expr_type *type;
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
+ nft_expr_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_EXPR_DATA])
+ return -EINVAL;
+
+ type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
+ if (!type)
+ return -ENOENT;
+
+ if (!type->inner_ops)
+ return -EOPNOTSUPP;
+
+ err = nla_parse_nested_deprecated(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA],
+ type->policy, NULL);
+ if (err < 0)
+ goto err_nla_parse;
+
+ info->attr = nla;
+ info->ops = type->inner_ops;
+
+ return 0;
+
+err_nla_parse:
+ return err;
+}
+
static int nf_tables_newexpr(const struct nft_ctx *ctx,
const struct nft_expr_info *expr_info,
struct nft_expr *expr)
@@ -2997,7 +3034,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
u32 flags, int family,
const struct nft_table *table,
const struct nft_chain *chain,
- const struct nft_rule *rule, u64 handle)
+ const struct nft_rule *rule, u64 handle,
+ bool reset)
{
struct nlmsghdr *nlh;
const struct nft_expr *expr, *next;
@@ -3030,7 +3068,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
if (list == NULL)
goto nla_put_failure;
nft_rule_for_each_expr(expr, next, rule) {
- if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, list);
@@ -3081,7 +3119,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
event, flags, ctx->family, ctx->table,
- ctx->chain, rule, handle);
+ ctx->chain, rule, handle, false);
if (err < 0) {
kfree_skb(skb);
goto err;
@@ -3102,7 +3140,8 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
unsigned int *idx,
struct netlink_callback *cb,
const struct nft_table *table,
- const struct nft_chain *chain)
+ const struct nft_chain *chain,
+ bool reset)
{
struct net *net = sock_net(skb->sk);
const struct nft_rule *rule, *prule;
@@ -3129,7 +3168,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
NFT_MSG_NEWRULE,
NLM_F_MULTI | NLM_F_APPEND,
table->family,
- table, chain, rule, handle) < 0)
+ table, chain, rule, handle, reset) < 0)
return 1;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -3152,6 +3191,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nftables_pernet *nft_net;
+ bool reset = false;
+
+ if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
+ reset = true;
rcu_read_lock();
nft_net = nft_pernet(net);
@@ -3176,14 +3219,15 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
if (!nft_is_active(net, chain))
continue;
__nf_tables_dump_rules(skb, &idx,
- cb, table, chain);
+ cb, table, chain, reset);
break;
}
goto done;
}
list_for_each_entry_rcu(chain, &table->chains, list) {
- if (__nf_tables_dump_rules(skb, &idx, cb, table, chain))
+ if (__nf_tables_dump_rules(skb, &idx,
+ cb, table, chain, reset))
goto done;
}
@@ -3254,6 +3298,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
+ bool reset = false;
int err;
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -3290,9 +3335,12 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
if (!skb2)
return -ENOMEM;
+ if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
+ reset = true;
+
err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
- family, table, chain, rule, 0);
+ family, table, chain, rule, 0, reset);
if (err < 0)
goto err_fill_rule_info;
@@ -4067,7 +4115,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (set->num_exprs == 1) {
nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
- if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0)
+ if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -4078,7 +4126,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
for (i = 0; i < set->num_exprs; i++) {
if (nft_expr_dump(skb, NFTA_LIST_ELEM,
- set->exprs[i]) < 0)
+ set->exprs[i], false) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -4909,7 +4957,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb,
if (num_exprs == 1) {
expr = nft_setelem_expr_at(elem_expr, 0);
- if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0)
+ if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0)
return -1;
return 0;
@@ -4920,7 +4968,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb,
nft_setelem_expr_foreach(expr, elem_expr, size) {
expr = nft_setelem_expr_at(elem_expr, size);
- if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -8276,6 +8324,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
+ [NFT_MSG_GETRULE_RESET] = {
+ .call = nf_tables_getrule,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
[NFT_MSG_DELRULE] = {
.call = nf_tables_delrule,
.type = NFNL_CB_BATCH,
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index cee3e4e905ec..709a736c301c 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -340,6 +340,8 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_exthdr_type,
&nft_last_type,
&nft_counter_type,
+ &nft_objref_type,
+ &nft_inner_type,
};
static struct nft_object_type *nft_basic_objects[] = {
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index e6e402b247d0..84eae7cabc67 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -232,7 +232,8 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb,
return 0;
}
-static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_bitwise_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);
int err = 0;
@@ -393,7 +394,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
}
static int
-nft_bitwise_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+nft_bitwise_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
struct nft_data data;
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index f952a80275a8..b66647a5a171 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -148,7 +148,8 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
priv->len);
}
-static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_byteorder_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_byteorder *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 963cf831799c..6eb21a4f5698 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -92,7 +92,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return 0;
}
-static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp_expr *priv = nft_expr_priv(expr);
@@ -253,7 +254,8 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
return __nft_cmp_offload(ctx, flow, &cmp);
}
-static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
@@ -347,7 +349,8 @@ static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
return __nft_cmp_offload(ctx, flow, &cmp);
}
-static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp16_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index c16172427622..5284cd2ad532 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -324,7 +324,8 @@ static int nft_extension_dump_info(struct sk_buff *skb, int attr,
return 0;
}
-static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_target_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct xt_target *target = expr->ops->data;
void *info = nft_expr_priv(expr);
@@ -572,12 +573,14 @@ nla_put_failure:
return -1;
}
-static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_match_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
return __nft_match_dump(skb, expr, nft_expr_priv(expr));
}
-static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
+static int nft_match_large_dump(struct sk_buff *skb,
+ const struct nft_expr *e, bool reset)
{
struct nft_xt_match_priv *priv = nft_expr_priv(e);
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index d657f999a11b..de9d1980df69 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -185,7 +185,8 @@ static void nft_connlimit_eval(const struct nft_expr *expr,
nft_connlimit_do_eval(priv, regs, pkt, NULL);
}
-static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_connlimit_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index f4d3573e8782..dccc68a5135a 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -201,11 +201,12 @@ void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs,
nft_counter_do_eval(priv, regs, pkt);
}
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_counter_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- return nft_counter_do_dump(skb, priv, false);
+ return nft_counter_do_dump(skb, priv, reset);
}
static int nft_counter_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 641dc21f92b4..c68e2151defe 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -641,7 +641,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ct *priv = nft_expr_priv(expr);
@@ -703,7 +704,8 @@ static bool nft_ct_get_reduce(struct nft_regs_track *track,
return nft_expr_reduce_bitwise(track, expr);
}
-static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ct *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index 63507402716d..e5739a59ebf1 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -44,7 +44,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
sizeof(int));
}
-static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_netdev_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_dup_netdev *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 6983e6ddeef9..274579b1696e 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -357,7 +357,8 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dynset_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
@@ -379,7 +380,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (priv->set->num_exprs == 0) {
if (priv->num_exprs == 1) {
if (nft_expr_dump(skb, NFTA_DYNSET_EXPR,
- priv->expr_array[0]))
+ priv->expr_array[0], reset))
goto nla_put_failure;
} else if (priv->num_exprs > 1) {
struct nlattr *nest;
@@ -390,7 +391,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
for (i = 0; i < priv->num_exprs; i++) {
if (nft_expr_dump(skb, NFTA_LIST_ELEM,
- priv->expr_array[i]))
+ priv->expr_array[i], reset))
goto nla_put_failure;
}
nla_nest_end(skb, nest);
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a67ea9c3ae57..a54a7f772cec 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -13,7 +13,6 @@
#include <linux/sctp.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
-#include <net/sctp/sctp.h>
#include <net/tcp.h>
struct nft_exthdr {
@@ -576,7 +575,8 @@ nla_put_failure:
return -1;
}
-static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
@@ -586,7 +586,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
-static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump_set(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
@@ -596,7 +597,8 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
-static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump_strip(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 1f12d7ade606..6e049fd48760 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -118,7 +118,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
}
EXPORT_SYMBOL_GPL(nft_fib_init);
-int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset)
{
const struct nft_fib *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index a25c88bc8b75..e860d8fe0e5e 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -433,7 +433,8 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_flow_offload_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 7c5876dc9ff2..7b9d4d1bd17c 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -56,7 +56,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
sizeof(int));
}
-static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_netdev_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
@@ -186,7 +187,8 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
addr_len);
}
-static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_neigh_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index e5631e88b285..ee8d487b69c0 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -139,7 +139,7 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
}
static int nft_jhash_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_jhash *priv = nft_expr_priv(expr);
@@ -176,7 +176,7 @@ static bool nft_jhash_reduce(struct nft_regs_track *track,
}
static int nft_symhash_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_symhash *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 5f28b21abc7d..c9d2f7c29f53 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -147,7 +147,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx,
}
}
-static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_immediate_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
new file mode 100644
index 000000000000..28e2873ba24e
--- /dev/null
+++ b/net/netfilter/nft_inner.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+#include <net/netfilter/nf_tables_offload.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/gre.h>
+#include <net/geneve.h>
+#include <net/ip.h>
+#include <linux/icmpv6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx);
+
+/* Same layout as nft_expr but it embeds the private expression data area. */
+struct __nft_expr {
+ const struct nft_expr_ops *ops;
+ union {
+ struct nft_payload payload;
+ struct nft_meta meta;
+ } __attribute__((aligned(__alignof__(u64))));
+};
+
+enum {
+ NFT_INNER_EXPR_PAYLOAD,
+ NFT_INNER_EXPR_META,
+};
+
+struct nft_inner {
+ u8 flags;
+ u8 hdrsize;
+ u8 type;
+ u8 expr_type;
+
+ struct __nft_expr expr;
+};
+
+static int nft_inner_parse_l2l3(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *ctx, u32 off)
+{
+ __be16 llproto, outer_llproto;
+ u32 nhoff, thoff;
+
+ if (priv->flags & NFT_INNER_LL) {
+ struct vlan_ethhdr *veth, _veth;
+ struct ethhdr *eth, _eth;
+ u32 hdrsize;
+
+ eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth);
+ if (!eth)
+ return -1;
+
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ llproto = eth->h_proto;
+ hdrsize = sizeof(_eth);
+ break;
+ case htons(ETH_P_8021Q):
+ veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth);
+ if (!veth)
+ return -1;
+
+ outer_llproto = veth->h_vlan_encapsulated_proto;
+ llproto = veth->h_vlan_proto;
+ hdrsize = sizeof(_veth);
+ break;
+ default:
+ return -1;
+ }
+
+ ctx->inner_lloff = off;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL;
+ off += hdrsize;
+ } else {
+ struct iphdr *iph;
+ u32 _version;
+
+ iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version);
+ if (!iph)
+ return -1;
+
+ switch (iph->version) {
+ case 4:
+ llproto = htons(ETH_P_IP);
+ break;
+ case 6:
+ llproto = htons(ETH_P_IPV6);
+ break;
+ default:
+ return -1;
+ }
+ }
+
+ ctx->llproto = llproto;
+ if (llproto == htons(ETH_P_8021Q))
+ llproto = outer_llproto;
+
+ nhoff = off;
+
+ switch (llproto) {
+ case htons(ETH_P_IP): {
+ struct iphdr *iph, _iph;
+
+ iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return -1;
+
+ if (iph->ihl < 5 || iph->version != 4)
+ return -1;
+
+ ctx->inner_nhoff = nhoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+ thoff = nhoff + (iph->ihl * 4);
+ if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) {
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ ctx->inner_thoff = thoff;
+ ctx->l4proto = iph->protocol;
+ }
+ }
+ break;
+ case htons(ETH_P_IPV6): {
+ struct ipv6hdr *ip6h, _ip6h;
+ int fh_flags = IP6_FH_F_AUTH;
+ unsigned short fragoff;
+ int l4proto;
+
+ ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h);
+ if (!ip6h)
+ return -1;
+
+ if (ip6h->version != 6)
+ return -1;
+
+ ctx->inner_nhoff = nhoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+ thoff = nhoff;
+ l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags);
+ if (l4proto < 0 || thoff > U16_MAX)
+ return -1;
+
+ if (fragoff == 0) {
+ thoff = nhoff + sizeof(_ip6h);
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ ctx->inner_thoff = thoff;
+ ctx->l4proto = l4proto;
+ }
+ }
+ break;
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nft_inner_parse_tunhdr(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *ctx, u32 *off)
+{
+ if (pkt->tprot == IPPROTO_GRE) {
+ ctx->inner_tunoff = pkt->thoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+ return 0;
+ }
+
+ if (pkt->tprot != IPPROTO_UDP)
+ return -1;
+
+ ctx->inner_tunoff = *off;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+ *off += priv->hdrsize;
+
+ switch (priv->type) {
+ case NFT_INNER_GENEVE: {
+ struct genevehdr *gnvh, _gnvh;
+
+ gnvh = skb_header_pointer(pkt->skb, pkt->inneroff,
+ sizeof(_gnvh), &_gnvh);
+ if (!gnvh)
+ return -1;
+
+ *off += gnvh->opt_len * 4;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int nft_inner_parse(const struct nft_inner *priv,
+ struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ struct nft_inner_tun_ctx ctx = {};
+ u32 off = pkt->inneroff;
+
+ if (priv->flags & NFT_INNER_HDRSIZE &&
+ nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
+ return -1;
+
+ if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
+ if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
+ return -1;
+ } else if (priv->flags & NFT_INNER_TH) {
+ ctx.inner_thoff = off;
+ ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ }
+
+ *tun_ctx = ctx;
+ tun_ctx->type = priv->type;
+ pkt->flags |= NFT_PKTINFO_INNER_FULL;
+
+ return 0;
+}
+
+static bool nft_inner_parse_needed(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ const struct nft_inner_tun_ctx *tun_ctx)
+{
+ if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
+ return true;
+
+ if (priv->type != tun_ctx->type)
+ return true;
+
+ return false;
+}
+
+static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+ const struct nft_inner *priv = nft_expr_priv(expr);
+
+ if (nft_payload_inner_offset(pkt) < 0)
+ goto err;
+
+ if (nft_inner_parse_needed(priv, pkt, tun_ctx) &&
+ nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0)
+ goto err;
+
+ switch (priv->expr_type) {
+ case NFT_INNER_EXPR_PAYLOAD:
+ nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+ break;
+ case NFT_INNER_EXPR_META:
+ nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = {
+ [NFTA_INNER_NUM] = { .type = NLA_U32 },
+ [NFTA_INNER_FLAGS] = { .type = NLA_U32 },
+ [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 },
+ [NFTA_INNER_TYPE] = { .type = NLA_U32 },
+ [NFTA_INNER_EXPR] = { .type = NLA_NESTED },
+};
+
+struct nft_expr_info {
+ const struct nft_expr_ops *ops;
+ const struct nlattr *attr;
+ struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
+};
+
+static int nft_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_inner *priv = nft_expr_priv(expr);
+ u32 flags, hdrsize, type, num;
+ struct nft_expr_info expr_info;
+ int err;
+
+ if (!tb[NFTA_INNER_FLAGS] ||
+ !tb[NFTA_INNER_HDRSIZE] ||
+ !tb[NFTA_INNER_TYPE] ||
+ !tb[NFTA_INNER_EXPR])
+ return -EINVAL;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS]));
+ if (flags & ~NFT_INNER_MASK)
+ return -EOPNOTSUPP;
+
+ num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM]));
+ if (num != 0)
+ return -EOPNOTSUPP;
+
+ hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE]));
+ type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE]));
+
+ if (type > U8_MAX)
+ return -EINVAL;
+
+ if (flags & NFT_INNER_HDRSIZE) {
+ if (hdrsize == 0 || hdrsize > 64)
+ return -EOPNOTSUPP;
+ }
+
+ priv->flags = flags;
+ priv->hdrsize = hdrsize;
+ priv->type = type;
+
+ err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info);
+ if (err < 0)
+ return err;
+
+ priv->expr.ops = expr_info.ops;
+
+ if (!strcmp(expr_info.ops->type->name, "payload"))
+ priv->expr_type = NFT_INNER_EXPR_PAYLOAD;
+ else if (!strcmp(expr_info.ops->type->name, "meta"))
+ priv->expr_type = NFT_INNER_EXPR_META;
+ else
+ return -EINVAL;
+
+ err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr,
+ (const struct nlattr * const*)expr_info.tb);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_inner_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_inner *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) ||
+ nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) ||
+ nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) ||
+ nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize)))
+ goto nla_put_failure;
+
+ if (nft_expr_dump(skb, NFTA_INNER_EXPR,
+ (struct nft_expr *)&priv->expr, reset) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_expr_ops nft_inner_ops = {
+ .type = &nft_inner_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)),
+ .eval = nft_inner_eval,
+ .init = nft_inner_init,
+ .dump = nft_inner_dump,
+};
+
+struct nft_expr_type nft_inner_type __read_mostly = {
+ .name = "inner",
+ .ops = &nft_inner_ops,
+ .policy = nft_inner_policy,
+ .maxattr = NFTA_INNER_MAX,
+ .owner = THIS_MODULE,
+};
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index bb15a55dad5c..7f2bda6641bd 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -65,7 +65,8 @@ static void nft_last_eval(const struct nft_expr *expr,
WRITE_ONCE(last->set, 1);
}
-static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_last_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_last_priv *priv = nft_expr_priv(expr);
struct nft_last *last = priv->last;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 981addb2d051..145dc62c6247 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -193,7 +193,8 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_limit_pkts_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
@@ -251,7 +252,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
}
static int nft_limit_bytes_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_limit_priv *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 0e13c003f0c1..5defe6e4fd98 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -241,7 +241,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
nf_logger_put(ctx->family, li->type);
}
-static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_log_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_log *priv = nft_expr_priv(expr);
const struct nf_loginfo *li = &priv->loginfo;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index dfae12759c7c..cae5a6724163 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -178,7 +178,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_lookup_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0;
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 2a0adc497bbb..e55e455275c4 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -73,7 +73,8 @@ static int nft_masq_init(const struct nft_ctx *ctx,
return nf_ct_netns_get(ctx->net, ctx->family);
}
-static int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_masq_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_masq *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 55d2d49c3425..e384e0de7a54 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -669,7 +669,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
EXPORT_SYMBOL_GPL(nft_meta_set_init);
int nft_meta_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -684,7 +684,8 @@ nla_put_failure:
}
EXPORT_SYMBOL_GPL(nft_meta_get_dump);
-int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_meta_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -831,9 +832,71 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EINVAL);
}
+static int nft_meta_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ len = sizeof(u16);
+ break;
+ case NFT_META_L4PROTO:
+ len = sizeof(u32);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ priv->len = len;
+
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
+}
+
+void nft_meta_inner_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ nft_reg_store16(dest, (__force u16)tun_ctx->llproto);
+ break;
+ case NFT_META_L4PROTO:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+ goto err;
+
+ nft_reg_store8(dest, tun_ctx->l4proto);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ return;
+
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+EXPORT_SYMBOL_GPL(nft_meta_inner_eval);
+
+static const struct nft_expr_ops nft_meta_inner_ops = {
+ .type = &nft_meta_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .init = nft_meta_inner_init,
+ .dump = nft_meta_get_dump,
+ /* direct call to nft_meta_inner_eval(). */
+};
+
struct nft_expr_type nft_meta_type __read_mostly = {
.name = "meta",
.select_ops = nft_meta_select_ops,
+ .inner_ops = &nft_meta_inner_ops,
.policy = nft_meta_policy,
.maxattr = NFTA_META_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index e5fd6995e4bf..047999150390 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -255,7 +255,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return nf_ct_netns_get(ctx->net, family);
}
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_nat_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_nat *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 45d3dc9e96f2..7d29db7c2ac0 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -112,7 +112,8 @@ nla_put_failure:
return -1;
}
-static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_inc_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ng_inc *priv = nft_expr_priv(expr);
@@ -168,7 +169,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
NULL, NFT_DATA_VALUE, sizeof(u32));
}
-static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_random_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ng_random *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 5d8d91b3904d..7b01aa2ef653 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -47,7 +47,8 @@ static int nft_objref_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_object *obj = nft_objref_priv(expr);
@@ -82,7 +83,6 @@ static void nft_objref_activate(const struct nft_ctx *ctx,
obj->use++;
}
-static struct nft_expr_type nft_objref_type;
static const struct nft_expr_ops nft_objref_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_object *)),
@@ -156,7 +156,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_objref_map *priv = nft_expr_priv(expr);
@@ -195,7 +196,6 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static struct nft_expr_type nft_objref_type;
static const struct nft_expr_ops nft_objref_map_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
@@ -233,28 +233,10 @@ static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
[NFTA_OBJREF_SET_ID] = { .type = NLA_U32 },
};
-static struct nft_expr_type nft_objref_type __read_mostly = {
+struct nft_expr_type nft_objref_type __read_mostly = {
.name = "objref",
.select_ops = nft_objref_select_ops,
.policy = nft_objref_policy,
.maxattr = NFTA_OBJREF_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_objref_module_init(void)
-{
- return nft_register_expr(&nft_objref_type);
-}
-
-static void __exit nft_objref_module_exit(void)
-{
- nft_unregister_expr(&nft_objref_type);
-}
-
-module_init(nft_objref_module_init);
-module_exit(nft_objref_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_EXPR("objref");
-MODULE_DESCRIPTION("nftables stateful object reference module");
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index adacf95b6e2b..70820c66b591 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -92,7 +92,8 @@ static int nft_osf_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_osf_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_osf *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 4edd899aeb9b..17b418a5a593 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -19,6 +19,7 @@
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
+#include <net/gre.h>
#include <linux/icmpv6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -100,6 +101,41 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
pkt->inneroff = thoff + __tcp_hdrlen(th);
}
break;
+ case IPPROTO_GRE: {
+ u32 offset = sizeof(struct gre_base_hdr);
+ struct gre_base_hdr *gre, _gre;
+ __be16 version;
+
+ gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre);
+ if (!gre)
+ return -1;
+
+ version = gre->flags & GRE_VERSION;
+ switch (version) {
+ case GRE_VERSION_0:
+ if (gre->flags & GRE_ROUTING)
+ return -1;
+
+ if (gre->flags & GRE_CSUM) {
+ offset += sizeof_field(struct gre_full_hdr, csum) +
+ sizeof_field(struct gre_full_hdr, reserved1);
+ }
+ if (gre->flags & GRE_KEY)
+ offset += sizeof_field(struct gre_full_hdr, key);
+
+ if (gre->flags & GRE_SEQ)
+ offset += sizeof_field(struct gre_full_hdr, seq);
+ break;
+ default:
+ return -1;
+ }
+
+ pkt->inneroff = thoff + offset;
+ }
+ break;
+ case IPPROTO_IPIP:
+ pkt->inneroff = thoff;
+ break;
default:
return -1;
}
@@ -109,7 +145,7 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
return 0;
}
-static int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
+int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
{
if (!(pkt->flags & NFT_PKTINFO_INNER) &&
__nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0)
@@ -195,7 +231,8 @@ static int nft_payload_init(const struct nft_ctx *ctx,
priv->len);
}
-static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_payload *priv = nft_expr_priv(expr);
@@ -552,6 +589,92 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.offload = nft_payload_offload,
};
+void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ u32 *dest = &regs->data[priv->dreg];
+ int offset;
+
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_TUN_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN))
+ goto err;
+
+ offset = tun_ctx->inner_tunoff;
+ break;
+ case NFT_PAYLOAD_LL_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL))
+ goto err;
+
+ offset = tun_ctx->inner_lloff;
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH))
+ goto err;
+
+ offset = tun_ctx->inner_nhoff;
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+ goto err;
+
+ offset = tun_ctx->inner_thoff;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ offset += priv->offset;
+
+ if (skb_copy_bits(skb, offset, dest, priv->len) < 0)
+ goto err;
+
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_payload_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_payload *priv = nft_expr_priv(expr);
+ u32 base;
+
+ base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ switch (base) {
+ case NFT_PAYLOAD_TUN_HEADER:
+ case NFT_PAYLOAD_LL_HEADER:
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->base = base;
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+
+ return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
+}
+
+static const struct nft_expr_ops nft_payload_inner_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+ .init = nft_payload_inner_init,
+ .dump = nft_payload_dump,
+ /* direct call to nft_payload_inner_eval(). */
+};
+
static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
{
*sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
@@ -665,6 +788,16 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
return 0;
}
+struct nft_payload_set {
+ enum nft_payload_bases base:8;
+ u8 offset;
+ u8 len;
+ u8 sreg;
+ u8 csum_type;
+ u8 csum_offset;
+ u8 csum_flags;
+};
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -787,7 +920,8 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
priv->len);
}
-static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_payload_set *priv = nft_expr_priv(expr);
@@ -885,6 +1019,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
struct nft_expr_type nft_payload_type __read_mostly = {
.name = "payload",
.select_ops = nft_payload_select_ops,
+ .inner_ops = &nft_payload_inner_ops,
.policy = nft_payload_policy,
.maxattr = NFTA_PAYLOAD_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index da29e92c03e2..b2b8127c8d43 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -152,7 +152,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_queue_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_queue *priv = nft_expr_priv(expr);
@@ -168,7 +169,8 @@ nla_put_failure:
}
static int
-nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+nft_queue_sreg_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_queue *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index e6b0df68feea..123578e28917 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -217,11 +217,12 @@ static int nft_quota_init(const struct nft_ctx *ctx,
return nft_quota_do_init(tb, priv);
}
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_quota *priv = nft_expr_priv(expr);
- return nft_quota_do_dump(skb, priv, false);
+ return nft_quota_do_dump(skb, priv, reset);
}
static void nft_quota_destroy(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 832f0d725a9e..0566d6aaf1e5 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -111,7 +111,8 @@ err1:
return err;
}
-static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_range_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_range_expr *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 5086adfe731c..5f7739987559 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -75,7 +75,8 @@ static int nft_redir_init(const struct nft_ctx *ctx,
return nf_ct_netns_get(ctx->net, ctx->family);
}
-static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_redir_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_redir *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 927ff8459bd9..f2addc844dd2 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -69,7 +69,8 @@ int nft_reject_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_reject_init);
-int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_reject_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_reject *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 71931ec91721..5990fdd7b3cc 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -146,7 +146,7 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
}
static int nft_rt_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_rt *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 49a5348a6a14..85f8df87efda 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -199,7 +199,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
}
static int nft_socket_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_socket *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index 6cf9a04fbfe2..13da882669a4 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -272,7 +272,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx,
nft_synproxy_do_destroy(ctx);
}
-static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_synproxy_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_synproxy *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index 62da25ad264b..ea83f661417e 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -294,7 +294,7 @@ static void nft_tproxy_destroy(const struct nft_ctx *ctx,
}
static int nft_tproxy_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_tproxy *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 983ade4be3b3..b059aa541798 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -108,7 +108,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
}
static int nft_tunnel_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_tunnel *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 1c5343c936a8..c88fd078a9ae 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -212,7 +212,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr,
}
static int nft_xfrm_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_xfrm *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 680015ba7cb6..e8961094a282 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -4,7 +4,6 @@
#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/ipv6.h>
-#include <net/sctp/sctp.h>
#include <linux/sctp.h>
#include <linux/netfilter/x_tables.h>