summaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c170
-rw-r--r--net/netfilter/nf_conntrack_standalone.c16
-rw-r--r--net/netfilter/nft_set_rbtree.c332
3 files changed, 276 insertions, 242 deletions
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index d88b92a8ffca..945dd40e7077 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -27,22 +27,16 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
- closely. They're more complex. --RR
-
- And so for me for SCTP :D -Kiran */
-
static const char *const sctp_conntrack_names[] = {
- "NONE",
- "CLOSED",
- "COOKIE_WAIT",
- "COOKIE_ECHOED",
- "ESTABLISHED",
- "SHUTDOWN_SENT",
- "SHUTDOWN_RECD",
- "SHUTDOWN_ACK_SENT",
- "HEARTBEAT_SENT",
- "HEARTBEAT_ACKED",
+ [SCTP_CONNTRACK_NONE] = "NONE",
+ [SCTP_CONNTRACK_CLOSED] = "CLOSED",
+ [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT",
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED",
+ [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED",
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT",
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD",
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT",
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT",
};
#define SECS * HZ
@@ -54,13 +48,11 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
[SCTP_CONNTRACK_CLOSED] = 10 SECS,
[SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS,
[SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS,
- [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS,
+ [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS,
[SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
[SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
[SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
- [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS,
- [SCTP_CONNTRACK_DATA_SENT] = 30 SECS,
};
#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1
@@ -74,8 +66,6 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT
-#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
-#define sDS SCTP_CONNTRACK_DATA_SENT
#define sIV SCTP_CONNTRACK_MAX
/*
@@ -98,10 +88,6 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
the SHUTDOWN chunk. Connection is closed.
HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow.
-HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK/DATA/SACK in the direction
- opposite to that of the HEARTBEAT/DATA chunk. Secondary connection
- is established.
-DATA_SENT - We have seen a DATA/SACK in a new flow.
*/
/* TODO
@@ -115,38 +101,36 @@ cookie echoed to closed.
*/
/* SCTP conntrack state transitions */
-static const u8 sctp_conntracks[2][12][SCTP_CONNTRACK_MAX] = {
+static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */
-/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA, sCW},
-/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},
-/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS, sCL},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA, sSA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't have Stale cookie*/
-/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA, sCL},
-/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS},
-/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS},
-/* data/sack */ {sDS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
+/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL},
+/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
},
{
/* REPLY */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */
-/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},
-/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL, sIV},
-/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR, sIV},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA, sIV},
-/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA, sIV},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* Can't come in reply dir */
-/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA, sIV},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA, sIV},
-/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sHA},
-/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA},
-/* data/sack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA},
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV},
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV},
+/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES},
}
};
@@ -160,8 +144,8 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \
- (offset) < (skb)->len && \
- ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \
+ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))) && \
+ (sch)->length; \
(offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)
/* Some validity checks to make sure the chunks are fine */
@@ -258,11 +242,6 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
pr_debug("SCTP_CID_HEARTBEAT_ACK");
i = 10;
break;
- case SCTP_CID_DATA:
- case SCTP_CID_SACK:
- pr_debug("SCTP_CID_DATA/SACK");
- i = 11;
- break;
default:
/* Other chunks like DATA or SACK do not change the state */
pr_debug("Unknown chunk type, Will stay in %s\n",
@@ -316,9 +295,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
ih->init_tag);
ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
- } else if (sch->type == SCTP_CID_HEARTBEAT ||
- sch->type == SCTP_CID_DATA ||
- sch->type == SCTP_CID_SACK) {
+ } else if (sch->type == SCTP_CID_HEARTBEAT) {
pr_debug("Setting vtag %x for secondary conntrack\n",
sh->vtag);
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
@@ -404,19 +381,19 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
if (!sctp_new(ct, skb, sh, dataoff))
return -NF_ACCEPT;
- } else {
- /* Check the verification tag (Sec 8.5) */
- if (!test_bit(SCTP_CID_INIT, map) &&
- !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
- !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
- !test_bit(SCTP_CID_ABORT, map) &&
- !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
- !test_bit(SCTP_CID_HEARTBEAT, map) &&
- !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
- sh->vtag != ct->proto.sctp.vtag[dir]) {
- pr_debug("Verification tag check failed\n");
- goto out;
- }
+ }
+
+ /* Check the verification tag (Sec 8.5) */
+ if (!test_bit(SCTP_CID_INIT, map) &&
+ !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
+ !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
+ !test_bit(SCTP_CID_ABORT, map) &&
+ !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) {
+ pr_debug("Verification tag check failed\n");
+ goto out;
}
old_state = new_state = SCTP_CONNTRACK_NONE;
@@ -424,22 +401,29 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
/* Special cases of Verification tag check (Sec 8.5.1) */
if (sch->type == SCTP_CID_INIT) {
- /* Sec 8.5.1 (A) */
+ /* (A) vtag MUST be zero */
if (sh->vtag != 0)
goto out_unlock;
} else if (sch->type == SCTP_CID_ABORT) {
- /* Sec 8.5.1 (B) */
- if (sh->vtag != ct->proto.sctp.vtag[dir] &&
- sh->vtag != ct->proto.sctp.vtag[!dir])
+ /* (B) vtag MUST match own vtag if T flag is unset OR
+ * MUST match peer's vtag if T flag is set
+ */
+ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) ||
+ ((sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[!dir]))
goto out_unlock;
} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
- /* Sec 8.5.1 (C) */
- if (sh->vtag != ct->proto.sctp.vtag[dir] &&
- sh->vtag != ct->proto.sctp.vtag[!dir] &&
- sch->flags & SCTP_CHUNK_FLAG_T)
+ /* (C) vtag MUST match own vtag if T flag is unset OR
+ * MUST match peer's vtag if T flag is set
+ */
+ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) ||
+ ((sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[!dir]))
goto out_unlock;
} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
- /* Sec 8.5.1 (D) */
+ /* (D) vtag must be same as init_vtag as found in INIT_ACK */
if (sh->vtag != ct->proto.sctp.vtag[dir])
goto out_unlock;
} else if (sch->type == SCTP_CID_HEARTBEAT) {
@@ -476,11 +460,6 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
} else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
}
- } else if (sch->type == SCTP_CID_DATA || sch->type == SCTP_CID_SACK) {
- if (ct->proto.sctp.vtag[dir] == 0) {
- pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir);
- ct->proto.sctp.vtag[dir] = sh->vtag;
- }
}
old_state = ct->proto.sctp.state;
@@ -518,8 +497,12 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
}
ct->proto.sctp.state = new_state;
- if (old_state != new_state)
+ if (old_state != new_state) {
nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+ if (new_state == SCTP_CONNTRACK_ESTABLISHED &&
+ !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ }
}
spin_unlock_bh(&ct->lock);
@@ -533,14 +516,6 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
- if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
- dir == IP_CT_DIR_REPLY &&
- new_state == SCTP_CONNTRACK_ESTABLISHED) {
- pr_debug("Setting assured bit\n");
- set_bit(IPS_ASSURED_BIT, &ct->status);
- nf_conntrack_event_cache(IPCT_ASSURED, ct);
- }
-
return NF_ACCEPT;
out_unlock:
@@ -701,7 +676,6 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 },
- [CTA_TIMEOUT_SCTP_DATA_SENT] = { .type = NLA_U32 },
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 0250725e38a4..460294bd4b60 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -601,8 +601,6 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD,
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT,
- NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED,
- NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT,
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST,
@@ -887,18 +885,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED] = {
- .procname = "nf_conntrack_sctp_timeout_heartbeat_acked",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT] = {
- .procname = "nf_conntrack_sctp_timeout_data_sent",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = {
@@ -1042,8 +1028,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
XASSIGN(SHUTDOWN_RECD, sn);
XASSIGN(SHUTDOWN_ACK_SENT, sn);
XASSIGN(HEARTBEAT_SENT, sn);
- XASSIGN(HEARTBEAT_ACKED, sn);
- XASSIGN(DATA_SENT, sn);
#undef XASSIGN
#endif
}
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 7325bee7d144..19ea4d3c3553 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
return !nft_rbtree_interval_end(rbe);
}
-static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
- const struct nft_rbtree_elem *interval)
+static int nft_rbtree_cmp(const struct nft_set *set,
+ const struct nft_rbtree_elem *e1,
+ const struct nft_rbtree_elem *e2)
{
- return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
+ return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
+ set->klen);
}
static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
@@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
const struct nft_rbtree_elem *rbe, *interval = NULL;
u8 genmask = nft_genmask_cur(net);
const struct rb_node *parent;
- const void *this;
int d;
parent = rcu_dereference_raw(priv->root.rb_node);
@@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- this = nft_set_ext_key(&rbe->ext);
- d = memcmp(this, key, set->klen);
+ d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
if (d < 0) {
parent = rcu_dereference_raw(parent->rb_left);
if (interval &&
- nft_rbtree_equal(set, this, interval) &&
+ !nft_rbtree_cmp(set, rbe, interval) &&
nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_start(interval))
continue;
@@ -215,154 +215,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
return rbe;
}
+static int nft_rbtree_gc_elem(const struct nft_set *__set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ struct nft_set *set = (struct nft_set *)__set;
+ struct rb_node *prev = rb_prev(&rbe->node);
+ struct nft_rbtree_elem *rbe_prev;
+ struct nft_set_gc_batch *gcb;
+
+ gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC);
+ if (!gcb)
+ return -ENOMEM;
+
+ /* search for expired end interval coming before this element. */
+ do {
+ rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+ if (nft_rbtree_interval_end(rbe_prev))
+ break;
+
+ prev = rb_prev(prev);
+ } while (prev != NULL);
+
+ rb_erase(&rbe_prev->node, &priv->root);
+ rb_erase(&rbe->node, &priv->root);
+ atomic_sub(2, &set->nelems);
+
+ nft_set_gc_batch_add(gcb, rbe);
+ nft_set_gc_batch_complete(gcb);
+
+ return 0;
+}
+
+static bool nft_rbtree_update_first(const struct nft_set *set,
+ struct nft_rbtree_elem *rbe,
+ struct rb_node *first)
+{
+ struct nft_rbtree_elem *first_elem;
+
+ first_elem = rb_entry(first, struct nft_rbtree_elem, node);
+ /* this element is closest to where the new element is to be inserted:
+ * update the first element for the node list path.
+ */
+ if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
+ return true;
+
+ return false;
+}
+
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new,
struct nft_set_ext **ext)
{
- bool overlap = false, dup_end_left = false, dup_end_right = false;
+ struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
+ struct rb_node *node, *parent, **p, *first = NULL;
struct nft_rbtree *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
- struct nft_rbtree_elem *rbe;
- struct rb_node *parent, **p;
- int d;
+ int d, err;
- /* Detect overlaps as we descend the tree. Set the flag in these cases:
- *
- * a1. _ _ __>| ?_ _ __| (insert end before existing end)
- * a2. _ _ ___| ?_ _ _>| (insert end after existing end)
- * a3. _ _ ___? >|_ _ __| (insert start before existing end)
- *
- * and clear it later on, as we eventually reach the points indicated by
- * '?' above, in the cases described below. We'll always meet these
- * later, locally, due to tree ordering, and overlaps for the intervals
- * that are the closest together are always evaluated last.
- *
- * b1. _ _ __>| !_ _ __| (insert end before existing start)
- * b2. _ _ ___| !_ _ _>| (insert end after existing start)
- * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf)
- * '--' no nodes falling in this range
- * b4. >|_ _ ! (insert start before existing start)
- *
- * Case a3. resolves to b3.:
- * - if the inserted start element is the leftmost, because the '0'
- * element in the tree serves as end element
- * - otherwise, if an existing end is found immediately to the left. If
- * there are existing nodes in between, we need to further descend the
- * tree before we can conclude the new start isn't causing an overlap
- *
- * or to b4., which, preceded by a3., means we already traversed one or
- * more existing intervals entirely, from the right.
- *
- * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
- * in that order.
- *
- * The flag is also cleared in two special cases:
- *
- * b5. |__ _ _!|<_ _ _ (insert start right before existing end)
- * b6. |__ _ >|!__ _ _ (insert end right after existing start)
- *
- * which always happen as last step and imply that no further
- * overlapping is possible.
- *
- * Another special case comes from the fact that start elements matching
- * an already existing start element are allowed: insertion is not
- * performed but we return -EEXIST in that case, and the error will be
- * cleared by the caller if NLM_F_EXCL is not present in the request.
- * This way, request for insertion of an exact overlap isn't reported as
- * error to userspace if not desired.
- *
- * However, if the existing start matches a pre-existing start, but the
- * end element doesn't match the corresponding pre-existing end element,
- * we need to report a partial overlap. This is a local condition that
- * can be noticed without need for a tracking flag, by checking for a
- * local duplicated end for a corresponding start, from left and right,
- * separately.
+ /* Descend the tree to search for an existing element greater than the
+ * key value to insert that is greater than the new element. This is the
+ * first element to walk the ordered elements to find possible overlap.
*/
-
parent = NULL;
p = &priv->root.rb_node;
while (*p != NULL) {
parent = *p;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- d = memcmp(nft_set_ext_key(&rbe->ext),
- nft_set_ext_key(&new->ext),
- set->klen);
+ d = nft_rbtree_cmp(set, rbe, new);
+
if (d < 0) {
p = &parent->rb_left;
-
- if (nft_rbtree_interval_start(new)) {
- if (nft_rbtree_interval_end(rbe) &&
- nft_set_elem_active(&rbe->ext, genmask) &&
- !nft_set_elem_expired(&rbe->ext) && !*p)
- overlap = false;
- } else {
- if (dup_end_left && !*p)
- return -ENOTEMPTY;
-
- overlap = nft_rbtree_interval_end(rbe) &&
- nft_set_elem_active(&rbe->ext,
- genmask) &&
- !nft_set_elem_expired(&rbe->ext);
-
- if (overlap) {
- dup_end_right = true;
- continue;
- }
- }
} else if (d > 0) {
- p = &parent->rb_right;
+ if (!first ||
+ nft_rbtree_update_first(set, rbe, first))
+ first = &rbe->node;
- if (nft_rbtree_interval_end(new)) {
- if (dup_end_right && !*p)
- return -ENOTEMPTY;
-
- overlap = nft_rbtree_interval_end(rbe) &&
- nft_set_elem_active(&rbe->ext,
- genmask) &&
- !nft_set_elem_expired(&rbe->ext);
-
- if (overlap) {
- dup_end_left = true;
- continue;
- }
- } else if (nft_set_elem_active(&rbe->ext, genmask) &&
- !nft_set_elem_expired(&rbe->ext)) {
- overlap = nft_rbtree_interval_end(rbe);
- }
+ p = &parent->rb_right;
} else {
- if (nft_rbtree_interval_end(rbe) &&
- nft_rbtree_interval_start(new)) {
+ if (nft_rbtree_interval_end(rbe))
p = &parent->rb_left;
-
- if (nft_set_elem_active(&rbe->ext, genmask) &&
- !nft_set_elem_expired(&rbe->ext))
- overlap = false;
- } else if (nft_rbtree_interval_start(rbe) &&
- nft_rbtree_interval_end(new)) {
+ else
p = &parent->rb_right;
+ }
+ }
+
+ if (!first)
+ first = rb_first(&priv->root);
+
+ /* Detect overlap by going through the list of valid tree nodes.
+ * Values stored in the tree are in reversed order, starting from
+ * highest to lowest value.
+ */
+ for (node = first; node != NULL; node = rb_next(node)) {
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
- if (nft_set_elem_active(&rbe->ext, genmask) &&
- !nft_set_elem_expired(&rbe->ext))
- overlap = false;
- } else if (nft_set_elem_active(&rbe->ext, genmask) &&
- !nft_set_elem_expired(&rbe->ext)) {
- *ext = &rbe->ext;
- return -EEXIST;
- } else {
- overlap = false;
- if (nft_rbtree_interval_end(rbe))
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
+ if (!nft_set_elem_active(&rbe->ext, genmask))
+ continue;
+
+ /* perform garbage collection to avoid bogus overlap reports. */
+ if (nft_set_elem_expired(&rbe->ext)) {
+ err = nft_rbtree_gc_elem(set, priv, rbe);
+ if (err < 0)
+ return err;
+
+ continue;
+ }
+
+ d = nft_rbtree_cmp(set, rbe, new);
+ if (d == 0) {
+ /* Matching end element: no need to look for an
+ * overlapping greater or equal element.
+ */
+ if (nft_rbtree_interval_end(rbe)) {
+ rbe_le = rbe;
+ break;
+ }
+
+ /* first element that is greater or equal to key value. */
+ if (!rbe_ge) {
+ rbe_ge = rbe;
+ continue;
+ }
+
+ /* this is a closer more or equal element, update it. */
+ if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
+ rbe_ge = rbe;
+ continue;
+ }
+
+ /* element is equal to key value, make sure flags are
+ * the same, an existing more or equal start element
+ * must not be replaced by more or equal end element.
+ */
+ if ((nft_rbtree_interval_start(new) &&
+ nft_rbtree_interval_start(rbe_ge)) ||
+ (nft_rbtree_interval_end(new) &&
+ nft_rbtree_interval_end(rbe_ge))) {
+ rbe_ge = rbe;
+ continue;
}
+ } else if (d > 0) {
+ /* annotate element greater than the new element. */
+ rbe_ge = rbe;
+ continue;
+ } else if (d < 0) {
+ /* annotate element less than the new element. */
+ rbe_le = rbe;
+ break;
}
+ }
- dup_end_left = dup_end_right = false;
+ /* - new start element matching existing start element: full overlap
+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
+ */
+ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
+ nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
+ *ext = &rbe_ge->ext;
+ return -EEXIST;
}
- if (overlap)
+ /* - new end element matching existing end element: full overlap
+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
+ */
+ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
+ nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
+ *ext = &rbe_le->ext;
+ return -EEXIST;
+ }
+
+ /* - new start element with existing closest, less or equal key value
+ * being a start element: partial overlap, reported as -ENOTEMPTY.
+ * Anonymous sets allow for two consecutive start element since they
+ * are constant, skip them to avoid bogus overlap reports.
+ */
+ if (!nft_set_is_anonymous(set) && rbe_le &&
+ nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
+ return -ENOTEMPTY;
+
+ /* - new end element with existing closest, less or equal key value
+ * being a end element: partial overlap, reported as -ENOTEMPTY.
+ */
+ if (rbe_le &&
+ nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
return -ENOTEMPTY;
+ /* - new end element with existing closest, greater or equal key value
+ * being an end element: partial overlap, reported as -ENOTEMPTY
+ */
+ if (rbe_ge &&
+ nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
+ return -ENOTEMPTY;
+
+ /* Accepted element: pick insertion point depending on key value */
+ parent = NULL;
+ p = &priv->root.rb_node;
+ while (*p != NULL) {
+ parent = *p;
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+ d = nft_rbtree_cmp(set, rbe, new);
+
+ if (d < 0)
+ p = &parent->rb_left;
+ else if (d > 0)
+ p = &parent->rb_right;
+ else if (nft_rbtree_interval_end(rbe))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+
rb_link_node_rcu(&new->node, parent, p);
rb_insert_color(&new->node, &priv->root);
return 0;
@@ -501,23 +563,37 @@ static void nft_rbtree_gc(struct work_struct *work)
struct nft_rbtree *priv;
struct rb_node *node;
struct nft_set *set;
+ struct net *net;
+ u8 genmask;
priv = container_of(work, struct nft_rbtree, gc_work.work);
set = nft_set_container_of(priv);
+ net = read_pnet(&set->net);
+ genmask = nft_genmask_cur(net);
write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ if (!nft_set_elem_active(&rbe->ext, genmask))
+ continue;
+
+ /* elements are reversed in the rbtree for historical reasons,
+ * from highest to lowest value, that is why end element is
+ * always visited before the start element.
+ */
if (nft_rbtree_interval_end(rbe)) {
rbe_end = rbe;
continue;
}
if (!nft_set_elem_expired(&rbe->ext))
continue;
- if (nft_set_elem_mark_busy(&rbe->ext))
+
+ if (nft_set_elem_mark_busy(&rbe->ext)) {
+ rbe_end = NULL;
continue;
+ }
if (rbe_prev) {
rb_erase(&rbe_prev->node, &priv->root);