From bec94d430f2c97159e21e38c0f1fa4da3710d5e1 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemming@brocade.com>
Date: Sat, 27 Dec 2014 10:01:42 -0800
Subject: gre: allow live address change

The GRE tap device supports Ethernet over GRE, but doesn't
care about the source address of the tunnel, therefore it
can be changed without bring device down.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4f4bf5b99686..942576e27df1 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -673,6 +673,7 @@ static bool ipgre_netlink_encap_parms(struct nlattr *data[],
 static int gre_tap_init(struct net_device *dev)
 {
 	__gre_tunnel_init(dev);
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 
 	return ip_tunnel_init(dev);
 }
-- 
cgit v1.2.3


From 8274a97aa4c694ad0d7b31b283a89dcca140e62b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:29 -0800
Subject: fib_trie: Update usage stats to be percpu instead of global variables

The trie usage stats were currently being shared by all threads that were
calling fib_table_lookup.  As a result when multiple threads were
performing lookups simultaneously the trie would begin to cache bounce
between those threads.

In order to prevent this I have updated the usage stats to use a set of
percpu variables.  By doing this we should be able to avoid the cache
bouncing and still make use of these stats.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c |  2 +-
 net/ipv4/fib_trie.c     | 68 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 49 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 23104a3f2924..66890209208f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -67,7 +67,7 @@ static int __net_init fib4_rules_init(struct net *net)
 	return 0;
 
 fail:
-	kfree(local_table);
+	fib_free_table(local_table);
 	return -ENOMEM;
 }
 #else
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18bcaf2ff2fd..d3dbb4821ae8 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -153,7 +153,7 @@ struct trie_stat {
 struct trie {
 	struct rt_trie_node __rcu *trie;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-	struct trie_use_stats stats;
+	struct trie_use_stats __percpu *stats;
 #endif
 };
 
@@ -631,7 +631,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 		if (IS_ERR(tn)) {
 			tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-			t->stats.resize_node_skipped++;
+			this_cpu_inc(t->stats->resize_node_skipped);
 #endif
 			break;
 		}
@@ -658,7 +658,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 		if (IS_ERR(tn)) {
 			tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-			t->stats.resize_node_skipped++;
+			this_cpu_inc(t->stats->resize_node_skipped);
 #endif
 			break;
 		}
@@ -1357,7 +1357,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
 			err = fib_props[fa->fa_type].error;
 			if (err) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-				t->stats.semantic_match_passed++;
+				this_cpu_inc(t->stats->semantic_match_passed);
 #endif
 				return err;
 			}
@@ -1372,7 +1372,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
 					continue;
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-				t->stats.semantic_match_passed++;
+				this_cpu_inc(t->stats->semantic_match_passed);
 #endif
 				res->prefixlen = li->plen;
 				res->nh_sel = nhsel;
@@ -1388,7 +1388,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
 		}
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-		t->stats.semantic_match_miss++;
+		this_cpu_inc(t->stats->semantic_match_miss);
 #endif
 	}
 
@@ -1399,6 +1399,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	struct trie_use_stats __percpu *stats = t->stats;
+#endif
 	int ret;
 	struct rt_trie_node *n;
 	struct tnode *pn;
@@ -1417,7 +1420,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		goto failed;
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-	t->stats.gets++;
+	this_cpu_inc(stats->gets);
 #endif
 
 	/* Just a leaf? */
@@ -1441,7 +1444,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 
 		if (n == NULL) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-			t->stats.null_node_hit++;
+			this_cpu_inc(stats->null_node_hit);
 #endif
 			goto backtrace;
 		}
@@ -1576,7 +1579,7 @@ backtrace:
 			chopped_off = 0;
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-			t->stats.backtrack++;
+			this_cpu_inc(stats->backtrack);
 #endif
 			goto backtrace;
 		}
@@ -1830,6 +1833,11 @@ int fib_table_flush(struct fib_table *tb)
 
 void fib_free_table(struct fib_table *tb)
 {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	struct trie *t = (struct trie *)tb->tb_data;
+
+	free_percpu(t->stats);
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
 	kfree(tb);
 }
 
@@ -1973,7 +1981,14 @@ struct fib_table *fib_trie_table(u32 id)
 	tb->tb_num_default = 0;
 
 	t = (struct trie *) tb->tb_data;
-	memset(t, 0, sizeof(*t));
+	RCU_INIT_POINTER(t->trie, NULL);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	t->stats = alloc_percpu(struct trie_use_stats);
+	if (!t->stats) {
+		kfree(tb);
+		tb = NULL;
+	}
+#endif
 
 	return tb;
 }
@@ -2139,18 +2154,31 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 static void trie_show_usage(struct seq_file *seq,
-			    const struct trie_use_stats *stats)
+			    const struct trie_use_stats __percpu *stats)
 {
+	struct trie_use_stats s = { 0 };
+	int cpu;
+
+	/* loop through all of the CPUs and gather up the stats */
+	for_each_possible_cpu(cpu) {
+		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
+
+		s.gets += pcpu->gets;
+		s.backtrack += pcpu->backtrack;
+		s.semantic_match_passed += pcpu->semantic_match_passed;
+		s.semantic_match_miss += pcpu->semantic_match_miss;
+		s.null_node_hit += pcpu->null_node_hit;
+		s.resize_node_skipped += pcpu->resize_node_skipped;
+	}
+
 	seq_printf(seq, "\nCounters:\n---------\n");
-	seq_printf(seq, "gets = %u\n", stats->gets);
-	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
+	seq_printf(seq, "gets = %u\n", s.gets);
+	seq_printf(seq, "backtracks = %u\n", s.backtrack);
 	seq_printf(seq, "semantic match passed = %u\n",
-		   stats->semantic_match_passed);
-	seq_printf(seq, "semantic match miss = %u\n",
-		   stats->semantic_match_miss);
-	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
-	seq_printf(seq, "skipped node resize = %u\n\n",
-		   stats->resize_node_skipped);
+		   s.semantic_match_passed);
+	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
+	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
+	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
 }
 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
 
@@ -2191,7 +2219,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 			trie_collect_stats(t, &stat);
 			trie_show_stats(seq, &stat);
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-			trie_show_usage(seq, &t->stats);
+			trie_show_usage(seq, t->stats);
 #endif
 		}
 	}
-- 
cgit v1.2.3


From 64c9b6fb26ebcc82e36438d4084f2258f29dbadf Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:35 -0800
Subject: fib_trie: Make leaf and tnode more uniform

This change makes some fundamental changes to the way leaves and tnodes are
constructed.  The big differences are:
1.  Leaves now populate pos and bits indicating their full key size.
2.  Trie nodes now mask out their lower bits to be consistent with the leaf
3.  Both structures have been reordered so that rt_trie_node now consisists
    of a much larger region including the pos, bits, and rcu portions of
    the tnode structure.

On 32b systems this will result in the leaf being 4B larger as the pos and
bits values were added to a hole created by the key as it was only 4B in
length.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 192 ++++++++++++++++++++++------------------------------
 1 file changed, 82 insertions(+), 110 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d3dbb4821ae8..2b7220728b24 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -87,24 +87,38 @@
 
 typedef unsigned int t_key;
 
-#define T_TNODE 0
-#define T_LEAF  1
-#define NODE_TYPE_MASK	0x1UL
-#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
 
-#define IS_TNODE(n) (!(n->parent & T_LEAF))
-#define IS_LEAF(n) (n->parent & T_LEAF)
+struct tnode {
+	t_key key;
+	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
+	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
+	struct tnode __rcu *parent;
+	union {
+		struct rcu_head rcu;
+		struct tnode *tnode_free;
+	};
+	unsigned int full_children;	/* KEYLENGTH bits needed */
+	unsigned int empty_children;	/* KEYLENGTH bits needed */
+	struct rt_trie_node __rcu *child[0];
+};
 
 struct rt_trie_node {
-	unsigned long parent;
 	t_key key;
+	unsigned char bits;
+	unsigned char pos;
+	struct tnode __rcu *parent;
+	struct rcu_head rcu;
 };
 
 struct leaf {
-	unsigned long parent;
 	t_key key;
-	struct hlist_head list;
+	unsigned char bits;
+	unsigned char pos;
+	struct tnode __rcu *parent;
 	struct rcu_head rcu;
+	struct hlist_head list;
 };
 
 struct leaf_info {
@@ -115,20 +129,6 @@ struct leaf_info {
 	struct rcu_head rcu;
 };
 
-struct tnode {
-	unsigned long parent;
-	t_key key;
-	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
-	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
-	unsigned int full_children;	/* KEYLENGTH bits needed */
-	unsigned int empty_children;	/* KEYLENGTH bits needed */
-	union {
-		struct rcu_head rcu;
-		struct tnode *tnode_free;
-	};
-	struct rt_trie_node __rcu *child[0];
-};
-
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 struct trie_use_stats {
 	unsigned int gets;
@@ -176,38 +176,27 @@ static const int sync_pages = 128;
 static struct kmem_cache *fn_alias_kmem __read_mostly;
 static struct kmem_cache *trie_leaf_kmem __read_mostly;
 
-/*
- * caller must hold RTNL
- */
-static inline struct tnode *node_parent(const struct rt_trie_node *node)
-{
-	unsigned long parent;
+/* caller must hold RTNL */
+#define node_parent(n) rtnl_dereference((n)->parent)
 
-	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
+/* caller must hold RCU read lock or RTNL */
+#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
 
-	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
-}
-
-/*
- * caller must hold RCU read lock or RTNL
- */
-static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
+/* wrapper for rcu_assign_pointer */
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 {
-	unsigned long parent;
-
-	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
-							   lockdep_rtnl_is_held());
-
-	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
+	if (node)
+		rcu_assign_pointer(node->parent, ptr);
 }
 
-/* Same as rcu_assign_pointer
- * but that macro() assumes that value is a pointer.
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
+
+/* This provides us with the number of children in this node, in the case of a
+ * leaf this will return 0 meaning none of the children are accessible.
  */
-static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
+static inline int tnode_child_length(const struct tnode *tn)
 {
-	smp_wmb();
-	node->parent = (unsigned long)ptr | NODE_TYPE(node);
+	return (1ul << tn->bits) & ~(1ul);
 }
 
 /*
@@ -215,7 +204,7 @@ static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
  */
 static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 {
-	BUG_ON(i >= 1U << tn->bits);
+	BUG_ON(i >= tnode_child_length(tn));
 
 	return rtnl_dereference(tn->child[i]);
 }
@@ -225,16 +214,11 @@ static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsig
  */
 static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 {
-	BUG_ON(i >= 1U << tn->bits);
+	BUG_ON(i >= tnode_child_length(tn));
 
 	return rcu_dereference_rtnl(tn->child[i]);
 }
 
-static inline int tnode_child_length(const struct tnode *tn)
-{
-	return 1 << tn->bits;
-}
-
 static inline t_key mask_pfx(t_key k, unsigned int l)
 {
 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
@@ -336,11 +320,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
 
 */
 
-static inline void check_tnode(const struct tnode *tn)
-{
-	WARN_ON(tn && tn->pos+tn->bits > 32);
-}
-
 static const int halve_threshold = 25;
 static const int inflate_threshold = 50;
 static const int halve_threshold_root = 15;
@@ -426,11 +405,20 @@ static void tnode_free_flush(void)
 	}
 }
 
-static struct leaf *leaf_new(void)
+static struct leaf *leaf_new(t_key key)
 {
 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 	if (l) {
-		l->parent = T_LEAF;
+		l->parent = NULL;
+		/* set key and pos to reflect full key value
+		 * any trailing zeros in the key should be ignored
+		 * as the nodes are searched
+		 */
+		l->key = key;
+		l->pos = KEYLENGTH;
+		/* set bits to 0 indicating we are not a tnode */
+		l->bits = 0;
+
 		INIT_HLIST_HEAD(&l->list);
 	}
 	return l;
@@ -451,12 +439,16 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 {
 	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
 	struct tnode *tn = tnode_alloc(sz);
+	unsigned int shift = pos + bits;
+
+	/* verify bits and pos their msb bits clear and values are valid */
+	BUG_ON(!bits || (shift > KEYLENGTH));
 
 	if (tn) {
-		tn->parent = T_TNODE;
+		tn->parent = NULL;
 		tn->pos = pos;
 		tn->bits = bits;
-		tn->key = key;
+		tn->key = mask_pfx(key, pos);
 		tn->full_children = 0;
 		tn->empty_children = 1<<bits;
 	}
@@ -473,10 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 
 static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 {
-	if (n == NULL || IS_LEAF(n))
-		return 0;
-
-	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
+	return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
 }
 
 static inline void put_child(struct tnode *tn, int i,
@@ -514,8 +503,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *
 	else if (!wasfull && isfull)
 		tn->full_children++;
 
-	if (n)
-		node_set_parent(n, tn);
+	node_set_parent(n, tn);
 
 	rcu_assign_pointer(tn->child[i], n);
 }
@@ -523,7 +511,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *
 #define MAX_WORK 10
 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 {
-	int i;
+	struct rt_trie_node *n = NULL;
 	struct tnode *old_tn;
 	int inflate_threshold_use;
 	int halve_threshold_use;
@@ -536,12 +524,11 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 		 tn, inflate_threshold, halve_threshold);
 
 	/* No children */
-	if (tn->empty_children == tnode_child_length(tn)) {
-		tnode_free_safe(tn);
-		return NULL;
-	}
+	if (tn->empty_children > (tnode_child_length(tn) - 1))
+		goto no_children;
+
 	/* One child */
-	if (tn->empty_children == tnode_child_length(tn) - 1)
+	if (tn->empty_children == (tnode_child_length(tn) - 1))
 		goto one_child;
 	/*
 	 * Double as long as the resulting node has a number of
@@ -607,11 +594,9 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 	 *
 	 */
 
-	check_tnode(tn);
-
 	/* Keep root node larger  */
 
-	if (!node_parent((struct rt_trie_node *)tn)) {
+	if (!node_parent(tn)) {
 		inflate_threshold_use = inflate_threshold_root;
 		halve_threshold_use = halve_threshold_root;
 	} else {
@@ -637,8 +622,6 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 		}
 	}
 
-	check_tnode(tn);
-
 	/* Return if at least one inflate is run */
 	if (max_work != MAX_WORK)
 		return (struct rt_trie_node *) tn;
@@ -666,21 +649,16 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 
 
 	/* Only one child remains */
-	if (tn->empty_children == tnode_child_length(tn) - 1) {
+	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
+		unsigned long i;
 one_child:
-		for (i = 0; i < tnode_child_length(tn); i++) {
-			struct rt_trie_node *n;
-
-			n = rtnl_dereference(tn->child[i]);
-			if (!n)
-				continue;
-
-			/* compress one level */
-
-			node_set_parent(n, NULL);
-			tnode_free_safe(tn);
-			return n;
-		}
+		for (i = tnode_child_length(tn); !n && i;)
+			n = tnode_get_child(tn, --i);
+no_children:
+		/* compress one level */
+		node_set_parent(n, NULL);
+		tnode_free_safe(tn);
+		return n;
 	}
 	return (struct rt_trie_node *) tn;
 }
@@ -760,8 +738,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 
 		/* A leaf or an internal node with skipped bits */
 
-		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
-		   tn->pos + tn->bits - 1) {
+		if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) {
 			put_child(tn,
 				tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
 				node);
@@ -958,11 +935,9 @@ fib_find_node(struct trie *t, u32 key)
 	pos = 0;
 	n = rcu_dereference_rtnl(t->trie);
 
-	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
+	while (n && IS_TNODE(n)) {
 		tn = (struct tnode *) n;
 
-		check_tnode(tn);
-
 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 			pos = tn->pos + tn->bits;
 			n = tnode_get_child_rcu(tn,
@@ -988,7 +963,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 
 	key = tn->key;
 
-	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
+	while (tn != NULL && (tp = node_parent(tn)) != NULL) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 		tn = (struct tnode *)resize(t, tn);
@@ -996,7 +971,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 		tnode_put_child_reorg(tp, cindex,
 				      (struct rt_trie_node *)tn, wasfull);
 
-		tp = node_parent((struct rt_trie_node *) tn);
+		tp = node_parent(tn);
 		if (!tp)
 			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
 
@@ -1048,11 +1023,9 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 	 * If it doesn't, we need to replace it with a T_TNODE.
 	 */
 
-	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
+	while (n && IS_TNODE(n)) {
 		tn = (struct tnode *) n;
 
-		check_tnode(tn);
-
 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 			tp = tn;
 			pos = tn->pos + tn->bits;
@@ -1087,12 +1060,11 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		insert_leaf_info(&l->list, li);
 		goto done;
 	}
-	l = leaf_new();
+	l = leaf_new(key);
 
 	if (!l)
 		return NULL;
 
-	l->key = key;
 	li = leaf_info_new(plen);
 
 	if (!li) {
@@ -1569,7 +1541,7 @@ backtrace:
 		if (chopped_off <= pn->bits) {
 			cindex &= ~(1 << (chopped_off-1));
 		} else {
-			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
+			struct tnode *parent = node_parent_rcu(pn);
 			if (!parent)
 				goto failed;
 
@@ -1597,7 +1569,7 @@ EXPORT_SYMBOL_GPL(fib_table_lookup);
  */
 static void trie_leaf_remove(struct trie *t, struct leaf *l)
 {
-	struct tnode *tp = node_parent((struct rt_trie_node *) l);
+	struct tnode *tp = node_parent(l);
 
 	pr_debug("entering trie_leaf_remove(%p)\n", l);
 
@@ -2374,7 +2346,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 
 	if (IS_TNODE(n)) {
 		struct tnode *tn = (struct tnode *) n;
-		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
+		__be32 prf = htonl(tn->key);
 
 		seq_indent(seq, iter->depth-1);
 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
-- 
cgit v1.2.3


From 37fd30f2da573c2625ed29561e5b8eb7b0258860 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:41 -0800
Subject: fib_trie: Merge tnode_free and leaf_free into node_free

Both the leaf and the tnode had an rcu_head in them, but they had them in
slightly different places.  Since we now have them in the same spot and
know that any node with bits == 0 is a leaf and the rest are either vmalloc
or kmalloc tnodes depending on the value of bits it makes it easy to combine
the functions and reduce overhead.

In addition I have taken advantage of the rcu_head pointer to go ahead and
put together a simple linked list instead of using the tnode pointer as
this way we can merge either type of structure for freeing.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 90 ++++++++++++++++++++++++-----------------------------
 1 file changed, 40 insertions(+), 50 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2b7220728b24..d1a9907901cb 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,15 +95,17 @@ struct tnode {
 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
 	struct tnode __rcu *parent;
-	union {
-		struct rcu_head rcu;
-		struct tnode *tnode_free;
-	};
+	struct rcu_head rcu;
+	/* everything above this comment must be the same as rt_trie_node */
 	unsigned int full_children;	/* KEYLENGTH bits needed */
 	unsigned int empty_children;	/* KEYLENGTH bits needed */
 	struct rt_trie_node __rcu *child[0];
 };
 
+/* This struct represents the shared bits between tnode and leaf.  If any
+ * ordering is changed here is must also be updated in tnode and leaf as
+ * well.
+ */
 struct rt_trie_node {
 	t_key key;
 	unsigned char bits;
@@ -118,6 +120,7 @@ struct leaf {
 	unsigned char pos;
 	struct tnode __rcu *parent;
 	struct rcu_head rcu;
+	/* everything above this comment must be the same as rt_trie_node */
 	struct hlist_head list;
 };
 
@@ -163,7 +166,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
 /* tnodes to free after resize(); protected by RTNL */
-static struct tnode *tnode_free_head;
+static struct callback_head *tnode_free_head;
 static size_t tnode_free_size;
 
 /*
@@ -336,17 +339,23 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
 	call_rcu(&fa->rcu, __alias_free_mem);
 }
 
-static void __leaf_free_rcu(struct rcu_head *head)
-{
-	struct leaf *l = container_of(head, struct leaf, rcu);
-	kmem_cache_free(trie_leaf_kmem, l);
-}
+#define TNODE_KMALLOC_MAX \
+	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct rt_trie_node *))
 
-static inline void free_leaf(struct leaf *l)
+static void __node_free_rcu(struct rcu_head *head)
 {
-	call_rcu(&l->rcu, __leaf_free_rcu);
+	struct rt_trie_node *n = container_of(head, struct rt_trie_node, rcu);
+
+	if (IS_LEAF(n))
+		kmem_cache_free(trie_leaf_kmem, n);
+	else if (n->bits <= TNODE_KMALLOC_MAX)
+		kfree(n);
+	else
+		vfree(n);
 }
 
+#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
+
 static inline void free_leaf_info(struct leaf_info *leaf)
 {
 	kfree_rcu(leaf, rcu);
@@ -360,43 +369,24 @@ static struct tnode *tnode_alloc(size_t size)
 		return vzalloc(size);
 }
 
-static void __tnode_free_rcu(struct rcu_head *head)
-{
-	struct tnode *tn = container_of(head, struct tnode, rcu);
-	size_t size = sizeof(struct tnode) +
-		      (sizeof(struct rt_trie_node *) << tn->bits);
-
-	if (size <= PAGE_SIZE)
-		kfree(tn);
-	else
-		vfree(tn);
-}
-
-static inline void tnode_free(struct tnode *tn)
-{
-	if (IS_LEAF(tn))
-		free_leaf((struct leaf *) tn);
-	else
-		call_rcu(&tn->rcu, __tnode_free_rcu);
-}
-
 static void tnode_free_safe(struct tnode *tn)
 {
 	BUG_ON(IS_LEAF(tn));
-	tn->tnode_free = tnode_free_head;
-	tnode_free_head = tn;
-	tnode_free_size += sizeof(struct tnode) +
-			   (sizeof(struct rt_trie_node *) << tn->bits);
+	tn->rcu.next = tnode_free_head;
+	tnode_free_head = &tn->rcu;
 }
 
 static void tnode_free_flush(void)
 {
-	struct tnode *tn;
+	struct callback_head *head;
+
+	while ((head = tnode_free_head)) {
+		struct tnode *tn = container_of(head, struct tnode, rcu);
+
+		tnode_free_head = head->next;
+		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
 
-	while ((tn = tnode_free_head)) {
-		tnode_free_head = tn->tnode_free;
-		tn->tnode_free = NULL;
-		tnode_free(tn);
+		node_free(tn);
 	}
 
 	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
@@ -437,7 +427,7 @@ static struct leaf_info *leaf_info_new(int plen)
 
 static struct tnode *tnode_new(t_key key, int pos, int bits)
 {
-	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
+	size_t sz = offsetof(struct tnode, child[1 << bits]);
 	struct tnode *tn = tnode_alloc(sz);
 	unsigned int shift = pos + bits;
 
@@ -666,15 +656,15 @@ no_children:
 
 static void tnode_clean_free(struct tnode *tn)
 {
+	struct rt_trie_node *tofree;
 	int i;
-	struct tnode *tofree;
 
 	for (i = 0; i < tnode_child_length(tn); i++) {
-		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
+		tofree = rtnl_dereference(tn->child[i]);
 		if (tofree)
-			tnode_free(tofree);
+			node_free(tofree);
 	}
-	tnode_free(tn);
+	node_free(tn);
 }
 
 static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -717,7 +707,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 					  inode->bits - 1);
 
 			if (!right) {
-				tnode_free(left);
+				node_free(left);
 				goto nomem;
 			}
 
@@ -1068,7 +1058,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 	li = leaf_info_new(plen);
 
 	if (!li) {
-		free_leaf(l);
+		node_free(l);
 		return NULL;
 	}
 
@@ -1100,7 +1090,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 
 		if (!tn) {
 			free_leaf_info(li);
-			free_leaf(l);
+			node_free(l);
 			return NULL;
 		}
 
@@ -1580,7 +1570,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
 	} else
 		RCU_INIT_POINTER(t->trie, NULL);
 
-	free_leaf(l);
+	node_free(l);
 }
 
 /*
-- 
cgit v1.2.3


From adaf981685b7b17d773d17f4ee6a128a3a6b6dc8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:47 -0800
Subject: fib_trie: Merge leaf into tnode

This change makes it so that leaf and tnode are the same struct.  As a
result there is no need for rt_trie_node anymore since everyting can be
merged into tnode.

On 32b systems this results in the leaf being 4 bytes larger, however I
don't know if that is really an issue as this and an eariler patch that
added bits & pos have increased the size from 20 to 28.  If I am not
mistaken slub/slab allocate on power of 2 sizes so 20 was likely being
rounded up to 32 anyway.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 322 +++++++++++++++++++++++-----------------------------
 1 file changed, 140 insertions(+), 182 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d1a9907901cb..94c929f6c752 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -96,32 +96,16 @@ struct tnode {
 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
 	struct tnode __rcu *parent;
 	struct rcu_head rcu;
-	/* everything above this comment must be the same as rt_trie_node */
-	unsigned int full_children;	/* KEYLENGTH bits needed */
-	unsigned int empty_children;	/* KEYLENGTH bits needed */
-	struct rt_trie_node __rcu *child[0];
-};
-
-/* This struct represents the shared bits between tnode and leaf.  If any
- * ordering is changed here is must also be updated in tnode and leaf as
- * well.
- */
-struct rt_trie_node {
-	t_key key;
-	unsigned char bits;
-	unsigned char pos;
-	struct tnode __rcu *parent;
-	struct rcu_head rcu;
-};
-
-struct leaf {
-	t_key key;
-	unsigned char bits;
-	unsigned char pos;
-	struct tnode __rcu *parent;
-	struct rcu_head rcu;
-	/* everything above this comment must be the same as rt_trie_node */
-	struct hlist_head list;
+	union {
+		/* The fields in this struct are valid if bits > 0 (TNODE) */
+		struct {
+			unsigned int full_children;  /* KEYLENGTH bits needed */
+			unsigned int empty_children; /* KEYLENGTH bits needed */
+			struct tnode __rcu *child[0];
+		};
+		/* This list pointer if valid if bits == 0 (LEAF) */
+		struct hlist_head list;
+	};
 };
 
 struct leaf_info {
@@ -154,15 +138,15 @@ struct trie_stat {
 };
 
 struct trie {
-	struct rt_trie_node __rcu *trie;
+	struct tnode __rcu *trie;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	struct trie_use_stats __percpu *stats;
 #endif
 };
 
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
 				  int wasfull);
-static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
+static struct tnode *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
 /* tnodes to free after resize(); protected by RTNL */
@@ -186,10 +170,10 @@ static struct kmem_cache *trie_leaf_kmem __read_mostly;
 #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
 
 /* wrapper for rcu_assign_pointer */
-static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
+static inline void node_set_parent(struct tnode *n, struct tnode *tp)
 {
-	if (node)
-		rcu_assign_pointer(node->parent, ptr);
+	if (n)
+		rcu_assign_pointer(n->parent, tp);
 }
 
 #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
@@ -205,7 +189,7 @@ static inline int tnode_child_length(const struct tnode *tn)
 /*
  * caller must hold RTNL
  */
-static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
+static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
 {
 	BUG_ON(i >= tnode_child_length(tn));
 
@@ -215,7 +199,7 @@ static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsig
 /*
  * caller must hold RCU read lock or RTNL
  */
-static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
+static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 {
 	BUG_ON(i >= tnode_child_length(tn));
 
@@ -340,11 +324,11 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
 }
 
 #define TNODE_KMALLOC_MAX \
-	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct rt_trie_node *))
+	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
 
 static void __node_free_rcu(struct rcu_head *head)
 {
-	struct rt_trie_node *n = container_of(head, struct rt_trie_node, rcu);
+	struct tnode *n = container_of(head, struct tnode, rcu);
 
 	if (IS_LEAF(n))
 		kmem_cache_free(trie_leaf_kmem, n);
@@ -395,9 +379,9 @@ static void tnode_free_flush(void)
 	}
 }
 
-static struct leaf *leaf_new(t_key key)
+static struct tnode *leaf_new(t_key key)
 {
-	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 	if (l) {
 		l->parent = NULL;
 		/* set key and pos to reflect full key value
@@ -444,7 +428,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 	}
 
 	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
-		 sizeof(struct rt_trie_node *) << bits);
+		 sizeof(struct tnode *) << bits);
 	return tn;
 }
 
@@ -453,13 +437,13 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
  * and no bits are skipped. See discussion in dyntree paper p. 6
  */
 
-static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
+static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
 {
 	return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
 }
 
 static inline void put_child(struct tnode *tn, int i,
-			     struct rt_trie_node *n)
+			     struct tnode *n)
 {
 	tnode_put_child_reorg(tn, i, n, -1);
 }
@@ -469,10 +453,10 @@ static inline void put_child(struct tnode *tn, int i,
   * Update the value of full_children and empty_children.
   */
 
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
 				  int wasfull)
 {
-	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
+	struct tnode *chi = rtnl_dereference(tn->child[i]);
 	int isfull;
 
 	BUG_ON(i >= 1<<tn->bits);
@@ -499,10 +483,9 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *
 }
 
 #define MAX_WORK 10
-static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
+static struct tnode *resize(struct trie *t, struct tnode *tn)
 {
-	struct rt_trie_node *n = NULL;
-	struct tnode *old_tn;
+	struct tnode *old_tn, *n = NULL;
 	int inflate_threshold_use;
 	int halve_threshold_use;
 	int max_work;
@@ -614,7 +597,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 
 	/* Return if at least one inflate is run */
 	if (max_work != MAX_WORK)
-		return (struct rt_trie_node *) tn;
+		return tn;
 
 	/*
 	 * Halve as long as the number of empty children in this
@@ -650,13 +633,13 @@ no_children:
 		tnode_free_safe(tn);
 		return n;
 	}
-	return (struct rt_trie_node *) tn;
+	return tn;
 }
 
 
 static void tnode_clean_free(struct tnode *tn)
 {
-	struct rt_trie_node *tofree;
+	struct tnode *tofree;
 	int i;
 
 	for (i = 0; i < tnode_child_length(tn); i++) {
@@ -667,10 +650,10 @@ static void tnode_clean_free(struct tnode *tn)
 	node_free(tn);
 }
 
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 {
-	struct tnode *oldtnode = tn;
-	int olen = tnode_child_length(tn);
+	int olen = tnode_child_length(oldtnode);
+	struct tnode *tn;
 	int i;
 
 	pr_debug("In inflate\n");
@@ -690,11 +673,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 	for (i = 0; i < olen; i++) {
 		struct tnode *inode;
 
-		inode = (struct tnode *) tnode_get_child(oldtnode, i);
-		if (inode &&
-		    IS_TNODE(inode) &&
-		    inode->pos == oldtnode->pos + oldtnode->bits &&
-		    inode->bits > 1) {
+		inode = tnode_get_child(oldtnode, i);
+		if (tnode_full(oldtnode, inode) && inode->bits > 1) {
 			struct tnode *left, *right;
 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 
@@ -711,33 +691,29 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 				goto nomem;
 			}
 
-			put_child(tn, 2*i, (struct rt_trie_node *) left);
-			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
+			put_child(tn, 2*i, left);
+			put_child(tn, 2*i+1, right);
 		}
 	}
 
 	for (i = 0; i < olen; i++) {
-		struct tnode *inode;
-		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
+		struct tnode *inode = tnode_get_child(oldtnode, i);
 		struct tnode *left, *right;
 		int size, j;
 
 		/* An empty child */
-		if (node == NULL)
+		if (inode == NULL)
 			continue;
 
 		/* A leaf or an internal node with skipped bits */
-
-		if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) {
+		if (!tnode_full(oldtnode, inode)) {
 			put_child(tn,
-				tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
-				node);
+				tkey_extract_bits(inode->key, tn->pos, tn->bits),
+				inode);
 			continue;
 		}
 
 		/* An internal node with two children */
-		inode = (struct tnode *) node;
-
 		if (inode->bits == 1) {
 			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
 			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
@@ -769,12 +745,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 		 *   bit to zero.
 		 */
 
-		left = (struct tnode *) tnode_get_child(tn, 2*i);
+		left = tnode_get_child(tn, 2*i);
 		put_child(tn, 2*i, NULL);
 
 		BUG_ON(!left);
 
-		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+		right = tnode_get_child(tn, 2*i+1);
 		put_child(tn, 2*i+1, NULL);
 
 		BUG_ON(!right);
@@ -796,12 +772,11 @@ nomem:
 	return ERR_PTR(-ENOMEM);
 }
 
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 {
-	struct tnode *oldtnode = tn;
-	struct rt_trie_node *left, *right;
+	int olen = tnode_child_length(oldtnode);
+	struct tnode *tn, *left, *right;
 	int i;
-	int olen = tnode_child_length(tn);
 
 	pr_debug("In halve\n");
 
@@ -830,7 +805,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
 			if (!newn)
 				goto nomem;
 
-			put_child(tn, i/2, (struct rt_trie_node *)newn);
+			put_child(tn, i/2, newn);
 		}
 
 	}
@@ -855,7 +830,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
 		}
 
 		/* Two nonempty children */
-		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+		newBinNode = tnode_get_child(tn, i/2);
 		put_child(tn, i/2, NULL);
 		put_child(newBinNode, 0, left);
 		put_child(newBinNode, 1, right);
@@ -871,7 +846,7 @@ nomem:
 /* readside must use rcu_read_lock currently dump routines
  via get_fa_head and dump */
 
-static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
+static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
 {
 	struct hlist_head *head = &l->list;
 	struct leaf_info *li;
@@ -883,7 +858,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 	return NULL;
 }
 
-static inline struct list_head *get_fa_head(struct leaf *l, int plen)
+static inline struct list_head *get_fa_head(struct tnode *l, int plen)
 {
 	struct leaf_info *li = find_leaf_info(l, plen);
 
@@ -915,32 +890,25 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 
 /* rcu_read_lock needs to be hold by caller from readside */
 
-static struct leaf *
-fib_find_node(struct trie *t, u32 key)
+static struct tnode *fib_find_node(struct trie *t, u32 key)
 {
-	int pos;
-	struct tnode *tn;
-	struct rt_trie_node *n;
-
-	pos = 0;
-	n = rcu_dereference_rtnl(t->trie);
+	struct tnode *n = rcu_dereference_rtnl(t->trie);
+	int pos = 0;
 
 	while (n && IS_TNODE(n)) {
-		tn = (struct tnode *) n;
-
-		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
-			pos = tn->pos + tn->bits;
-			n = tnode_get_child_rcu(tn,
+		if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
+			pos = n->pos + n->bits;
+			n = tnode_get_child_rcu(n,
 						tkey_extract_bits(key,
-								  tn->pos,
-								  tn->bits));
+								  n->pos,
+								  n->bits));
 		} else
 			break;
 	}
 	/* Case we have found a leaf. Compare prefixes */
 
 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
-		return (struct leaf *)n;
+		return n;
 
 	return NULL;
 }
@@ -956,14 +924,13 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 	while (tn != NULL && (tp = node_parent(tn)) != NULL) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
-		tn = (struct tnode *)resize(t, tn);
+		tn = resize(t, tn);
 
-		tnode_put_child_reorg(tp, cindex,
-				      (struct rt_trie_node *)tn, wasfull);
+		tnode_put_child_reorg(tp, cindex, tn, wasfull);
 
 		tp = node_parent(tn);
 		if (!tp)
-			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+			rcu_assign_pointer(t->trie, tn);
 
 		tnode_free_flush();
 		if (!tp)
@@ -973,9 +940,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 
 	/* Handle last (top) tnode */
 	if (IS_TNODE(tn))
-		tn = (struct tnode *)resize(t, tn);
+		tn = resize(t, tn);
 
-	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+	rcu_assign_pointer(t->trie, tn);
 	tnode_free_flush();
 }
 
@@ -985,8 +952,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 {
 	int pos, newpos;
 	struct tnode *tp = NULL, *tn = NULL;
-	struct rt_trie_node *n;
-	struct leaf *l;
+	struct tnode *n;
+	struct tnode *l;
 	int missbit;
 	struct list_head *fa_head = NULL;
 	struct leaf_info *li;
@@ -1014,17 +981,15 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 	 */
 
 	while (n && IS_TNODE(n)) {
-		tn = (struct tnode *) n;
-
-		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
-			tp = tn;
-			pos = tn->pos + tn->bits;
-			n = tnode_get_child(tn,
+		if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
+			tp = n;
+			pos = n->pos + n->bits;
+			n = tnode_get_child(n,
 					    tkey_extract_bits(key,
-							      tn->pos,
-							      tn->bits));
+							      n->pos,
+							      n->bits));
 
-			BUG_ON(n && node_parent(n) != tn);
+			BUG_ON(n && node_parent(n) != tp);
 		} else
 			break;
 	}
@@ -1040,14 +1005,13 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 	/* Case 1: n is a leaf. Compare prefixes */
 
 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
-		l = (struct leaf *) n;
 		li = leaf_info_new(plen);
 
 		if (!li)
 			return NULL;
 
 		fa_head = &li->falh;
-		insert_leaf_info(&l->list, li);
+		insert_leaf_info(&n->list, li);
 		goto done;
 	}
 	l = leaf_new(key);
@@ -1068,10 +1032,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 	if (t->trie && n == NULL) {
 		/* Case 2: n is NULL, and will just insert a new leaf */
 
-		node_set_parent((struct rt_trie_node *)l, tp);
+		node_set_parent(l, tp);
 
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-		put_child(tp, cindex, (struct rt_trie_node *)l);
+		put_child(tp, cindex, l);
 	} else {
 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
 		/*
@@ -1094,17 +1058,17 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 			return NULL;
 		}
 
-		node_set_parent((struct rt_trie_node *)tn, tp);
+		node_set_parent(tn, tp);
 
 		missbit = tkey_extract_bits(key, newpos, 1);
-		put_child(tn, missbit, (struct rt_trie_node *)l);
+		put_child(tn, missbit, l);
 		put_child(tn, 1-missbit, n);
 
 		if (tp) {
 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-			put_child(tp, cindex, (struct rt_trie_node *)tn);
+			put_child(tp, cindex, tn);
 		} else {
-			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+			rcu_assign_pointer(t->trie, tn);
 		}
 
 		tp = tn;
@@ -1134,7 +1098,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	u8 tos = cfg->fc_tos;
 	u32 key, mask;
 	int err;
-	struct leaf *l;
+	struct tnode *l;
 
 	if (plen > 32)
 		return -EINVAL;
@@ -1292,7 +1256,7 @@ err:
 }
 
 /* should be called with rcu_read_lock */
-static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
 		      t_key key,  const struct flowi4 *flp,
 		      struct fib_result *res, int fib_flags)
 {
@@ -1365,7 +1329,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 	struct trie_use_stats __percpu *stats = t->stats;
 #endif
 	int ret;
-	struct rt_trie_node *n;
+	struct tnode *n;
 	struct tnode *pn;
 	unsigned int pos, bits;
 	t_key key = ntohl(flp->daddr);
@@ -1387,11 +1351,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 
 	/* Just a leaf? */
 	if (IS_LEAF(n)) {
-		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+		ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
 		goto found;
 	}
 
-	pn = (struct tnode *) n;
+	pn = n;
 	chopped_off = 0;
 
 	while (pn) {
@@ -1412,13 +1376,13 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		}
 
 		if (IS_LEAF(n)) {
-			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+			ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
 			if (ret > 0)
 				goto backtrace;
 			goto found;
 		}
 
-		cn = (struct tnode *)n;
+		cn = n;
 
 		/*
 		 * It's a tnode, and we can do some extra checks here if we
@@ -1506,7 +1470,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 				current_prefix_length = mp;
 		}
 
-		pn = (struct tnode *)n; /* Descend */
+		pn = n; /* Descend */
 		chopped_off = 0;
 		continue;
 
@@ -1557,7 +1521,7 @@ EXPORT_SYMBOL_GPL(fib_table_lookup);
 /*
  * Remove the leaf and return parent.
  */
-static void trie_leaf_remove(struct trie *t, struct leaf *l)
+static void trie_leaf_remove(struct trie *t, struct tnode *l)
 {
 	struct tnode *tp = node_parent(l);
 
@@ -1584,7 +1548,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	u8 tos = cfg->fc_tos;
 	struct fib_alias *fa, *fa_to_delete;
 	struct list_head *fa_head;
-	struct leaf *l;
+	struct tnode *l;
 	struct leaf_info *li;
 
 	if (plen > 32)
@@ -1682,7 +1646,7 @@ static int trie_flush_list(struct list_head *head)
 	return found;
 }
 
-static int trie_flush_leaf(struct leaf *l)
+static int trie_flush_leaf(struct tnode *l)
 {
 	int found = 0;
 	struct hlist_head *lih = &l->list;
@@ -1704,7 +1668,7 @@ static int trie_flush_leaf(struct leaf *l)
  * Scan for the next right leaf starting at node p->child[idx]
  * Since we have back pointer, no recursion necessary.
  */
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
+static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
 {
 	do {
 		t_key idx;
@@ -1720,47 +1684,46 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
 				continue;
 
 			if (IS_LEAF(c))
-				return (struct leaf *) c;
+				return c;
 
 			/* Rescan start scanning in new node */
-			p = (struct tnode *) c;
+			p = c;
 			idx = 0;
 		}
 
 		/* Node empty, walk back up to parent */
-		c = (struct rt_trie_node *) p;
+		c = p;
 	} while ((p = node_parent_rcu(c)) != NULL);
 
 	return NULL; /* Root of trie */
 }
 
-static struct leaf *trie_firstleaf(struct trie *t)
+static struct tnode *trie_firstleaf(struct trie *t)
 {
-	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
+	struct tnode *n = rcu_dereference_rtnl(t->trie);
 
 	if (!n)
 		return NULL;
 
 	if (IS_LEAF(n))          /* trie is just a leaf */
-		return (struct leaf *) n;
+		return n;
 
 	return leaf_walk_rcu(n, NULL);
 }
 
-static struct leaf *trie_nextleaf(struct leaf *l)
+static struct tnode *trie_nextleaf(struct tnode *l)
 {
-	struct rt_trie_node *c = (struct rt_trie_node *) l;
-	struct tnode *p = node_parent_rcu(c);
+	struct tnode *p = node_parent_rcu(l);
 
 	if (!p)
 		return NULL;	/* trie with just one leaf */
 
-	return leaf_walk_rcu(p, c);
+	return leaf_walk_rcu(p, l);
 }
 
-static struct leaf *trie_leafindex(struct trie *t, int index)
+static struct tnode *trie_leafindex(struct trie *t, int index)
 {
-	struct leaf *l = trie_firstleaf(t);
+	struct tnode *l = trie_firstleaf(t);
 
 	while (l && index-- > 0)
 		l = trie_nextleaf(l);
@@ -1775,7 +1738,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
 int fib_table_flush(struct fib_table *tb)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
-	struct leaf *l, *ll = NULL;
+	struct tnode *l, *ll = NULL;
 	int found = 0;
 
 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
@@ -1840,7 +1803,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
 	return skb->len;
 }
 
-static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
 			struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct leaf_info *li;
@@ -1876,7 +1839,7 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 		   struct netlink_callback *cb)
 {
-	struct leaf *l;
+	struct tnode *l;
 	struct trie *t = (struct trie *) tb->tb_data;
 	t_key key = cb->args[2];
 	int count = cb->args[3];
@@ -1922,7 +1885,7 @@ void __init fib_trie_init(void)
 					  0, SLAB_PANIC, NULL);
 
 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
-					   max(sizeof(struct leaf),
+					   max(sizeof(struct tnode),
 					       sizeof(struct leaf_info)),
 					   0, SLAB_PANIC, NULL);
 }
@@ -1965,7 +1928,7 @@ struct fib_trie_iter {
 	unsigned int depth;
 };
 
-static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
 {
 	struct tnode *tn = iter->tnode;
 	unsigned int cindex = iter->index;
@@ -1979,7 +1942,7 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
 		 iter->tnode, iter->index, iter->depth);
 rescan:
 	while (cindex < (1<<tn->bits)) {
-		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
+		struct tnode *n = tnode_get_child_rcu(tn, cindex);
 
 		if (n) {
 			if (IS_LEAF(n)) {
@@ -1987,7 +1950,7 @@ rescan:
 				iter->index = cindex + 1;
 			} else {
 				/* push down one level */
-				iter->tnode = (struct tnode *) n;
+				iter->tnode = n;
 				iter->index = 0;
 				++iter->depth;
 			}
@@ -1998,7 +1961,7 @@ rescan:
 	}
 
 	/* Current node exhausted, pop back up */
-	p = node_parent_rcu((struct rt_trie_node *)tn);
+	p = node_parent_rcu(tn);
 	if (p) {
 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
 		tn = p;
@@ -2010,10 +1973,10 @@ rescan:
 	return NULL;
 }
 
-static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
 				       struct trie *t)
 {
-	struct rt_trie_node *n;
+	struct tnode *n;
 
 	if (!t)
 		return NULL;
@@ -2023,7 +1986,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
 		return NULL;
 
 	if (IS_TNODE(n)) {
-		iter->tnode = (struct tnode *) n;
+		iter->tnode = n;
 		iter->index = 0;
 		iter->depth = 1;
 	} else {
@@ -2037,7 +2000,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
 
 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 {
-	struct rt_trie_node *n;
+	struct tnode *n;
 	struct fib_trie_iter iter;
 
 	memset(s, 0, sizeof(*s));
@@ -2045,7 +2008,6 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 	rcu_read_lock();
 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
 		if (IS_LEAF(n)) {
-			struct leaf *l = (struct leaf *)n;
 			struct leaf_info *li;
 
 			s->leaves++;
@@ -2053,18 +2015,17 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 			if (iter.depth > s->maxdepth)
 				s->maxdepth = iter.depth;
 
-			hlist_for_each_entry_rcu(li, &l->list, hlist)
+			hlist_for_each_entry_rcu(li, &n->list, hlist)
 				++s->prefixes;
 		} else {
-			const struct tnode *tn = (const struct tnode *) n;
 			int i;
 
 			s->tnodes++;
-			if (tn->bits < MAX_STAT_DEPTH)
-				s->nodesizes[tn->bits]++;
+			if (n->bits < MAX_STAT_DEPTH)
+				s->nodesizes[n->bits]++;
 
-			for (i = 0; i < (1<<tn->bits); i++)
-				if (!tn->child[i])
+			for (i = 0; i < tnode_child_length(n); i++)
+				if (!rcu_access_pointer(n->child[i]))
 					s->nullpointers++;
 		}
 	}
@@ -2088,7 +2049,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
 
 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
-	bytes = sizeof(struct leaf) * stat->leaves;
+	bytes = sizeof(struct tnode) * stat->leaves;
 
 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
 	bytes += sizeof(struct leaf_info) * stat->prefixes;
@@ -2109,7 +2070,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 	seq_putc(seq, '\n');
 	seq_printf(seq, "\tPointers: %u\n", pointers);
 
-	bytes += sizeof(struct rt_trie_node *) * pointers;
+	bytes += sizeof(struct tnode *) * pointers;
 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
 }
@@ -2163,7 +2124,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq,
 		   "Basic info: size of leaf:"
 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
-		   sizeof(struct leaf), sizeof(struct tnode));
+		   sizeof(struct tnode), sizeof(struct tnode));
 
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2202,7 +2163,7 @@ static const struct file_operations fib_triestat_fops = {
 	.release = single_release_net,
 };
 
-static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
 {
 	struct fib_trie_iter *iter = seq->private;
 	struct net *net = seq_file_net(seq);
@@ -2214,7 +2175,7 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
 		struct fib_table *tb;
 
 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
-			struct rt_trie_node *n;
+			struct tnode *n;
 
 			for (n = fib_trie_get_first(iter,
 						    (struct trie *) tb->tb_data);
@@ -2243,7 +2204,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	struct fib_table *tb = iter->tb;
 	struct hlist_node *tb_node;
 	unsigned int h;
-	struct rt_trie_node *n;
+	struct tnode *n;
 
 	++*pos;
 	/* next node in same table */
@@ -2329,29 +2290,26 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
 static int fib_trie_seq_show(struct seq_file *seq, void *v)
 {
 	const struct fib_trie_iter *iter = seq->private;
-	struct rt_trie_node *n = v;
+	struct tnode *n = v;
 
 	if (!node_parent_rcu(n))
 		fib_table_print(seq, iter->tb);
 
 	if (IS_TNODE(n)) {
-		struct tnode *tn = (struct tnode *) n;
-		__be32 prf = htonl(tn->key);
+		__be32 prf = htonl(n->key);
 
-		seq_indent(seq, iter->depth-1);
+		seq_indent(seq, iter->depth - 1);
 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
-			   &prf, tn->pos, tn->bits, tn->full_children,
-			   tn->empty_children);
-
+			   &prf, n->pos, n->bits, n->full_children,
+			   n->empty_children);
 	} else {
-		struct leaf *l = (struct leaf *) n;
 		struct leaf_info *li;
-		__be32 val = htonl(l->key);
+		__be32 val = htonl(n->key);
 
 		seq_indent(seq, iter->depth);
 		seq_printf(seq, "  |-- %pI4\n", &val);
 
-		hlist_for_each_entry_rcu(li, &l->list, hlist) {
+		hlist_for_each_entry_rcu(li, &n->list, hlist) {
 			struct fib_alias *fa;
 
 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -2401,9 +2359,9 @@ struct fib_route_iter {
 	t_key	key;
 };
 
-static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
+static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
 {
-	struct leaf *l = NULL;
+	struct tnode *l = NULL;
 	struct trie *t = iter->main_trie;
 
 	/* use cache location of last found key */
@@ -2448,7 +2406,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct fib_route_iter *iter = seq->private;
-	struct leaf *l = v;
+	struct tnode *l = v;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN) {
@@ -2494,7 +2452,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
  */
 static int fib_route_seq_show(struct seq_file *seq, void *v)
 {
-	struct leaf *l = v;
+	struct tnode *l = v;
 	struct leaf_info *li;
 
 	if (v == SEQ_START_TOKEN) {
-- 
cgit v1.2.3


From 9f9e636d4f89f788c5cf9c6a5357501c0d405fcb Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:54 -0800
Subject: fib_trie: Optimize fib_table_lookup to avoid wasting time on
 loops/variables

This patch is meant to reduce the complexity of fib_table_lookup by reducing
the number of variables to the bare minimum while still keeping the same if
not improved functionality versus the original.

Most of this change was started off by the desire to rid the function of
chopped_off and current_prefix_length as they actually added very little to
the function since they only applied when computing the cindex.  I was able
to replace them mostly with just a check for the prefix match.  As long as
the prefix between the key and the node being tested was the same we know
we can search the tnode fully versus just testing cindex 0.

The second portion of the change ended up being a massive reordering.
Originally the calls to check_leaf were up near the start of the loop, and
the backtracing and descending into lower levels of tnodes was later.  This
didn't make much sense as the structure of the tree means the leaves are
always the last thing to be tested.  As such I reordered things so that we
instead have a loop that will delve into the tree and only exit when we
have either found a leaf or we have exhausted the tree.  The advantage of
rearranging things like this is that we can fully inline check_leaf since
there is now only one reference to it in the function.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 250 +++++++++++++++++++---------------------------------
 1 file changed, 93 insertions(+), 157 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 94c929f6c752..3fe4dd917ce1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -90,6 +90,9 @@ typedef unsigned int t_key;
 #define IS_TNODE(n) ((n)->bits)
 #define IS_LEAF(n) (!(n)->bits)
 
+#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
+#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
+
 struct tnode {
 	t_key key;
 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
@@ -1281,7 +1284,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
 				continue;
 			fib_alias_accessed(fa);
 			err = fib_props[fa->fa_type].error;
-			if (err) {
+			if (unlikely(err < 0)) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 				this_cpu_inc(t->stats->semantic_match_passed);
 #endif
@@ -1303,7 +1306,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
 				res->prefixlen = li->plen;
 				res->nh_sel = nhsel;
 				res->type = fa->fa_type;
-				res->scope = fa->fa_info->fib_scope;
+				res->scope = fi->fib_scope;
 				res->fi = fi;
 				res->table = tb;
 				res->fa_head = &li->falh;
@@ -1321,23 +1324,24 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
 	return 1;
 }
 
+static inline t_key prefix_mismatch(t_key key, struct tnode *n)
+{
+	t_key prefix = n->key;
+
+	return (key ^ prefix) & (prefix | -prefix);
+}
+
 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags)
 {
-	struct trie *t = (struct trie *) tb->tb_data;
+	struct trie *t = (struct trie *)tb->tb_data;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	struct trie_use_stats __percpu *stats = t->stats;
 #endif
-	int ret;
-	struct tnode *n;
-	struct tnode *pn;
-	unsigned int pos, bits;
-	t_key key = ntohl(flp->daddr);
-	unsigned int chopped_off;
-	t_key cindex = 0;
-	unsigned int current_prefix_length = KEYLENGTH;
-	struct tnode *cn;
-	t_key pref_mismatch;
+	const t_key key = ntohl(flp->daddr);
+	struct tnode *n, *pn;
+	t_key cindex;
+	int ret = 1;
 
 	rcu_read_lock();
 
@@ -1349,170 +1353,102 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 	this_cpu_inc(stats->gets);
 #endif
 
-	/* Just a leaf? */
-	if (IS_LEAF(n)) {
-		ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
-		goto found;
-	}
-
 	pn = n;
-	chopped_off = 0;
-
-	while (pn) {
-		pos = pn->pos;
-		bits = pn->bits;
-
-		if (!chopped_off)
-			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
-						   pos, bits);
-
-		n = tnode_get_child_rcu(pn, cindex);
-
-		if (n == NULL) {
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-			this_cpu_inc(stats->null_node_hit);
-#endif
-			goto backtrace;
-		}
+	cindex = 0;
+
+	/* Step 1: Travel to the longest prefix match in the trie */
+	for (;;) {
+		unsigned long index = get_index(key, n);
+
+		/* This bit of code is a bit tricky but it combines multiple
+		 * checks into a single check.  The prefix consists of the
+		 * prefix plus zeros for the "bits" in the prefix. The index
+		 * is the difference between the key and this value.  From
+		 * this we can actually derive several pieces of data.
+		 *   if !(index >> bits)
+		 *     we know the value is child index
+		 *   else
+		 *     we have a mismatch in skip bits and failed
+		 */
+		if (index >> n->bits)
+			break;
 
-		if (IS_LEAF(n)) {
-			ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
-			if (ret > 0)
-				goto backtrace;
+		/* we have found a leaf. Prefixes have already been compared */
+		if (IS_LEAF(n))
 			goto found;
-		}
 
-		cn = n;
-
-		/*
-		 * It's a tnode, and we can do some extra checks here if we
-		 * like, to avoid descending into a dead-end branch.
-		 * This tnode is in the parent's child array at index
-		 * key[p_pos..p_pos+p_bits] but potentially with some bits
-		 * chopped off, so in reality the index may be just a
-		 * subprefix, padded with zero at the end.
-		 * We can also take a look at any skipped bits in this
-		 * tnode - everything up to p_pos is supposed to be ok,
-		 * and the non-chopped bits of the index (se previous
-		 * paragraph) are also guaranteed ok, but the rest is
-		 * considered unknown.
-		 *
-		 * The skipped bits are key[pos+bits..cn->pos].
-		 */
-
-		/* If current_prefix_length < pos+bits, we are already doing
-		 * actual prefix  matching, which means everything from
-		 * pos+(bits-chopped_off) onward must be zero along some
-		 * branch of this subtree - otherwise there is *no* valid
-		 * prefix present. Here we can only check the skipped
-		 * bits. Remember, since we have already indexed into the
-		 * parent's child array, we know that the bits we chopped of
-		 * *are* zero.
+		/* only record pn and cindex if we are going to be chopping
+		 * bits later.  Otherwise we are just wasting cycles.
 		 */
-
-		/* NOTA BENE: Checking only skipped bits
-		   for the new node here */
-
-		if (current_prefix_length < pos+bits) {
-			if (tkey_extract_bits(cn->key, current_prefix_length,
-						cn->pos - current_prefix_length)
-			    || !(cn->child[0]))
-				goto backtrace;
+		if (index) {
+			pn = n;
+			cindex = index;
 		}
 
-		/*
-		 * If chopped_off=0, the index is fully validated and we
-		 * only need to look at the skipped bits for this, the new,
-		 * tnode. What we actually want to do is to find out if
-		 * these skipped bits match our key perfectly, or if we will
-		 * have to count on finding a matching prefix further down,
-		 * because if we do, we would like to have some way of
-		 * verifying the existence of such a prefix at this point.
-		 */
+		n = rcu_dereference(n->child[index]);
+		if (unlikely(!n))
+			goto backtrace;
+	}
 
-		/* The only thing we can do at this point is to verify that
-		 * any such matching prefix can indeed be a prefix to our
-		 * key, and if the bits in the node we are inspecting that
-		 * do not match our key are not ZERO, this cannot be true.
-		 * Thus, find out where there is a mismatch (before cn->pos)
-		 * and verify that all the mismatching bits are zero in the
-		 * new tnode's key.
-		 */
+	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
+	for (;;) {
+		/* record the pointer where our next node pointer is stored */
+		struct tnode __rcu **cptr = n->child;
 
-		/*
-		 * Note: We aren't very concerned about the piece of
-		 * the key that precede pn->pos+pn->bits, since these
-		 * have already been checked. The bits after cn->pos
-		 * aren't checked since these are by definition
-		 * "unknown" at this point. Thus, what we want to see
-		 * is if we are about to enter the "prefix matching"
-		 * state, and in that case verify that the skipped
-		 * bits that will prevail throughout this subtree are
-		 * zero, as they have to be if we are to find a
-		 * matching prefix.
+		/* This test verifies that none of the bits that differ
+		 * between the key and the prefix exist in the region of
+		 * the lsb and higher in the prefix.
 		 */
+		if (unlikely(prefix_mismatch(key, n)))
+			goto backtrace;
 
-		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
+		/* exit out and process leaf */
+		if (unlikely(IS_LEAF(n)))
+			break;
 
-		/*
-		 * In short: If skipped bits in this node do not match
-		 * the search key, enter the "prefix matching"
-		 * state.directly.
+		/* Don't bother recording parent info.  Since we are in
+		 * prefix match mode we will have to come back to wherever
+		 * we started this traversal anyway
 		 */
-		if (pref_mismatch) {
-			/* fls(x) = __fls(x) + 1 */
-			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
-
-			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
-				goto backtrace;
-
-			if (current_prefix_length >= cn->pos)
-				current_prefix_length = mp;
-		}
-
-		pn = n; /* Descend */
-		chopped_off = 0;
-		continue;
 
+		while ((n = rcu_dereference(*cptr)) == NULL) {
 backtrace:
-		chopped_off++;
-
-		/* As zero don't change the child key (cindex) */
-		while ((chopped_off <= pn->bits)
-		       && !(cindex & (1<<(chopped_off-1))))
-			chopped_off++;
-
-		/* Decrease current_... with bits chopped off */
-		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
-			current_prefix_length = pn->pos + pn->bits
-				- chopped_off;
-
-		/*
-		 * Either we do the actual chop off according or if we have
-		 * chopped off all bits in this tnode walk up to our parent.
-		 */
-
-		if (chopped_off <= pn->bits) {
-			cindex &= ~(1 << (chopped_off-1));
-		} else {
-			struct tnode *parent = node_parent_rcu(pn);
-			if (!parent)
-				goto failed;
-
-			/* Get Child's index */
-			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
-			pn = parent;
-			chopped_off = 0;
-
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-			this_cpu_inc(stats->backtrack);
+			if (!n)
+				this_cpu_inc(stats->null_node_hit);
 #endif
-			goto backtrace;
+			/* If we are at cindex 0 there are no more bits for
+			 * us to strip at this level so we must ascend back
+			 * up one level to see if there are any more bits to
+			 * be stripped there.
+			 */
+			while (!cindex) {
+				t_key pkey = pn->key;
+
+				pn = node_parent_rcu(pn);
+				if (unlikely(!pn))
+					goto failed;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+				this_cpu_inc(stats->backtrack);
+#endif
+				/* Get Child's index */
+				cindex = get_index(pkey, pn);
+			}
+
+			/* strip the least significant bit from the cindex */
+			cindex &= cindex - 1;
+
+			/* grab pointer for next child node */
+			cptr = &pn->child[cindex];
 		}
 	}
-failed:
-	ret = 1;
+
 found:
+	/* Step 3: Process the leaf, if that fails fall back to backtracing */
+	ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
+	if (unlikely(ret > 0))
+		goto backtrace;
+failed:
 	rcu_read_unlock();
 	return ret;
 }
-- 
cgit v1.2.3


From 939afb0657dd8c8f9486d172d6bb62fc902e2f23 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:00 -0800
Subject: fib_trie: Optimize fib_find_node

This patch makes use of the same features I made use of for
fib_table_lookup to streamline fib_find_node.  The resultant code should be
smaller and run faster than the original.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3fe4dd917ce1..ac04f31a632e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -892,28 +892,34 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 }
 
 /* rcu_read_lock needs to be hold by caller from readside */
-
 static struct tnode *fib_find_node(struct trie *t, u32 key)
 {
 	struct tnode *n = rcu_dereference_rtnl(t->trie);
-	int pos = 0;
 
-	while (n && IS_TNODE(n)) {
-		if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
-			pos = n->pos + n->bits;
-			n = tnode_get_child_rcu(n,
-						tkey_extract_bits(key,
-								  n->pos,
-								  n->bits));
-		} else
+	while (n) {
+		unsigned long index = get_index(key, n);
+
+		/* This bit of code is a bit tricky but it combines multiple
+		 * checks into a single check.  The prefix consists of the
+		 * prefix plus zeros for the bits in the cindex. The index
+		 * is the difference between the key and this value.  From
+		 * this we can actually derive several pieces of data.
+		 *   if !(index >> bits)
+		 *     we know the value is cindex
+		 *   else
+		 *     we have a mismatch in skip bits and failed
+		 */
+		if (index >> n->bits)
+			return NULL;
+
+		/* we have found a leaf. Prefixes have already been compared */
+		if (IS_LEAF(n))
 			break;
-	}
-	/* Case we have found a leaf. Compare prefixes */
 
-	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
-		return n;
+		n = rcu_dereference_rtnl(n->child[index]);
+	}
 
-	return NULL;
+	return n;
 }
 
 static void trie_rebalance(struct trie *t, struct tnode *tn)
-- 
cgit v1.2.3


From 836a0123c98c91df71678ce63c0d42770996582a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:06 -0800
Subject: fib_trie: Optimize fib_table_insert

This patch updates the fib_table_insert function to take advantage of the
changes made to improve the performance of fib_table_lookup.  As a result
the code should be smaller and run faster then the original.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 196 +++++++++++++++++++---------------------------------
 1 file changed, 71 insertions(+), 125 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ac04f31a632e..8a147c3baaba 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -222,31 +222,6 @@ static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int
 		return 0;
 }
 
-static inline int tkey_equals(t_key a, t_key b)
-{
-	return a == b;
-}
-
-static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
-{
-	if (bits == 0 || offset >= KEYLENGTH)
-		return 1;
-	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
-	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
-}
-
-static inline int tkey_mismatch(t_key a, int offset, t_key b)
-{
-	t_key diff = a ^ b;
-	int i = offset;
-
-	if (!diff)
-		return 0;
-	while ((diff << i) >> (KEYLENGTH-1) == 0)
-		i++;
-	return i;
-}
-
 /*
   To understand this stuff, an understanding of keys and all their bits is
   necessary. Every node in the trie has a key associated with it, but not
@@ -485,6 +460,15 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
 	rcu_assign_pointer(tn->child[i], n);
 }
 
+static void put_child_root(struct tnode *tp, struct trie *t,
+			   t_key key, struct tnode *n)
+{
+	if (tp)
+		put_child(tp, get_index(key, tp), n);
+	else
+		rcu_assign_pointer(t->trie, n);
+}
+
 #define MAX_WORK 10
 static struct tnode *resize(struct trie *t, struct tnode *tn)
 {
@@ -959,138 +943,100 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 
 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 {
-	int pos, newpos;
-	struct tnode *tp = NULL, *tn = NULL;
-	struct tnode *n;
-	struct tnode *l;
-	int missbit;
 	struct list_head *fa_head = NULL;
+	struct tnode *l, *n, *tp = NULL;
 	struct leaf_info *li;
-	t_key cindex;
 
-	pos = 0;
+	li = leaf_info_new(plen);
+	if (!li)
+		return NULL;
+	fa_head = &li->falh;
+
 	n = rtnl_dereference(t->trie);
 
 	/* If we point to NULL, stop. Either the tree is empty and we should
 	 * just put a new leaf in if, or we have reached an empty child slot,
 	 * and we should just put our new leaf in that.
-	 * If we point to a T_TNODE, check if it matches our key. Note that
-	 * a T_TNODE might be skipping any number of bits - its 'pos' need
-	 * not be the parent's 'pos'+'bits'!
-	 *
-	 * If it does match the current key, get pos/bits from it, extract
-	 * the index from our key, push the T_TNODE and walk the tree.
-	 *
-	 * If it doesn't, we have to replace it with a new T_TNODE.
 	 *
-	 * If we point to a T_LEAF, it might or might not have the same key
-	 * as we do. If it does, just change the value, update the T_LEAF's
-	 * value, and return it.
-	 * If it doesn't, we need to replace it with a T_TNODE.
+	 * If we hit a node with a key that does't match then we should stop
+	 * and create a new tnode to replace that node and insert ourselves
+	 * and the other node into the new tnode.
 	 */
+	while (n) {
+		unsigned long index = get_index(key, n);
 
-	while (n && IS_TNODE(n)) {
-		if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
-			tp = n;
-			pos = n->pos + n->bits;
-			n = tnode_get_child(n,
-					    tkey_extract_bits(key,
-							      n->pos,
-							      n->bits));
-
-			BUG_ON(n && node_parent(n) != tp);
-		} else
+		/* This bit of code is a bit tricky but it combines multiple
+		 * checks into a single check.  The prefix consists of the
+		 * prefix plus zeros for the "bits" in the prefix. The index
+		 * is the difference between the key and this value.  From
+		 * this we can actually derive several pieces of data.
+		 *   if !(index >> bits)
+		 *     we know the value is child index
+		 *   else
+		 *     we have a mismatch in skip bits and failed
+		 */
+		if (index >> n->bits)
 			break;
-	}
-
-	/*
-	 * n  ----> NULL, LEAF or TNODE
-	 *
-	 * tp is n's (parent) ----> NULL or TNODE
-	 */
-
-	BUG_ON(tp && IS_LEAF(tp));
-
-	/* Case 1: n is a leaf. Compare prefixes */
-
-	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
-		li = leaf_info_new(plen);
 
-		if (!li)
-			return NULL;
+		/* we have found a leaf. Prefixes have already been compared */
+		if (IS_LEAF(n)) {
+			/* Case 1: n is a leaf, and prefixes match*/
+			insert_leaf_info(&n->list, li);
+			return fa_head;
+		}
 
-		fa_head = &li->falh;
-		insert_leaf_info(&n->list, li);
-		goto done;
+		tp = n;
+		n = rcu_dereference_rtnl(n->child[index]);
 	}
-	l = leaf_new(key);
-
-	if (!l)
-		return NULL;
-
-	li = leaf_info_new(plen);
 
-	if (!li) {
-		node_free(l);
+	l = leaf_new(key);
+	if (!l) {
+		free_leaf_info(li);
 		return NULL;
 	}
 
-	fa_head = &li->falh;
 	insert_leaf_info(&l->list, li);
 
-	if (t->trie && n == NULL) {
-		/* Case 2: n is NULL, and will just insert a new leaf */
-
-		node_set_parent(l, tp);
-
-		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-		put_child(tp, cindex, l);
-	} else {
-		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
-		/*
-		 *  Add a new tnode here
-		 *  first tnode need some special handling
-		 */
+	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
+	 *
+	 *  Add a new tnode here
+	 *  first tnode need some special handling
+	 *  leaves us in position for handling as case 3
+	 */
+	if (n) {
+		struct tnode *tn;
+		int newpos;
 
-		if (n) {
-			pos = tp ? tp->pos+tp->bits : 0;
-			newpos = tkey_mismatch(key, pos, n->key);
-			tn = tnode_new(n->key, newpos, 1);
-		} else {
-			newpos = 0;
-			tn = tnode_new(key, newpos, 1); /* First tnode */
-		}
+		newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
 
+		tn = tnode_new(key, newpos, 1);
 		if (!tn) {
 			free_leaf_info(li);
 			node_free(l);
 			return NULL;
 		}
 
-		node_set_parent(tn, tp);
-
-		missbit = tkey_extract_bits(key, newpos, 1);
-		put_child(tn, missbit, l);
-		put_child(tn, 1-missbit, n);
+		/* initialize routes out of node */
+		NODE_INIT_PARENT(tn, tp);
+		put_child(tn, get_index(key, tn) ^ 1, n);
 
-		if (tp) {
-			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-			put_child(tp, cindex, tn);
-		} else {
-			rcu_assign_pointer(t->trie, tn);
-		}
+		/* start adding routes into the node */
+		put_child_root(tp, t, key, tn);
+		node_set_parent(n, tn);
 
+		/* parent now has a NULL spot where the leaf can go */
 		tp = tn;
 	}
 
-	if (tp && tp->pos + tp->bits > 32)
-		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
-			tp, tp->pos, tp->bits, key, plen);
-
-	/* Rebalance the trie */
+	/* Case 3: n is NULL, and will just insert a new leaf */
+	if (tp) {
+		NODE_INIT_PARENT(l, tp);
+		put_child(tp, get_index(key, tp), l);
+		trie_rebalance(t, tp);
+	} else {
+		rcu_assign_pointer(t->trie, l);
+	}
 
-	trie_rebalance(t, tp);
-done:
 	return fa_head;
 }
 
@@ -1470,11 +1416,11 @@ static void trie_leaf_remove(struct trie *t, struct tnode *l)
 	pr_debug("entering trie_leaf_remove(%p)\n", l);
 
 	if (tp) {
-		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
-		put_child(tp, cindex, NULL);
+		put_child(tp, get_index(l->key, tp), NULL);
 		trie_rebalance(t, tp);
-	} else
+	} else {
 		RCU_INIT_POINTER(t->trie, NULL);
+	}
 
 	node_free(l);
 }
-- 
cgit v1.2.3


From e9b44019d41a5cf41de43e895a5ed38b19a59b5b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:12 -0800
Subject: fib_trie: Update meaning of pos to represent unchecked bits

This change moves the pos value to the other side of the "bits" field.  By
doing this it actually simplifies a significant amount of code in the trie.

For example when halving a tree we know that the bit lost exists at
oldnode->pos, and if we inflate the tree the new bit being add is at
tn->pos.  Previously to find those bits you would have to subtract pos and
bits from the keylength or start with a value of (1 << 31) and then shift
that.

There are a number of spots throughout the code that benefit from this.  In
the case of the hot-path searches the main advantage is that we can drop 2
or more operations from the search path as we no longer need to compute the
value for the index to be shifted by and can instead just use the raw pos
value.

In addition the tkey_extract_bits is now defunct and can be replaced by
get_index since the two operations were doing the same thing, but now
get_index does it much more quickly as it is only an xor and shift versus a
pair of shifts and a subtraction.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 194 ++++++++++++++++++++++------------------------------
 1 file changed, 81 insertions(+), 113 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 8a147c3baaba..9d675486b38c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -90,8 +90,7 @@ typedef unsigned int t_key;
 #define IS_TNODE(n) ((n)->bits)
 #define IS_LEAF(n) (!(n)->bits)
 
-#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
-#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
+#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
 
 struct tnode {
 	t_key key;
@@ -209,81 +208,64 @@ static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned
 	return rcu_dereference_rtnl(tn->child[i]);
 }
 
-static inline t_key mask_pfx(t_key k, unsigned int l)
-{
-	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
-}
-
-static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
-{
-	if (offset < KEYLENGTH)
-		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
-	else
-		return 0;
-}
-
-/*
-  To understand this stuff, an understanding of keys and all their bits is
-  necessary. Every node in the trie has a key associated with it, but not
-  all of the bits in that key are significant.
-
-  Consider a node 'n' and its parent 'tp'.
-
-  If n is a leaf, every bit in its key is significant. Its presence is
-  necessitated by path compression, since during a tree traversal (when
-  searching for a leaf - unless we are doing an insertion) we will completely
-  ignore all skipped bits we encounter. Thus we need to verify, at the end of
-  a potentially successful search, that we have indeed been walking the
-  correct key path.
-
-  Note that we can never "miss" the correct key in the tree if present by
-  following the wrong path. Path compression ensures that segments of the key
-  that are the same for all keys with a given prefix are skipped, but the
-  skipped part *is* identical for each node in the subtrie below the skipped
-  bit! trie_insert() in this implementation takes care of that - note the
-  call to tkey_sub_equals() in trie_insert().
-
-  if n is an internal node - a 'tnode' here, the various parts of its key
-  have many different meanings.
-
-  Example:
-  _________________________________________________________________
-  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
-  -----------------------------------------------------------------
-    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
-
-  _________________________________________________________________
-  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
-  -----------------------------------------------------------------
-   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
-
-  tp->pos = 7
-  tp->bits = 3
-  n->pos = 15
-  n->bits = 4
-
-  First, let's just ignore the bits that come before the parent tp, that is
-  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
-  not use them for anything.
-
-  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
-  index into the parent's child array. That is, they will be used to find
-  'n' among tp's children.
-
-  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
-  for the node n.
-
-  All the bits we have seen so far are significant to the node n. The rest
-  of the bits are really not needed or indeed known in n->key.
-
-  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
-  n's child array, and will of course be different for each child.
-
-
-  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
-  at this point.
-
-*/
+/* To understand this stuff, an understanding of keys and all their bits is
+ * necessary. Every node in the trie has a key associated with it, but not
+ * all of the bits in that key are significant.
+ *
+ * Consider a node 'n' and its parent 'tp'.
+ *
+ * If n is a leaf, every bit in its key is significant. Its presence is
+ * necessitated by path compression, since during a tree traversal (when
+ * searching for a leaf - unless we are doing an insertion) we will completely
+ * ignore all skipped bits we encounter. Thus we need to verify, at the end of
+ * a potentially successful search, that we have indeed been walking the
+ * correct key path.
+ *
+ * Note that we can never "miss" the correct key in the tree if present by
+ * following the wrong path. Path compression ensures that segments of the key
+ * that are the same for all keys with a given prefix are skipped, but the
+ * skipped part *is* identical for each node in the subtrie below the skipped
+ * bit! trie_insert() in this implementation takes care of that.
+ *
+ * if n is an internal node - a 'tnode' here, the various parts of its key
+ * have many different meanings.
+ *
+ * Example:
+ * _________________________________________________________________
+ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+ * -----------------------------------------------------------------
+ *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
+ *
+ * _________________________________________________________________
+ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+ * -----------------------------------------------------------------
+ *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
+ *
+ * tp->pos = 22
+ * tp->bits = 3
+ * n->pos = 13
+ * n->bits = 4
+ *
+ * First, let's just ignore the bits that come before the parent tp, that is
+ * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
+ * point we do not use them for anything.
+ *
+ * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+ * index into the parent's child array. That is, they will be used to find
+ * 'n' among tp's children.
+ *
+ * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
+ * for the node n.
+ *
+ * All the bits we have seen so far are significant to the node n. The rest
+ * of the bits are really not needed or indeed known in n->key.
+ *
+ * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+ * n's child array, and will of course be different for each child.
+ *
+ * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
+ * at this point.
+ */
 
 static const int halve_threshold = 25;
 static const int inflate_threshold = 50;
@@ -367,7 +349,7 @@ static struct tnode *leaf_new(t_key key)
 		 * as the nodes are searched
 		 */
 		l->key = key;
-		l->pos = KEYLENGTH;
+		l->pos = 0;
 		/* set bits to 0 indicating we are not a tnode */
 		l->bits = 0;
 
@@ -400,7 +382,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 		tn->parent = NULL;
 		tn->pos = pos;
 		tn->bits = bits;
-		tn->key = mask_pfx(key, pos);
+		tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
 		tn->full_children = 0;
 		tn->empty_children = 1<<bits;
 	}
@@ -410,14 +392,12 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 	return tn;
 }
 
-/*
- * Check whether a tnode 'n' is "full", i.e. it is an internal node
+/* Check whether a tnode 'n' is "full", i.e. it is an internal node
  * and no bits are skipped. See discussion in dyntree paper p. 6
  */
-
 static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
 {
-	return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
+	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
 }
 
 static inline void put_child(struct tnode *tn, int i,
@@ -641,11 +621,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 {
 	int olen = tnode_child_length(oldtnode);
 	struct tnode *tn;
+	t_key m;
 	int i;
 
 	pr_debug("In inflate\n");
 
-	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
+	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
 
 	if (!tn)
 		return ERR_PTR(-ENOMEM);
@@ -656,21 +637,18 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 	 * fails. In case of failure we return the oldnode and  inflate
 	 * of tnode is ignored.
 	 */
+	for (i = 0, m = 1u << tn->pos; i < olen; i++) {
+		struct tnode *inode = tnode_get_child(oldtnode, i);
 
-	for (i = 0; i < olen; i++) {
-		struct tnode *inode;
-
-		inode = tnode_get_child(oldtnode, i);
-		if (tnode_full(oldtnode, inode) && inode->bits > 1) {
+		if (tnode_full(oldtnode, inode) && (inode->bits > 1)) {
 			struct tnode *left, *right;
-			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 
-			left = tnode_new(inode->key&(~m), inode->pos + 1,
+			left = tnode_new(inode->key & ~m, inode->pos,
 					 inode->bits - 1);
 			if (!left)
 				goto nomem;
 
-			right = tnode_new(inode->key|m, inode->pos + 1,
+			right = tnode_new(inode->key | m, inode->pos,
 					  inode->bits - 1);
 
 			if (!right) {
@@ -694,9 +672,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 
 		/* A leaf or an internal node with skipped bits */
 		if (!tnode_full(oldtnode, inode)) {
-			put_child(tn,
-				tkey_extract_bits(inode->key, tn->pos, tn->bits),
-				inode);
+			put_child(tn, get_index(inode->key, tn), inode);
 			continue;
 		}
 
@@ -767,7 +743,7 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 
 	pr_debug("In halve\n");
 
-	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
+	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
 
 	if (!tn)
 		return ERR_PTR(-ENOMEM);
@@ -787,7 +763,7 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 		if (left && right) {
 			struct tnode *newn;
 
-			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
+			newn = tnode_new(left->key, oldtnode->pos, 1);
 
 			if (!newn)
 				goto nomem;
@@ -915,7 +891,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 	key = tn->key;
 
 	while (tn != NULL && (tp = node_parent(tn)) != NULL) {
-		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+		cindex = get_index(key, tp);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 		tn = resize(t, tn);
 
@@ -1005,11 +981,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 	 */
 	if (n) {
 		struct tnode *tn;
-		int newpos;
-
-		newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
 
-		tn = tnode_new(key, newpos, 1);
+		tn = tnode_new(key, __fls(key ^ n->key), 1);
 		if (!tn) {
 			free_leaf_info(li);
 			node_free(l);
@@ -1559,12 +1532,7 @@ static int trie_flush_leaf(struct tnode *l)
 static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
 {
 	do {
-		t_key idx;
-
-		if (c)
-			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
-		else
-			idx = 0;
+		t_key idx = c ? idx = get_index(c->key, p) + 1 : 0;
 
 		while (idx < 1u << p->bits) {
 			c = tnode_get_child_rcu(p, idx++);
@@ -1851,7 +1819,7 @@ rescan:
 	/* Current node exhausted, pop back up */
 	p = node_parent_rcu(tn);
 	if (p) {
-		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
+		cindex = get_index(tn->key, p) + 1;
 		tn = p;
 		--iter->depth;
 		goto rescan;
@@ -2186,10 +2154,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 	if (IS_TNODE(n)) {
 		__be32 prf = htonl(n->key);
 
-		seq_indent(seq, iter->depth - 1);
-		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
-			   &prf, n->pos, n->bits, n->full_children,
-			   n->empty_children);
+		seq_indent(seq, iter->depth-1);
+		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
+			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
+			   n->full_children, n->empty_children);
 	} else {
 		struct leaf_info *li;
 		__be32 val = htonl(n->key);
-- 
cgit v1.2.3


From 98293e8d2f51976d5f790400859a517c0f7cb598 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:18 -0800
Subject: fib_trie: Use unsigned long for anything dealing with a shift by bits

This change makes it so that anything that can be shifted by, or compared
to a value shifted by bits is updated to be an unsigned long.  This is
mostly a precaution against an insanely huge address space that somehow
starts coming close to the 2^32 root node size which would require
something like 1.5 billion addresses.

I chose unsigned long instead of unsigned long long since I do not believe
it is possible to allocate a 32 bit tnode on a 32 bit system as the memory
consumed would be 16GB + 28B which exceeds the addressible space for any
one process.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 53 ++++++++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9d675486b38c..28a3065470bc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -146,8 +146,8 @@ struct trie {
 #endif
 };
 
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
-				  int wasfull);
+static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
+				  struct tnode *n, int wasfull);
 static struct tnode *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
@@ -183,25 +183,23 @@ static inline void node_set_parent(struct tnode *n, struct tnode *tp)
 /* This provides us with the number of children in this node, in the case of a
  * leaf this will return 0 meaning none of the children are accessible.
  */
-static inline int tnode_child_length(const struct tnode *tn)
+static inline unsigned long tnode_child_length(const struct tnode *tn)
 {
 	return (1ul << tn->bits) & ~(1ul);
 }
 
-/*
- * caller must hold RTNL
- */
-static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
+/* caller must hold RTNL */
+static inline struct tnode *tnode_get_child(const struct tnode *tn,
+					    unsigned long i)
 {
 	BUG_ON(i >= tnode_child_length(tn));
 
 	return rtnl_dereference(tn->child[i]);
 }
 
-/*
- * caller must hold RCU read lock or RTNL
- */
-static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
+/* caller must hold RCU read lock or RTNL */
+static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
+						unsigned long i)
 {
 	BUG_ON(i >= tnode_child_length(tn));
 
@@ -400,7 +398,7 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
 	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
 }
 
-static inline void put_child(struct tnode *tn, int i,
+static inline void put_child(struct tnode *tn, unsigned long i,
 			     struct tnode *n)
 {
 	tnode_put_child_reorg(tn, i, n, -1);
@@ -411,13 +409,13 @@ static inline void put_child(struct tnode *tn, int i,
   * Update the value of full_children and empty_children.
   */
 
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
-				  int wasfull)
+static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
+				  struct tnode *n, int wasfull)
 {
 	struct tnode *chi = rtnl_dereference(tn->child[i]);
 	int isfull;
 
-	BUG_ON(i >= 1<<tn->bits);
+	BUG_ON(i >= tnode_child_length(tn));
 
 	/* update emptyChildren */
 	if (n == NULL && chi != NULL)
@@ -607,10 +605,10 @@ no_children:
 static void tnode_clean_free(struct tnode *tn)
 {
 	struct tnode *tofree;
-	int i;
+	unsigned long i;
 
 	for (i = 0; i < tnode_child_length(tn); i++) {
-		tofree = rtnl_dereference(tn->child[i]);
+		tofree = tnode_get_child(tn, i);
 		if (tofree)
 			node_free(tofree);
 	}
@@ -619,10 +617,10 @@ static void tnode_clean_free(struct tnode *tn)
 
 static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 {
-	int olen = tnode_child_length(oldtnode);
+	unsigned long olen = tnode_child_length(oldtnode);
 	struct tnode *tn;
+	unsigned long i;
 	t_key m;
-	int i;
 
 	pr_debug("In inflate\n");
 
@@ -664,7 +662,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 	for (i = 0; i < olen; i++) {
 		struct tnode *inode = tnode_get_child(oldtnode, i);
 		struct tnode *left, *right;
-		int size, j;
+		unsigned long size, j;
 
 		/* An empty child */
 		if (inode == NULL)
@@ -737,7 +735,7 @@ nomem:
 
 static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 {
-	int olen = tnode_child_length(oldtnode);
+	unsigned long olen = tnode_child_length(oldtnode);
 	struct tnode *tn, *left, *right;
 	int i;
 
@@ -1532,9 +1530,9 @@ static int trie_flush_leaf(struct tnode *l)
 static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
 {
 	do {
-		t_key idx = c ? idx = get_index(c->key, p) + 1 : 0;
+		unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
 
-		while (idx < 1u << p->bits) {
+		while (idx < tnode_child_length(p)) {
 			c = tnode_get_child_rcu(p, idx++);
 			if (!c)
 				continue;
@@ -1786,8 +1784,8 @@ struct fib_trie_iter {
 
 static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
 {
+	unsigned long cindex = iter->index;
 	struct tnode *tn = iter->tnode;
-	unsigned int cindex = iter->index;
 	struct tnode *p;
 
 	/* A single entry routing table */
@@ -1797,7 +1795,7 @@ static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
 		 iter->tnode, iter->index, iter->depth);
 rescan:
-	while (cindex < (1<<tn->bits)) {
+	while (cindex < tnode_child_length(tn)) {
 		struct tnode *n = tnode_get_child_rcu(tn, cindex);
 
 		if (n) {
@@ -1874,15 +1872,16 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 			hlist_for_each_entry_rcu(li, &n->list, hlist)
 				++s->prefixes;
 		} else {
-			int i;
+			unsigned long i;
 
 			s->tnodes++;
 			if (n->bits < MAX_STAT_DEPTH)
 				s->nodesizes[n->bits]++;
 
-			for (i = 0; i < tnode_child_length(n); i++)
+			for (i = 0; i < tnode_child_length(n); i++) {
 				if (!rcu_access_pointer(n->child[i]))
 					s->nullpointers++;
+			}
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 345e9b54268ae065520a7252c182d22ef4591718 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:24 -0800
Subject: fib_trie: Push rcu_read_lock/unlock to callers

This change is to start cleaning up some of the rcu_read_lock/unlock
handling.  I realized while reviewing the code there are several spots that
I don't believe are being handled correctly or are masking warnings by
locally calling rcu_read_lock/unlock instead of calling them at the correct
level.

A common example is a call to fib_get_table followed by fib_table_lookup.
The rcu_read_lock/unlock ought to wrap both but there are several spots where
they were not wrapped.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  50 ++++++++++--------
 net/ipv4/fib_frontend.c |  27 +++++-----
 net/ipv4/fib_rules.c    |  22 ++++----
 net/ipv4/fib_trie.c     | 137 +++++++++++++++++++++---------------------------
 4 files changed, 114 insertions(+), 122 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 09a819ee2151..5bd120e4bc0a 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -222,16 +222,19 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id)
 static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 			     struct fib_result *res)
 {
-	struct fib_table *table;
+	int err = -ENETUNREACH;
+
+	rcu_read_lock();
+
+	if (!fib_table_lookup(fib_get_table(net, RT_TABLE_LOCAL), flp, res,
+			      FIB_LOOKUP_NOREF) ||
+	    !fib_table_lookup(fib_get_table(net, RT_TABLE_MAIN), flp, res,
+			      FIB_LOOKUP_NOREF))
+		err = 0;
 
-	table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
-		return 0;
+	rcu_read_unlock();
 
-	table = fib_get_table(net, RT_TABLE_MAIN);
-	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
-		return 0;
-	return -ENETUNREACH;
+	return err;
 }
 
 #else /* CONFIG_IP_MULTIPLE_TABLES */
@@ -247,20 +250,25 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 			     struct fib_result *res)
 {
 	if (!net->ipv4.fib_has_custom_rules) {
+		int err = -ENETUNREACH;
+
+		rcu_read_lock();
+
 		res->tclassid = 0;
-		if (net->ipv4.fib_local &&
-		    !fib_table_lookup(net->ipv4.fib_local, flp, res,
-				      FIB_LOOKUP_NOREF))
-			return 0;
-		if (net->ipv4.fib_main &&
-		    !fib_table_lookup(net->ipv4.fib_main, flp, res,
-				      FIB_LOOKUP_NOREF))
-			return 0;
-		if (net->ipv4.fib_default &&
-		    !fib_table_lookup(net->ipv4.fib_default, flp, res,
-				      FIB_LOOKUP_NOREF))
-			return 0;
-		return -ENETUNREACH;
+		if ((net->ipv4.fib_local &&
+		     !fib_table_lookup(net->ipv4.fib_local, flp, res,
+				       FIB_LOOKUP_NOREF)) ||
+		    (net->ipv4.fib_main &&
+		     !fib_table_lookup(net->ipv4.fib_main, flp, res,
+				       FIB_LOOKUP_NOREF)) ||
+		    (net->ipv4.fib_default &&
+		     !fib_table_lookup(net->ipv4.fib_default, flp, res,
+				       FIB_LOOKUP_NOREF)))
+			err = 0;
+
+		rcu_read_unlock();
+
+		return err;
 	}
 	return __fib_lookup(net, flp, res);
 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 66890209208f..57be71dd6a9e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -109,6 +109,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
 	return tb;
 }
 
+/* caller must hold either rtnl or rcu read lock */
 struct fib_table *fib_get_table(struct net *net, u32 id)
 {
 	struct fib_table *tb;
@@ -119,15 +120,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
 		id = RT_TABLE_MAIN;
 	h = id & (FIB_TABLE_HASHSZ - 1);
 
-	rcu_read_lock();
 	head = &net->ipv4.fib_table_hash[h];
 	hlist_for_each_entry_rcu(tb, head, tb_hlist) {
-		if (tb->tb_id == id) {
-			rcu_read_unlock();
+		if (tb->tb_id == id)
 			return tb;
-		}
 	}
-	rcu_read_unlock();
 	return NULL;
 }
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
@@ -167,16 +164,18 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	if (ipv4_is_multicast(addr))
 		return RTN_MULTICAST;
 
+	rcu_read_lock();
+
 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
 	if (local_table) {
 		ret = RTN_UNICAST;
-		rcu_read_lock();
 		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 			if (!dev || dev == res.fi->fib_dev)
 				ret = res.type;
 		}
-		rcu_read_unlock();
 	}
+
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -919,7 +918,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 #undef BRD1_OK
 }
 
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
+static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
 {
 
 	struct fib_result       res;
@@ -929,6 +928,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 		.flowi4_tos = frn->fl_tos,
 		.flowi4_scope = frn->fl_scope,
 	};
+	struct fib_table *tb;
+
+	rcu_read_lock();
+
+	tb = fib_get_table(net, frn->tb_id_in);
 
 	frn->err = -ENOENT;
 	if (tb) {
@@ -945,6 +949,8 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 		}
 		local_bh_enable();
 	}
+
+	rcu_read_unlock();
 }
 
 static void nl_fib_input(struct sk_buff *skb)
@@ -952,7 +958,6 @@ static void nl_fib_input(struct sk_buff *skb)
 	struct net *net;
 	struct fib_result_nl *frn;
 	struct nlmsghdr *nlh;
-	struct fib_table *tb;
 	u32 portid;
 
 	net = sock_net(skb->sk);
@@ -967,9 +972,7 @@ static void nl_fib_input(struct sk_buff *skb)
 	nlh = nlmsg_hdr(skb);
 
 	frn = (struct fib_result_nl *) nlmsg_data(nlh);
-	tb = fib_get_table(net, frn->tb_id_in);
-
-	nl_fib_lookup(frn, tb);
+	nl_fib_lookup(net, frn);
 
 	portid = NETLINK_CB(skb).portid;      /* netlink portid */
 	NETLINK_CB(skb).portid = 0;        /* from kernel */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 8f7bd56955b0..d3db718be51d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -81,27 +81,25 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 		break;
 
 	case FR_ACT_UNREACHABLE:
-		err = -ENETUNREACH;
-		goto errout;
+		return -ENETUNREACH;
 
 	case FR_ACT_PROHIBIT:
-		err = -EACCES;
-		goto errout;
+		return -EACCES;
 
 	case FR_ACT_BLACKHOLE:
 	default:
-		err = -EINVAL;
-		goto errout;
+		return -EINVAL;
 	}
 
+	rcu_read_lock();
+
 	tbl = fib_get_table(rule->fr_net, rule->table);
-	if (!tbl)
-		goto errout;
+	if (tbl)
+		err = fib_table_lookup(tbl, &flp->u.ip4,
+				       (struct fib_result *)arg->result,
+				       arg->flags);
 
-	err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
-	if (err > 0)
-		err = -EAGAIN;
-errout:
+	rcu_read_unlock();
 	return err;
 }
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 28a3065470bc..987b06d1effe 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1181,72 +1181,6 @@ err:
 	return err;
 }
 
-/* should be called with rcu_read_lock */
-static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
-		      t_key key,  const struct flowi4 *flp,
-		      struct fib_result *res, int fib_flags)
-{
-	struct leaf_info *li;
-	struct hlist_head *hhead = &l->list;
-
-	hlist_for_each_entry_rcu(li, hhead, hlist) {
-		struct fib_alias *fa;
-
-		if (l->key != (key & li->mask_plen))
-			continue;
-
-		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
-			struct fib_info *fi = fa->fa_info;
-			int nhsel, err;
-
-			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
-				continue;
-			if (fi->fib_dead)
-				continue;
-			if (fa->fa_info->fib_scope < flp->flowi4_scope)
-				continue;
-			fib_alias_accessed(fa);
-			err = fib_props[fa->fa_type].error;
-			if (unlikely(err < 0)) {
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-				this_cpu_inc(t->stats->semantic_match_passed);
-#endif
-				return err;
-			}
-			if (fi->fib_flags & RTNH_F_DEAD)
-				continue;
-			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
-				const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
-				if (nh->nh_flags & RTNH_F_DEAD)
-					continue;
-				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
-					continue;
-
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-				this_cpu_inc(t->stats->semantic_match_passed);
-#endif
-				res->prefixlen = li->plen;
-				res->nh_sel = nhsel;
-				res->type = fa->fa_type;
-				res->scope = fi->fib_scope;
-				res->fi = fi;
-				res->table = tb;
-				res->fa_head = &li->falh;
-				if (!(fib_flags & FIB_LOOKUP_NOREF))
-					atomic_inc(&fi->fib_clntref);
-				return 0;
-			}
-		}
-
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-		this_cpu_inc(t->stats->semantic_match_miss);
-#endif
-	}
-
-	return 1;
-}
-
 static inline t_key prefix_mismatch(t_key key, struct tnode *n)
 {
 	t_key prefix = n->key;
@@ -1254,6 +1188,7 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n)
 	return (key ^ prefix) & (prefix | -prefix);
 }
 
+/* should be called with rcu_read_lock */
 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags)
 {
@@ -1263,14 +1198,12 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 #endif
 	const t_key key = ntohl(flp->daddr);
 	struct tnode *n, *pn;
+	struct leaf_info *li;
 	t_key cindex;
-	int ret = 1;
-
-	rcu_read_lock();
 
 	n = rcu_dereference(t->trie);
 	if (!n)
-		goto failed;
+		return -EAGAIN;
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	this_cpu_inc(stats->gets);
@@ -1350,7 +1283,7 @@ backtrace:
 
 				pn = node_parent_rcu(pn);
 				if (unlikely(!pn))
-					goto failed;
+					return -EAGAIN;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 				this_cpu_inc(stats->backtrack);
 #endif
@@ -1368,12 +1301,62 @@ backtrace:
 
 found:
 	/* Step 3: Process the leaf, if that fails fall back to backtracing */
-	ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
-	if (unlikely(ret > 0))
-		goto backtrace;
-failed:
-	rcu_read_unlock();
-	return ret;
+	hlist_for_each_entry_rcu(li, &n->list, hlist) {
+		struct fib_alias *fa;
+
+		if ((key ^ n->key) & li->mask_plen)
+			continue;
+
+		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+			struct fib_info *fi = fa->fa_info;
+			int nhsel, err;
+
+			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+				continue;
+			if (fi->fib_dead)
+				continue;
+			if (fa->fa_info->fib_scope < flp->flowi4_scope)
+				continue;
+			fib_alias_accessed(fa);
+			err = fib_props[fa->fa_type].error;
+			if (unlikely(err < 0)) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+				this_cpu_inc(stats->semantic_match_passed);
+#endif
+				return err;
+			}
+			if (fi->fib_flags & RTNH_F_DEAD)
+				continue;
+			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+				const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+				if (nh->nh_flags & RTNH_F_DEAD)
+					continue;
+				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+					continue;
+
+				if (!(fib_flags & FIB_LOOKUP_NOREF))
+					atomic_inc(&fi->fib_clntref);
+
+				res->prefixlen = li->plen;
+				res->nh_sel = nhsel;
+				res->type = fa->fa_type;
+				res->scope = fi->fib_scope;
+				res->fi = fi;
+				res->table = tb;
+				res->fa_head = &li->falh;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+				this_cpu_inc(stats->semantic_match_passed);
+#endif
+				return err;
+			}
+		}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+		this_cpu_inc(stats->semantic_match_miss);
+#endif
+	}
+	goto backtrace;
 }
 EXPORT_SYMBOL_GPL(fib_table_lookup);
 
-- 
cgit v1.2.3


From cf3637bb8f07fb3b6791708f1d5118afb5446061 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:31 -0800
Subject: fib_trie: Move resize to after inflate/halve

This change consists of a cut/paste of resize to behind inflate and halve
so that I could remove the two function prototypes.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 311 ++++++++++++++++++++++++++--------------------------
 1 file changed, 154 insertions(+), 157 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 987b06d1effe..d044fa355f69 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -149,8 +149,6 @@ struct trie {
 static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
 				  struct tnode *n, int wasfull);
 static struct tnode *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
 /* tnodes to free after resize(); protected by RTNL */
 static struct callback_head *tnode_free_head;
 static size_t tnode_free_size;
@@ -447,161 +445,6 @@ static void put_child_root(struct tnode *tp, struct trie *t,
 		rcu_assign_pointer(t->trie, n);
 }
 
-#define MAX_WORK 10
-static struct tnode *resize(struct trie *t, struct tnode *tn)
-{
-	struct tnode *old_tn, *n = NULL;
-	int inflate_threshold_use;
-	int halve_threshold_use;
-	int max_work;
-
-	if (!tn)
-		return NULL;
-
-	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
-		 tn, inflate_threshold, halve_threshold);
-
-	/* No children */
-	if (tn->empty_children > (tnode_child_length(tn) - 1))
-		goto no_children;
-
-	/* One child */
-	if (tn->empty_children == (tnode_child_length(tn) - 1))
-		goto one_child;
-	/*
-	 * Double as long as the resulting node has a number of
-	 * nonempty nodes that are above the threshold.
-	 */
-
-	/*
-	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
-	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
-	 * Telecommunications, page 6:
-	 * "A node is doubled if the ratio of non-empty children to all
-	 * children in the *doubled* node is at least 'high'."
-	 *
-	 * 'high' in this instance is the variable 'inflate_threshold'. It
-	 * is expressed as a percentage, so we multiply it with
-	 * tnode_child_length() and instead of multiplying by 2 (since the
-	 * child array will be doubled by inflate()) and multiplying
-	 * the left-hand side by 100 (to handle the percentage thing) we
-	 * multiply the left-hand side by 50.
-	 *
-	 * The left-hand side may look a bit weird: tnode_child_length(tn)
-	 * - tn->empty_children is of course the number of non-null children
-	 * in the current node. tn->full_children is the number of "full"
-	 * children, that is non-null tnodes with a skip value of 0.
-	 * All of those will be doubled in the resulting inflated tnode, so
-	 * we just count them one extra time here.
-	 *
-	 * A clearer way to write this would be:
-	 *
-	 * to_be_doubled = tn->full_children;
-	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
-	 *     tn->full_children;
-	 *
-	 * new_child_length = tnode_child_length(tn) * 2;
-	 *
-	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
-	 *      new_child_length;
-	 * if (new_fill_factor >= inflate_threshold)
-	 *
-	 * ...and so on, tho it would mess up the while () loop.
-	 *
-	 * anyway,
-	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
-	 *      inflate_threshold
-	 *
-	 * avoid a division:
-	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
-	 *      inflate_threshold * new_child_length
-	 *
-	 * expand not_to_be_doubled and to_be_doubled, and shorten:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children) >= inflate_threshold * new_child_length
-	 *
-	 * expand new_child_length:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children) >=
-	 *      inflate_threshold * tnode_child_length(tn) * 2
-	 *
-	 * shorten again:
-	 * 50 * (tn->full_children + tnode_child_length(tn) -
-	 *    tn->empty_children) >= inflate_threshold *
-	 *    tnode_child_length(tn)
-	 *
-	 */
-
-	/* Keep root node larger  */
-
-	if (!node_parent(tn)) {
-		inflate_threshold_use = inflate_threshold_root;
-		halve_threshold_use = halve_threshold_root;
-	} else {
-		inflate_threshold_use = inflate_threshold;
-		halve_threshold_use = halve_threshold;
-	}
-
-	max_work = MAX_WORK;
-	while ((tn->full_children > 0 &&  max_work-- &&
-		50 * (tn->full_children + tnode_child_length(tn)
-		      - tn->empty_children)
-		>= inflate_threshold_use * tnode_child_length(tn))) {
-
-		old_tn = tn;
-		tn = inflate(t, tn);
-
-		if (IS_ERR(tn)) {
-			tn = old_tn;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-			this_cpu_inc(t->stats->resize_node_skipped);
-#endif
-			break;
-		}
-	}
-
-	/* Return if at least one inflate is run */
-	if (max_work != MAX_WORK)
-		return tn;
-
-	/*
-	 * Halve as long as the number of empty children in this
-	 * node is above threshold.
-	 */
-
-	max_work = MAX_WORK;
-	while (tn->bits > 1 &&  max_work-- &&
-	       100 * (tnode_child_length(tn) - tn->empty_children) <
-	       halve_threshold_use * tnode_child_length(tn)) {
-
-		old_tn = tn;
-		tn = halve(t, tn);
-		if (IS_ERR(tn)) {
-			tn = old_tn;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-			this_cpu_inc(t->stats->resize_node_skipped);
-#endif
-			break;
-		}
-	}
-
-
-	/* Only one child remains */
-	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
-		unsigned long i;
-one_child:
-		for (i = tnode_child_length(tn); !n && i;)
-			n = tnode_get_child(tn, --i);
-no_children:
-		/* compress one level */
-		node_set_parent(n, NULL);
-		tnode_free_safe(tn);
-		return n;
-	}
-	return tn;
-}
-
-
 static void tnode_clean_free(struct tnode *tn)
 {
 	struct tnode *tofree;
@@ -804,6 +647,160 @@ nomem:
 	return ERR_PTR(-ENOMEM);
 }
 
+#define MAX_WORK 10
+static struct tnode *resize(struct trie *t, struct tnode *tn)
+{
+	struct tnode *old_tn, *n = NULL;
+	int inflate_threshold_use;
+	int halve_threshold_use;
+	int max_work;
+
+	if (!tn)
+		return NULL;
+
+	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+		 tn, inflate_threshold, halve_threshold);
+
+	/* No children */
+	if (tn->empty_children > (tnode_child_length(tn) - 1))
+		goto no_children;
+
+	/* One child */
+	if (tn->empty_children == (tnode_child_length(tn) - 1))
+		goto one_child;
+	/*
+	 * Double as long as the resulting node has a number of
+	 * nonempty nodes that are above the threshold.
+	 */
+
+	/*
+	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
+	 * Telecommunications, page 6:
+	 * "A node is doubled if the ratio of non-empty children to all
+	 * children in the *doubled* node is at least 'high'."
+	 *
+	 * 'high' in this instance is the variable 'inflate_threshold'. It
+	 * is expressed as a percentage, so we multiply it with
+	 * tnode_child_length() and instead of multiplying by 2 (since the
+	 * child array will be doubled by inflate()) and multiplying
+	 * the left-hand side by 100 (to handle the percentage thing) we
+	 * multiply the left-hand side by 50.
+	 *
+	 * The left-hand side may look a bit weird: tnode_child_length(tn)
+	 * - tn->empty_children is of course the number of non-null children
+	 * in the current node. tn->full_children is the number of "full"
+	 * children, that is non-null tnodes with a skip value of 0.
+	 * All of those will be doubled in the resulting inflated tnode, so
+	 * we just count them one extra time here.
+	 *
+	 * A clearer way to write this would be:
+	 *
+	 * to_be_doubled = tn->full_children;
+	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+	 *     tn->full_children;
+	 *
+	 * new_child_length = tnode_child_length(tn) * 2;
+	 *
+	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+	 *      new_child_length;
+	 * if (new_fill_factor >= inflate_threshold)
+	 *
+	 * ...and so on, tho it would mess up the while () loop.
+	 *
+	 * anyway,
+	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+	 *      inflate_threshold
+	 *
+	 * avoid a division:
+	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+	 *      inflate_threshold * new_child_length
+	 *
+	 * expand not_to_be_doubled and to_be_doubled, and shorten:
+	 * 100 * (tnode_child_length(tn) - tn->empty_children +
+	 *    tn->full_children) >= inflate_threshold * new_child_length
+	 *
+	 * expand new_child_length:
+	 * 100 * (tnode_child_length(tn) - tn->empty_children +
+	 *    tn->full_children) >=
+	 *      inflate_threshold * tnode_child_length(tn) * 2
+	 *
+	 * shorten again:
+	 * 50 * (tn->full_children + tnode_child_length(tn) -
+	 *    tn->empty_children) >= inflate_threshold *
+	 *    tnode_child_length(tn)
+	 *
+	 */
+
+	/* Keep root node larger  */
+
+	if (!node_parent(tn)) {
+		inflate_threshold_use = inflate_threshold_root;
+		halve_threshold_use = halve_threshold_root;
+	} else {
+		inflate_threshold_use = inflate_threshold;
+		halve_threshold_use = halve_threshold;
+	}
+
+	max_work = MAX_WORK;
+	while ((tn->full_children > 0 &&  max_work-- &&
+		50 * (tn->full_children + tnode_child_length(tn)
+		      - tn->empty_children)
+		>= inflate_threshold_use * tnode_child_length(tn))) {
+
+		old_tn = tn;
+		tn = inflate(t, tn);
+
+		if (IS_ERR(tn)) {
+			tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			this_cpu_inc(t->stats->resize_node_skipped);
+#endif
+			break;
+		}
+	}
+
+	/* Return if at least one inflate is run */
+	if (max_work != MAX_WORK)
+		return tn;
+
+	/*
+	 * Halve as long as the number of empty children in this
+	 * node is above threshold.
+	 */
+
+	max_work = MAX_WORK;
+	while (tn->bits > 1 &&  max_work-- &&
+	       100 * (tnode_child_length(tn) - tn->empty_children) <
+	       halve_threshold_use * tnode_child_length(tn)) {
+
+		old_tn = tn;
+		tn = halve(t, tn);
+		if (IS_ERR(tn)) {
+			tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			this_cpu_inc(t->stats->resize_node_skipped);
+#endif
+			break;
+		}
+	}
+
+
+	/* Only one child remains */
+	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
+		unsigned long i;
+one_child:
+		for (i = tnode_child_length(tn); !n && i;)
+			n = tnode_get_child(tn, --i);
+no_children:
+		/* compress one level */
+		node_set_parent(n, NULL);
+		tnode_free_safe(tn);
+		return n;
+	}
+	return tn;
+}
+
 /* readside must use rcu_read_lock currently dump routines
  via get_fa_head and dump */
 
-- 
cgit v1.2.3


From f05a48198bf742efd9d36553c51e98a648dbe807 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:37 -0800
Subject: fib_trie: Add functions should_inflate and should_halve

This change pulls the logic for if we should inflate/halve the nodes out
into separate functions.  It also addresses what I believe is a bug where 1
full node is all that is needed to keep a node from ever being halved.

Simple script to reproduce the issue:
	modprobe dummy;	ifconfig dummy0 up
	for i in `seq 0 255`; do ifconfig dummy0:$i 10.0.${i}.1/24 up; done
	ifconfig dummy0:256 10.0.255.33/16 up
	for i in `seq 0 254`; do ifconfig dummy0:$i down; done

Results from /proc/net/fib_triestat
Before:
	Local:
		Aver depth:     3.00
		Max depth:      4
		Leaves:         17
		Prefixes:       18
		Internal nodes: 11
		  1: 8  2: 2  10: 1
		Pointers: 1048
	Null ptrs: 1021
	Total size: 11  kB
After:
	Local:
		Aver depth:     3.41
		Max depth:      5
		Leaves:         17
		Prefixes:       18
		Internal nodes: 12
		  1: 8  2: 3  3: 1
		Pointers: 36
	Null ptrs: 8
	Total size: 3  kB

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 175 ++++++++++++++++++++++++++--------------------------
 1 file changed, 89 insertions(+), 86 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d044fa355f69..58e1224786fa 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -647,12 +647,94 @@ nomem:
 	return ERR_PTR(-ENOMEM);
 }
 
+/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * tnode_child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: tnode_child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+ *     tn->full_children;
+ *
+ * new_child_length = tnode_child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ *      new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ *      inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ *      inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ *    tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ *    tn->full_children) >=
+ *      inflate_threshold * tnode_child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + tnode_child_length(tn) -
+ *    tn->empty_children) >= inflate_threshold *
+ *    tnode_child_length(tn)
+ *
+ */
+static bool should_inflate(const struct tnode *tn)
+{
+	unsigned long used = tnode_child_length(tn);
+	unsigned long threshold = used;
+
+	/* Keep root node larger */
+	threshold *= node_parent(tn) ? inflate_threshold :
+				       inflate_threshold_root;
+	used += tn->full_children;
+	used -= tn->empty_children;
+
+	return tn->pos && ((50 * used) >= threshold);
+}
+
+static bool should_halve(const struct tnode *tn)
+{
+	unsigned long used = tnode_child_length(tn);
+	unsigned long threshold = used;
+
+	/* Keep root node larger */
+	threshold *= node_parent(tn) ? halve_threshold :
+				       halve_threshold_root;
+	used -= tn->empty_children;
+
+	return (tn->bits > 1) && ((100 * used) < threshold);
+}
+
 #define MAX_WORK 10
 static struct tnode *resize(struct trie *t, struct tnode *tn)
 {
 	struct tnode *old_tn, *n = NULL;
-	int inflate_threshold_use;
-	int halve_threshold_use;
 	int max_work;
 
 	if (!tn)
@@ -668,86 +750,12 @@ static struct tnode *resize(struct trie *t, struct tnode *tn)
 	/* One child */
 	if (tn->empty_children == (tnode_child_length(tn) - 1))
 		goto one_child;
-	/*
-	 * Double as long as the resulting node has a number of
-	 * nonempty nodes that are above the threshold.
-	 */
 
-	/*
-	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
-	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
-	 * Telecommunications, page 6:
-	 * "A node is doubled if the ratio of non-empty children to all
-	 * children in the *doubled* node is at least 'high'."
-	 *
-	 * 'high' in this instance is the variable 'inflate_threshold'. It
-	 * is expressed as a percentage, so we multiply it with
-	 * tnode_child_length() and instead of multiplying by 2 (since the
-	 * child array will be doubled by inflate()) and multiplying
-	 * the left-hand side by 100 (to handle the percentage thing) we
-	 * multiply the left-hand side by 50.
-	 *
-	 * The left-hand side may look a bit weird: tnode_child_length(tn)
-	 * - tn->empty_children is of course the number of non-null children
-	 * in the current node. tn->full_children is the number of "full"
-	 * children, that is non-null tnodes with a skip value of 0.
-	 * All of those will be doubled in the resulting inflated tnode, so
-	 * we just count them one extra time here.
-	 *
-	 * A clearer way to write this would be:
-	 *
-	 * to_be_doubled = tn->full_children;
-	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
-	 *     tn->full_children;
-	 *
-	 * new_child_length = tnode_child_length(tn) * 2;
-	 *
-	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
-	 *      new_child_length;
-	 * if (new_fill_factor >= inflate_threshold)
-	 *
-	 * ...and so on, tho it would mess up the while () loop.
-	 *
-	 * anyway,
-	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
-	 *      inflate_threshold
-	 *
-	 * avoid a division:
-	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
-	 *      inflate_threshold * new_child_length
-	 *
-	 * expand not_to_be_doubled and to_be_doubled, and shorten:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children) >= inflate_threshold * new_child_length
-	 *
-	 * expand new_child_length:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children) >=
-	 *      inflate_threshold * tnode_child_length(tn) * 2
-	 *
-	 * shorten again:
-	 * 50 * (tn->full_children + tnode_child_length(tn) -
-	 *    tn->empty_children) >= inflate_threshold *
-	 *    tnode_child_length(tn)
-	 *
+	/* Double as long as the resulting node has a number of
+	 * nonempty nodes that are above the threshold.
 	 */
-
-	/* Keep root node larger  */
-
-	if (!node_parent(tn)) {
-		inflate_threshold_use = inflate_threshold_root;
-		halve_threshold_use = halve_threshold_root;
-	} else {
-		inflate_threshold_use = inflate_threshold;
-		halve_threshold_use = halve_threshold;
-	}
-
 	max_work = MAX_WORK;
-	while ((tn->full_children > 0 &&  max_work-- &&
-		50 * (tn->full_children + tnode_child_length(tn)
-		      - tn->empty_children)
-		>= inflate_threshold_use * tnode_child_length(tn))) {
-
+	while (should_inflate(tn) && max_work--) {
 		old_tn = tn;
 		tn = inflate(t, tn);
 
@@ -764,16 +772,11 @@ static struct tnode *resize(struct trie *t, struct tnode *tn)
 	if (max_work != MAX_WORK)
 		return tn;
 
-	/*
-	 * Halve as long as the number of empty children in this
+	/* Halve as long as the number of empty children in this
 	 * node is above threshold.
 	 */
-
 	max_work = MAX_WORK;
-	while (tn->bits > 1 &&  max_work-- &&
-	       100 * (tnode_child_length(tn) - tn->empty_children) <
-	       halve_threshold_use * tnode_child_length(tn)) {
-
+	while (should_halve(tn) && max_work--) {
 		old_tn = tn;
 		tn = halve(t, tn);
 		if (IS_ERR(tn)) {
-- 
cgit v1.2.3


From ff181ed8768fa57fbda40fa97914f27fe275129b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:43 -0800
Subject: fib_trie: Push assignment of child to parent down into inflate/halve

This change makes it so that the assignment of the tnode to the parent is
handled directly within whatever function is currently handling the node be
it inflate, halve, or resize.  By doing this we can avoid some of the need
to set NULL pointers in the tree while we are resizing the subnodes.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 149 +++++++++++++++++++++++-----------------------------
 1 file changed, 66 insertions(+), 83 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 58e1224786fa..7fbd2c5a76b1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -146,9 +146,7 @@ struct trie {
 #endif
 };
 
-static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
-				  struct tnode *n, int wasfull);
-static struct tnode *resize(struct trie *t, struct tnode *tn);
+static void resize(struct trie *t, struct tnode *tn);
 /* tnodes to free after resize(); protected by RTNL */
 static struct callback_head *tnode_free_head;
 static size_t tnode_free_size;
@@ -396,22 +394,13 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
 	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
 }
 
-static inline void put_child(struct tnode *tn, unsigned long i,
-			     struct tnode *n)
-{
-	tnode_put_child_reorg(tn, i, n, -1);
-}
-
- /*
-  * Add a child at position i overwriting the old value.
-  * Update the value of full_children and empty_children.
-  */
-
-static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
-				  struct tnode *n, int wasfull)
+/* Add a child at position i overwriting the old value.
+ * Update the value of full_children and empty_children.
+ */
+static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 {
 	struct tnode *chi = rtnl_dereference(tn->child[i]);
-	int isfull;
+	int isfull, wasfull;
 
 	BUG_ON(i >= tnode_child_length(tn));
 
@@ -422,10 +411,9 @@ static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
 		tn->empty_children--;
 
 	/* update fullChildren */
-	if (wasfull == -1)
-		wasfull = tnode_full(tn, chi);
-
+	wasfull = tnode_full(tn, chi);
 	isfull = tnode_full(tn, n);
+
 	if (wasfull && !isfull)
 		tn->full_children--;
 	else if (!wasfull && isfull)
@@ -458,9 +446,10 @@ static void tnode_clean_free(struct tnode *tn)
 	node_free(tn);
 }
 
-static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
+static int inflate(struct trie *t, struct tnode *oldtnode)
 {
 	unsigned long olen = tnode_child_length(oldtnode);
+	struct tnode *tp = node_parent(oldtnode);
 	struct tnode *tn;
 	unsigned long i;
 	t_key m;
@@ -468,9 +457,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 	pr_debug("In inflate\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
-
 	if (!tn)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	/*
 	 * Preallocate and store tnodes before the actual work so we
@@ -564,30 +552,36 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 			put_child(left, j, rtnl_dereference(inode->child[j]));
 			put_child(right, j, rtnl_dereference(inode->child[j + size]));
 		}
-		put_child(tn, 2*i, resize(t, left));
-		put_child(tn, 2*i+1, resize(t, right));
+
+		put_child(tn, 2 * i, left);
+		put_child(tn, 2 * i + 1, right);
 
 		tnode_free_safe(inode);
+
+		resize(t, left);
+		resize(t, right);
 	}
+
+	put_child_root(tp, t, tn->key, tn);
 	tnode_free_safe(oldtnode);
-	return tn;
+	return 0;
 nomem:
 	tnode_clean_free(tn);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
-static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
+static int halve(struct trie *t, struct tnode *oldtnode)
 {
 	unsigned long olen = tnode_child_length(oldtnode);
+	struct tnode *tp = node_parent(oldtnode);
 	struct tnode *tn, *left, *right;
 	int i;
 
 	pr_debug("In halve\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
-
 	if (!tn)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	/*
 	 * Preallocate and store tnodes before the actual work so we
@@ -606,8 +600,10 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 
 			newn = tnode_new(left->key, oldtnode->pos, 1);
 
-			if (!newn)
-				goto nomem;
+			if (!newn) {
+				tnode_clean_free(tn);
+				return -ENOMEM;
+			}
 
 			put_child(tn, i/2, newn);
 		}
@@ -635,16 +631,18 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 
 		/* Two nonempty children */
 		newBinNode = tnode_get_child(tn, i/2);
-		put_child(tn, i/2, NULL);
 		put_child(newBinNode, 0, left);
 		put_child(newBinNode, 1, right);
-		put_child(tn, i/2, resize(t, newBinNode));
+
+		put_child(tn, i / 2, newBinNode);
+
+		resize(t, newBinNode);
 	}
+
+	put_child_root(tp, t, tn->key, tn);
 	tnode_free_safe(oldtnode);
-	return tn;
-nomem:
-	tnode_clean_free(tn);
-	return ERR_PTR(-ENOMEM);
+
+	return 0;
 }
 
 /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
@@ -704,45 +702,48 @@ nomem:
  *    tnode_child_length(tn)
  *
  */
-static bool should_inflate(const struct tnode *tn)
+static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
 {
 	unsigned long used = tnode_child_length(tn);
 	unsigned long threshold = used;
 
 	/* Keep root node larger */
-	threshold *= node_parent(tn) ? inflate_threshold :
-				       inflate_threshold_root;
+	threshold *= tp ? inflate_threshold : inflate_threshold_root;
 	used += tn->full_children;
 	used -= tn->empty_children;
 
 	return tn->pos && ((50 * used) >= threshold);
 }
 
-static bool should_halve(const struct tnode *tn)
+static bool should_halve(const struct tnode *tp, const struct tnode *tn)
 {
 	unsigned long used = tnode_child_length(tn);
 	unsigned long threshold = used;
 
 	/* Keep root node larger */
-	threshold *= node_parent(tn) ? halve_threshold :
-				       halve_threshold_root;
+	threshold *= tp ? halve_threshold : halve_threshold_root;
 	used -= tn->empty_children;
 
 	return (tn->bits > 1) && ((100 * used) < threshold);
 }
 
 #define MAX_WORK 10
-static struct tnode *resize(struct trie *t, struct tnode *tn)
+static void resize(struct trie *t, struct tnode *tn)
 {
-	struct tnode *old_tn, *n = NULL;
+	struct tnode *tp = node_parent(tn), *n = NULL;
+	struct tnode __rcu **cptr;
 	int max_work;
 
-	if (!tn)
-		return NULL;
-
 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 		 tn, inflate_threshold, halve_threshold);
 
+	/* track the tnode via the pointer from the parent instead of
+	 * doing it ourselves.  This way we can let RCU fully do its
+	 * thing without us interfering
+	 */
+	cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
+	BUG_ON(tn != rtnl_dereference(*cptr));
+
 	/* No children */
 	if (tn->empty_children > (tnode_child_length(tn) - 1))
 		goto no_children;
@@ -755,39 +756,35 @@ static struct tnode *resize(struct trie *t, struct tnode *tn)
 	 * nonempty nodes that are above the threshold.
 	 */
 	max_work = MAX_WORK;
-	while (should_inflate(tn) && max_work--) {
-		old_tn = tn;
-		tn = inflate(t, tn);
-
-		if (IS_ERR(tn)) {
-			tn = old_tn;
+	while (should_inflate(tp, tn) && max_work--) {
+		if (inflate(t, tn)) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(t->stats->resize_node_skipped);
 #endif
 			break;
 		}
+
+		tn = rtnl_dereference(*cptr);
 	}
 
 	/* Return if at least one inflate is run */
 	if (max_work != MAX_WORK)
-		return tn;
+		return;
 
 	/* Halve as long as the number of empty children in this
 	 * node is above threshold.
 	 */
 	max_work = MAX_WORK;
-	while (should_halve(tn) && max_work--) {
-		old_tn = tn;
-		tn = halve(t, tn);
-		if (IS_ERR(tn)) {
-			tn = old_tn;
+	while (should_halve(tp, tn) && max_work--) {
+		if (halve(t, tn)) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(t->stats->resize_node_skipped);
 #endif
 			break;
 		}
-	}
 
+		tn = rtnl_dereference(*cptr);
+	}
 
 	/* Only one child remains */
 	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
@@ -797,11 +794,12 @@ one_child:
 			n = tnode_get_child(tn, --i);
 no_children:
 		/* compress one level */
-		node_set_parent(n, NULL);
+		put_child_root(tp, t, tn->key, n);
+		node_set_parent(n, tp);
+
+		/* drop dead node */
 		tnode_free_safe(tn);
-		return n;
 	}
-	return tn;
 }
 
 /* readside must use rcu_read_lock currently dump routines
@@ -882,34 +880,19 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
 
 static void trie_rebalance(struct trie *t, struct tnode *tn)
 {
-	int wasfull;
-	t_key cindex, key;
 	struct tnode *tp;
 
-	key = tn->key;
-
-	while (tn != NULL && (tp = node_parent(tn)) != NULL) {
-		cindex = get_index(key, tp);
-		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
-		tn = resize(t, tn);
-
-		tnode_put_child_reorg(tp, cindex, tn, wasfull);
-
-		tp = node_parent(tn);
-		if (!tp)
-			rcu_assign_pointer(t->trie, tn);
+	while ((tp = node_parent(tn)) != NULL) {
+		resize(t, tn);
 
 		tnode_free_flush();
-		if (!tp)
-			break;
 		tn = tp;
 	}
 
 	/* Handle last (top) tnode */
 	if (IS_TNODE(tn))
-		tn = resize(t, tn);
+		resize(t, tn);
 
-	rcu_assign_pointer(t->trie, tn);
 	tnode_free_flush();
 }
 
-- 
cgit v1.2.3


From fc86a93b46d707c6e245b15127409340eaf6ae47 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:49 -0800
Subject: fib_trie: Push tnode flushing down to inflate/halve

This change pushes the tnode freeing down into the inflate and halve
functions.  It makes more sense here as we have a better grasp of what is
going on and when a given cluster of nodes is ready to be freed.

I believe this may address a bug in the freeing logic as well.  For some
reason if the freelist got to a certain size we would call
synchronize_rcu().  I'm assuming that what they meant to do is call
synchronize_rcu() after they had handed off that much memory via
call_rcu().  As such that is what I have updated the behavior to be.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 103 +++++++++++++++++++++++++---------------------------
 1 file changed, 50 insertions(+), 53 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 7fbd2c5a76b1..485983ef0444 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -147,8 +147,6 @@ struct trie {
 };
 
 static void resize(struct trie *t, struct tnode *tn);
-/* tnodes to free after resize(); protected by RTNL */
-static struct callback_head *tnode_free_head;
 static size_t tnode_free_size;
 
 /*
@@ -307,32 +305,6 @@ static struct tnode *tnode_alloc(size_t size)
 		return vzalloc(size);
 }
 
-static void tnode_free_safe(struct tnode *tn)
-{
-	BUG_ON(IS_LEAF(tn));
-	tn->rcu.next = tnode_free_head;
-	tnode_free_head = &tn->rcu;
-}
-
-static void tnode_free_flush(void)
-{
-	struct callback_head *head;
-
-	while ((head = tnode_free_head)) {
-		struct tnode *tn = container_of(head, struct tnode, rcu);
-
-		tnode_free_head = head->next;
-		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
-
-		node_free(tn);
-	}
-
-	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
-		tnode_free_size = 0;
-		synchronize_rcu();
-	}
-}
-
 static struct tnode *leaf_new(t_key key)
 {
 	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
@@ -433,17 +405,33 @@ static void put_child_root(struct tnode *tp, struct trie *t,
 		rcu_assign_pointer(t->trie, n);
 }
 
-static void tnode_clean_free(struct tnode *tn)
+static inline void tnode_free_init(struct tnode *tn)
 {
-	struct tnode *tofree;
-	unsigned long i;
+	tn->rcu.next = NULL;
+}
+
+static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
+{
+	n->rcu.next = tn->rcu.next;
+	tn->rcu.next = &n->rcu;
+}
 
-	for (i = 0; i < tnode_child_length(tn); i++) {
-		tofree = tnode_get_child(tn, i);
-		if (tofree)
-			node_free(tofree);
+static void tnode_free(struct tnode *tn)
+{
+	struct callback_head *head = &tn->rcu;
+
+	while (head) {
+		head = head->next;
+		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
+		node_free(tn);
+
+		tn = container_of(head, struct tnode, rcu);
+	}
+
+	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+		tnode_free_size = 0;
+		synchronize_rcu();
 	}
-	node_free(tn);
 }
 
 static int inflate(struct trie *t, struct tnode *oldtnode)
@@ -476,20 +464,23 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 					 inode->bits - 1);
 			if (!left)
 				goto nomem;
+			tnode_free_append(tn, left);
 
 			right = tnode_new(inode->key | m, inode->pos,
 					  inode->bits - 1);
 
-			if (!right) {
-				node_free(left);
+			if (!right)
 				goto nomem;
-			}
+			tnode_free_append(tn, right);
 
 			put_child(tn, 2*i, left);
 			put_child(tn, 2*i+1, right);
 		}
 	}
 
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
+
 	for (i = 0; i < olen; i++) {
 		struct tnode *inode = tnode_get_child(oldtnode, i);
 		struct tnode *left, *right;
@@ -505,12 +496,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 			continue;
 		}
 
+		/* drop the node in the old tnode free list */
+		tnode_free_append(oldtnode, inode);
+
 		/* An internal node with two children */
 		if (inode->bits == 1) {
 			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
 			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
-
-			tnode_free_safe(inode);
 			continue;
 		}
 
@@ -556,17 +548,19 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 		put_child(tn, 2 * i, left);
 		put_child(tn, 2 * i + 1, right);
 
-		tnode_free_safe(inode);
-
+		/* resize child nodes */
 		resize(t, left);
 		resize(t, right);
 	}
 
 	put_child_root(tp, t, tn->key, tn);
-	tnode_free_safe(oldtnode);
+
+	/* we completed without error, prepare to free old node */
+	tnode_free(oldtnode);
 	return 0;
 nomem:
-	tnode_clean_free(tn);
+	/* all pointers should be clean so we are done */
+	tnode_free(tn);
 	return -ENOMEM;
 }
 
@@ -599,17 +593,20 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 			struct tnode *newn;
 
 			newn = tnode_new(left->key, oldtnode->pos, 1);
-
 			if (!newn) {
-				tnode_clean_free(tn);
+				tnode_free(tn);
 				return -ENOMEM;
 			}
+			tnode_free_append(tn, newn);
 
 			put_child(tn, i/2, newn);
 		}
 
 	}
 
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
+
 	for (i = 0; i < olen; i += 2) {
 		struct tnode *newBinNode;
 
@@ -636,11 +633,14 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 
 		put_child(tn, i / 2, newBinNode);
 
+		/* resize child node */
 		resize(t, newBinNode);
 	}
 
 	put_child_root(tp, t, tn->key, tn);
-	tnode_free_safe(oldtnode);
+
+	/* all pointers should be clean so we are done */
+	tnode_free(oldtnode);
 
 	return 0;
 }
@@ -798,7 +798,8 @@ no_children:
 		node_set_parent(n, tp);
 
 		/* drop dead node */
-		tnode_free_safe(tn);
+		tnode_free_init(tn);
+		tnode_free(tn);
 	}
 }
 
@@ -884,16 +885,12 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 
 	while ((tp = node_parent(tn)) != NULL) {
 		resize(t, tn);
-
-		tnode_free_flush();
 		tn = tp;
 	}
 
 	/* Handle last (top) tnode */
 	if (IS_TNODE(tn))
 		resize(t, tn);
-
-	tnode_free_flush();
 }
 
 /* only used from updater-side */
-- 
cgit v1.2.3


From 12c081a5c82ef64a90906247cecfa5422962e387 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:55 -0800
Subject: fib_trie: inflate/halve nodes in a more RCU friendly way

This change pulls the node_set_parent functionality out of put_child_reorg
and instead leaves that to the function to take care of as well.  By doing
this we can fully construct the new cluster of tnodes and all of the
pointers out of it before we start routing pointers into it.

I am suspecting this will likely fix some concurency issues though I don't
have a good test to show as such.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 236 +++++++++++++++++++++++++---------------------------
 1 file changed, 115 insertions(+), 121 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 485983ef0444..0c88df09d25d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -391,8 +391,6 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 	else if (!wasfull && isfull)
 		tn->full_children++;
 
-	node_set_parent(n, tn);
-
 	rcu_assign_pointer(tn->child[i], n);
 }
 
@@ -436,10 +434,8 @@ static void tnode_free(struct tnode *tn)
 
 static int inflate(struct trie *t, struct tnode *oldtnode)
 {
-	unsigned long olen = tnode_child_length(oldtnode);
-	struct tnode *tp = node_parent(oldtnode);
-	struct tnode *tn;
-	unsigned long i;
+	struct tnode *inode, *node0, *node1, *tn, *tp;
+	unsigned long i, j, k;
 	t_key m;
 
 	pr_debug("In inflate\n");
@@ -448,43 +444,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 	if (!tn)
 		return -ENOMEM;
 
-	/*
-	 * Preallocate and store tnodes before the actual work so we
-	 * don't get into an inconsistent state if memory allocation
-	 * fails. In case of failure we return the oldnode and  inflate
-	 * of tnode is ignored.
+	/* Assemble all of the pointers in our cluster, in this case that
+	 * represents all of the pointers out of our allocated nodes that
+	 * point to existing tnodes and the links between our allocated
+	 * nodes.
 	 */
-	for (i = 0, m = 1u << tn->pos; i < olen; i++) {
-		struct tnode *inode = tnode_get_child(oldtnode, i);
-
-		if (tnode_full(oldtnode, inode) && (inode->bits > 1)) {
-			struct tnode *left, *right;
-
-			left = tnode_new(inode->key & ~m, inode->pos,
-					 inode->bits - 1);
-			if (!left)
-				goto nomem;
-			tnode_free_append(tn, left);
-
-			right = tnode_new(inode->key | m, inode->pos,
-					  inode->bits - 1);
-
-			if (!right)
-				goto nomem;
-			tnode_free_append(tn, right);
-
-			put_child(tn, 2*i, left);
-			put_child(tn, 2*i+1, right);
-		}
-	}
-
-	/* prepare oldtnode to be freed */
-	tnode_free_init(oldtnode);
-
-	for (i = 0; i < olen; i++) {
-		struct tnode *inode = tnode_get_child(oldtnode, i);
-		struct tnode *left, *right;
-		unsigned long size, j;
+	for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
+		inode = tnode_get_child(oldtnode, --i);
 
 		/* An empty child */
 		if (inode == NULL)
@@ -496,65 +462,99 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 			continue;
 		}
 
-		/* drop the node in the old tnode free list */
-		tnode_free_append(oldtnode, inode);
-
 		/* An internal node with two children */
 		if (inode->bits == 1) {
-			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
-			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
+			put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
+			put_child(tn, 2 * i, tnode_get_child(inode, 0));
 			continue;
 		}
 
-		/* An internal node with more than two children */
-
 		/* We will replace this node 'inode' with two new
-		 * ones, 'left' and 'right', each with half of the
+		 * ones, 'node0' and 'node1', each with half of the
 		 * original children. The two new nodes will have
 		 * a position one bit further down the key and this
 		 * means that the "significant" part of their keys
 		 * (see the discussion near the top of this file)
 		 * will differ by one bit, which will be "0" in
-		 * left's key and "1" in right's key. Since we are
+		 * node0's key and "1" in node1's key. Since we are
 		 * moving the key position by one step, the bit that
 		 * we are moving away from - the bit at position
-		 * (inode->pos) - is the one that will differ between
-		 * left and right. So... we synthesize that bit in the
-		 * two  new keys.
-		 * The mask 'm' below will be a single "one" bit at
-		 * the position (inode->pos)
+		 * (tn->pos) - is the one that will differ between
+		 * node0 and node1. So... we synthesize that bit in the
+		 * two new keys.
 		 */
+		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
+		if (!node1)
+			goto nomem;
+		tnode_free_append(tn, node1);
+
+		node0 = tnode_new(inode->key & ~m, inode->pos, inode->bits - 1);
+		if (!node0)
+			goto nomem;
+		tnode_free_append(tn, node0);
+
+		/* populate child pointers in new nodes */
+		for (k = tnode_child_length(inode), j = k / 2; j;) {
+			put_child(node1, --j, tnode_get_child(inode, --k));
+			put_child(node0, j, tnode_get_child(inode, j));
+			put_child(node1, --j, tnode_get_child(inode, --k));
+			put_child(node0, j, tnode_get_child(inode, j));
+		}
 
-		/* Use the old key, but set the new significant
-		 *   bit to zero.
-		 */
+		/* link new nodes to parent */
+		NODE_INIT_PARENT(node1, tn);
+		NODE_INIT_PARENT(node0, tn);
+
+		/* link parent to nodes */
+		put_child(tn, 2 * i + 1, node1);
+		put_child(tn, 2 * i, node0);
+	}
 
-		left = tnode_get_child(tn, 2*i);
-		put_child(tn, 2*i, NULL);
+	/* setup the parent pointer into and out of this node */
+	tp = node_parent(oldtnode);
+	NODE_INIT_PARENT(tn, tp);
+	put_child_root(tp, t, tn->key, tn);
+
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
 
-		BUG_ON(!left);
+	/* update all child nodes parent pointers to route to us */
+	for (i = tnode_child_length(oldtnode); i;) {
+		inode = tnode_get_child(oldtnode, --i);
+
+		/* A leaf or an internal node with skipped bits */
+		if (!tnode_full(oldtnode, inode)) {
+			node_set_parent(inode, tn);
+			continue;
+		}
 
-		right = tnode_get_child(tn, 2*i+1);
-		put_child(tn, 2*i+1, NULL);
+		/* drop the node in the old tnode free list */
+		tnode_free_append(oldtnode, inode);
 
-		BUG_ON(!right);
+		/* fetch new nodes */
+		node1 = tnode_get_child(tn, 2 * i + 1);
+		node0 = tnode_get_child(tn, 2 * i);
 
-		size = tnode_child_length(left);
-		for (j = 0; j < size; j++) {
-			put_child(left, j, rtnl_dereference(inode->child[j]));
-			put_child(right, j, rtnl_dereference(inode->child[j + size]));
+		/* bits == 1 then node0 and node1 represent inode's children */
+		if (inode->bits == 1) {
+			node_set_parent(node1, tn);
+			node_set_parent(node0, tn);
+			continue;
 		}
 
-		put_child(tn, 2 * i, left);
-		put_child(tn, 2 * i + 1, right);
+		/* update parent pointers in child node's children */
+		for (k = tnode_child_length(inode), j = k / 2; j;) {
+			node_set_parent(tnode_get_child(inode, --k), node1);
+			node_set_parent(tnode_get_child(inode, --j), node0);
+			node_set_parent(tnode_get_child(inode, --k), node1);
+			node_set_parent(tnode_get_child(inode, --j), node0);
+		}
 
 		/* resize child nodes */
-		resize(t, left);
-		resize(t, right);
+		resize(t, node1);
+		resize(t, node0);
 	}
 
-	put_child_root(tp, t, tn->key, tn);
-
 	/* we completed without error, prepare to free old node */
 	tnode_free(oldtnode);
 	return 0;
@@ -566,10 +566,8 @@ nomem:
 
 static int halve(struct trie *t, struct tnode *oldtnode)
 {
-	unsigned long olen = tnode_child_length(oldtnode);
-	struct tnode *tp = node_parent(oldtnode);
-	struct tnode *tn, *left, *right;
-	int i;
+	struct tnode *tn, *tp, *inode, *node0, *node1;
+	unsigned long i;
 
 	pr_debug("In halve\n");
 
@@ -577,68 +575,64 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 	if (!tn)
 		return -ENOMEM;
 
-	/*
-	 * Preallocate and store tnodes before the actual work so we
-	 * don't get into an inconsistent state if memory allocation
-	 * fails. In case of failure we return the oldnode and halve
-	 * of tnode is ignored.
+	/* Assemble all of the pointers in our cluster, in this case that
+	 * represents all of the pointers out of our allocated nodes that
+	 * point to existing tnodes and the links between our allocated
+	 * nodes.
 	 */
+	for (i = tnode_child_length(oldtnode); i;) {
+		node1 = tnode_get_child(oldtnode, --i);
+		node0 = tnode_get_child(oldtnode, --i);
 
-	for (i = 0; i < olen; i += 2) {
-		left = tnode_get_child(oldtnode, i);
-		right = tnode_get_child(oldtnode, i+1);
+		/* At least one of the children is empty */
+		if (!node1 || !node0) {
+			put_child(tn, i / 2, node1 ? : node0);
+			continue;
+		}
 
 		/* Two nonempty children */
-		if (left && right) {
-			struct tnode *newn;
-
-			newn = tnode_new(left->key, oldtnode->pos, 1);
-			if (!newn) {
-				tnode_free(tn);
-				return -ENOMEM;
-			}
-			tnode_free_append(tn, newn);
-
-			put_child(tn, i/2, newn);
+		inode = tnode_new(node0->key, oldtnode->pos, 1);
+		if (!inode) {
+			tnode_free(tn);
+			return -ENOMEM;
 		}
+		tnode_free_append(tn, inode);
 
+		/* initialize pointers out of node */
+		put_child(inode, 1, node1);
+		put_child(inode, 0, node0);
+		NODE_INIT_PARENT(inode, tn);
+
+		/* link parent to node */
+		put_child(tn, i / 2, inode);
 	}
 
+	/* setup the parent pointer out of and back into this node */
+	tp = node_parent(oldtnode);
+	NODE_INIT_PARENT(tn, tp);
+	put_child_root(tp, t, tn->key, tn);
+
 	/* prepare oldtnode to be freed */
 	tnode_free_init(oldtnode);
 
-	for (i = 0; i < olen; i += 2) {
-		struct tnode *newBinNode;
-
-		left = tnode_get_child(oldtnode, i);
-		right = tnode_get_child(oldtnode, i+1);
-
-		/* At least one of the children is empty */
-		if (left == NULL) {
-			if (right == NULL)    /* Both are empty */
-				continue;
-			put_child(tn, i/2, right);
-			continue;
-		}
+	/* update all of the child parent pointers */
+	for (i = tnode_child_length(tn); i;) {
+		inode = tnode_get_child(tn, --i);
 
-		if (right == NULL) {
-			put_child(tn, i/2, left);
+		/* only new tnodes will be considered "full" nodes */
+		if (!tnode_full(tn, inode)) {
+			node_set_parent(inode, tn);
 			continue;
 		}
 
 		/* Two nonempty children */
-		newBinNode = tnode_get_child(tn, i/2);
-		put_child(newBinNode, 0, left);
-		put_child(newBinNode, 1, right);
-
-		put_child(tn, i / 2, newBinNode);
+		node_set_parent(tnode_get_child(inode, 1), inode);
+		node_set_parent(tnode_get_child(inode, 0), inode);
 
 		/* resize child node */
-		resize(t, newBinNode);
+		resize(t, inode);
 	}
 
-	put_child_root(tp, t, tn->key, tn);
-
 	/* all pointers should be clean so we are done */
 	tnode_free(oldtnode);
 
-- 
cgit v1.2.3


From 21d1f11db0e2f20a549ad8322879507850544670 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:57:02 -0800
Subject: fib_trie: Remove checks for index >= tnode_child_length from
 tnode_get_child

For some reason the compiler doesn't seem to understand that when we are in
a loop that runs from tnode_child_length - 1 to 0 we don't expect the value
of tn->bits to change.  As such every call to tnode_get_child was rerunning
tnode_chile_length which ended up consuming quite a bit of space in the
resultant assembly code.

I have gone though and verified that in all cases where tnode_get_child
is used we are either winding though a fixed loop from tnode_child_length -
1 to 0, or are in a fastpath case where we are verifying the value by
either checking for any remaining bits after shifting index by bits and
testing for leaf, or by using tnode_child_length.

size net/ipv4/fib_trie.o
Before:
   text	   data	    bss	    dec	    hex	filename
  15506	    376	      8	  15890	   3e12	net/ipv4/fib_trie.o

After:
   text	   data	    bss	    dec	    hex	filename
  14827	    376	      8	  15211	   3b6b	net/ipv4/fib_trie.o

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0c88df09d25d..87fc9a466fa2 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,8 +186,6 @@ static inline unsigned long tnode_child_length(const struct tnode *tn)
 static inline struct tnode *tnode_get_child(const struct tnode *tn,
 					    unsigned long i)
 {
-	BUG_ON(i >= tnode_child_length(tn));
-
 	return rtnl_dereference(tn->child[i]);
 }
 
@@ -195,8 +193,6 @@ static inline struct tnode *tnode_get_child(const struct tnode *tn,
 static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
 						unsigned long i)
 {
-	BUG_ON(i >= tnode_child_length(tn));
-
 	return rcu_dereference_rtnl(tn->child[i]);
 }
 
@@ -371,7 +367,7 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
  */
 static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 {
-	struct tnode *chi = rtnl_dereference(tn->child[i]);
+	struct tnode *chi = tnode_get_child(tn, i);
 	int isfull, wasfull;
 
 	BUG_ON(i >= tnode_child_length(tn));
@@ -867,7 +863,7 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
 		if (IS_LEAF(n))
 			break;
 
-		n = rcu_dereference_rtnl(n->child[index]);
+		n = tnode_get_child_rcu(n, index);
 	}
 
 	return n;
@@ -934,7 +930,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		}
 
 		tp = n;
-		n = rcu_dereference_rtnl(n->child[index]);
+		n = tnode_get_child_rcu(n, index);
 	}
 
 	l = leaf_new(key);
@@ -1215,7 +1211,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 			cindex = index;
 		}
 
-		n = rcu_dereference(n->child[index]);
+		n = tnode_get_child_rcu(n, index);
 		if (unlikely(!n))
 			goto backtrace;
 	}
@@ -1835,7 +1831,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 			if (n->bits < MAX_STAT_DEPTH)
 				s->nodesizes[n->bits]++;
 
-			for (i = 0; i < tnode_child_length(n); i++) {
+			for (i = tnode_child_length(n); i--;) {
 				if (!rcu_access_pointer(n->child[i]))
 					s->nullpointers++;
 			}
-- 
cgit v1.2.3


From 5405afd1a30620de8601436bae739c67c0bc7324 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:57:08 -0800
Subject: fib_trie: Add tracking value for suffix length

This change adds a tracking value for the maximum suffix length of all
prefixes stored in any given tnode.  With this value we can determine if we
need to backtrace or not based on if the suffix is greater than the pos
value.

By doing this we can reduce the CPU overhead for lookups in the local table
as many of the prefixes there are 32b long and have a suffix length of 0
meaning we can immediately backtrace to the root node without needing to
test any of the nodes between it and where we ended up.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 116 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 87fc9a466fa2..281e5e00025f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -96,6 +96,7 @@ struct tnode {
 	t_key key;
 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
+	unsigned char slen;
 	struct tnode __rcu *parent;
 	struct rcu_head rcu;
 	union {
@@ -311,6 +312,7 @@ static struct tnode *leaf_new(t_key key)
 		 * as the nodes are searched
 		 */
 		l->key = key;
+		l->slen = 0;
 		l->pos = 0;
 		/* set bits to 0 indicating we are not a tnode */
 		l->bits = 0;
@@ -342,6 +344,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 
 	if (tn) {
 		tn->parent = NULL;
+		tn->slen = pos;
 		tn->pos = pos;
 		tn->bits = bits;
 		tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
@@ -387,6 +390,9 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 	else if (!wasfull && isfull)
 		tn->full_children++;
 
+	if (n && (tn->slen < n->slen))
+		tn->slen = n->slen;
+
 	rcu_assign_pointer(tn->child[i], n);
 }
 
@@ -635,6 +641,41 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 	return 0;
 }
 
+static unsigned char update_suffix(struct tnode *tn)
+{
+	unsigned char slen = tn->pos;
+	unsigned long stride, i;
+
+	/* search though the list of children looking for nodes that might
+	 * have a suffix greater than the one we currently have.  This is
+	 * why we start with a stride of 2 since a stride of 1 would
+	 * represent the nodes with suffix length equal to tn->pos
+	 */
+	for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
+		struct tnode *n = tnode_get_child(tn, i);
+
+		if (!n || (n->slen <= slen))
+			continue;
+
+		/* update stride and slen based on new value */
+		stride <<= (n->slen - slen);
+		slen = n->slen;
+		i &= ~(stride - 1);
+
+		/* if slen covers all but the last bit we can stop here
+		 * there will be nothing longer than that since only node
+		 * 0 and 1 << (bits - 1) could have that as their suffix
+		 * length.
+		 */
+		if ((slen + 1) >= (tn->pos + tn->bits))
+			break;
+	}
+
+	tn->slen = slen;
+
+	return slen;
+}
+
 /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
  * the Helsinki University of Technology and Matti Tikkanen of Nokia
  * Telecommunications, page 6:
@@ -790,6 +831,19 @@ no_children:
 		/* drop dead node */
 		tnode_free_init(tn);
 		tnode_free(tn);
+		return;
+	}
+
+	/* Return if at least one deflate was run */
+	if (max_work != MAX_WORK)
+		return;
+
+	/* push the suffix length to the parent node */
+	if (tn->slen > tn->pos) {
+		unsigned char slen = update_suffix(tn);
+
+		if (tp && (slen > tp->slen))
+			tp->slen = slen;
 	}
 }
 
@@ -818,8 +872,58 @@ static inline struct list_head *get_fa_head(struct tnode *l, int plen)
 	return &li->falh;
 }
 
-static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
+static void leaf_pull_suffix(struct tnode *l)
+{
+	struct tnode *tp = node_parent(l);
+
+	while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
+		if (update_suffix(tp) > l->slen)
+			break;
+		tp = node_parent(tp);
+	}
+}
+
+static void leaf_push_suffix(struct tnode *l)
 {
+	struct tnode *tn = node_parent(l);
+
+	/* if this is a new leaf then tn will be NULL and we can sort
+	 * out parent suffix lengths as a part of trie_rebalance
+	 */
+	while (tn && (tn->slen < l->slen)) {
+		tn->slen = l->slen;
+		tn = node_parent(tn);
+	}
+}
+
+static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
+{
+	struct hlist_node *prev;
+
+	/* record the location of the pointer to this object */
+	prev = rtnl_dereference(hlist_pprev_rcu(&old->hlist));
+
+	/* remove the leaf info from the list */
+	hlist_del_rcu(&old->hlist);
+
+	/* if we emptied the list this leaf will be freed and we can sort
+	 * out parent suffix lengths as a part of trie_rebalance
+	 */
+	if (hlist_empty(&l->list))
+		return;
+
+	/* if we removed the tail then we need to update slen */
+	if (!rcu_access_pointer(hlist_next_rcu(prev))) {
+		struct leaf_info *li = hlist_entry(prev, typeof(*li), hlist);
+
+		l->slen = KEYLENGTH - li->plen;
+		leaf_pull_suffix(l);
+	}
+}
+
+static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
+{
+	struct hlist_head *head = &l->list;
 	struct leaf_info *li = NULL, *last = NULL;
 
 	if (hlist_empty(head)) {
@@ -836,6 +940,12 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 		else
 			hlist_add_before_rcu(&new->hlist, &li->hlist);
 	}
+
+	/* if we added to the tail node then we need to update slen */
+	if (!rcu_access_pointer(hlist_next_rcu(&new->hlist))) {
+		l->slen = KEYLENGTH - new->plen;
+		leaf_push_suffix(l);
+	}
 }
 
 /* rcu_read_lock needs to be hold by caller from readside */
@@ -925,7 +1035,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		/* we have found a leaf. Prefixes have already been compared */
 		if (IS_LEAF(n)) {
 			/* Case 1: n is a leaf, and prefixes match*/
-			insert_leaf_info(&n->list, li);
+			insert_leaf_info(n, li);
 			return fa_head;
 		}
 
@@ -939,7 +1049,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		return NULL;
 	}
 
-	insert_leaf_info(&l->list, li);
+	insert_leaf_info(l, li);
 
 	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
 	 *
@@ -1206,7 +1316,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		/* only record pn and cindex if we are going to be chopping
 		 * bits later.  Otherwise we are just wasting cycles.
 		 */
-		if (index) {
+		if (n->slen > n->pos) {
 			pn = n;
 			cindex = index;
 		}
@@ -1225,7 +1335,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		 * between the key and the prefix exist in the region of
 		 * the lsb and higher in the prefix.
 		 */
-		if (unlikely(prefix_mismatch(key, n)))
+		if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
 			goto backtrace;
 
 		/* exit out and process leaf */
@@ -1425,7 +1535,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 		tb->tb_num_default--;
 
 	if (list_empty(fa_head)) {
-		hlist_del_rcu(&li->hlist);
+		remove_leaf_info(l, li);
 		free_leaf_info(li);
 	}
 
-- 
cgit v1.2.3


From a4c9ea5e8fec680134d22aa99b54d1cd8c226ebd Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Tue, 30 Dec 2014 19:10:16 -0800
Subject: geneve: Add Geneve GRO support

This results in an approximately 30% increase in throughput
when handling encapsulated bulk traffic.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/geneve.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 394a200f93c1..19e256e1dd92 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -149,6 +149,99 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 }
 EXPORT_SYMBOL_GPL(geneve_xmit_skb);
 
+static int geneve_hlen(struct genevehdr *gh)
+{
+	return sizeof(*gh) + gh->opt_len * 4;
+}
+
+static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
+					   struct sk_buff *skb)
+{
+	struct sk_buff *p, **pp = NULL;
+	struct genevehdr *gh, *gh2;
+	unsigned int hlen, gh_len, off_gnv;
+	const struct packet_offload *ptype;
+	__be16 type;
+	int flush = 1;
+
+	off_gnv = skb_gro_offset(skb);
+	hlen = off_gnv + sizeof(*gh);
+	gh = skb_gro_header_fast(skb, off_gnv);
+	if (skb_gro_header_hard(skb, hlen)) {
+		gh = skb_gro_header_slow(skb, hlen, off_gnv);
+		if (unlikely(!gh))
+			goto out;
+	}
+
+	if (gh->ver != GENEVE_VER || gh->oam)
+		goto out;
+	gh_len = geneve_hlen(gh);
+
+	hlen = off_gnv + gh_len;
+	if (skb_gro_header_hard(skb, hlen)) {
+		gh = skb_gro_header_slow(skb, hlen, off_gnv);
+		if (unlikely(!gh))
+			goto out;
+	}
+
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		gh2 = (struct genevehdr *)(p->data + off_gnv);
+		if (gh->opt_len != gh2->opt_len ||
+		    memcmp(gh, gh2, gh_len)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+	}
+
+	type = gh->proto_type;
+
+	rcu_read_lock();
+	ptype = gro_find_receive_by_type(type);
+	if (ptype == NULL) {
+		flush = 1;
+		goto out_unlock;
+	}
+
+	skb_gro_pull(skb, gh_len);
+	skb_gro_postpull_rcsum(skb, gh, gh_len);
+	pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct genevehdr *gh;
+	struct packet_offload *ptype;
+	__be16 type;
+	int gh_len;
+	int err = -ENOSYS;
+
+	udp_tunnel_gro_complete(skb, nhoff);
+
+	gh = (struct genevehdr *)(skb->data + nhoff);
+	gh_len = geneve_hlen(gh);
+	type = gh->proto_type;
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype != NULL)
+		err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
+
+	rcu_read_unlock();
+	return err;
+}
+
 static void geneve_notify_add_rx_port(struct geneve_sock *gs)
 {
 	struct sock *sk = gs->sock->sk;
@@ -278,8 +371,8 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 
 	/* Initialize the geneve udp offloads structure */
 	gs->udp_offloads.port = port;
-	gs->udp_offloads.callbacks.gro_receive = NULL;
-	gs->udp_offloads.callbacks.gro_complete = NULL;
+	gs->udp_offloads.callbacks.gro_receive  = geneve_gro_receive;
+	gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
 
 	spin_lock(&gn->sock_lock);
 	hlist_add_head_rcu(&gs->hlist, gs_head(net, port));
-- 
cgit v1.2.3


From 61f3cade763dca46127146a52d829e30b8f48921 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Fri, 2 Jan 2015 18:26:02 -0800
Subject: geneve: Remove workqueue.

The work queue is used only to free the UDP socket upon destruction.
This is not necessary with Geneve and generally makes the code more
difficult to reason about. It also introduces nondeterministic
behavior such as when a socket is rapidly deleted and recreated, which
could fail as the the deletion happens asynchronously.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/geneve.h |  1 -
 net/ipv4/geneve.c    | 21 ++-------------------
 2 files changed, 2 insertions(+), 20 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/geneve.h b/include/net/geneve.h
index 112132cf8e2e..56c7e1ac216a 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -71,7 +71,6 @@ struct geneve_sock {
 	struct hlist_node	hlist;
 	geneve_rcv_t		*rcv;
 	void			*rcv_data;
-	struct work_struct	del_work;
 	struct socket		*sock;
 	struct rcu_head		rcu;
 	atomic_t		refcnt;
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 19e256e1dd92..136a829e8746 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -61,8 +61,6 @@ struct geneve_net {
 
 static int geneve_net_id;
 
-static struct workqueue_struct *geneve_wq;
-
 static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
 {
 	return (struct genevehdr *)(udp_hdr(skb) + 1);
@@ -307,15 +305,6 @@ error:
 	return 1;
 }
 
-static void geneve_del_work(struct work_struct *work)
-{
-	struct geneve_sock *gs = container_of(work, struct geneve_sock,
-					      del_work);
-
-	udp_tunnel_sock_release(gs->sock);
-	kfree_rcu(gs, rcu);
-}
-
 static struct socket *geneve_create_sock(struct net *net, bool ipv6,
 					 __be16 port)
 {
@@ -356,8 +345,6 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	if (!gs)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_WORK(&gs->del_work, geneve_del_work);
-
 	sock = geneve_create_sock(net, ipv6, port);
 	if (IS_ERR(sock)) {
 		kfree(gs);
@@ -430,7 +417,8 @@ void geneve_sock_release(struct geneve_sock *gs)
 	geneve_notify_del_rx_port(gs);
 	spin_unlock(&gn->sock_lock);
 
-	queue_work(geneve_wq, &gs->del_work);
+	udp_tunnel_sock_release(gs->sock);
+	kfree_rcu(gs, rcu);
 }
 EXPORT_SYMBOL_GPL(geneve_sock_release);
 
@@ -458,10 +446,6 @@ static int __init geneve_init_module(void)
 {
 	int rc;
 
-	geneve_wq = alloc_workqueue("geneve", 0, 0);
-	if (!geneve_wq)
-		return -ENOMEM;
-
 	rc = register_pernet_subsys(&geneve_net_ops);
 	if (rc)
 		return rc;
@@ -474,7 +458,6 @@ late_initcall(geneve_init_module);
 
 static void __exit geneve_cleanup_module(void)
 {
-	destroy_workqueue(geneve_wq);
 	unregister_pernet_subsys(&geneve_net_ops);
 }
 module_exit(geneve_cleanup_module);
-- 
cgit v1.2.3


From 829a3ada9cc7d4c30fa61f8033403fb6c8f8092a Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Fri, 2 Jan 2015 18:26:03 -0800
Subject: geneve: Simplify locking.

The existing Geneve locking scheme was pulled over directly from
VXLAN. However, VXLAN has a number of built in mechanisms which make
the locking more complex and are unlikely to be necessary with Geneve.
This simplifies the locking to use a basic scheme of a mutex
when doing updates plus RCU on receive.

In addition to making the code easier to read, this also avoids the
possibility of a race when creating or destroying sockets since
UDP sockets and the list of Geneve sockets are protected by different
locks. After this change, the entire operation is atomic.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/geneve.h |  2 +-
 net/ipv4/geneve.c    | 59 +++++++++++++++++++++++-----------------------------
 2 files changed, 27 insertions(+), 34 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/geneve.h b/include/net/geneve.h
index 56c7e1ac216a..b40f4affc4cb 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -73,7 +73,7 @@ struct geneve_sock {
 	void			*rcv_data;
 	struct socket		*sock;
 	struct rcu_head		rcu;
-	atomic_t		refcnt;
+	int			refcnt;
 	struct udp_offload	udp_offloads;
 };
 
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 136a829e8746..ad8dbae11d01 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -17,7 +17,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/skbuff.h>
-#include <linux/rculist.h>
+#include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/in.h>
 #include <linux/ip.h>
@@ -28,6 +28,7 @@
 #include <linux/if_vlan.h>
 #include <linux/hash.h>
 #include <linux/ethtool.h>
+#include <linux/mutex.h>
 #include <net/arp.h>
 #include <net/ndisc.h>
 #include <net/ip.h>
@@ -50,13 +51,15 @@
 #include <net/ip6_checksum.h>
 #endif
 
+/* Protects sock_list and refcounts. */
+static DEFINE_MUTEX(geneve_mutex);
+
 #define PORT_HASH_BITS 8
 #define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
 
 /* per-network namespace private data for this module */
 struct geneve_net {
 	struct hlist_head	sock_list[PORT_HASH_SIZE];
-	spinlock_t		sock_lock;   /* Protects sock_list */
 };
 
 static int geneve_net_id;
@@ -78,7 +81,7 @@ static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
 {
 	struct geneve_sock *gs;
 
-	hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) {
+	hlist_for_each_entry(gs, gs_head(net, port), hlist) {
 		if (inet_sk(gs->sock->sk)->inet_sport == port)
 			return gs;
 	}
@@ -336,7 +339,6 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 						geneve_rcv_t *rcv, void *data,
 						bool ipv6)
 {
-	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
 	struct socket *sock;
 	struct udp_tunnel_sock_cfg tunnel_cfg;
@@ -352,7 +354,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	}
 
 	gs->sock = sock;
-	atomic_set(&gs->refcnt, 1);
+	gs->refcnt = 1;
 	gs->rcv = rcv;
 	gs->rcv_data = data;
 
@@ -360,11 +362,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	gs->udp_offloads.port = port;
 	gs->udp_offloads.callbacks.gro_receive  = geneve_gro_receive;
 	gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
-
-	spin_lock(&gn->sock_lock);
-	hlist_add_head_rcu(&gs->hlist, gs_head(net, port));
 	geneve_notify_add_rx_port(gs);
-	spin_unlock(&gn->sock_lock);
 
 	/* Mark socket as an encapsulation socket */
 	tunnel_cfg.sk_user_data = gs;
@@ -373,6 +371,8 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	tunnel_cfg.encap_destroy = NULL;
 	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 
+	hlist_add_head(&gs->hlist, gs_head(net, port));
+
 	return gs;
 }
 
@@ -380,25 +380,21 @@ struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
 				    geneve_rcv_t *rcv, void *data,
 				    bool no_share, bool ipv6)
 {
-	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
 
-	gs = geneve_socket_create(net, port, rcv, data, ipv6);
-	if (!IS_ERR(gs))
-		return gs;
-
-	if (no_share)	/* Return error if sharing is not allowed. */
-		return ERR_PTR(-EINVAL);
+	mutex_lock(&geneve_mutex);
 
-	spin_lock(&gn->sock_lock);
 	gs = geneve_find_sock(net, port);
-	if (gs && ((gs->rcv != rcv) ||
-		   !atomic_add_unless(&gs->refcnt, 1, 0)))
+	if (gs) {
+		if (!no_share && gs->rcv == rcv)
+			gs->refcnt++;
+		else
 			gs = ERR_PTR(-EBUSY);
-	spin_unlock(&gn->sock_lock);
+	} else {
+		gs = geneve_socket_create(net, port, rcv, data, ipv6);
+	}
 
-	if (!gs)
-		gs = ERR_PTR(-EINVAL);
+	mutex_unlock(&geneve_mutex);
 
 	return gs;
 }
@@ -406,19 +402,18 @@ EXPORT_SYMBOL_GPL(geneve_sock_add);
 
 void geneve_sock_release(struct geneve_sock *gs)
 {
-	struct net *net = sock_net(gs->sock->sk);
-	struct geneve_net *gn = net_generic(net, geneve_net_id);
+	mutex_lock(&geneve_mutex);
 
-	if (!atomic_dec_and_test(&gs->refcnt))
-		return;
+	if (--gs->refcnt)
+		goto unlock;
 
-	spin_lock(&gn->sock_lock);
-	hlist_del_rcu(&gs->hlist);
+	hlist_del(&gs->hlist);
 	geneve_notify_del_rx_port(gs);
-	spin_unlock(&gn->sock_lock);
-
 	udp_tunnel_sock_release(gs->sock);
 	kfree_rcu(gs, rcu);
+
+unlock:
+	mutex_unlock(&geneve_mutex);
 }
 EXPORT_SYMBOL_GPL(geneve_sock_release);
 
@@ -427,8 +422,6 @@ static __net_init int geneve_init_net(struct net *net)
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	unsigned int h;
 
-	spin_lock_init(&gn->sock_lock);
-
 	for (h = 0; h < PORT_HASH_SIZE; ++h)
 		INIT_HLIST_HEAD(&gn->sock_list[h]);
 
@@ -454,7 +447,7 @@ static int __init geneve_init_module(void)
 
 	return 0;
 }
-late_initcall(geneve_init_module);
+module_init(geneve_init_module);
 
 static void __exit geneve_cleanup_module(void)
 {
-- 
cgit v1.2.3


From df5dba8e52be50e615e03ef73b34611d82587f42 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Fri, 2 Jan 2015 18:26:04 -0800
Subject: geneve: Remove socket hash table.

The hash table for open Geneve ports is used only on creation and
deletion time. It is not performance critical and is not likely to
grow to a large number of items. Therefore, this can be changed
to use a simple linked list.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/geneve.h |  2 +-
 net/ipv4/geneve.c    | 26 +++++++-------------------
 2 files changed, 8 insertions(+), 20 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/geneve.h b/include/net/geneve.h
index b40f4affc4cb..03aa2adb5bab 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -68,7 +68,7 @@ struct geneve_sock;
 typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb);
 
 struct geneve_sock {
-	struct hlist_node	hlist;
+	struct list_head	list;
 	geneve_rcv_t		*rcv;
 	void			*rcv_data;
 	struct socket		*sock;
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index ad8dbae11d01..4fe5a592821c 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -26,7 +26,6 @@
 #include <linux/etherdevice.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
-#include <linux/hash.h>
 #include <linux/ethtool.h>
 #include <linux/mutex.h>
 #include <net/arp.h>
@@ -54,12 +53,9 @@
 /* Protects sock_list and refcounts. */
 static DEFINE_MUTEX(geneve_mutex);
 
-#define PORT_HASH_BITS 8
-#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
-
 /* per-network namespace private data for this module */
 struct geneve_net {
-	struct hlist_head	sock_list[PORT_HASH_SIZE];
+	struct list_head	sock_list;
 };
 
 static int geneve_net_id;
@@ -69,19 +65,13 @@ static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
 	return (struct genevehdr *)(udp_hdr(skb) + 1);
 }
 
-static struct hlist_head *gs_head(struct net *net, __be16 port)
-{
-	struct geneve_net *gn = net_generic(net, geneve_net_id);
-
-	return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
-}
-
 /* Find geneve socket based on network namespace and UDP port */
 static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
 {
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
 
-	hlist_for_each_entry(gs, gs_head(net, port), hlist) {
+	list_for_each_entry(gs, &gn->sock_list, list) {
 		if (inet_sk(gs->sock->sk)->inet_sport == port)
 			return gs;
 	}
@@ -339,6 +329,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 						geneve_rcv_t *rcv, void *data,
 						bool ipv6)
 {
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
 	struct socket *sock;
 	struct udp_tunnel_sock_cfg tunnel_cfg;
@@ -371,7 +362,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	tunnel_cfg.encap_destroy = NULL;
 	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 
-	hlist_add_head(&gs->hlist, gs_head(net, port));
+	list_add(&gs->list, &gn->sock_list);
 
 	return gs;
 }
@@ -407,7 +398,7 @@ void geneve_sock_release(struct geneve_sock *gs)
 	if (--gs->refcnt)
 		goto unlock;
 
-	hlist_del(&gs->hlist);
+	list_del(&gs->list);
 	geneve_notify_del_rx_port(gs);
 	udp_tunnel_sock_release(gs->sock);
 	kfree_rcu(gs, rcu);
@@ -420,17 +411,14 @@ EXPORT_SYMBOL_GPL(geneve_sock_release);
 static __net_init int geneve_init_net(struct net *net)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
-	unsigned int h;
 
-	for (h = 0; h < PORT_HASH_SIZE; ++h)
-		INIT_HLIST_HEAD(&gn->sock_list[h]);
+	INIT_LIST_HEAD(&gn->sock_list);
 
 	return 0;
 }
 
 static struct pernet_operations geneve_net_ops = {
 	.init = geneve_init_net,
-	.exit = NULL,
 	.id   = &geneve_net_id,
 	.size = sizeof(struct geneve_net),
 };
-- 
cgit v1.2.3


From 46b1e4f9115d48d17bb92f612e4fa45a521a0537 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Fri, 2 Jan 2015 18:26:05 -0800
Subject: geneve: Check family when reusing sockets.

When searching for an existing socket to reuse, the address family
is not taken into account - only port number. This means that an
IPv4 socket could be used for IPv6 traffic and vice versa, which
is sure to cause problems when passing packets.

It is not possible to trigger this problem currently because the
only user of Geneve creates just IPv4 sockets. However, that is
likely to change in the near future.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/geneve.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 4fe5a592821c..5b52046ec7a2 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -65,14 +65,15 @@ static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
 	return (struct genevehdr *)(udp_hdr(skb) + 1);
 }
 
-/* Find geneve socket based on network namespace and UDP port */
-static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
+static struct geneve_sock *geneve_find_sock(struct net *net,
+					    sa_family_t family, __be16 port)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
 
 	list_for_each_entry(gs, &gn->sock_list, list) {
-		if (inet_sk(gs->sock->sk)->inet_sport == port)
+		if (inet_sk(gs->sock->sk)->inet_sport == port &&
+		    inet_sk(gs->sock->sk)->sk.sk_family == family)
 			return gs;
 	}
 
@@ -375,7 +376,7 @@ struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
 
 	mutex_lock(&geneve_mutex);
 
-	gs = geneve_find_sock(net, port);
+	gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
 	if (gs) {
 		if (!no_share && gs->rcv == rcv)
 			gs->refcnt++;
-- 
cgit v1.2.3


From 224d019c4fbba242041e9b25a926ba873b7da1e2 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 5 Jan 2015 13:56:14 -0800
Subject: ip: Move checksum convert defines to inet

Move convert_csum from udp_sock to inet_sock. This allows the
possibility that we can use convert checksum for different types
of sockets and also allows convert checksum to be enabled from
inet layer (what we'll want to do when enabling IP_CHECKSUM cmsg).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h     | 16 +---------------
 include/net/inet_sock.h | 17 +++++++++++++++++
 net/ipv4/fou.c          |  2 +-
 net/ipv4/udp.c          |  2 +-
 net/ipv4/udp_tunnel.c   |  2 +-
 net/ipv6/udp.c          |  2 +-
 6 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index ee3277593222..247cfdcc4b08 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -49,11 +49,7 @@ struct udp_sock {
 	unsigned int	 corkflag;	/* Cork is required */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 convert_csum:1;/* On receive, convert checksum
-					 * unnecessary to checksum complete
-					 * if possible.
-					 */
+			 no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -102,16 +98,6 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
 	return udp_sk(sk)->no_check6_rx;
 }
 
-static inline void udp_set_convert_csum(struct sock *sk, bool val)
-{
-	udp_sk(sk)->convert_csum = val;
-}
-
-static inline bool udp_get_convert_csum(struct sock *sk)
-{
-	return udp_sk(sk)->convert_csum;
-}
-
 #define udp_portaddr_for_each_entry(__sk, node, list) \
 	hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node)
 
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index a829b77523cf..360b110b3e36 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -184,6 +184,7 @@ struct inet_sock {
 				mc_all:1,
 				nodefrag:1;
 	__u8			rcv_tos;
+	__u8			convert_csum;
 	int			uc_index;
 	int			mc_index;
 	__be32			mc_addr;
@@ -250,4 +251,20 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
 	return flags;
 }
 
+static inline void inet_inc_convert_csum(struct sock *sk)
+{
+	inet_sk(sk)->convert_csum++;
+}
+
+static inline void inet_dec_convert_csum(struct sock *sk)
+{
+	if (inet_sk(sk)->convert_csum > 0)
+		inet_sk(sk)->convert_csum--;
+}
+
+static inline bool inet_get_convert_csum(struct sock *sk)
+{
+	return !!inet_sk(sk)->convert_csum;
+}
+
 #endif	/* _INET_SOCK_H */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index b986298a7ba3..2197c36f722f 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -490,7 +490,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 	sk->sk_user_data = fou;
 	fou->sock = sock;
 
-	udp_set_convert_csum(sk, true);
+	inet_inc_convert_csum(sk);
 
 	sk->sk_allocation = GFP_ATOMIC;
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 13b4dcf86ef6..53358d88f110 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1806,7 +1806,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk != NULL) {
 		int ret;
 
-		if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk))
+		if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
 			skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
 						 inet_compute_pseudo);
 
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 1671263e5fa0..9996e63ed304 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -63,7 +63,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 	inet_sk(sk)->mc_loop = 0;
 
 	/* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
-	udp_set_convert_csum(sk, true);
+	inet_inc_convert_csum(sk);
 
 	rcu_assign_sk_user_data(sk, cfg->sk_user_data);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 189dc4ae3eca..e41f017cd479 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -909,7 +909,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 			goto csum_error;
 		}
 
-		if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk))
+		if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
 			skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
 						 ip6_compute_pseudo);
 
-- 
cgit v1.2.3


From c44d13d6f341ca59f3d6646f2337d4d3c8a814a6 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 5 Jan 2015 13:56:15 -0800
Subject: ip: IP cmsg cleanup

Move the IP_CMSG_* constants from ip_sockglue.c to inet_sock.h so that
they can be referenced in other source files.

Restructure ip_cmsg_recv to not go through flags using shift, check
for flags by 'and'. This eliminates both the shift and a conditional
per flag check.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h | 11 ++++++++-
 net/ipv4/ip_sockglue.c  | 64 ++++++++++++++++++++++++++++---------------------
 2 files changed, 47 insertions(+), 28 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 360b110b3e36..605ca421d5ab 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -16,7 +16,7 @@
 #ifndef _INET_SOCK_H
 #define _INET_SOCK_H
 
-
+#include <linux/bitops.h>
 #include <linux/kmemcheck.h>
 #include <linux/string.h>
 #include <linux/types.h>
@@ -195,6 +195,15 @@ struct inet_sock {
 #define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
 #define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */
 
+/* cmsg flags for inet */
+#define IP_CMSG_PKTINFO		BIT(0)
+#define IP_CMSG_TTL		BIT(1)
+#define IP_CMSG_TOS		BIT(2)
+#define IP_CMSG_RECVOPTS	BIT(3)
+#define IP_CMSG_RETOPTS		BIT(4)
+#define IP_CMSG_PASSSEC		BIT(5)
+#define IP_CMSG_ORIGDSTADDR	BIT(6)
+
 static inline struct inet_sock *inet_sk(const struct sock *sk)
 {
 	return (struct inet_sock *)sk;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8a89c738b7a3..80f78565b41b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -45,14 +45,6 @@
 #include <linux/errqueue.h>
 #include <asm/uaccess.h>
 
-#define IP_CMSG_PKTINFO		1
-#define IP_CMSG_TTL		2
-#define IP_CMSG_TOS		4
-#define IP_CMSG_RECVOPTS	8
-#define IP_CMSG_RETOPTS		16
-#define IP_CMSG_PASSSEC		32
-#define IP_CMSG_ORIGDSTADDR     64
-
 /*
  *	SOL_IP control messages.
  */
@@ -150,37 +142,55 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 	unsigned int flags = inet->cmsg_flags;
 
 	/* Ordered by supposed usage frequency */
-	if (flags & 1)
+	if (flags & IP_CMSG_PKTINFO) {
 		ip_cmsg_recv_pktinfo(msg, skb);
-	if ((flags >>= 1) == 0)
-		return;
 
-	if (flags & 1)
+		flags &= ~IP_CMSG_PKTINFO;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_TTL) {
 		ip_cmsg_recv_ttl(msg, skb);
-	if ((flags >>= 1) == 0)
-		return;
 
-	if (flags & 1)
+		flags &= ~IP_CMSG_TTL;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_TOS) {
 		ip_cmsg_recv_tos(msg, skb);
-	if ((flags >>= 1) == 0)
-		return;
 
-	if (flags & 1)
+		flags &= ~IP_CMSG_TOS;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_RECVOPTS) {
 		ip_cmsg_recv_opts(msg, skb);
-	if ((flags >>= 1) == 0)
-		return;
 
-	if (flags & 1)
+		flags &= ~IP_CMSG_RECVOPTS;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_RETOPTS) {
 		ip_cmsg_recv_retopts(msg, skb);
-	if ((flags >>= 1) == 0)
-		return;
 
-	if (flags & 1)
+		flags &= ~IP_CMSG_RETOPTS;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_PASSSEC) {
 		ip_cmsg_recv_security(msg, skb);
 
-	if ((flags >>= 1) == 0)
-		return;
-	if (flags & 1)
+		flags &= ~IP_CMSG_PASSSEC;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_ORIGDSTADDR)
 		ip_cmsg_recv_dstaddr(msg, skb);
 
 }
-- 
cgit v1.2.3


From 5961de9f199bef6ff437d7d85fe69b6a1964739b Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 5 Jan 2015 13:56:16 -0800
Subject: ip: Add offset parameter to ip_cmsg_recv

Add ip_cmsg_recv_offset function which takes an offset argument
that indicates the starting offset in skb where data is being received
from. This will be useful in the case of UDP and provided checksum
to user space.

ip_cmsg_recv is an inline call to ip_cmsg_recv_offset with offset of
zero.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h       | 7 ++++++-
 net/ipv4/ip_sockglue.c | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index 0bb620702929..0e5a0bae187f 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -537,7 +537,7 @@ int ip_options_rcv_srr(struct sk_buff *skb);
  */
 
 void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
-void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb);
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int offset);
 int ip_cmsg_send(struct net *net, struct msghdr *msg,
 		 struct ipcm_cookie *ipc, bool allow_ipv6);
 int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
@@ -557,6 +557,11 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
 void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
 		    u32 info);
 
+static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+{
+	ip_cmsg_recv_offset(msg, skb, 0);
+}
+
 bool icmp_global_allow(void);
 extern int sysctl_icmp_msgs_per_sec;
 extern int sysctl_icmp_msgs_burst;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 80f78565b41b..513d506ffebb 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -136,7 +136,8 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
 }
 
-void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
+			 int offset)
 {
 	struct inet_sock *inet = inet_sk(skb->sk);
 	unsigned int flags = inet->cmsg_flags;
@@ -194,7 +195,7 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 		ip_cmsg_recv_dstaddr(msg, skb);
 
 }
-EXPORT_SYMBOL(ip_cmsg_recv);
+EXPORT_SYMBOL(ip_cmsg_recv_offset);
 
 int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
 		 bool allow_ipv6)
-- 
cgit v1.2.3


From ad6f939ab193750cc94a265f58e007fb598c97b7 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 5 Jan 2015 13:56:17 -0800
Subject: ip: Add offset parameter to ip_cmsg_recv

Add ip_cmsg_recv_offset function which takes an offset argument
that indicates the starting offset in skb where data is being received
from. This will be useful in the case of UDP and provided checksum
to user space.

ip_cmsg_recv is an inline call to ip_cmsg_recv_offset with offset of
zero.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h |  1 +
 include/uapi/linux/in.h |  1 +
 net/ipv4/ip_sockglue.c  | 41 ++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/udp.c          |  2 +-
 4 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 605ca421d5ab..eb16c7beed1e 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -203,6 +203,7 @@ struct inet_sock {
 #define IP_CMSG_RETOPTS		BIT(4)
 #define IP_CMSG_PASSSEC		BIT(5)
 #define IP_CMSG_ORIGDSTADDR	BIT(6)
+#define IP_CMSG_CHECKSUM	BIT(7)
 
 static inline struct inet_sock *inet_sk(const struct sock *sk)
 {
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index c33a65e3d62c..589ced069e8a 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -109,6 +109,7 @@ struct in_addr {
 
 #define IP_MINTTL       21
 #define IP_NODEFRAG     22
+#define IP_CHECKSUM	23
 
 /* IP_MTU_DISCOVER values */
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 513d506ffebb..a317797b3cd0 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -37,6 +37,7 @@
 #include <net/route.h>
 #include <net/xfrm.h>
 #include <net/compat.h>
+#include <net/checksum.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/transp_v6.h>
 #endif
@@ -96,6 +97,20 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
 	put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
 }
 
+static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
+				  int offset)
+{
+	__wsum csum = skb->csum;
+
+	if (skb->ip_summed != CHECKSUM_COMPLETE)
+		return;
+
+	if (offset != 0)
+		csum = csum_sub(csum, csum_partial(skb->data, offset, 0));
+
+	put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
+}
+
 static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
 {
 	char *secdata;
@@ -191,9 +206,16 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
 			return;
 	}
 
-	if (flags & IP_CMSG_ORIGDSTADDR)
+	if (flags & IP_CMSG_ORIGDSTADDR) {
 		ip_cmsg_recv_dstaddr(msg, skb);
 
+		flags &= ~IP_CMSG_ORIGDSTADDR;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_CHECKSUM)
+		ip_cmsg_recv_checksum(msg, skb, offset);
 }
 EXPORT_SYMBOL(ip_cmsg_recv_offset);
 
@@ -533,6 +555,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	case IP_MULTICAST_ALL:
 	case IP_MULTICAST_LOOP:
 	case IP_RECVORIGDSTADDR:
+	case IP_CHECKSUM:
 		if (optlen >= sizeof(int)) {
 			if (get_user(val, (int __user *) optval))
 				return -EFAULT;
@@ -630,6 +653,19 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		else
 			inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
 		break;
+	case IP_CHECKSUM:
+		if (val) {
+			if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) {
+				inet_inc_convert_csum(sk);
+				inet->cmsg_flags |= IP_CMSG_CHECKSUM;
+			}
+		} else {
+			if (inet->cmsg_flags & IP_CMSG_CHECKSUM) {
+				inet_dec_convert_csum(sk);
+				inet->cmsg_flags &= ~IP_CMSG_CHECKSUM;
+			}
+		}
+		break;
 	case IP_TOS:	/* This sets both TOS and Precedence */
 		if (sk->sk_type == SOCK_STREAM) {
 			val &= ~INET_ECN_MASK;
@@ -1233,6 +1269,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_RECVORIGDSTADDR:
 		val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
 		break;
+	case IP_CHECKSUM:
+		val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
+		break;
 	case IP_TOS:
 		val = inet->tos;
 		break;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 53358d88f110..97ef1f8b7be8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1329,7 +1329,7 @@ try_again:
 		*addr_len = sizeof(*sin);
 	}
 	if (inet->cmsg_flags)
-		ip_cmsg_recv(msg, skb);
+		ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr));
 
 	err = copied;
 	if (flags & MSG_TRUNC)
-- 
cgit v1.2.3


From 29ba4fffd396f4beefe34d24d7bcd86cb5c3e492 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:45 +0100
Subject: net: tcp: refactor reinitialization of congestion control

We can just move this to an extra function and make the code
a bit more readable, no functional change.

Joint work with Florian Westphal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cong.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0dd16bc..38f2f8aa4ceb 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -107,6 +107,18 @@ void tcp_init_congestion_control(struct sock *sk)
 		icsk->icsk_ca_ops->init(sk);
 }
 
+static void tcp_reinit_congestion_control(struct sock *sk,
+					  const struct tcp_congestion_ops *ca)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tcp_cleanup_congestion_control(sk);
+	icsk->icsk_ca_ops = ca;
+
+	if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
+		icsk->icsk_ca_ops->init(sk);
+}
+
 /* Manage refcounts on socket close. */
 void tcp_cleanup_congestion_control(struct sock *sk)
 {
@@ -262,21 +274,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 #endif
 	if (!ca)
 		err = -ENOENT;
-
 	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
 		   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
 		err = -EPERM;
-
 	else if (!try_module_get(ca->owner))
 		err = -EBUSY;
-
-	else {
-		tcp_cleanup_congestion_control(sk);
-		icsk->icsk_ca_ops = ca;
-
-		if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
-			icsk->icsk_ca_ops->init(sk);
-	}
+	else
+		tcp_reinit_congestion_control(sk, ca);
  out:
 	rcu_read_unlock();
 	return err;
-- 
cgit v1.2.3


From c5c6a8ab45ec0f18733afb4aaade0d4a139d80b3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:46 +0100
Subject: net: tcp: add key management to congestion control

This patch adds necessary infrastructure to the congestion control
framework for later per route congestion control support.

For a per route congestion control possibility, our aim is to store
a unique u32 key identifier into dst metrics, which can then be
mapped into a tcp_congestion_ops struct. We argue that having a
RTAX key entry is the most simple, generic and easy way to manage,
and also keeps the memory footprint of dst entries lower on 64 bit
than with storing a pointer directly, for example. Having a unique
key id also allows for decoupling actual TCP congestion control
module management from the FIB layer, i.e. we don't have to care
about expensive module refcounting inside the FIB at this point.

We first thought of using an IDR store for the realization, which
takes over dynamic assignment of unused key space and also performs
the key to pointer mapping in RCU. While doing so, we stumbled upon
the issue that due to the nature of dynamic key distribution, it
just so happens, arguably in very rare occasions, that excessive
module loads and unloads can lead to a possible reuse of previously
used key space. Thus, previously stale keys in the dst metric are
now being reassigned to a different congestion control algorithm,
which might lead to unexpected behaviour. One way to resolve this
would have been to walk FIBs on the actually rare occasion of a
module unload and reset the metric keys for each FIB in each netns,
but that's just very costly.

Therefore, we argue a better solution is to reuse the unique
congestion control algorithm name member and map that into u32 key
space through jhash. For that, we split the flags attribute (as it
currently uses 2 bits only anyway) into two u32 attributes, flags
and key, so that we can keep the cacheline boundary of 2 cachelines
on x86_64 and cache the precalculated key at registration time for
the fast path. On average we might expect 2 - 4 modules being loaded
worst case perhaps 15, so a key collision possibility is extremely
low, and guaranteed collision-free on LE/BE for all in-tree modules.
Overall this results in much simpler code, and all without the
overhead of an IDR. Due to the deterministic nature, modules can
now be unloaded, the congestion control algorithm for a specific
but unloaded key will fall back to the default one, and on module
reload time it will switch back to the expected algorithm
transparently.

Joint work with Florian Westphal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  3 +-
 include/net/tcp.h                  |  9 +++-
 net/ipv4/tcp_cong.c                | 97 +++++++++++++++++++++++++++++++-------
 3 files changed, 91 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 848e85cb5c61..5976bdecf58b 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -98,7 +98,8 @@ struct inet_connection_sock {
 	const struct tcp_congestion_ops *icsk_ca_ops;
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
-	__u8			  icsk_ca_state;
+	__u8			  icsk_ca_state:7,
+				  icsk_ca_dst_locked:1;
 	__u8			  icsk_retransmits;
 	__u8			  icsk_pending;
 	__u8			  icsk_backoff;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf76f..135b70c9a734 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -787,6 +787,8 @@ enum tcp_ca_ack_event_flags {
 #define TCP_CA_MAX	128
 #define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)
 
+#define TCP_CA_UNSPEC	0
+
 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
 #define TCP_CONG_NON_RESTRICTED 0x1
 /* Requires ECN/ECT set on all packets */
@@ -794,7 +796,8 @@ enum tcp_ca_ack_event_flags {
 
 struct tcp_congestion_ops {
 	struct list_head	list;
-	unsigned long flags;
+	u32 key;
+	u32 flags;
 
 	/* initialize private data (optional) */
 	void (*init)(struct sock *sk);
@@ -841,6 +844,10 @@ u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
 
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
+u32 tcp_ca_get_key_by_name(const char *name);
+char *tcp_ca_get_name_by_key(u32 key, char *buffer);
+
 static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 38f2f8aa4ceb..63c29dba68a8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -13,6 +13,7 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/gfp.h>
+#include <linux/jhash.h>
 #include <net/tcp.h>
 
 static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
 	return NULL;
 }
 
+/* Must be called with rcu lock held */
+static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
+{
+	const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+#ifdef CONFIG_MODULES
+	if (!ca && capable(CAP_NET_ADMIN)) {
+		rcu_read_unlock();
+		request_module("tcp_%s", name);
+		rcu_read_lock();
+		ca = tcp_ca_find(name);
+	}
+#endif
+	return ca;
+}
+
+/* Simple linear search, not much in here. */
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
+{
+	struct tcp_congestion_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+		if (e->key == key)
+			return e;
+	}
+
+	return NULL;
+}
+
 /*
  * Attach new congestion control algorithm to the list
  * of available options.
@@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 		return -EINVAL;
 	}
 
+	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
 	spin_lock(&tcp_cong_list_lock);
-	if (tcp_ca_find(ca->name)) {
-		pr_notice("%s already registered\n", ca->name);
+	if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
+		pr_notice("%s already registered or non-unique key\n",
+			  ca->name);
 		ret = -EEXIST;
 	} else {
 		list_add_tail_rcu(&ca->list, &tcp_cong_list);
@@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 	spin_lock(&tcp_cong_list_lock);
 	list_del_rcu(&ca->list);
 	spin_unlock(&tcp_cong_list_lock);
+
+	/* Wait for outstanding readers to complete before the
+	 * module gets removed entirely.
+	 *
+	 * A try_module_get() should fail by now as our module is
+	 * in "going" state since no refs are held anymore and
+	 * module_exit() handler being called.
+	 */
+	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
+u32 tcp_ca_get_key_by_name(const char *name)
+{
+	const struct tcp_congestion_ops *ca;
+	u32 key;
+
+	might_sleep();
+
+	rcu_read_lock();
+	ca = __tcp_ca_find_autoload(name);
+	key = ca ? ca->key : TCP_CA_UNSPEC;
+	rcu_read_unlock();
+
+	return key;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
+
+char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+	const struct tcp_congestion_ops *ca;
+	char *ret = NULL;
+
+	rcu_read_lock();
+	ca = tcp_ca_find_key(key);
+	if (ca)
+		ret = strncpy(buffer, ca->name,
+			      TCP_CA_NAME_MAX);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
+
 /* Assign choice of congestion control. */
 void tcp_assign_congestion_control(struct sock *sk)
 {
@@ -253,25 +326,17 @@ out:
 int tcp_set_congestion_control(struct sock *sk, const char *name)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	struct tcp_congestion_ops *ca;
+	const struct tcp_congestion_ops *ca;
 	int err = 0;
 
-	rcu_read_lock();
-	ca = tcp_ca_find(name);
+	if (icsk->icsk_ca_dst_locked)
+		return -EPERM;
 
-	/* no change asking for existing value */
+	rcu_read_lock();
+	ca = __tcp_ca_find_autoload(name);
+	/* No change asking for existing value */
 	if (ca == icsk->icsk_ca_ops)
 		goto out;
-
-#ifdef CONFIG_MODULES
-	/* not found attempt to autoload module */
-	if (!ca && capable(CAP_NET_ADMIN)) {
-		rcu_read_unlock();
-		request_module("tcp_%s", name);
-		rcu_read_lock();
-		ca = tcp_ca_find(name);
-	}
-#endif
 	if (!ca)
 		err = -ENOENT;
 	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
-- 
cgit v1.2.3


From ea697639992d96da98016b8934e68a73876a2264 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:47 +0100
Subject: net: tcp: add RTAX_CC_ALGO fib handling

This patch adds the minimum necessary for the RTAX_CC_ALGO congestion
control metric to be set up and dumped back to user space.

While the internal representation of RTAX_CC_ALGO is handled as a u32
key, we avoided to expose this implementation detail to user space, thus
instead, we chose the netlink attribute that is being exchanged between
user space to be the actual congestion control algorithm name, similarly
as in the setsockopt(2) API in order to allow for maximum flexibility,
even for 3rd party modules.

It is a bit unfortunate that RTAX_QUICKACK used up a whole RTAX slot as
it should have been stored in RTAX_FEATURES instead, we first thought
about reusing it for the congestion control key, but it brings more
complications and/or confusion than worth it.

Joint work with Florian Westphal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h              |  7 +++++++
 include/uapi/linux/rtnetlink.h |  2 ++
 net/core/rtnetlink.c           | 15 +++++++++++++--
 net/decnet/dn_fib.c            |  3 ++-
 net/decnet/dn_table.c          |  4 +++-
 net/ipv4/fib_semantics.c       | 14 ++++++++++++--
 net/ipv6/route.c               | 17 +++++++++++++++--
 7 files changed, 54 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 135b70c9a734..95bb237152e0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -846,7 +846,14 @@ extern struct tcp_congestion_ops tcp_reno;
 
 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
 u32 tcp_ca_get_key_by_name(const char *name);
+#ifdef CONFIG_INET
 char *tcp_ca_get_name_by_key(u32 key, char *buffer);
+#else
+static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+	return NULL;
+}
+#endif
 
 static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 {
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9c9b8b4480cd..d81f22d5b390 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -389,6 +389,8 @@ enum {
 #define RTAX_INITRWND RTAX_INITRWND
 	RTAX_QUICKACK,
 #define RTAX_QUICKACK RTAX_QUICKACK
+	RTAX_CC_ALGO,
+#define RTAX_CC_ALGO RTAX_CC_ALGO
 	__RTAX_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index da983d4bac02..6a6cdade1676 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -50,6 +50,7 @@
 #include <net/arp.h>
 #include <net/route.h>
 #include <net/udp.h>
+#include <net/tcp.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/fib_rules.h>
@@ -669,9 +670,19 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
 
 	for (i = 0; i < RTAX_MAX; i++) {
 		if (metrics[i]) {
+			if (i == RTAX_CC_ALGO - 1) {
+				char tmp[TCP_CA_NAME_MAX], *name;
+
+				name = tcp_ca_get_name_by_key(metrics[i], tmp);
+				if (!name)
+					continue;
+				if (nla_put_string(skb, i + 1, name))
+					goto nla_put_failure;
+			} else {
+				if (nla_put_u32(skb, i + 1, metrics[i]))
+					goto nla_put_failure;
+			}
 			valid++;
-			if (nla_put_u32(skb, i+1, metrics[i]))
-				goto nla_put_failure;
 		}
 	}
 
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index d332aefb0846..df4803437888 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -298,7 +298,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att
 			int type = nla_type(attr);
 
 			if (type) {
-				if (type > RTAX_MAX || nla_len(attr) < 4)
+				if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
+				    nla_len(attr) < 4)
 					goto err_inval;
 
 				fi->fib_metrics[type-1] = nla_get_u32(attr);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 86e3807052e9..3f19fcbf126d 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -29,6 +29,7 @@
 #include <linux/route.h> /* RTF_xxx */
 #include <net/neighbour.h>
 #include <net/netlink.h>
+#include <net/tcp.h>
 #include <net/dst.h>
 #include <net/flow.h>
 #include <net/fib_rules.h>
@@ -273,7 +274,8 @@ static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
 			 + nla_total_size(4) /* RTA_TABLE */
 			 + nla_total_size(2) /* RTA_DST */
-			 + nla_total_size(4); /* RTA_PRIORITY */
+			 + nla_total_size(4) /* RTA_PRIORITY */
+			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
 
 	/* space for nested metrics */
 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f99f41bd15b8..d2b7b5521b1b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 			 + nla_total_size(4) /* RTA_TABLE */
 			 + nla_total_size(4) /* RTA_DST */
 			 + nla_total_size(4) /* RTA_PRIORITY */
-			 + nla_total_size(4); /* RTA_PREFSRC */
+			 + nla_total_size(4) /* RTA_PREFSRC */
+			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
 
 	/* space for nested metrics */
 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
@@ -859,7 +860,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 
 				if (type > RTAX_MAX)
 					goto err_inval;
-				val = nla_get_u32(nla);
+				if (type == RTAX_CC_ALGO) {
+					char tmp[TCP_CA_NAME_MAX];
+
+					nla_strlcpy(tmp, nla, sizeof(tmp));
+					val = tcp_ca_get_key_by_name(tmp);
+					if (val == TCP_CA_UNSPEC)
+						goto err_inval;
+				} else {
+					val = nla_get_u32(nla);
+				}
 				if (type == RTAX_ADVMSS && val > 65535 - 40)
 					val = 65535 - 40;
 				if (type == RTAX_MTU && val > 65535 - 15)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 454771d20b21..34dcbb59df75 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1488,10 +1488,22 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 		int type = nla_type(nla);
 
 		if (type) {
+			u32 val;
+
 			if (unlikely(type > RTAX_MAX))
 				goto err;
+			if (type == RTAX_CC_ALGO) {
+				char tmp[TCP_CA_NAME_MAX];
+
+				nla_strlcpy(tmp, nla, sizeof(tmp));
+				val = tcp_ca_get_key_by_name(tmp);
+				if (val == TCP_CA_UNSPEC)
+					goto err;
+			} else {
+				val = nla_get_u32(nla);
+			}
 
-			mp[type - 1] = nla_get_u32(nla);
+			mp[type - 1] = val;
 			__set_bit(type - 1, mxc->mx_valid);
 		}
 	}
@@ -2571,7 +2583,8 @@ static inline size_t rt6_nlmsg_size(void)
 	       + nla_total_size(4) /* RTA_OIF */
 	       + nla_total_size(4) /* RTA_PRIORITY */
 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
-	       + nla_total_size(sizeof(struct rta_cacheinfo));
+	       + nla_total_size(sizeof(struct rta_cacheinfo))
+	       + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
 }
 
 static int rt6_fill_node(struct net *net,
-- 
cgit v1.2.3


From 81164413ad096bafe8ad1068f3f095a7dd081d8b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:48 +0100
Subject: net: tcp: add per route congestion control

This work adds the possibility to define a per route/destination
congestion control algorithm. Generally, this opens up the possibility
for a machine with different links to enforce specific congestion
control algorithms with optimal strategies for each of them based
on their network characteristics, even transparently for a single
application listening on all links.

For our specific use case, this additionally facilitates deployment
of DCTCP, for example, applications can easily serve internal
traffic/dsts in DCTCP and external one with CUBIC. Other scenarios
would also allow for utilizing e.g. long living, low priority
background flows for certain destinations/routes while still being
able for normal traffic to utilize the default congestion control
algorithm. We also thought about a per netns setting (where different
defaults are possible), but given its actually a link specific
property, we argue that a per route/destination setting is the most
natural and flexible.

The administrator can utilize this through ip-route(8) by appending
"congctl [lock] <name>", where <name> denotes the name of a
congestion control algorithm and the optional lock parameter allows
to enforce the given algorithm so that applications in user space
would not be allowed to overwrite that algorithm for that destination.

The dst metric lookups are being done when a dst entry is already
available in order to avoid a costly lookup and still before the
algorithms are being initialized, thus overhead is very low when the
feature is not being used. While the client side would need to drop
the current reference on the module, on server side this can actually
even be avoided as we just got a flat-copied socket clone.

Joint work with Florian Westphal.

Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  6 ++++++
 net/ipv4/tcp_ipv4.c      |  2 ++
 net/ipv4/tcp_minisocks.c | 30 ++++++++++++++++++++++++++----
 net/ipv4/tcp_output.c    | 21 +++++++++++++++++++++
 net/ipv6/tcp_ipv6.c      |  2 ++
 5 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 95bb237152e0..b8fdc6bab3f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -448,6 +448,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_create_openreq_child(struct sock *sk,
 				      struct request_sock *req,
 				      struct sk_buff *skb);
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req,
 				  struct dst_entry *dst);
@@ -636,6 +637,11 @@ static inline u32 tcp_rto_min_us(struct sock *sk)
 	return jiffies_to_usecs(tcp_rto_min(sk));
 }
 
+static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
+{
+	return dst_metric_locked(dst, RTAX_CC_ALGO);
+}
+
 /* Compute the actual receive window we are currently advertising.
  * Rcv_nxt can be after the window if our peer push more data
  * than the offered window.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..ad3e65bdd368 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 	sk_setup_caps(newsk, dst);
 
+	tcp_ca_openreq_child(newsk, dst);
+
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric_advmss(dst);
 	if (tcp_sk(sk)->rx_opt.user_mss &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d2680b65db..bc9216dc9de1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp,
 	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
 }
 
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+	bool ca_got_dst = false;
+
+	if (ca_key != TCP_CA_UNSPEC) {
+		const struct tcp_congestion_ops *ca;
+
+		rcu_read_lock();
+		ca = tcp_ca_find_key(ca_key);
+		if (likely(ca && try_module_get(ca->owner))) {
+			icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+			icsk->icsk_ca_ops = ca;
+			ca_got_dst = true;
+		}
+		rcu_read_unlock();
+	}
+
+	if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
+		tcp_assign_congestion_control(sk);
+
+	tcp_set_ca_state(sk, TCP_CA_Open);
+}
+EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 
-		if (!try_module_get(newicsk->icsk_ca_ops->owner))
-			tcp_assign_congestion_control(newsk);
-
-		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		__skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f18262e2326..dc30cb563e4f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2939,6 +2939,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 }
 EXPORT_SYMBOL(tcp_make_synack);
 
+static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_congestion_ops *ca;
+	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+
+	if (ca_key == TCP_CA_UNSPEC)
+		return;
+
+	rcu_read_lock();
+	ca = tcp_ca_find_key(ca_key);
+	if (likely(ca && try_module_get(ca->owner))) {
+		module_put(icsk->icsk_ca_ops->owner);
+		icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+		icsk->icsk_ca_ops = ca;
+	}
+	rcu_read_unlock();
+}
+
 /* Do all connect socket setups that can be done AF independent. */
 static void tcp_connect_init(struct sock *sk)
 {
@@ -2964,6 +2983,8 @@ static void tcp_connect_init(struct sock *sk)
 	tcp_mtup_init(sk);
 	tcp_sync_mss(sk, dst_mtu(dst));
 
+	tcp_ca_dst_init(sk, dst);
+
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric_advmss(dst);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9c0b54e87b47..5d46832c6f72 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1199,6 +1199,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
 						     newnp->opt->opt_flen);
 
+	tcp_ca_openreq_child(newsk, dst);
+
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric_advmss(dst);
 	if (tcp_sk(sk)->rx_opt.user_mss &&
-- 
cgit v1.2.3


From 08abdffa1c16b9603ee9bb33f3226ff186d98fa1 Mon Sep 17 00:00:00 2001
From: Sébastien Barré <sebastien.barre@uclouvain.be>
Date: Mon, 12 Jan 2015 10:30:40 +0100
Subject: tcp: avoid reducing cwnd when ACK+DSACK is received
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With TLP, the peer may reply to a probe with an
ACK+D-SACK, with ack value set to tlp_high_seq. In the current code,
such ACK+DSACK will be missed and only at next, higher ack will the TLP
episode be considered done. Since the DSACK is not present anymore,
this will cost a cwnd reduction.

This patch ensures that this scenario does not cause a cwnd reduction, since
receiving an ACK+DSACK indicates that both the initial segment and the probe
have been received by the peer.

The following packetdrill test, from Neal Cardwell, validates this patch:

// Establish a connection.
0     socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0     setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0    bind(3, ..., ...) = 0
+0    listen(3, 1) = 0

+0    < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0    > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+.020 < . 1:1(0) ack 1 win 257
+0    accept(3, ..., ...) = 4

// Send 1 packet.
+0    write(4, ..., 1000) = 1000
+0    > P. 1:1001(1000) ack 1

// Loss probe retransmission.
// packets_out == 1 => schedule PTO in max(2*RTT, 1.5*RTT + 200ms)
// In this case, this means: 1.5*RTT + 200ms = 230ms
+.230 > P. 1:1001(1000) ack 1
+0    %{ assert tcpi_snd_cwnd == 10 }%

// Receiver ACKs at tlp_high_seq with a DSACK,
// indicating they received the original packet and probe.
+.020 < . 1:1(0) ack 1001 win 257 <sack 1:1001,nop,nop>
+0    %{ assert tcpi_snd_cwnd == 10 }%

// Send another packet.
+0    write(4, ..., 1000) = 1000
+0    > P. 1001:2001(1000) ack 1

// Receiver ACKs above tlp_high_seq, which should end the TLP episode
// if we haven't already. We should not reduce cwnd.
+.020 < . 1:1(0) ack 2001 win 257
+0    %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%

Credits:
-Gregory helped in finding that tcp_process_tlp_ack was where the cwnd
got reduced in our MPTCP tests.
-Neal wrote the packetdrill test above
-Yuchung reworked the patch to make it more readable.

Cc: Gregory Detal <gregory.detal@uclouvain.be>
Cc: Nandita Dukkipati <nanditad@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Sébastien Barré <sebastien.barre@uclouvain.be>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 075ab4d5af5e..71fb37c70581 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3358,34 +3358,34 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 }
 
 /* This routine deals with acks during a TLP episode.
+ * We mark the end of a TLP episode on receiving TLP dupack or when
+ * ack is after tlp_high_seq.
  * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
  */
 static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
-			     !(flag & (FLAG_SND_UNA_ADVANCED |
-				       FLAG_NOT_DUP | FLAG_DATA_SACKED));
 
-	/* Mark the end of TLP episode on receiving TLP dupack or when
-	 * ack is after tlp_high_seq.
-	 */
-	if (is_tlp_dupack) {
-		tp->tlp_high_seq = 0;
+	if (before(ack, tp->tlp_high_seq))
 		return;
-	}
 
-	if (after(ack, tp->tlp_high_seq)) {
+	if (flag & FLAG_DSACKING_ACK) {
+		/* This DSACK means original and TLP probe arrived; no loss */
+		tp->tlp_high_seq = 0;
+	} else if (after(ack, tp->tlp_high_seq)) {
+		/* ACK advances: there was a loss, so reduce cwnd. Reset
+		 * tlp_high_seq in tcp_init_cwnd_reduction()
+		 */
+		tcp_init_cwnd_reduction(sk);
+		tcp_set_ca_state(sk, TCP_CA_CWR);
+		tcp_end_cwnd_reduction(sk);
+		tcp_try_keep_open(sk);
+		NET_INC_STATS_BH(sock_net(sk),
+				 LINUX_MIB_TCPLOSSPROBERECOVERY);
+	} else if (!(flag & (FLAG_SND_UNA_ADVANCED |
+			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+		/* Pure dupack: original and TLP probe arrived; no loss */
 		tp->tlp_high_seq = 0;
-		/* Don't reduce cwnd if DSACK arrives for TLP retrans. */
-		if (!(flag & FLAG_DSACKING_ACK)) {
-			tcp_init_cwnd_reduction(sk);
-			tcp_set_ca_state(sk, TCP_CA_CWR);
-			tcp_end_cwnd_reduction(sk);
-			tcp_try_keep_open(sk);
-			NET_INC_STATS_BH(sock_net(sk),
-					 LINUX_MIB_TCPLOSSPROBERECOVERY);
-		}
 	}
 }
 
-- 
cgit v1.2.3


From df8a39defad46b83694ea6dd868d332976d62cc0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 13 Jan 2015 17:13:44 +0100
Subject: net: rename vlan_tx_* helpers since "tx" is misleading there

The same macros are used for rx as well. So rename it.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt                  |  4 ++--
 drivers/infiniband/hw/nes/nes_nic.c                  | 13 +++++++------
 drivers/net/ethernet/3com/typhoon.c                  |  4 ++--
 drivers/net/ethernet/alteon/acenic.c                 |  8 ++++----
 drivers/net/ethernet/amd/amd8111e.c                  |  4 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c             |  8 ++++----
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c      |  4 ++--
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c      |  4 ++--
 drivers/net/ethernet/atheros/atlx/atl1.c             |  4 ++--
 drivers/net/ethernet/atheros/atlx/atl2.c             |  4 ++--
 drivers/net/ethernet/broadcom/bnx2.c                 |  4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c      |  4 ++--
 drivers/net/ethernet/broadcom/tg3.c                  |  4 ++--
 drivers/net/ethernet/brocade/bna/bnad.c              |  4 ++--
 drivers/net/ethernet/chelsio/cxgb/sge.c              |  4 ++--
 drivers/net/ethernet/chelsio/cxgb3/sge.c             |  6 +++---
 drivers/net/ethernet/chelsio/cxgb4/sge.c             |  4 ++--
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c           |  4 ++--
 drivers/net/ethernet/cisco/enic/enic_main.c          |  4 ++--
 drivers/net/ethernet/emulex/benet/be_main.c          | 12 ++++++------
 drivers/net/ethernet/freescale/gianfar.c             |  4 ++--
 drivers/net/ethernet/ibm/ehea/ehea_main.c            |  4 ++--
 drivers/net/ethernet/intel/e1000/e1000_main.c        |  5 +++--
 drivers/net/ethernet/intel/e1000e/netdev.c           |  9 +++++----
 drivers/net/ethernet/intel/fm10k/fm10k_main.c        |  4 ++--
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c      |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c          |  4 ++--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c        |  4 ++--
 drivers/net/ethernet/intel/igb/igb_main.c            |  4 ++--
 drivers/net/ethernet/intel/igbvf/netdev.c            |  5 +++--
 drivers/net/ethernet/intel/ixgb/ixgb_main.c          |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  4 ++--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c    |  4 ++--
 drivers/net/ethernet/jme.c                           |  4 ++--
 drivers/net/ethernet/marvell/sky2.c                  |  6 +++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c           | 12 ++++++------
 drivers/net/ethernet/natsemi/ns83820.c               |  4 ++--
 drivers/net/ethernet/neterion/s2io.c                 |  4 ++--
 drivers/net/ethernet/neterion/vxge/vxge-main.c       |  4 ++--
 drivers/net/ethernet/nvidia/forcedeth.c              |  4 ++--
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  4 ++--
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c       |  8 ++++----
 drivers/net/ethernet/qlogic/qlge/qlge_main.c         |  6 +++---
 drivers/net/ethernet/realtek/8139cp.c                |  4 ++--
 drivers/net/ethernet/realtek/r8169.c                 |  4 ++--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c      |  2 +-
 drivers/net/ethernet/tehuti/tehuti.c                 |  4 ++--
 drivers/net/ethernet/via/via-rhine.c                 |  6 +++---
 drivers/net/ethernet/via/via-velocity.c              |  4 ++--
 drivers/net/macvtap.c                                |  6 +++---
 drivers/net/tun.c                                    |  4 ++--
 drivers/net/usb/r8152.c                              |  4 ++--
 drivers/net/vmxnet3/vmxnet3_drv.c                    |  4 ++--
 drivers/net/vxlan.c                                  |  4 ++--
 drivers/s390/net/qeth_l3_main.c                      |  8 ++++----
 drivers/vhost/net.c                                  |  2 +-
 include/linux/if_vlan.h                              | 16 ++++++++--------
 include/net/pkt_sched.h                              |  2 +-
 include/trace/events/net.h                           |  8 ++++----
 net/8021q/vlan_core.c                                |  2 +-
 net/bridge/br_netfilter.c                            | 12 ++++++------
 net/bridge/br_private.h                              |  4 ++--
 net/bridge/br_vlan.c                                 |  4 ++--
 net/bridge/netfilter/ebt_vlan.c                      |  4 ++--
 net/bridge/netfilter/ebtables.c                      |  2 +-
 net/core/dev.c                                       | 10 +++++-----
 net/core/netpoll.c                                   |  2 +-
 net/core/skbuff.c                                    |  8 ++++----
 net/ipv4/geneve.c                                    |  2 +-
 net/openvswitch/actions.c                            |  4 ++--
 net/openvswitch/datapath.c                           |  2 +-
 net/openvswitch/flow.c                               |  4 ++--
 net/openvswitch/vport-gre.c                          |  2 +-
 net/openvswitch/vport.c                              |  3 ++-
 net/packet/af_packet.c                               | 12 ++++++------
 net/sched/em_meta.c                                  |  2 +-
 net/wireless/util.c                                  |  4 ++--
 77 files changed, 195 insertions(+), 190 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 58d08f8d8d80..9930ecfbb465 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -279,8 +279,8 @@ Possible BPF extensions are shown in the following table:
   hatype                                skb->dev->type
   rxhash                                skb->hash
   cpu                                   raw_smp_processor_id()
-  vlan_tci                              vlan_tx_tag_get(skb)
-  vlan_pr                               vlan_tx_tag_present(skb)
+  vlan_tci                              skb_vlan_tag_get(skb)
+  vlan_pr                               skb_vlan_tag_present(skb)
   rand                                  prandom_u32()
 
 These extensions can also be prefixed with '#'.
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 49eb5111d2cd..70acda91eb2a 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -373,11 +373,11 @@ static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
 	wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
 
 	/* setup the VLAN tag if present */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-				netdev->name, vlan_tx_tag_get(skb));
+				netdev->name, skb_vlan_tag_get(skb));
 		wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-		wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+		wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
 	} else
 		wqe_misc = 0;
 
@@ -576,11 +576,12 @@ tso_sq_no_longer_full:
 				wqe_fragment_length =
 						(__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
 				/* setup the VLAN tag if present */
-				if (vlan_tx_tag_present(skb)) {
+				if (skb_vlan_tag_present(skb)) {
 					nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-							netdev->name, vlan_tx_tag_get(skb) );
+							netdev->name,
+						  skb_vlan_tag_get(skb));
 					wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-					wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+					wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
 				} else
 					wqe_misc = 0;
 
diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c
index dede43f4ce09..8f8418d2ac4a 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -769,11 +769,11 @@ typhoon_start_tx(struct sk_buff *skb, struct net_device *dev)
 		first_txd->processFlags |= TYPHOON_TX_PF_IP_CHKSUM;
 	}
 
-	if(vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		first_txd->processFlags |=
 		    TYPHOON_TX_PF_INSERT_VLAN | TYPHOON_TX_PF_VLAN_PRIORITY;
 		first_txd->processFlags |=
-		    cpu_to_le32(htons(vlan_tx_tag_get(skb)) <<
+		    cpu_to_le32(htons(skb_vlan_tag_get(skb)) <<
 				TYPHOON_TX_PF_VLAN_TAG_SHIFT);
 	}
 
diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c
index b68074803de3..b90a26b13fdf 100644
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@@ -2429,9 +2429,9 @@ restart:
 		flagsize = (skb->len << 16) | (BD_FLG_END);
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			flagsize |= BD_FLG_TCP_UDP_SUM;
-		if (vlan_tx_tag_present(skb)) {
+		if (skb_vlan_tag_present(skb)) {
 			flagsize |= BD_FLG_VLAN_TAG;
-			vlan_tag = vlan_tx_tag_get(skb);
+			vlan_tag = skb_vlan_tag_get(skb);
 		}
 		desc = ap->tx_ring + idx;
 		idx = (idx + 1) % ACE_TX_RING_ENTRIES(ap);
@@ -2450,9 +2450,9 @@ restart:
 		flagsize = (skb_headlen(skb) << 16);
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			flagsize |= BD_FLG_TCP_UDP_SUM;
-		if (vlan_tx_tag_present(skb)) {
+		if (skb_vlan_tag_present(skb)) {
 			flagsize |= BD_FLG_VLAN_TAG;
-			vlan_tag = vlan_tx_tag_get(skb);
+			vlan_tag = skb_vlan_tag_get(skb);
 		}
 
 		ace_load_tx_bd(ap, ap->tx_ring + idx, mapping, flagsize, vlan_tag);
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 841e6558db68..4c2ae2221780 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -1299,11 +1299,11 @@ static netdev_tx_t amd8111e_start_xmit(struct sk_buff *skb,
 	lp->tx_ring[tx_index].tx_flags = 0;
 
 #if AMD8111E_VLAN_TAG_USED
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		lp->tx_ring[tx_index].tag_ctrl_cmd |=
 				cpu_to_le16(TCC_VLAN_INSERT);
 		lp->tx_ring[tx_index].tag_ctrl_info =
-				cpu_to_le16(vlan_tx_tag_get(skb));
+				cpu_to_le16(skb_vlan_tag_get(skb));
 
 	}
 #endif
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 7bb5f07dbeef..2ba1dd22ad64 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1165,8 +1165,8 @@ static void xgbe_prep_tx_tstamp(struct xgbe_prv_data *pdata,
 
 static void xgbe_prep_vlan(struct sk_buff *skb, struct xgbe_packet_data *packet)
 {
-	if (vlan_tx_tag_present(skb))
-		packet->vlan_ctag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		packet->vlan_ctag = skb_vlan_tag_get(skb);
 }
 
 static int xgbe_prep_tso(struct sk_buff *skb, struct xgbe_packet_data *packet)
@@ -1247,9 +1247,9 @@ static void xgbe_packet_info(struct xgbe_prv_data *pdata,
 		XGMAC_SET_BITS(packet->attributes, TX_PACKET_ATTRIBUTES,
 			       CSUM_ENABLE, 1);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* VLAN requires an extra descriptor if tag is different */
-		if (vlan_tx_tag_get(skb) != ring->tx.cur_vlan_ctag)
+		if (skb_vlan_tag_get(skb) != ring->tx.cur_vlan_ctag)
 			/* We can share with the TSO context descriptor */
 			if (!context_desc) {
 				context_desc = 1;
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index c9946c6c119e..587f63e87588 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2235,8 +2235,8 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		u16 vlan = vlan_tx_tag_get(skb);
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		u16 vlan = skb_vlan_tag_get(skb);
 		__le16 tag;
 
 		vlan = cpu_to_le16(vlan);
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index c88abf5b6415..59a03a193e83 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1892,8 +1892,8 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
 
 	tpd = atl1e_get_tpd(adapter);
 
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		u16 atl1e_vlan_tag;
 
 		tpd->word3 |= 1 << TPD_INS_VL_TAG_SHIFT;
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index 2c8f398aeda9..eca1d113fee1 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -2415,8 +2415,8 @@ static netdev_tx_t atl1_xmit_frame(struct sk_buff *skb,
 		(u16) atomic_read(&tpd_ring->next_to_use));
 	memset(ptpd, 0, sizeof(struct tx_packet_desc));
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		vlan_tag = skb_vlan_tag_get(skb);
 		vlan_tag = (vlan_tag << 4) | (vlan_tag >> 13) |
 			((vlan_tag >> 9) & 0x8);
 		ptpd->word3 |= 1 << TPD_INS_VL_TAG_SHIFT;
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 482a7cabb0a1..46a535318c7a 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -887,8 +887,8 @@ static netdev_tx_t atl2_xmit_frame(struct sk_buff *skb,
 		offset = ((u32)(skb->len-copy_len + 3) & ~3);
 	}
 #ifdef NETIF_F_HW_VLAN_CTAG_TX
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		vlan_tag = (vlan_tag << 4) |
 			(vlan_tag >> 13) |
 			((vlan_tag >> 9) & 0x8);
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 823d01c5684c..02bf0b86995b 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6597,9 +6597,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		vlan_tag_flags |= TX_BD_FLAGS_TCP_UDP_CKSUM;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		vlan_tag_flags |=
-			(TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
+			(TX_BD_FLAGS_VLAN_TAG | (skb_vlan_tag_get(skb) << 16));
 	}
 
 	if ((mss = skb_shinfo(skb)->gso_size)) {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 1d1147c93d59..b51a18a09d4d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3865,9 +3865,9 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	   "sending pkt %u @%p  next_idx %u  bd %u @%p\n",
 	   pkt_prod, tx_buf, txdata->tx_pkt_prod, bd_prod, tx_start_bd);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_start_bd->vlan_or_ethertype =
-		    cpu_to_le16(vlan_tx_tag_get(skb));
+		    cpu_to_le16(skb_vlan_tag_get(skb));
 		tx_start_bd->bd_flags.as_bitfield |=
 		    (X_ETH_OUTBAND_VLAN << ETH_TX_BD_FLAGS_VLAN_MODE_SHIFT);
 	} else {
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 356bd5b022a5..4cf43bfbc955 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -8002,9 +8002,9 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	    !mss && skb->len > VLAN_ETH_FRAME_LEN)
 		base_flags |= TXD_FLAG_JMB_PKT;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		base_flags |= TXD_FLAG_VLAN;
-		vlan = vlan_tx_tag_get(skb);
+		vlan = skb_vlan_tag_get(skb);
 	}
 
 	if ((unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) &&
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index 323721838cf9..7714d7790089 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -2824,8 +2824,8 @@ bnad_txq_wi_prepare(struct bnad *bnad, struct bna_tcb *tcb,
 	u32 gso_size;
 	u16 vlan_tag = 0;
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_tag = (u16)vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		vlan_tag = (u16)skb_vlan_tag_get(skb);
 		flags |= (BNA_TXQ_WI_CF_INS_PRIO | BNA_TXQ_WI_CF_INS_VLAN);
 	}
 	if (test_bit(BNAD_RF_CEE_RUNNING, &bnad->run_flags)) {
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index babe2a915b00..526ea74e82d9 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -1860,9 +1860,9 @@ netdev_tx_t t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	cpl->iff = dev->if_port;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		cpl->vlan_valid = 1;
-		cpl->vlan = htons(vlan_tx_tag_get(skb));
+		cpl->vlan = htons(skb_vlan_tag_get(skb));
 		st->vlan_insert++;
 	} else
 		cpl->vlan_valid = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index 3dfcf600fcc6..d6aa602f168d 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -1148,8 +1148,8 @@ static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
 	cpl->len = htonl(skb->len);
 	cntrl = V_TXPKT_INTF(pi->port_id);
 
-	if (vlan_tx_tag_present(skb))
-		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb))
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(skb_vlan_tag_get(skb));
 
 	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
 	if (tso_info) {
@@ -1282,7 +1282,7 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
 	if (skb_shinfo(skb)->gso_size)
 		qs->port_stats[SGE_PSTAT_TSO]++;
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		qs->port_stats[SGE_PSTAT_VLANINS]++;
 
 	/*
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index ca42e2e9dec9..619156112b21 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -1154,9 +1154,9 @@ out_free:	dev_kfree_skb_any(skb);
 			cntrl = TXPKT_L4CSUM_DIS | TXPKT_IPCSUM_DIS;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		q->vlan_ins++;
-		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb));
+		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(skb_vlan_tag_get(skb));
 	}
 
 	cpl->ctrl0 = htonl(TXPKT_OPCODE(CPL_TX_PKT_XT) |
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index 4424277a7e4d..0545f0de1c52 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -1326,9 +1326,9 @@ int t4vf_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * If there's a VLAN tag present, add that to the list of things to
 	 * do in this Work Request.
 	 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		txq->vlan_ins++;
-		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb));
+		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(skb_vlan_tag_get(skb));
 	}
 
 	/*
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 9a952df6606e..0535f6fbdc71 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -520,10 +520,10 @@ static inline void enic_queue_wq_skb(struct enic *enic,
 	int loopback = 0;
 	int err;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* VLAN tag from trunking driver */
 		vlan_tag_insert = 1;
-		vlan_tag = vlan_tx_tag_get(skb);
+		vlan_tag = skb_vlan_tag_get(skb);
 	} else if (enic->loop_enable) {
 		vlan_tag = enic->loop_tag;
 		loopback = 1;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 37a26b0b7e33..ed46610e5453 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -694,7 +694,7 @@ static inline u16 be_get_tx_vlan_tag(struct be_adapter *adapter,
 	u8 vlan_prio;
 	u16 vlan_tag;
 
-	vlan_tag = vlan_tx_tag_get(skb);
+	vlan_tag = skb_vlan_tag_get(skb);
 	vlan_prio = (vlan_tag & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
 	/* If vlan priority provided by OS is NOT in available bmap */
 	if (!(adapter->vlan_prio_bmap & (1 << vlan_prio)))
@@ -745,7 +745,7 @@ static void wrb_fill_hdr(struct be_adapter *adapter, struct be_eth_hdr_wrb *hdr,
 			SET_TX_WRB_HDR_BITS(udpcs, hdr, 1);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		SET_TX_WRB_HDR_BITS(vlan, hdr, 1);
 		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
 		SET_TX_WRB_HDR_BITS(vlan_tag, hdr, vlan_tag);
@@ -864,7 +864,7 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 	if (unlikely(!skb))
 		return skb;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
 
 	if (qnq_async_evt_rcvd(adapter) && adapter->pvid) {
@@ -923,7 +923,7 @@ static bool be_ipv6_exthdr_check(struct sk_buff *skb)
 
 static int be_vlan_tag_tx_chk(struct be_adapter *adapter, struct sk_buff *skb)
 {
-	return vlan_tx_tag_present(skb) || adapter->pvid || adapter->qnq_vid;
+	return skb_vlan_tag_present(skb) || adapter->pvid || adapter->qnq_vid;
 }
 
 static int be_ipv6_tx_stall_chk(struct be_adapter *adapter, struct sk_buff *skb)
@@ -946,7 +946,7 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
 	eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ?
 						VLAN_ETH_HLEN : ETH_HLEN;
 	if (skb->len <= 60 &&
-	    (lancer_chip(adapter) || vlan_tx_tag_present(skb)) &&
+	    (lancer_chip(adapter) || skb_vlan_tag_present(skb)) &&
 	    is_ipv4_pkt(skb)) {
 		ip = (struct iphdr *)ip_hdr(skb);
 		pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len));
@@ -964,7 +964,7 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
 	 * Manually insert VLAN in pkt.
 	 */
 	if (skb->ip_summed != CHECKSUM_PARTIAL &&
-	    vlan_tx_tag_present(skb)) {
+	    skb_vlan_tag_present(skb)) {
 		skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
 		if (unlikely(!skb))
 			goto err;
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index e54b1e39f9b4..93ff846e96f1 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -2170,7 +2170,7 @@ static inline void gfar_tx_checksum(struct sk_buff *skb, struct txfcb *fcb,
 void inline gfar_tx_vlan(struct sk_buff *skb, struct txfcb *fcb)
 {
 	fcb->flags |= TXFCB_VLN;
-	fcb->vlctl = vlan_tx_tag_get(skb);
+	fcb->vlctl = skb_vlan_tag_get(skb);
 }
 
 static inline struct txbd8 *skip_txbd(struct txbd8 *bdp, int stride,
@@ -2230,7 +2230,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	regs = tx_queue->grp->regs;
 
 	do_csum = (CHECKSUM_PARTIAL == skb->ip_summed);
-	do_vlan = vlan_tx_tag_present(skb);
+	do_vlan = skb_vlan_tag_present(skb);
 	do_tstamp = (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
 		    priv->hwts_tx_en;
 
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 566b17db135a..e8a1adb7a962 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -2064,9 +2064,9 @@ static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	memset(swqe, 0, SWQE_HEADER_SIZE);
 	atomic_dec(&pr->swqe_avail);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		swqe->tx_control |= EHEA_SWQE_VLAN_INSERT;
-		swqe->vlan_tag = vlan_tx_tag_get(skb);
+		swqe->vlan_tag = skb_vlan_tag_get(skb);
 	}
 
 	pr->tx_packets++;
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 83140cbb5f01..9242982db3e0 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -3226,9 +3226,10 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_BUSY;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= E1000_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     E1000_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	first = tx_ring->next_to_use;
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 332a298e95b5..38cb586b1bf4 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5463,8 +5463,8 @@ static int e1000_transfer_dhcp_info(struct e1000_adapter *adapter,
 	struct e1000_hw *hw = &adapter->hw;
 	u16 length, offset;
 
-	if (vlan_tx_tag_present(skb) &&
-	    !((vlan_tx_tag_get(skb) == adapter->hw.mng_cookie.vlan_id) &&
+	if (skb_vlan_tag_present(skb) &&
+	    !((skb_vlan_tag_get(skb) == adapter->hw.mng_cookie.vlan_id) &&
 	      (adapter->hw.mng_cookie.status &
 	       E1000_MNG_DHCP_COOKIE_STATUS_VLAN)))
 		return 0;
@@ -5603,9 +5603,10 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 	if (e1000_maybe_stop_tx(tx_ring, count + 2))
 		return NETDEV_TX_BUSY;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= E1000_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     E1000_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	first = tx_ring->next_to_use;
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index eb088b129bc7..caa43f7c2931 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -965,8 +965,8 @@ static void fm10k_tx_map(struct fm10k_ring *tx_ring,
 	tx_desc = FM10K_TX_DESC(tx_ring, i);
 
 	/* add HW VLAN tag */
-	if (vlan_tx_tag_present(skb))
-		tx_desc->vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb))
+		tx_desc->vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 	else
 		tx_desc->vlan = 0;
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 8811364b91cb..945b35d31c71 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -609,7 +609,7 @@ static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	int err;
 
 	if ((skb->protocol ==  htons(ETH_P_8021Q)) &&
-	    !vlan_tx_tag_present(skb)) {
+	    !skb_vlan_tag_present(skb)) {
 		/* FM10K only supports hardware tagging, any tags in frame
 		 * are considered 2nd level or "outer" tags
 		 */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 04b441460bbd..9f536dd8e1ec 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1772,8 +1772,8 @@ static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 	u32  tx_flags = 0;
 
 	/* if we have a HW VLAN tag being added, default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN, check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 04c7c1557a0c..82c3798fdd36 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1122,8 +1122,8 @@ static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 	u32  tx_flags = 0;
 
 	/* if we have a HW VLAN tag being added, default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN, check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index ff59897a9463..6c25ec314183 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5035,9 +5035,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 
 	skb_tx_timestamp(skb);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IGB_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	/* record initial flags and protocol */
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 63c807c9b21c..ad2b4897b392 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -2234,9 +2234,10 @@ static netdev_tx_t igbvf_xmit_frame_ring_adv(struct sk_buff *skb,
 		return NETDEV_TX_BUSY;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IGBVF_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << IGBVF_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     IGBVF_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	if (skb->protocol == htons(ETH_P_IP))
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index aa87605b144a..11a1bdbe3fd9 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -1532,9 +1532,9 @@ ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
                      DESC_NEEDED)))
 		return NETDEV_TX_BUSY;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IXGB_TX_FLAGS_VLAN;
-		vlan_id = vlan_tx_tag_get(skb);
+		vlan_id = skb_vlan_tag_get(skb);
 	}
 
 	first = adapter->tx_ring.next_to_use;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 2ed2c7de2304..7bb421bfd84e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7217,8 +7217,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 	first->gso_segs = 1;
 
 	/* if we have a HW VLAN tag being added default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << IXGBE_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << IXGBE_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= IXGBE_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 62a0d8e0f17d..c9b49bfb51bb 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3452,8 +3452,8 @@ static int ixgbevf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	first->bytecount = skb->len;
 	first->gso_segs = 1;
 
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb);
 		tx_flags <<= IXGBE_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= IXGBE_TX_FLAGS_VLAN;
 	}
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 44ce7d88f554..6e9a792097d3 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -2154,9 +2154,9 @@ jme_tx_csum(struct jme_adapter *jme, struct sk_buff *skb, u8 *flags)
 static inline void
 jme_tx_vlan(struct sk_buff *skb, __le16 *vlan, u8 *flags)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		*flags |= TXFLAG_TAGON;
-		*vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+		*vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 	}
 }
 
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 867a6a3ef81f..d9f4498832a1 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -1895,14 +1895,14 @@ static netdev_tx_t sky2_xmit_frame(struct sk_buff *skb,
 	ctrl = 0;
 
 	/* Add VLAN tag, can piggyback on LRGLEN or ADDR64 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if (!le) {
 			le = get_tx_le(sky2, &slot);
 			le->addr = 0;
 			le->opcode = OP_VLAN|HW_OWNER;
 		} else
 			le->opcode |= OP_VLAN;
-		le->length = cpu_to_be16(vlan_tx_tag_get(skb));
+		le->length = cpu_to_be16(skb_vlan_tag_get(skb));
 		ctrl |= INS_VLAN;
 	}
 
@@ -2594,7 +2594,7 @@ static struct sk_buff *sky2_receive(struct net_device *dev,
 	sky2->rx_next = (sky2->rx_next + 1) % sky2->rx_pending;
 	prefetch(sky2->rx_ring + sky2->rx_next);
 
-	if (vlan_tx_tag_present(re->skb))
+	if (skb_vlan_tag_present(re->skb))
 		count -= VLAN_HLEN;	/* Account for vlan tag */
 
 	/* This chip has hardware problems that generates bogus status.
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index e3357bf523df..359bb1286eb5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -682,8 +682,8 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
 	if (dev->num_tc)
 		return skb_tx_hash(dev, skb);
 
-	if (vlan_tx_tag_present(skb))
-		up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT;
+	if (skb_vlan_tag_present(skb))
+		up = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT;
 
 	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
 }
@@ -742,8 +742,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_drop;
 	}
 
-	if (vlan_tx_tag_present(skb))
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		vlan_tag = skb_vlan_tag_get(skb);
 
 
 	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
@@ -930,7 +930,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	real_size = (real_size / 16) & 0x3f;
 
 	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
-	    !vlan_tx_tag_present(skb) && send_doorbell) {
+	    !skb_vlan_tag_present(skb) && send_doorbell) {
 		tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
 				       cpu_to_be32(real_size);
 
@@ -952,7 +952,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	} else {
 		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
 		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
-			!!vlan_tx_tag_present(skb);
+			!!skb_vlan_tag_present(skb);
 		tx_desc->ctrl.fence_size = real_size;
 
 		/* Ensure new descriptor hits memory
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 2552e550a78c..eb807b0dc72a 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -1122,12 +1122,12 @@ again:
 	}
 
 #ifdef NS83820_VLAN_ACCEL_SUPPORT
-	if(vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* fetch the vlan tag info out of the
 		 * ancillary data if the vlan code
 		 * is using hw vlan acceleration
 		 */
-		short tag = vlan_tx_tag_get(skb);
+		short tag = skb_vlan_tag_get(skb);
 		extsts |= (EXTSTS_VPKT | htons(tag));
 	}
 #endif
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index f5e4b820128b..0529cad75b10 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -4045,8 +4045,8 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	queue = 0;
-	if (vlan_tx_tag_present(skb))
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		vlan_tag = skb_vlan_tag_get(skb);
 	if (sp->config.tx_steering_type == TX_DEFAULT_STEERING) {
 		if (skb->protocol == htons(ETH_P_IP)) {
 			struct iphdr *ip;
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index cc0485e3c621..50d5604833ed 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -890,8 +890,8 @@ vxge_xmit(struct sk_buff *skb, struct net_device *dev)
 		dev->name, __func__, __LINE__,
 		fifo_hw, dtr, dtr_priv);
 
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		vxge_hw_fifo_txdl_vlan_set(dtr, vlan_tag);
 	}
 
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index f39cae620f61..a41bb5e6b954 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -2462,9 +2462,9 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 			 NV_TX2_CHECKSUM_L3 | NV_TX2_CHECKSUM_L4 : 0;
 
 	/* vlan tag */
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		start_tx->txvlan = cpu_to_le32(NV_TX3_VLAN_TAG_PRESENT |
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 	else
 		start_tx->txvlan = 0;
 
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 613037584d08..a47fe67fdf58 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -1893,9 +1893,9 @@ netxen_tso_check(struct net_device *netdev,
 		protocol = vh->h_vlan_encapsulated_proto;
 		flags = FLAGS_VLAN_TAGGED;
 
-	} else if (vlan_tx_tag_present(skb)) {
+	} else if (skb_vlan_tag_present(skb)) {
 		flags = FLAGS_VLAN_OOB;
-		vid = vlan_tx_tag_get(skb);
+		vid = skb_vlan_tag_get(skb);
 		netxen_set_tx_vlan_tci(first_desc, vid);
 		vlan_oob = 1;
 	}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
index d166e534925d..4d2496f28b85 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -321,8 +321,8 @@ static void qlcnic_send_filter(struct qlcnic_adapter *adapter,
 		if (protocol == ETH_P_8021Q) {
 			vh = (struct vlan_ethhdr *)skb->data;
 			vlan_id = ntohs(vh->h_vlan_TCI);
-		} else if (vlan_tx_tag_present(skb)) {
-			vlan_id = vlan_tx_tag_get(skb);
+		} else if (skb_vlan_tag_present(skb)) {
+			vlan_id = skb_vlan_tag_get(skb);
 		}
 	}
 
@@ -473,9 +473,9 @@ static int qlcnic_tx_pkt(struct qlcnic_adapter *adapter,
 		flags = QLCNIC_FLAGS_VLAN_TAGGED;
 		vlan_tci = ntohs(vh->h_vlan_TCI);
 		protocol = ntohs(vh->h_vlan_encapsulated_proto);
-	} else if (vlan_tx_tag_present(skb)) {
+	} else if (skb_vlan_tag_present(skb)) {
 		flags = QLCNIC_FLAGS_VLAN_OOB;
-		vlan_tci = vlan_tx_tag_get(skb);
+		vlan_tci = skb_vlan_tag_get(skb);
 	}
 	if (unlikely(adapter->tx_pvid)) {
 		if (vlan_tci && !(adapter->flags & QLCNIC_TAGGING_ENABLED))
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 6c904a6cad2a..dc0058f90370 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -2660,11 +2660,11 @@ static netdev_tx_t qlge_send(struct sk_buff *skb, struct net_device *ndev)
 
 	mac_iocb_ptr->frame_len = cpu_to_le16((u16) skb->len);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		netif_printk(qdev, tx_queued, KERN_DEBUG, qdev->ndev,
-			     "Adding a vlan tag %d.\n", vlan_tx_tag_get(skb));
+			     "Adding a vlan tag %d.\n", skb_vlan_tag_get(skb));
 		mac_iocb_ptr->flags3 |= OB_MAC_IOCB_V;
-		mac_iocb_ptr->vlan_tci = cpu_to_le16(vlan_tx_tag_get(skb));
+		mac_iocb_ptr->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
 	}
 	tso = ql_tso(skb, (struct ob_mac_tso_iocb_req *)mac_iocb_ptr);
 	if (tso < 0) {
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 9c31e46d1eee..d79e33b3c191 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -708,8 +708,8 @@ static void cp_tx (struct cp_private *cp)
 
 static inline u32 cp_tx_vlan_tag(struct sk_buff *skb)
 {
-	return vlan_tx_tag_present(skb) ?
-		TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
+	return skb_vlan_tag_present(skb) ?
+		TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00;
 }
 
 static void unwind_tx_frag_mapping(struct cp_private *cp, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 3a280598a15a..cd286b0356ab 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -2073,8 +2073,8 @@ static int rtl8169_set_features(struct net_device *dev,
 
 static inline u32 rtl8169_tx_vlan_tag(struct sk_buff *skb)
 {
-	return (vlan_tx_tag_present(skb)) ?
-		TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
+	return (skb_vlan_tag_present(skb)) ?
+		TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00;
 }
 
 static void rtl8169_rx_vlan_tag(struct RxDesc *desc, struct sk_buff *skb)
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index b6612d6090ac..23545e1e605a 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1272,7 +1272,7 @@ static netdev_tx_t sxgbe_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb_is_gso(skb) && tqueue->prev_mss != cur_mss))
 		ctxt_desc_req = 1;
 
-	if (unlikely(vlan_tx_tag_present(skb) ||
+	if (unlikely(skb_vlan_tag_present(skb) ||
 		     ((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
 		      tqueue->hwts_tx_en)))
 		ctxt_desc_req = 1;
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 6ab36d9ff2ab..a9cac8413e49 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -1650,9 +1650,9 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
 		    txd_mss);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/*Cut VLAN ID to 12 bits */
-		txd_vlan_id = vlan_tx_tag_get(skb) & BITS_MASK(12);
+		txd_vlan_id = skb_vlan_tag_get(skb) & BITS_MASK(12);
 		txd_vtag = 1;
 	}
 
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index a191afc23b56..0ac76102b33d 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -1781,8 +1781,8 @@ static netdev_tx_t rhine_start_tx(struct sk_buff *skb,
 	rp->tx_ring[entry].desc_length =
 		cpu_to_le32(TXDESC | (skb->len >= ETH_ZLEN ? skb->len : ETH_ZLEN));
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		u16 vid_pcp = vlan_tx_tag_get(skb);
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		u16 vid_pcp = skb_vlan_tag_get(skb);
 
 		/* drop CFI/DEI bit, register needs VID and PCP */
 		vid_pcp = (vid_pcp & VLAN_VID_MASK) |
@@ -1803,7 +1803,7 @@ static netdev_tx_t rhine_start_tx(struct sk_buff *skb,
 
 	/* Non-x86 Todo: explicitly flush cache lines here. */
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		/* Tx queues are bits 7-0 (first Tx queue: bit 7) */
 		BYTE_REG_BITS_ON(1 << 7, ioaddr + TQWake);
 
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index 282f83a63b67..c20206f83cc1 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -2611,8 +2611,8 @@ static netdev_tx_t velocity_xmit(struct sk_buff *skb,
 
 	td_ptr->tdesc1.cmd = TCPLS_NORMAL + (tdinfo->nskb_dma + 1) * 16;
 
-	if (vlan_tx_tag_present(skb)) {
-		td_ptr->tdesc1.vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb)) {
+		td_ptr->tdesc1.vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 		td_ptr->tdesc1.TCR |= TCR0_VETAG;
 	}
 
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 7df221788cd4..d0ed5694dd7d 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -645,7 +645,7 @@ static void macvtap_skb_to_vnet_hdr(struct macvtap_queue *q,
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		if (vlan_tx_tag_present(skb))
+		if (skb_vlan_tag_present(skb))
 			vnet_hdr->csum_start = cpu_to_macvtap16(q,
 				skb_checksum_start_offset(skb) + VLAN_HLEN);
 		else
@@ -821,13 +821,13 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 	total = vnet_hdr_len;
 	total += skb->len;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
 		} veth;
 		veth.h_vlan_proto = skb->vlan_proto;
-		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 		total += VLAN_HLEN;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 74fdf1158448..be196e89ab6c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1260,7 +1260,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		vlan_hlen = VLAN_HLEN;
 
 	if (tun->flags & IFF_VNET_HDR)
@@ -1337,7 +1337,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 		} veth;
 
 		veth.h_vlan_proto = skb->vlan_proto;
-		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index b23426e4952c..e519e6a269b9 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -1421,10 +1421,10 @@ static int msdn_giant_send_check(struct sk_buff *skb)
 
 static inline void rtl_tx_vlan_tag(struct tx_desc *desc, struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		u32 opts2;
 
-		opts2 = TX_VLAN_TAG | swab16(vlan_tx_tag_get(skb));
+		opts2 = TX_VLAN_TAG | swab16(skb_vlan_tag_get(skb));
 		desc->opts2 |= cpu_to_le32(opts2);
 	}
 }
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 31439818c27e..294214c15292 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1038,9 +1038,9 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		le32_add_cpu(&tq->shared->txNumDeferred, 1);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		gdesc->txd.ti = 1;
-		gdesc->txd.tci = vlan_tx_tag_get(skb);
+		gdesc->txd.tci = skb_vlan_tag_get(skb);
 	}
 
 	/* finally flips the GEN bit of the SOP desc. */
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3a18d8ed89ca..985359dd6033 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1561,7 +1561,7 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 
 	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
 			+ VXLAN_HLEN + sizeof(struct ipv6hdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
@@ -1607,7 +1607,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ VXLAN_HLEN + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 625227ad16ee..dd4ab8d73d34 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2800,12 +2800,12 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
 	 * before we're going to overwrite this location with next hop ip.
 	 * v6 uses passthrough, v4 sets the tag in the QDIO header.
 	 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if ((ipv == 4) || (card->info.type == QETH_CARD_TYPE_IQD))
 			hdr->hdr.l3.ext_flags = QETH_HDR_EXT_VLAN_FRAME;
 		else
 			hdr->hdr.l3.ext_flags = QETH_HDR_EXT_INCLUDE_VLAN_TAG;
-		hdr->hdr.l3.vlan_id = vlan_tx_tag_get(skb);
+		hdr->hdr.l3.vlan_id = skb_vlan_tag_get(skb);
 	}
 
 	hdr->hdr.l3.length = skb->len - sizeof(struct qeth_hdr);
@@ -2986,7 +2986,7 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			skb_pull(new_skb, ETH_HLEN);
 		}
 
-		if (ipv != 4 && vlan_tx_tag_present(new_skb)) {
+		if (ipv != 4 && skb_vlan_tag_present(new_skb)) {
 			skb_push(new_skb, VLAN_HLEN);
 			skb_copy_to_linear_data(new_skb, new_skb->data + 4, 4);
 			skb_copy_to_linear_data_offset(new_skb, 4,
@@ -2995,7 +2995,7 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				new_skb->data + 12, 4);
 			tag = (u16 *)(new_skb->data + 12);
 			*tag = __constant_htons(ETH_P_8021Q);
-			*(tag + 1) = htons(vlan_tx_tag_get(new_skb));
+			*(tag + 1) = htons(skb_vlan_tag_get(new_skb));
 		}
 	}
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 14419a8ccbb6..bcaf4cabb858 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -469,7 +469,7 @@ static int peek_head_len(struct sock *sk)
 	head = skb_peek(&sk->sk_receive_queue);
 	if (likely(head)) {
 		len = head->len;
-		if (vlan_tx_tag_present(head))
+		if (skb_vlan_tag_present(head))
 			len += VLAN_HLEN;
 	}
 
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 515a35e2a48a..bea465f24ebb 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -78,9 +78,9 @@ static inline bool is_vlan_dev(struct net_device *dev)
         return dev->priv_flags & IFF_802_1Q_VLAN;
 }
 
-#define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
-#define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
-#define vlan_tx_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
+#define skb_vlan_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
+#define skb_vlan_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
+#define skb_vlan_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
 
 /**
  *	struct vlan_pcpu_stats - VLAN percpu rx/tx stats
@@ -376,7 +376,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
 static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
 {
 	skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 	if (likely(skb))
 		skb->vlan_tci = 0;
 	return skb;
@@ -393,7 +393,7 @@ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
  */
 static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		skb = __vlan_hwaccel_push_inside(skb);
 	return skb;
 }
@@ -442,8 +442,8 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
 					 u16 *vlan_tci)
 {
-	if (vlan_tx_tag_present(skb)) {
-		*vlan_tci = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		*vlan_tci = skb_vlan_tag_get(skb);
 		return 0;
 	} else {
 		*vlan_tci = 0;
@@ -480,7 +480,7 @@ static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
 {
 	__be16 protocol = 0;
 
-	if (vlan_tx_tag_present(skb) ||
+	if (skb_vlan_tag_present(skb) ||
 	     skb->protocol != cpu_to_be16(ETH_P_8021Q))
 		protocol = skb->protocol;
 	else {
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index fe6e7aac3c56..2342bf12cb78 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -121,7 +121,7 @@ static inline __be16 tc_skb_protocol(const struct sk_buff *skb)
 	 * vlan accelerated path. In that case, use skb->vlan_proto
 	 * as the original vlan header was already stripped.
 	 */
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		return skb->vlan_proto;
 	return skb->protocol;
 }
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 1de256b35807..49cc7c3de252 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -40,9 +40,9 @@ TRACE_EVENT(net_dev_start_xmit,
 		__assign_str(name, dev->name);
 		__entry->queue_mapping = skb->queue_mapping;
 		__entry->skbaddr = skb;
-		__entry->vlan_tagged = vlan_tx_tag_present(skb);
+		__entry->vlan_tagged = skb_vlan_tag_present(skb);
 		__entry->vlan_proto = ntohs(skb->vlan_proto);
-		__entry->vlan_tci = vlan_tx_tag_get(skb);
+		__entry->vlan_tci = skb_vlan_tag_get(skb);
 		__entry->protocol = ntohs(skb->protocol);
 		__entry->ip_summed = skb->ip_summed;
 		__entry->len = skb->len;
@@ -174,9 +174,9 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 #endif
 		__entry->queue_mapping = skb->queue_mapping;
 		__entry->skbaddr = skb;
-		__entry->vlan_tagged = vlan_tx_tag_present(skb);
+		__entry->vlan_tagged = skb_vlan_tag_present(skb);
 		__entry->vlan_proto = ntohs(skb->vlan_proto);
-		__entry->vlan_tci = vlan_tx_tag_get(skb);
+		__entry->vlan_tci = skb_vlan_tag_get(skb);
 		__entry->protocol = ntohs(skb->protocol);
 		__entry->ip_summed = skb->ip_summed;
 		__entry->hash = skb->hash;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 90cc2bdd4064..61bf2a06e85d 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -9,7 +9,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	__be16 vlan_proto = skb->vlan_proto;
-	u16 vlan_id = vlan_tx_tag_get_id(skb);
+	u16 vlan_id = skb_vlan_tag_get_id(skb);
 	struct net_device *vlan_dev;
 	struct vlan_pcpu_stats *rx_stats;
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22b6b3d..65728e0dc4ff 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -66,17 +66,17 @@ static int brnf_pass_vlan_indev __read_mostly = 0;
 #endif
 
 #define IS_IP(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
 
 #define IS_IPV6(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
 
 #define IS_ARP(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
 
 static inline __be16 vlan_proto(const struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		return skb->protocol;
 	else if (skb->protocol == htons(ETH_P_8021Q))
 		return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
@@ -436,11 +436,11 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 	struct net_device *vlan, *br;
 
 	br = bridge_parent(dev);
-	if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb))
+	if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
 		return br;
 
 	vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
-				    vlan_tx_tag_get(skb) & VLAN_VID_MASK);
+				    skb_vlan_tag_get(skb) & VLAN_VID_MASK);
 
 	return vlan ? vlan : br;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aea3d1339b3f..d808d766334d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -628,8 +628,8 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
 {
 	int err = 0;
 
-	if (vlan_tx_tag_present(skb))
-		*vid = vlan_tx_tag_get(skb) & VLAN_VID_MASK;
+	if (skb_vlan_tag_present(skb))
+		*vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK;
 	else {
 		*vid = 0;
 		err = -EINVAL;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 97b8ddf57363..13013fe8db24 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -187,7 +187,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 	 * sent from vlan device on the bridge device, it does not have
 	 * HW accelerated vlan tag.
 	 */
-	if (unlikely(!vlan_tx_tag_present(skb) &&
+	if (unlikely(!skb_vlan_tag_present(skb) &&
 		     skb->protocol == proto)) {
 		skb = skb_vlan_untag(skb);
 		if (unlikely(!skb))
@@ -200,7 +200,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 			/* Protocol-mismatch, empty out vlan_tci for new tag */
 			skb_push(skb, ETH_HLEN);
 			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-							vlan_tx_tag_get(skb));
+							skb_vlan_tag_get(skb));
 			if (unlikely(!skb))
 				return false;
 
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 8d3f8c7651f0..618568888128 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -45,8 +45,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	/* VLAN encapsulated Type/Length field, given from orig frame */
 	__be16 encap;
 
-	if (vlan_tx_tag_present(skb)) {
-		TCI = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		TCI = skb_vlan_tag_get(skb);
 		encap = skb->protocol;
 	} else {
 		const struct vlan_hdr *fp;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index d9a8c05d995d..91180a7fc943 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -133,7 +133,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
 	__be16 ethproto;
 	int verdict, i;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		ethproto = htons(ETH_P_8021Q);
 	else
 		ethproto = h->h_proto;
diff --git a/net/core/dev.c b/net/core/dev.c
index 805456147c30..1e325adc4367 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2578,7 +2578,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	if (skb->encapsulation)
 		features &= dev->hw_enc_features;
 
-	if (!vlan_tx_tag_present(skb)) {
+	if (!skb_vlan_tag_present(skb)) {
 		if (unlikely(protocol == htons(ETH_P_8021Q) ||
 			     protocol == htons(ETH_P_8021AD))) {
 			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2659,7 +2659,7 @@ out:
 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 					  netdev_features_t features)
 {
-	if (vlan_tx_tag_present(skb) &&
+	if (skb_vlan_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 		skb = __vlan_hwaccel_push_inside(skb);
 	return skb;
@@ -3676,7 +3676,7 @@ ncls:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
@@ -3708,8 +3708,8 @@ ncls:
 		}
 	}
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		if (vlan_tx_tag_get_id(skb))
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		if (skb_vlan_tag_get_id(skb))
 			skb->pkt_type = PACKET_OTHERHOST;
 		/* Note: we might in the future use prio bits
 		 * and set skb->priority like in vlan_do_receive()
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e0ad5d16c9c5..c126a878c47c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -77,7 +77,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	features = netif_skb_features(skb);
 
-	if (vlan_tx_tag_present(skb) &&
+	if (skb_vlan_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
 		skb = __vlan_hwaccel_push_inside(skb);
 		if (unlikely(!skb)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a2a2e887a12..56db472e9b86 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4197,7 +4197,7 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
 	struct vlan_hdr *vhdr;
 	u16 vlan_tci;
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
+	if (unlikely(skb_vlan_tag_present(skb))) {
 		/* vlan_tci is already set-up so leave this for another time */
 		return skb;
 	}
@@ -4283,7 +4283,7 @@ int skb_vlan_pop(struct sk_buff *skb)
 	__be16 vlan_proto;
 	int err;
 
-	if (likely(vlan_tx_tag_present(skb))) {
+	if (likely(skb_vlan_tag_present(skb))) {
 		skb->vlan_tci = 0;
 	} else {
 		if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
@@ -4313,7 +4313,7 @@ EXPORT_SYMBOL(skb_vlan_pop);
 
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		unsigned int offset = skb->data - skb_mac_header(skb);
 		int err;
 
@@ -4323,7 +4323,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 		 */
 		__skb_push(skb, offset);
 		err = __vlan_insert_tag(skb, skb->vlan_proto,
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 		if (err)
 			return err;
 		skb->protocol = skb->vlan_proto;
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 5b52046ec7a2..23744c7a9718 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -119,7 +119,7 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err)) {
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 770064c83711..b4cffe686126 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -212,7 +212,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 	int err;
 
 	err = skb_vlan_pop(skb);
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		invalidate_flow_key(key);
 	else
 		key->eth.tci = 0;
@@ -222,7 +222,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_vlan *vlan)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		invalidate_flow_key(key);
 	else
 		key->eth.tci = vlan->vlan_tci;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4e9a5f035cbc..54854e3ecd83 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -419,7 +419,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	if (!dp_ifindex)
 		return -ENODEV;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		nskb = skb_clone(skb, GFP_ATOMIC);
 		if (!nskb)
 			return -ENOMEM;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index da2fae0873a5..df334fe43d7f 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -70,7 +70,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
 {
 	struct flow_stats *stats;
 	int node = numa_node_id();
-	int len = skb->len + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+	int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	stats = rcu_dereference(flow->stats[node]);
 
@@ -472,7 +472,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 	 */
 
 	key->eth.tci = 0;
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		key->eth.tci = htons(skb->vlan_tci);
 	else if (eth->h_proto == htons(ETH_P_8021Q))
 		if (unlikely(parse_vlan(skb, key)))
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index d4168c442db5..e9aedb7c7106 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -166,7 +166,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ tunnel_hlen + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
 		int head_delta = SKB_DATA_ALIGN(min_headroom -
 						skb_headroom(skb) +
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 2034c6d9cb5a..464739aac0f3 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -480,7 +480,8 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 	stats = this_cpu_ptr(vport->percpu_stats);
 	u64_stats_update_begin(&stats->syncp);
 	stats->rx_packets++;
-	stats->rx_bytes += skb->len + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+	stats->rx_bytes += skb->len +
+			   (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 	u64_stats_update_end(&stats->syncp);
 
 	OVS_CB(skb)->input_vport = vport;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0f02668dc219..d37075b0d6d5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -986,8 +986,8 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
 			struct tpacket3_hdr *ppd)
 {
-	if (vlan_tx_tag_present(pkc->skb)) {
-		ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
+	if (skb_vlan_tag_present(pkc->skb)) {
+		ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
 		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
 		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 	} else {
@@ -2000,8 +2000,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h2->tp_net = netoff;
 		h.h2->tp_sec = ts.tv_sec;
 		h.h2->tp_nsec = ts.tv_nsec;
-		if (vlan_tx_tag_present(skb)) {
-			h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (skb_vlan_tag_present(skb)) {
+			h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
 			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
 			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 		} else {
@@ -3010,8 +3010,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		aux.tp_snaplen = skb->len;
 		aux.tp_mac = 0;
 		aux.tp_net = skb_network_offset(skb);
-		if (vlan_tx_tag_present(skb)) {
-			aux.tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (skb_vlan_tag_present(skb)) {
+			aux.tp_vlan_tci = skb_vlan_tag_get(skb);
 			aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
 			aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 		} else {
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 2159981b604e..b5294ce20cd4 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -176,7 +176,7 @@ META_COLLECTOR(int_vlan_tag)
 {
 	unsigned short tag;
 
-	tag = vlan_tx_tag_get(skb);
+	tag = skb_vlan_tag_get(skb);
 	if (!tag && __vlan_get_tag(skb, &tag))
 		*err = -1;
 	else
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d0ac795445b7..1d2fcfad06cc 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -708,8 +708,8 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb,
 	if (skb->priority >= 256 && skb->priority <= 263)
 		return skb->priority - 256;
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_priority = (vlan_tx_tag_get(skb) & VLAN_PRIO_MASK)
+	if (skb_vlan_tag_present(skb)) {
+		vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK)
 			>> VLAN_PRIO_SHIFT;
 		if (vlan_priority > 0)
 			return vlan_priority;
-- 
cgit v1.2.3


From a2b12f3c7ac1ea43ae646db74faf0b56c2bba563 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 12 Jan 2015 17:00:37 -0800
Subject: udp: pass udp_offload struct to UDP gro callbacks

This patch introduces udp_offload_callbacks which has the same
GRO functions (but not a GSO function) as offload_callbacks,
except there is an argument to a udp_offload struct passed to
gro_receive and gro_complete functions. This additional argument
can be used to retrieve the per port structure of the encapsulation
for use in gro processing (mostly by doing container_of on the
structure).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       |  7 +++++--
 include/linux/netdevice.h | 15 +++++++++++++--
 net/ipv4/fou.c            | 12 ++++++++----
 net/ipv4/geneve.c         |  6 ++++--
 net/ipv4/udp_offload.c    |  7 +++++--
 5 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 985359dd6033..5c56a3ff25aa 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -539,7 +539,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 	return 1;
 }
 
-static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
+					  struct sk_buff *skb,
+					  struct udp_offload *uoff)
 {
 	struct sk_buff *p, **pp = NULL;
 	struct vxlanhdr *vh, *vh2;
@@ -578,7 +580,8 @@ out:
 	return pp;
 }
 
-static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
+static int vxlan_gro_complete(struct sk_buff *skb, int nhoff,
+			      struct udp_offload *uoff)
 {
 	udp_tunnel_gro_complete(skb, nhoff);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 679e6e90aa4c..47921c291dd6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1969,7 +1969,7 @@ struct offload_callbacks {
 	struct sk_buff		*(*gso_segment)(struct sk_buff *skb,
 						netdev_features_t features);
 	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
-					       struct sk_buff *skb);
+						 struct sk_buff *skb);
 	int			(*gro_complete)(struct sk_buff *skb, int nhoff);
 };
 
@@ -1979,10 +1979,21 @@ struct packet_offload {
 	struct list_head	 list;
 };
 
+struct udp_offload;
+
+struct udp_offload_callbacks {
+	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
+						 struct sk_buff *skb,
+						 struct udp_offload *uoff);
+	int			(*gro_complete)(struct sk_buff *skb,
+						int nhoff,
+						struct udp_offload *uoff);
+};
+
 struct udp_offload {
 	__be16			 port;
 	u8			 ipproto;
-	struct offload_callbacks callbacks;
+	struct udp_offload_callbacks callbacks;
 };
 
 /* often modified stats are per cpu, other are shared (netdev->stats) */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 2197c36f722f..3bc0cf07661c 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -174,7 +174,8 @@ drop:
 }
 
 static struct sk_buff **fou_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb)
+					struct sk_buff *skb,
+					struct udp_offload *uoff)
 {
 	const struct net_offload *ops;
 	struct sk_buff **pp = NULL;
@@ -195,7 +196,8 @@ out_unlock:
 	return pp;
 }
 
-static int fou_gro_complete(struct sk_buff *skb, int nhoff)
+static int fou_gro_complete(struct sk_buff *skb, int nhoff,
+			    struct udp_offload *uoff)
 {
 	const struct net_offload *ops;
 	u8 proto = NAPI_GRO_CB(skb)->proto;
@@ -254,7 +256,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 }
 
 static struct sk_buff **gue_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb)
+					struct sk_buff *skb,
+					struct udp_offload *uoff)
 {
 	const struct net_offload **offloads;
 	const struct net_offload *ops;
@@ -360,7 +363,8 @@ out:
 	return pp;
 }
 
-static int gue_gro_complete(struct sk_buff *skb, int nhoff)
+static int gue_gro_complete(struct sk_buff *skb, int nhoff,
+			    struct udp_offload *uoff)
 {
 	const struct net_offload **offloads;
 	struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 23744c7a9718..9568594ca2f1 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -147,7 +147,8 @@ static int geneve_hlen(struct genevehdr *gh)
 }
 
 static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
-					   struct sk_buff *skb)
+					   struct sk_buff *skb,
+					   struct udp_offload *uoff)
 {
 	struct sk_buff *p, **pp = NULL;
 	struct genevehdr *gh, *gh2;
@@ -211,7 +212,8 @@ out:
 	return pp;
 }
 
-static int geneve_gro_complete(struct sk_buff *skb, int nhoff)
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
+			       struct udp_offload *uoff)
 {
 	struct genevehdr *gh;
 	struct packet_offload *ptype;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index d3e537ef6b7f..d10f6f4ead27 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -339,7 +339,8 @@ unflush:
 	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
 	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
 	NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-	pp = uo_priv->offload->callbacks.gro_receive(head, skb);
+	pp = uo_priv->offload->callbacks.gro_receive(head, skb,
+						     uo_priv->offload);
 
 out_unlock:
 	rcu_read_unlock();
@@ -395,7 +396,9 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
 
 	if (uo_priv != NULL) {
 		NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-		err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr));
+		err = uo_priv->offload->callbacks.gro_complete(skb,
+				nhoff + sizeof(struct udphdr),
+				uo_priv->offload);
 	}
 
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 5055c371bfd53fd369b895051b541318c2bad495 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 14 Jan 2015 15:17:06 -0800
Subject: ipv4: per cpu uncached list

RAW sockets with hdrinc suffer from contention on rt_uncached_lock
spinlock.

One solution is to use percpu lists, since most routes are destroyed
by the cpu that created them.

It is unclear why we even have to put these routes in uncached_list,
as all outgoing packets should be freed when a device is dismantled.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: caacf05e5ad1 ("ipv4: Properly purge netdev references on uncached routes.")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  2 ++
 net/ipv4/route.c    | 46 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 35 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index b17cf28f996e..fe22d03afb6a 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -46,6 +46,7 @@
 
 struct fib_nh;
 struct fib_info;
+struct uncached_list;
 struct rtable {
 	struct dst_entry	dst;
 
@@ -64,6 +65,7 @@ struct rtable {
 	u32			rt_pmtu;
 
 	struct list_head	rt_uncached;
+	struct uncached_list	*rt_uncached_list;
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6a2155b02602..ce112d0f2698 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1325,14 +1325,22 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 	return ret;
 }
 
-static DEFINE_SPINLOCK(rt_uncached_lock);
-static LIST_HEAD(rt_uncached_list);
+struct uncached_list {
+	spinlock_t		lock;
+	struct list_head	head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
 
 static void rt_add_uncached_list(struct rtable *rt)
 {
-	spin_lock_bh(&rt_uncached_lock);
-	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
-	spin_unlock_bh(&rt_uncached_lock);
+	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
+
+	rt->rt_uncached_list = ul;
+
+	spin_lock_bh(&ul->lock);
+	list_add_tail(&rt->rt_uncached, &ul->head);
+	spin_unlock_bh(&ul->lock);
 }
 
 static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1340,27 +1348,32 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 	struct rtable *rt = (struct rtable *) dst;
 
 	if (!list_empty(&rt->rt_uncached)) {
-		spin_lock_bh(&rt_uncached_lock);
+		struct uncached_list *ul = rt->rt_uncached_list;
+
+		spin_lock_bh(&ul->lock);
 		list_del(&rt->rt_uncached);
-		spin_unlock_bh(&rt_uncached_lock);
+		spin_unlock_bh(&ul->lock);
 	}
 }
 
 void rt_flush_dev(struct net_device *dev)
 {
-	if (!list_empty(&rt_uncached_list)) {
-		struct net *net = dev_net(dev);
-		struct rtable *rt;
+	struct net *net = dev_net(dev);
+	struct rtable *rt;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
 
-		spin_lock_bh(&rt_uncached_lock);
-		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
+		spin_lock_bh(&ul->lock);
+		list_for_each_entry(rt, &ul->head, rt_uncached) {
 			if (rt->dst.dev != dev)
 				continue;
 			rt->dst.dev = net->loopback_dev;
 			dev_hold(rt->dst.dev);
 			dev_put(dev);
 		}
-		spin_unlock_bh(&rt_uncached_lock);
+		spin_unlock_bh(&ul->lock);
 	}
 }
 
@@ -2717,6 +2730,7 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
 int __init ip_rt_init(void)
 {
 	int rc = 0;
+	int cpu;
 
 	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
 	if (!ip_idents)
@@ -2724,6 +2738,12 @@ int __init ip_rt_init(void)
 
 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
 
+	for_each_possible_cpu(cpu) {
+		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+		INIT_LIST_HEAD(&ul->head);
+		spin_lock_init(&ul->lock);
+	}
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
 	if (!ip_rt_acct)
-- 
cgit v1.2.3


From 053c095a82cf773075e83d7233b5cc19a1f73ece Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Jan 2015 22:09:00 +0100
Subject: netlink: make nlmsg_end() and genlmsg_end() void

Contrary to common expectations for an "int" return, these functions
return only a positive value -- if used correctly they cannot even
return 0 because the message header will necessarily be in the skb.

This makes the very common pattern of

  if (genlmsg_end(...) < 0) { ... }

be a whole bunch of dead code. Many places also simply do

  return nlmsg_end(...);

and the caller is expected to deal with it.

This also commonly (at least for me) causes errors, because it is very
common to write

  if (my_function(...))
    /* error condition */

and if my_function() does "return nlmsg_end()" this is of course wrong.

Additionally, there's not a single place in the kernel that actually
needs the message length returned, and if anyone needs it later then
it'll be very easy to just use skb->len there.

Remove this, and make the functions void. This removes a bunch of dead
code as described above. The patch adds lines because I did

-	return nlmsg_end(...);
+	nlmsg_end(...);
+	return 0;

I could have preserved all the function's return values by returning
skb->len, but instead I've audited all the places calling the affected
functions and found that none cared. A few places actually compared
the return value with <= 0 in dump functionality, but that could just
be changed to < 0 with no change in behaviour, so I opted for the more
efficient version.

One instance of the error I've made numerous times now is also present
in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't
check for <0 or <=0 and thus broke out of the loop every single time.
I've preserved this since it will (I think) have caused the messages to
userspace to be formatted differently with just a single message for
every SKB returned to userspace. It's possible that this isn't needed
for the tools that actually use this, but I don't even know what they
are so couldn't test that changing this behaviour would be acceptable.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c                  |  7 +------
 drivers/net/ethernet/rocker/rocker.c  |  3 ++-
 drivers/net/vxlan.c                   |  3 ++-
 drivers/net/wireless/mac80211_hwsim.c |  3 ++-
 drivers/scsi/pmcraid.c                |  8 +-------
 drivers/target/target_core_user.c     |  4 +---
 drivers/thermal/thermal_core.c        |  6 +-----
 fs/dlm/netlink.c                      |  7 +------
 include/net/genetlink.h               |  4 ++--
 include/net/netlink.h                 |  6 +-----
 kernel/taskstats.c                    | 13 ++-----------
 net/bridge/br_fdb.c                   |  3 ++-
 net/bridge/br_mdb.c                   |  3 ++-
 net/bridge/br_netlink.c               |  3 ++-
 net/can/gw.c                          |  3 ++-
 net/core/fib_rules.c                  |  3 ++-
 net/core/neighbour.c                  | 12 ++++++++----
 net/core/rtnetlink.c                  |  9 ++++++---
 net/decnet/dn_dev.c                   |  3 ++-
 net/decnet/dn_route.c                 |  3 ++-
 net/decnet/dn_table.c                 |  3 ++-
 net/ieee802154/netlink.c              | 12 ++----------
 net/ieee802154/nl-mac.c               |  3 ++-
 net/ieee802154/nl-phy.c               |  3 ++-
 net/ieee802154/nl802154.c             |  6 ++++--
 net/ipv4/devinet.c                    |  8 +++++---
 net/ipv4/fib_semantics.c              |  3 ++-
 net/ipv4/inet_diag.c                  |  9 ++++++---
 net/ipv4/ipmr.c                       |  3 ++-
 net/ipv4/route.c                      |  3 ++-
 net/ipv4/tcp_metrics.c                |  3 ++-
 net/ipv6/addrconf.c                   | 32 +++++++++++++++++++-------------
 net/ipv6/addrlabel.c                  |  5 +++--
 net/ipv6/ip6_fib.c                    |  1 -
 net/ipv6/ip6mr.c                      |  3 ++-
 net/ipv6/route.c                      |  3 ++-
 net/l2tp/l2tp_netlink.c               | 10 ++++++----
 net/netfilter/ipvs/ip_vs_ctl.c        |  9 ++++++---
 net/netfilter/nf_tables_api.c         | 18 ++++++++++++------
 net/netlabel/netlabel_cipso_v4.c      |  3 ++-
 net/netlabel/netlabel_mgmt.c          |  6 ++++--
 net/netlabel/netlabel_unlabeled.c     |  3 ++-
 net/netlink/diag.c                    |  3 ++-
 net/netlink/genetlink.c               |  6 ++++--
 net/nfc/netlink.c                     | 12 +++++++-----
 net/openvswitch/datapath.c            |  9 ++++++---
 net/packet/diag.c                     |  3 ++-
 net/phonet/pn_netlink.c               | 16 +++++++++++-----
 net/unix/diag.c                       |  3 ++-
 net/wireless/nl80211.c                | 27 ++++++++++++++++++---------
 net/xfrm/xfrm_user.c                  | 27 ++++++++++++++++++---------
 51 files changed, 203 insertions(+), 158 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index ef2d730734dc..e24ea4e796e4 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -100,7 +100,6 @@ int acpi_bus_generate_netlink_event(const char *device_class,
 	struct acpi_genl_event *event;
 	void *msg_header;
 	int size;
-	int result;
 
 	/* allocate memory */
 	size = nla_total_size(sizeof(struct acpi_genl_event)) +
@@ -137,11 +136,7 @@ int acpi_bus_generate_netlink_event(const char *device_class,
 	event->data = data;
 
 	/* send multicast genetlink message */
-	result = genlmsg_end(skb, msg_header);
-	if (result < 0) {
-		nlmsg_free(skb);
-		return result;
-	}
+	genlmsg_end(skb, msg_header);
 
 	genlmsg_multicast(&acpi_event_genl_family, skb, 0, 0, GFP_ATOMIC);
 	return 0;
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 964d719b150f..d54781e71cd4 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3674,7 +3674,8 @@ static int rocker_fdb_fill_info(struct sk_buff *skb,
 	if (vid && nla_put_u16(skb, NDA_VLAN, vid))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 6b6b45622a0a..c5f79e7513a6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -363,7 +363,8 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 494e7335aa64..4a4c6586a8d2 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2557,7 +2557,8 @@ static int mac80211_hwsim_get_radio(struct sk_buff *skb,
 	if (res < 0)
 		goto out_err;
 
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
 out_err:
 	genlmsg_cancel(skb, hdr);
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 8c27b6a77ec4..cf222f46eac5 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1473,13 +1473,7 @@ static int pmcraid_notify_aen(
 	}
 
 	/* send genetlink multicast message to notify appplications */
-	result = genlmsg_end(skb, msg_header);
-
-	if (result < 0) {
-		pmcraid_err("genlmsg_end failed\n");
-		nlmsg_free(skb);
-		return result;
-	}
+	genlmsg_end(skb, msg_header);
 
 	result = genlmsg_multicast(&pmcraid_event_family, skb,
 				   0, 0, GFP_ATOMIC);
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 1157b559683b..1a1bcf71ec9d 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -784,9 +784,7 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int mino
 	if (ret < 0)
 		goto free_skb;
 
-	ret = genlmsg_end(skb, msg_header);
-	if (ret < 0)
-		goto free_skb;
+	genlmsg_end(skb, msg_header);
 
 	ret = genlmsg_multicast(&tcmu_genl_family, skb, 0,
 				TCMU_MCGRP_CONFIG, GFP_KERNEL);
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 87e0b0782023..48491d1a81d6 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1759,11 +1759,7 @@ int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 	thermal_event->event = event;
 
 	/* send multicast genetlink message */
-	result = genlmsg_end(skb, msg_header);
-	if (result < 0) {
-		nlmsg_free(skb);
-		return result;
-	}
+	genlmsg_end(skb, msg_header);
 
 	result = genlmsg_multicast(&thermal_event_genl_family, skb, 0,
 				   0, GFP_ATOMIC);
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7cfbaf8d0e2..1e6e227134d7 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
 	void *data = genlmsg_data(genlhdr);
-	int rv;
 
-	rv = genlmsg_end(skb, data);
-	if (rv < 0) {
-		nlmsg_free(skb);
-		return rv;
-	}
+	genlmsg_end(skb, data);
 
 	return genlmsg_unicast(&init_net, skb, listener_nlportid);
 }
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 84125088c309..f24aa83b80b6 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -245,9 +245,9 @@ static inline void *genlmsg_put_reply(struct sk_buff *skb,
  * @skb: socket buffer the message is stored in
  * @hdr: user specific header
  */
-static inline int genlmsg_end(struct sk_buff *skb, void *hdr)
+static inline void genlmsg_end(struct sk_buff *skb, void *hdr)
 {
-	return nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
+	nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
 }
 
 /**
diff --git a/include/net/netlink.h b/include/net/netlink.h
index d5869b90bfbb..e010ee8da41d 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -490,14 +490,10 @@ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
  * Corrects the netlink message header to include the appeneded
  * attributes. Only necessary if attributes have been added to
  * the message.
- *
- * Returns the total data length of the skb.
  */
-static inline int nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
+static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
-
-	return skb->len;
 }
 
 /**
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 670fff88a961..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
 	void *reply = genlmsg_data(genlhdr);
-	int rc;
 
-	rc = genlmsg_end(skb, reply);
-	if (rc < 0) {
-		nlmsg_free(skb);
-		return rc;
-	}
+	genlmsg_end(skb, reply);
 
 	return genlmsg_reply(skb, info);
 }
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
 	void *reply = genlmsg_data(genlhdr);
 	int rc, delcount = 0;
 
-	rc = genlmsg_end(skb, reply);
-	if (rc < 0) {
-		nlmsg_free(skb);
-		return;
-	}
+	genlmsg_end(skb, reply);
 
 	rc = 0;
 	down_read(&listeners->sem);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 03667e65cc29..08bf04bdac58 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -633,7 +633,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index fed61c971b17..409608960899 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -190,7 +190,8 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
 
 	nla_nest_end(skb, nest2);
 	nla_nest_end(skb, nest);
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 end:
 	nla_nest_end(skb, nest);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 163950b10d8c..528cf2790a5f 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -263,7 +263,8 @@ static int br_fill_ifinfo(struct sk_buff *skb,
 	}
 
 done:
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/can/gw.c b/net/can/gw.c
index 295f62e62eb3..a6f448e18ea8 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -575,7 +575,8 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
 			goto cancel;
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 cancel:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 185c341fafbd..44706e81b2e0 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -609,7 +609,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	if (ops->fill(rule, skb, frh) < 0)
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8d614c93f86a..d36d564f149f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1884,7 +1884,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 		goto nla_put_failure;
 
 	read_unlock_bh(&tbl->lock);
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	read_unlock_bh(&tbl->lock);
@@ -1917,7 +1918,8 @@ static int neightbl_fill_param_info(struct sk_buff *skb,
 		goto errout;
 
 	read_unlock_bh(&tbl->lock);
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 errout:
 	read_unlock_bh(&tbl->lock);
 	nlmsg_cancel(skb, nlh);
@@ -2202,7 +2204,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
 	    nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
@@ -2232,7 +2235,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
 	if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index eadc5c0e2dfa..e13b9dbdf154 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1199,7 +1199,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 
 	nla_nest_end(skb, af_spec);
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
@@ -2326,7 +2327,8 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
 	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
@@ -2809,7 +2811,8 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 
 	nla_nest_end(skb, protinfo);
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
 	return -EMSGSIZE;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 4400da7739da..b2c26b081134 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -702,7 +702,8 @@ static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
 	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
 	     nla_put_u32(skb, IFA_FLAGS, ifa_flags))
 		goto nla_put_failure;
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index daccc4a36d80..812e5e6e88fb 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1616,7 +1616,8 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
 	    nla_put_u32(skb, RTA_IIF, rt->fld.flowidn_iif) < 0)
 		goto errout;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 errout:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 3f19fcbf126d..1540b506e3e0 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -367,7 +367,8 @@ static int dn_fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		nla_nest_end(skb, mp_head);
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 errout:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index fa1464762d0d..c8133c07ceee 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -63,13 +63,9 @@ int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group)
 	struct nlmsghdr *nlh = nlmsg_hdr(msg);
 	void *hdr = genlmsg_data(nlmsg_data(nlh));
 
-	if (genlmsg_end(msg, hdr) < 0)
-		goto out;
+	genlmsg_end(msg, hdr);
 
 	return genlmsg_multicast(&nl802154_family, msg, 0, group, GFP_ATOMIC);
-out:
-	nlmsg_free(msg);
-	return -ENOBUFS;
 }
 
 struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info,
@@ -96,13 +92,9 @@ int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info)
 	struct nlmsghdr *nlh = nlmsg_hdr(msg);
 	void *hdr = genlmsg_data(nlmsg_data(nlh));
 
-	if (genlmsg_end(msg, hdr) < 0)
-		goto out;
+	genlmsg_end(msg, hdr);
 
 	return genlmsg_reply(msg, info);
-out:
-	nlmsg_free(msg);
-	return -ENOBUFS;
 }
 
 static const struct genl_ops ieee8021154_ops[] = {
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 3c902e9516fb..9105265920fe 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -136,7 +136,8 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid,
 	}
 
 	wpan_phy_put(phy);
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
 nla_put_failure:
 	wpan_phy_put(phy);
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 7baf98b14611..1b9d25f6e898 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -65,7 +65,8 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
 		goto nla_put_failure;
 	mutex_unlock(&phy->pib_lock);
 	kfree(buf);
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
 nla_put_failure:
 	mutex_unlock(&phy->pib_lock);
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index a25b9bbd077b..a4daf91b8d0a 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -306,7 +306,8 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
 		goto nla_put_failure;
 
 finish:
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -489,7 +490,8 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
 	if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt))
 		goto nla_put_failure;
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 214882e7d6de..5f344eb3fc25 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1522,7 +1522,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 			  preferred, valid))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
@@ -1566,7 +1567,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 				if (inet_fill_ifaddr(skb, ifa,
 					     NETLINK_CB(cb->skb).portid,
 					     cb->nlh->nlmsg_seq,
-					     RTM_NEWADDR, NLM_F_MULTI) <= 0) {
+					     RTM_NEWADDR, NLM_F_MULTI) < 0) {
 					rcu_read_unlock();
 					goto done;
 				}
@@ -1749,7 +1750,8 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d2b7b5521b1b..265cb72b7c1b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1091,7 +1091,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		nla_nest_end(skb, mp);
 	}
 #endif
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e34dccbc4d70..81751f12645f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -203,7 +203,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		icsk->icsk_ca_ops->get_info(sk, ext, skb);
 
 out:
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 errout:
 	nlmsg_cancel(skb, nlh);
@@ -271,7 +272,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 	}
 #endif
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
@@ -758,7 +760,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	}
 #endif
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c8034587859d..9d78427652d2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2290,7 +2290,8 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	if (err < 0 && err != -ENOENT)
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ce112d0f2698..f6e43ca5e641 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2390,7 +2390,8 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index ed9c9a91851c..e5f41bd5ec1b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -886,7 +886,8 @@ static int tcp_metrics_dump_info(struct sk_buff *skb,
 	if (tcp_metrics_fill_info(skb, tm) < 0)
 		goto nla_put_failure;
 
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(skb, hdr);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f7c8bbeb27b7..8975d9501d50 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -489,7 +489,8 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
@@ -619,7 +620,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
 						       cb->nlh->nlmsg_seq,
 						       RTM_NEWNETCONF,
 						       NLM_F_MULTI,
-						       -1) <= 0) {
+						       -1) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -635,7 +636,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) <= 0)
+					       -1) < 0)
 			goto done;
 		else
 			h++;
@@ -646,7 +647,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) <= 0)
+					       -1) < 0)
 			goto done;
 		else
 			h++;
@@ -4047,7 +4048,8 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
 	if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0)
 		goto error;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 error:
 	nlmsg_cancel(skb, nlh);
@@ -4076,7 +4078,8 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
 		return -EMSGSIZE;
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
@@ -4101,7 +4104,8 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
 		return -EMSGSIZE;
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 enum addr_type_t {
@@ -4134,7 +4138,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
 						cb->nlh->nlmsg_seq,
 						RTM_NEWADDR,
 						NLM_F_MULTI);
-			if (err <= 0)
+			if (err < 0)
 				break;
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 		}
@@ -4151,7 +4155,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
 						  cb->nlh->nlmsg_seq,
 						  RTM_GETMULTICAST,
 						  NLM_F_MULTI);
-			if (err <= 0)
+			if (err < 0)
 				break;
 		}
 		break;
@@ -4166,7 +4170,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
 						  cb->nlh->nlmsg_seq,
 						  RTM_GETANYCAST,
 						  NLM_F_MULTI);
-			if (err <= 0)
+			if (err < 0)
 				break;
 		}
 		break;
@@ -4638,7 +4642,8 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
 		goto nla_put_failure;
 
 	nla_nest_end(skb, protoinfo);
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
@@ -4670,7 +4675,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 			if (inet6_fill_ifinfo(skb, idev,
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
-					      RTM_NEWLINK, NLM_F_MULTI) <= 0)
+					      RTM_NEWLINK, NLM_F_MULTI) < 0)
 				goto out;
 cont:
 			idx++;
@@ -4747,7 +4752,8 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
 	ci.valid_time = ntohl(pinfo->valid);
 	if (nla_put(skb, PREFIX_CACHEINFO, sizeof(ci), &ci))
 		goto nla_put_failure;
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index fd0dc47f471d..e43e79d0a612 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -490,7 +490,8 @@ static int ip6addrlbl_fill(struct sk_buff *skb,
 		return -EMSGSIZE;
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
@@ -510,7 +511,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWADDRLABEL,
 					      NLM_F_MULTI);
-			if (err <= 0)
+			if (err < 0)
 				break;
 		}
 		idx++;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 03c520a4ebeb..53775ee7d376 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -277,7 +277,6 @@ static int fib6_dump_node(struct fib6_walker *w)
 			w->leaf = rt;
 			return 1;
 		}
-		WARN_ON(res == 0);
 	}
 	w->leaf = NULL;
 	return 0;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 722669754bbf..34b682617f50 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2388,7 +2388,8 @@ static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 	if (err < 0 && err != -ENOENT)
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 34dcbb59df75..c60f15775c53 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2725,7 +2725,8 @@ static int rt6_fill_node(struct net *net,
 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 6b16598f31d5..b4e923f77954 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -390,7 +390,8 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
 	}
 
 out:
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(skb, hdr);
@@ -451,7 +452,7 @@ static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback
 
 		if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid,
 					cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					tunnel, L2TP_CMD_TUNNEL_GET) <= 0)
+					tunnel, L2TP_CMD_TUNNEL_GET) < 0)
 			goto out;
 
 		ti++;
@@ -752,7 +753,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 		goto nla_put_failure;
 	nla_nest_end(skb, nest);
 
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
  nla_put_failure:
 	genlmsg_cancel(skb, hdr);
@@ -816,7 +818,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
 
 		if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid,
 					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					 session, L2TP_CMD_SESSION_GET) <= 0)
+					 session, L2TP_CMD_SESSION_GET) < 0)
 			break;
 
 		si++;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index b8295a430a56..e55759056361 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2887,7 +2887,8 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb,
 	if (ip_vs_genl_fill_service(skb, svc) < 0)
 		goto nla_put_failure;
 
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(skb, hdr);
@@ -3079,7 +3080,8 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
 	if (ip_vs_genl_fill_dest(skb, dest) < 0)
 		goto nla_put_failure;
 
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(skb, hdr);
@@ -3215,7 +3217,8 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
 	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
 		goto nla_put_failure;
 
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(skb, hdr);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3b3ddb4fb9ee..70f697827b9b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -427,7 +427,8 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
 	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_trim(skb, nlh);
@@ -971,7 +972,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 	if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_trim(skb, nlh);
@@ -1707,7 +1709,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
 	    nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule)))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_trim(skb, nlh);
@@ -2361,7 +2364,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 		goto nla_put_failure;
 	nla_nest_end(skb, desc);
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_trim(skb, nlh);
@@ -3035,7 +3039,8 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
 
 	nla_nest_end(skb, nest);
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_trim(skb, nlh);
@@ -3324,7 +3329,8 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)))
 		goto nla_put_failure;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_trim(skb, nlh);
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index c2f2a53a4879..179625353cac 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -641,7 +641,8 @@ static int netlbl_cipsov4_listall_cb(struct cipso_v4_doi *doi_def, void *arg)
 	if (ret_val != 0)
 		goto listall_cb_failure;
 
-	return genlmsg_end(cb_arg->skb, data);
+	genlmsg_end(cb_arg->skb, data);
+	return 0;
 
 listall_cb_failure:
 	genlmsg_cancel(cb_arg->skb, data);
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index e66e977ef2fa..8b3b789c43c2 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -456,7 +456,8 @@ static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg)
 		goto listall_cb_failure;
 
 	cb_arg->seq++;
-	return genlmsg_end(cb_arg->skb, data);
+	genlmsg_end(cb_arg->skb, data);
+	return 0;
 
 listall_cb_failure:
 	genlmsg_cancel(cb_arg->skb, data);
@@ -620,7 +621,8 @@ static int netlbl_mgmt_protocols_cb(struct sk_buff *skb,
 	if (ret_val != 0)
 		goto protocols_cb_failure;
 
-	return genlmsg_end(skb, data);
+	genlmsg_end(skb, data);
+	return 0;
 
 protocols_cb_failure:
 	genlmsg_cancel(skb, data);
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 78a63c18779e..aec7994f78cf 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1163,7 +1163,8 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd,
 		goto list_cb_failure;
 
 	cb_arg->seq++;
-	return genlmsg_end(cb_arg->skb, data);
+	genlmsg_end(cb_arg->skb, data);
+	return 0;
 
 list_cb_failure:
 	genlmsg_cancel(cb_arg->skb, data);
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index bb59a7ed0859..3ee63a3cff30 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -91,7 +91,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 	    sk_diag_put_rings_cfg(sk, skb))
 		goto out_nlmsg_trim;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 out_nlmsg_trim:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 2e11061ef885..f52a7d5734cd 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -756,7 +756,8 @@ static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq,
 		nla_nest_end(skb, nla_grps);
 	}
 
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(skb, hdr);
@@ -796,7 +797,8 @@ static int ctrl_fill_mcgrp_info(struct genl_family *family,
 	nla_nest_end(skb, nest);
 	nla_nest_end(skb, nla_grps);
 
-	return genlmsg_end(skb, hdr);
+	genlmsg_end(skb, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(skb, hdr);
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 44989fc8cddf..be387e6219a0 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -102,7 +102,8 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target,
 			goto nla_put_failure;
 	}
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -518,7 +519,8 @@ static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev,
 	    nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode))
 		goto nla_put_failure;
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -908,7 +910,8 @@ static int nfc_genl_send_params(struct sk_buff *msg,
 	    nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux)))
 		goto nla_put_failure;
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
 nla_put_failure:
 
@@ -1247,8 +1250,7 @@ static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev,
 		    nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type))
 			goto nla_put_failure;
 
-		if (genlmsg_end(msg, hdr) < 0)
-			goto nla_put_failure;
+		genlmsg_end(msg, hdr);
 	}
 
 	return 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 8bda3cc12344..f45f1bf4422c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -799,7 +799,8 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
 	if (err)
 		goto error;
 
-	return genlmsg_end(skb, ovs_header);
+	genlmsg_end(skb, ovs_header);
+	return 0;
 
 error:
 	genlmsg_cancel(skb, ovs_header);
@@ -1349,7 +1350,8 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
 	if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
 		goto nla_put_failure;
 
-	return genlmsg_end(skb, ovs_header);
+	genlmsg_end(skb, ovs_header);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(skb, ovs_header);
@@ -1723,7 +1725,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 	if (err == -EMSGSIZE)
 		goto error;
 
-	return genlmsg_end(skb, ovs_header);
+	genlmsg_end(skb, ovs_header);
+	return 0;
 
 nla_put_failure:
 	err = -EMSGSIZE;
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 92f2c7107eec..0ed68f0238bf 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -177,7 +177,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 				     PACKET_DIAG_FILTER))
 		goto out_nlmsg_trim;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 out_nlmsg_trim:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index b64151ade6b3..54d766842c2b 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -121,7 +121,8 @@ static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr,
 	ifm->ifa_index = dev->ifindex;
 	if (nla_put_u8(skb, IFA_LOCAL, addr))
 		goto nla_put_failure;
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
@@ -190,7 +191,8 @@ static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst,
 	if (nla_put_u8(skb, RTA_DST, dst) ||
 	    nla_put_u32(skb, RTA_OIF, dev->ifindex))
 		goto nla_put_failure;
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
@@ -282,9 +284,13 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 
 		if (addr_idx++ < addr_start_idx)
 			continue;
-		if (fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid,
-				cb->nlh->nlmsg_seq, RTM_NEWROUTE))
-			goto out;
+		fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid,
+			   cb->nlh->nlmsg_seq, RTM_NEWROUTE);
+		/* fill_route() used to return > 0 (or negative errors) but
+		 * never 0 - ignore the return value and just go out to
+		 * call dumpit again from outside to preserve the behavior
+		 */
+		goto out;
 	}
 
 out:
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 86fa0f3b2caf..ef542fbca9fe 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -155,7 +155,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 	if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown))
 		goto out_nlmsg_trim;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 out_nlmsg_trim:
 	nlmsg_cancel(skb, nlh);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 380784378df8..4ed9039bd5f9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1721,7 +1721,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		break;
 	}
  finish:
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -2404,7 +2405,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 			goto nla_put_failure;
 	}
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -3825,7 +3827,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 		    sinfo->assoc_req_ies))
 		goto nla_put_failure;
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -4555,7 +4558,8 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq,
 
 	nla_nest_end(msg, pinfoattr);
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -5507,7 +5511,8 @@ static int nl80211_send_regdom(struct sk_buff *msg, struct netlink_callback *cb,
 	    nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG))
 		goto nla_put_failure;
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -6577,7 +6582,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 
 	nla_nest_end(msg, bss);
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
  fail_unlock_rcu:
 	rcu_read_unlock();
@@ -6686,7 +6692,8 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
 
 	nla_nest_end(msg, infoattr);
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -11025,7 +11032,8 @@ static int nl80211_send_scan_msg(struct sk_buff *msg,
 	/* ignore errors and send incomplete event anyway */
 	nl80211_add_scan_req(msg, rdev);
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -11048,7 +11056,8 @@ nl80211_send_sched_scan_msg(struct sk_buff *msg,
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex))
 		goto nla_put_failure;
 
-	return genlmsg_end(msg, hdr);
+	genlmsg_end(msg, hdr);
+	return 0;
 
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8128594ab379..7de2ed9ec46d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1019,7 +1019,8 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net,
 		return err;
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -1121,7 +1122,8 @@ static int build_sadinfo(struct sk_buff *skb, struct net *net,
 		return err;
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -1842,7 +1844,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
 	if (err)
 		goto out_cancel;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 out_cancel:
 	nlmsg_cancel(skb, nlh);
@@ -2282,7 +2285,8 @@ static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m,
 			goto out_cancel;
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 
 out_cancel:
 	nlmsg_cancel(skb, nlh);
@@ -2490,7 +2494,8 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct
 	if (err)
 		return err;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c)
@@ -2712,7 +2717,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 		return err;
 	}
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
@@ -2827,7 +2833,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
 	}
 	upe->hard = !!hard;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
@@ -2986,7 +2993,8 @@ static int build_report(struct sk_buff *skb, u8 proto,
 			return err;
 		}
 	}
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int xfrm_send_report(struct net *net, u8 proto,
@@ -3031,7 +3039,8 @@ static int build_mapping(struct sk_buff *skb, struct xfrm_state *x,
 	um->old_sport = x->encap->encap_sport;
 	um->reqid = x->props.reqid;
 
-	return nlmsg_end(skb, nlh);
+	nlmsg_end(skb, nlh);
+	return 0;
 }
 
 static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
-- 
cgit v1.2.3


From 7b46a644a407f9f2f1c8f7b2af157c79af55b49e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 18 Jan 2015 23:36:08 -0500
Subject: netlink: Fix bugs in nlmsg_end() conversions.

Commit 053c095a82cf ("netlink: make nlmsg_end() and genlmsg_end()
void") didn't catch all of the cases where callers were breaking out
on the return value being equal to zero, which they no longer should
when zero means success.

Fix all such cases.

Reported-by: Marcel Holtmann <marcel@holtmann.org>
Reported-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c  | 8 ++++----
 net/core/rtnetlink.c  | 2 +-
 net/decnet/dn_route.c | 5 +----
 net/ipv4/devinet.c    | 6 +++---
 net/ipv4/route.c      | 2 +-
 net/ipv6/addrconf.c   | 2 +-
 6 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d36d564f149f..70fe9e10ac86 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2128,7 +2128,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 
 		if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
 				       cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
-				       NLM_F_MULTI) <= 0)
+				       NLM_F_MULTI) < 0)
 			break;
 
 		nidx = 0;
@@ -2144,7 +2144,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 						     NETLINK_CB(cb->skb).portid,
 						     cb->nlh->nlmsg_seq,
 						     RTM_NEWNEIGHTBL,
-						     NLM_F_MULTI) <= 0)
+						     NLM_F_MULTI) < 0)
 				goto out;
 		next:
 			nidx++;
@@ -2274,7 +2274,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
 					    cb->nlh->nlmsg_seq,
 					    RTM_NEWNEIGH,
-					    NLM_F_MULTI) <= 0) {
+					    NLM_F_MULTI) < 0) {
 				rc = -1;
 				goto out;
 			}
@@ -2311,7 +2311,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 			if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
 					    cb->nlh->nlmsg_seq,
 					    RTM_NEWNEIGH,
-					    NLM_F_MULTI, tbl) <= 0) {
+					    NLM_F_MULTI, tbl) < 0) {
 				read_unlock_bh(&tbl->lock);
 				rc = -1;
 				goto out;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e13b9dbdf154..0e26b9f66cad 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1327,7 +1327,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 			 */
 			WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
 
-			if (err <= 0)
+			if (err < 0)
 				goto out;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 812e5e6e88fb..1d7c1256e845 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1710,9 +1710,6 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 		rt->rt_flags |= RTCF_NOTIFY;
 
 	err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
-
-	if (err == 0)
-		goto out_free;
 	if (err < 0) {
 		err = -EMSGSIZE;
 		goto out_free;
@@ -1763,7 +1760,7 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			skb_dst_set(skb, dst_clone(&rt->dst));
 			if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).portid,
 					cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-					1, NLM_F_MULTI) <= 0) {
+					1, NLM_F_MULTI) < 0) {
 				skb_dst_drop(skb);
 				rcu_read_unlock_bh();
 				goto done;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 5f344eb3fc25..59ebe16d06fc 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1883,7 +1883,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
 						      cb->nlh->nlmsg_seq,
 						      RTM_NEWNETCONF,
 						      NLM_F_MULTI,
-						      -1) <= 0) {
+						      -1) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -1899,7 +1899,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) <= 0)
+					      -1) < 0)
 			goto done;
 		else
 			h++;
@@ -1910,7 +1910,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) <= 0)
+					      -1) < 0)
 			goto done;
 		else
 			h++;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f6e43ca5e641..2000110c75f0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2483,7 +2483,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	err = rt_fill_info(net, dst, src, &fl4, skb,
 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
-	if (err <= 0)
+	if (err < 0)
 		goto errout_free;
 
 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8975d9501d50..d6b4f5d08014 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4213,7 +4213,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 				goto cont;
 
 			if (in6_dump_addrs(idev, skb, cb, type,
-					   s_ip_idx, &ip_idx) <= 0)
+					   s_ip_idx, &ip_idx) < 0)
 				goto done;
 cont:
 			idx++;
-- 
cgit v1.2.3


From 1728d4fabd1bc9965728de25dda0b694b8da6450 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 15 Jan 2015 15:11:17 +0100
Subject: tunnels: advertise link netns via netlink

Implement rtnl_link_ops->get_link_net() callback so that IFLA_LINK_NETNSID is
added to rtnetlink messages.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c      | 8 ++++++++
 include/net/ip6_tunnel.h | 1 +
 include/net/ip_tunnels.h | 1 +
 net/ipv4/ip_gre.c        | 2 ++
 net/ipv4/ip_tunnel.c     | 8 ++++++++
 net/ipv4/ip_vti.c        | 1 +
 net/ipv4/ipip.c          | 1 +
 net/ipv6/ip6_gre.c       | 1 +
 net/ipv6/ip6_tunnel.c    | 9 +++++++++
 net/ipv6/ip6_vti.c       | 1 +
 net/ipv6/sit.c           | 1 +
 11 files changed, 34 insertions(+)

(limited to 'net/ipv4')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c5f79e7513a6..0346eaa6d236 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2923,6 +2923,13 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static struct net *vxlan_get_link_net(const struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	return vxlan->net;
+}
+
 static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.kind		= "vxlan",
 	.maxtype	= IFLA_VXLAN_MAX,
@@ -2934,6 +2941,7 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.dellink	= vxlan_dellink,
 	.get_size	= vxlan_get_size,
 	.fill_info	= vxlan_fill_info,
+	.get_link_net	= vxlan_get_link_net,
 };
 
 static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 9326c41c2d7f..76c091b53dae 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -70,6 +70,7 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
 __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw);
 __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr,
 			     const struct in6_addr *raddr);
+struct net *ip6_tnl_get_link_net(const struct net_device *dev);
 
 static inline void ip6tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index ce4db3cc5647..2c47061a6954 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -141,6 +141,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 int ip_tunnel_init(struct net_device *dev);
 void ip_tunnel_uninit(struct net_device *dev);
 void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
+struct net *ip_tunnel_get_link_net(const struct net_device *dev);
 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 942576e27df1..6e7727f27393 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -829,6 +829,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipgre_get_size,
 	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
@@ -843,6 +844,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipgre_get_size,
 	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static int __net_init ipgre_tap_init_net(struct net *net)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d3e447936720..2cd08280c77b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -972,6 +972,14 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 
+struct net *ip_tunnel_get_link_net(const struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	return tunnel->net;
+}
+EXPORT_SYMBOL(ip_tunnel_get_link_net);
+
 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 				  struct rtnl_link_ops *ops, char *devname)
 {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 1a7e979e80ba..94efe148181c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -531,6 +531,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
 	.dellink        = ip_tunnel_dellink,
 	.get_size	= vti_get_size,
 	.fill_info	= vti_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static int __init vti_init(void)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 40403114f00a..b58d6689874c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -498,6 +498,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipip_get_size,
 	.fill_info	= ipip_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct xfrm_tunnel ipip_handler __read_mostly = {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 13cda4c6313b..9306a5ff9149 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1662,6 +1662,7 @@ static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
 	.dellink	= ip6gre_dellink,
 	.get_size	= ip6gre_get_size,
 	.fill_info	= ip6gre_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 92b3da571980..266a264ec212 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1760,6 +1760,14 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+struct net *ip6_tnl_get_link_net(const struct net_device *dev)
+{
+	struct ip6_tnl *tunnel = netdev_priv(dev);
+
+	return tunnel->net;
+}
+EXPORT_SYMBOL(ip6_tnl_get_link_net);
+
 static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_LINK]		= { .type = NLA_U32 },
 	[IFLA_IPTUN_LOCAL]		= { .len = sizeof(struct in6_addr) },
@@ -1783,6 +1791,7 @@ static struct rtnl_link_ops ip6_link_ops __read_mostly = {
 	.dellink	= ip6_tnl_dellink,
 	.get_size	= ip6_tnl_get_size,
 	.fill_info	= ip6_tnl_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ace10d0b3aac..5fb9e212eca8 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1016,6 +1016,7 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
 	.changelink	= vti6_changelink,
 	.get_size	= vti6_get_size,
 	.fill_info	= vti6_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 213546bd6d5d..3cc197c72b59 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1763,6 +1763,7 @@ static struct rtnl_link_ops sit_link_ops __read_mostly = {
 	.get_size	= ipip6_get_size,
 	.fill_info	= ipip6_fill_info,
 	.dellink	= ipip6_dellink,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct xfrm_tunnel sit_handler __read_mostly = {
-- 
cgit v1.2.3


From 728c02089a0e3eefb02e9927bfae50490f40e72e Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 16 Jan 2015 09:56:01 -0800
Subject: net: ipv4: handle DSA enabled master network devices

The logic to configure a network interface for kernel IP
auto-configuration is very simplistic, and does not handle the case
where a device is stacked onto another such as with DSA. This causes the
kernel not to open and configure the master network device in a DSA
switch tree, and therefore slave network devices using this master
network devices as conduit device cannot be open.

This restriction comes from a check in net/dsa/slave.c, which is
basically checking the master netdev flags for IFF_UP and returns
-ENETDOWN if it is not the case.

Automatically bringing-up DSA master network devices allows DSA slave
network devices to be used as valid interfaces for e.g: NFS root booting
by allowing kernel IP autoconfiguration to succeed on these interfaces.

On the reverse path, make sure we do not attempt to close a DSA-enabled
device as this would implicitely prevent the slave DSA network device
from operating.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 7fa18bc7e47f..b26376ef87f6 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -209,9 +209,9 @@ static int __init ic_open_devs(void)
 	last = &ic_first_dev;
 	rtnl_lock();
 
-	/* bring loopback device up first */
+	/* bring loopback and DSA master network devices up first */
 	for_each_netdev(&init_net, dev) {
-		if (!(dev->flags & IFF_LOOPBACK))
+		if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
 			continue;
 		if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
 			pr_err("IP-Config: Failed to open %s\n", dev->name);
@@ -306,7 +306,7 @@ static void __init ic_close_devs(void)
 	while ((d = next)) {
 		next = d->next;
 		dev = d->dev;
-		if (dev != ic_dev) {
+		if (dev != ic_dev && !netdev_uses_dsa(dev)) {
 			DBG(("IP-Config: Downing %s\n", dev->name));
 			dev_change_flags(dev, d->flags);
 		}
-- 
cgit v1.2.3


From d998f8efa47221405ceae129aa93fa6d4ac8510d Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 20 Jan 2015 11:23:04 -0800
Subject: udp: Do not require sock in udp_tunnel_xmit_skb

The UDP tunnel transmit functions udp_tunnel_xmit_skb and
udp_tunnel6_xmit_skb include a socket argument. The socket being
passed to the functions (from VXLAN) is a UDP created for receive
side. The only thing that the socket is used for in the transmit
functions is to get the setting for checksum (enabled or zero).
This patch removes the argument and and adds a nocheck argument
for checksum setting. This eliminates the unnecessary dependency
on a UDP socket for UDP tunnel transmit.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 10 ++++++----
 include/net/udp_tunnel.h  | 16 ++++++++--------
 net/ipv4/geneve.c         |  5 +++--
 net/ipv4/udp_tunnel.c     | 12 ++++++------
 net/ipv6/ip6_udp_tunnel.c | 12 ++++++------
 5 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 19d3664ab9dd..a288ceab502e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1769,8 +1769,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 
 	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
 
-	udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio,
-			     ttl, src_port, dst_port);
+	udp_tunnel6_xmit_skb(dst, skb, dev, saddr, daddr, prio,
+			     ttl, src_port, dst_port,
+			     udp_get_no_check6_tx(vs->sock->sk));
 	return 0;
 err:
 	dst_release(dst);
@@ -1848,8 +1849,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 
 	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
 
-	return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
-				   ttl, df, src_port, dst_port, xnet);
+	return udp_tunnel_xmit_skb(rt, skb, src, dst, tos,
+				   ttl, df, src_port, dst_port, xnet,
+				   vs->sock->sk->sk_no_check_tx);
 }
 EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 2a50a70ef587..1a20d33d56bc 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -77,17 +77,17 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 			   struct udp_tunnel_sock_cfg *sock_cfg);
 
 /* Transmit the skb using UDP encapsulation. */
-int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt,
-			struct sk_buff *skb, __be32 src, __be32 dst,
-			__u8 tos, __u8 ttl, __be16 df, __be16 src_port,
-			__be16 dst_port, bool xnet);
+int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb,
+			__be32 src, __be32 dst, __u8 tos, __u8 ttl,
+			__be16 df, __be16 src_port, __be16 dst_port,
+			bool xnet, bool nocheck);
 
 #if IS_ENABLED(CONFIG_IPV6)
-int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst,
-			 struct sk_buff *skb, struct net_device *dev,
-			 struct in6_addr *saddr, struct in6_addr *daddr,
+int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sk_buff *skb,
+			 struct net_device *dev, struct in6_addr *saddr,
+			 struct in6_addr *daddr,
 			 __u8 prio, __u8 ttl, __be16 src_port,
-			 __be16 dst_port);
+			 __be16 dst_port, bool nocheck);
 #endif
 
 void udp_tunnel_sock_release(struct socket *sock);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 9568594ca2f1..93e51199e44b 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -136,8 +136,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
 
-	return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
-				   tos, ttl, df, src_port, dst_port, xnet);
+	return udp_tunnel_xmit_skb(rt, skb, src, dst,
+				   tos, ttl, df, src_port, dst_port, xnet,
+				   gs->sock->sk->sk_no_check_tx);
 }
 EXPORT_SYMBOL_GPL(geneve_xmit_skb);
 
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 9996e63ed304..c83b35485056 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -75,10 +75,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 }
 EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
 
-int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt,
-			struct sk_buff *skb, __be32 src, __be32 dst,
-			__u8 tos, __u8 ttl, __be16 df, __be16 src_port,
-			__be16 dst_port, bool xnet)
+int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb,
+			__be32 src, __be32 dst, __u8 tos, __u8 ttl,
+			__be16 df, __be16 src_port, __be16 dst_port,
+			bool xnet, bool nocheck)
 {
 	struct udphdr *uh;
 
@@ -90,9 +90,9 @@ int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt,
 	uh->source = src_port;
 	uh->len = htons(skb->len);
 
-	udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len);
+	udp_set_csum(nocheck, skb, src, dst, skb->len);
 
-	return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP,
+	return iptunnel_xmit(skb->sk, rt, skb, src, dst, IPPROTO_UDP,
 			     tos, ttl, df, xnet);
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 8db6c98fe218..32d9b268e7d8 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -62,14 +62,14 @@ error:
 }
 EXPORT_SYMBOL_GPL(udp_sock_create6);
 
-int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst,
-			 struct sk_buff *skb, struct net_device *dev,
-			 struct in6_addr *saddr, struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be16 src_port, __be16 dst_port)
+int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sk_buff *skb,
+			 struct net_device *dev, struct in6_addr *saddr,
+			 struct in6_addr *daddr,
+			 __u8 prio, __u8 ttl, __be16 src_port,
+			 __be16 dst_port, bool nocheck)
 {
 	struct udphdr *uh;
 	struct ipv6hdr *ip6h;
-	struct sock *sk = sock->sk;
 
 	__skb_push(skb, sizeof(*uh));
 	skb_reset_transport_header(skb);
@@ -85,7 +85,7 @@ int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst,
 			    | IPSKB_REROUTED);
 	skb_dst_set(skb, dst);
 
-	udp6_set_csum(udp_get_no_check6_tx(sk), skb, saddr, daddr, skb->len);
+	udp6_set_csum(nocheck, skb, saddr, daddr, skb->len);
 
 	__skb_push(skb, sizeof(*ip6h));
 	skb_reset_network_header(skb);
-- 
cgit v1.2.3


From b3832117b4b61374fac08692f1b1a620088342dd Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:08 -0800
Subject: fib_trie: Use index & (~0ul << n->bits) instead of index >> n->bits

In doing performance testing and analysis of the changes I recently found
that by shifting the index I had created an unnecessary dependency.

I have updated the code so that we instead shift a mask by bits and then
just test against that as that should save us about 2 CPU cycles since we
can generate the mask while the key and pos are being processed.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 281e5e00025f..dea2f80e08c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -961,12 +961,12 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
 		 * prefix plus zeros for the bits in the cindex. The index
 		 * is the difference between the key and this value.  From
 		 * this we can actually derive several pieces of data.
-		 *   if !(index >> bits)
-		 *     we know the value is cindex
-		 *   else
+		 *   if (index & (~0ul << bits))
 		 *     we have a mismatch in skip bits and failed
+		 *   else
+		 *     we know the value is cindex
 		 */
-		if (index >> n->bits)
+		if (index & (~0ul << n->bits))
 			return NULL;
 
 		/* we have found a leaf. Prefixes have already been compared */
@@ -1301,12 +1301,12 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		 * prefix plus zeros for the "bits" in the prefix. The index
 		 * is the difference between the key and this value.  From
 		 * this we can actually derive several pieces of data.
-		 *   if !(index >> bits)
-		 *     we know the value is child index
-		 *   else
+		 *   if (index & (~0ul << bits))
 		 *     we have a mismatch in skip bits and failed
+		 *   else
+		 *     we know the value is cindex
 		 */
-		if (index >> n->bits)
+		if (index & (~0ul << n->bits))
 			break;
 
 		/* we have found a leaf. Prefixes have already been compared */
-- 
cgit v1.2.3


From 69fa57b1e42c171599d53486839c3d58f7ed8eec Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:14 -0800
Subject: fib_trie: Fix RCU bug and merge similar bits of inflate/halve

This patch addresses two issues.

The first issue is the fact that I believe I had the RCU freeing sequence
slightly out of order.  As a result we could get into an issue if a caller
went into a child of a child of the new node, then backtraced into the to be
freed parent, and then attempted to access a child of a child that may have
been consumed in a resize of one of the new nodes children.  To resolve this I
have moved the resize after we have freed the oldtnode.  The only side effect
of this is that we will now be calling resize on more nodes in the case of
inflate due to the fact that we don't have a good way to test to see if a
full_tnode on the new node was there before or after the allocation.  This
should have minimal impact however since the node should already be
correctly size so it is just the cost of calling should_inflate that we
will be taking on the node which is only a couple of cycles.

The second issue is the fact that inflate and halve were essentially doing
the same thing after the new node was added to the trie replacing the old
one.  As such it wasn't really necessary to keep the code in both functions
so I have split it out into two other functions, called replace and
update_children.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 157 ++++++++++++++++++++++++----------------------------
 1 file changed, 73 insertions(+), 84 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index dea2f80e08c3..7e9031739f06 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -396,8 +396,30 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 	rcu_assign_pointer(tn->child[i], n);
 }
 
-static void put_child_root(struct tnode *tp, struct trie *t,
-			   t_key key, struct tnode *n)
+static void update_children(struct tnode *tn)
+{
+	unsigned long i;
+
+	/* update all of the child parent pointers */
+	for (i = tnode_child_length(tn); i;) {
+		struct tnode *inode = tnode_get_child(tn, --i);
+
+		if (!inode)
+			continue;
+
+		/* Either update the children of a tnode that
+		 * already belongs to us or update the child
+		 * to point to ourselves.
+		 */
+		if (node_parent(inode) == tn)
+			update_children(inode);
+		else
+			node_set_parent(inode, tn);
+	}
+}
+
+static inline void put_child_root(struct tnode *tp, struct trie *t,
+				  t_key key, struct tnode *n)
 {
 	if (tp)
 		put_child(tp, get_index(key, tp), n);
@@ -434,10 +456,35 @@ static void tnode_free(struct tnode *tn)
 	}
 }
 
+static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
+{
+	struct tnode *tp = node_parent(oldtnode);
+	unsigned long i;
+
+	/* setup the parent pointer out of and back into this node */
+	NODE_INIT_PARENT(tn, tp);
+	put_child_root(tp, t, tn->key, tn);
+
+	/* update all of the child parent pointers */
+	update_children(tn);
+
+	/* all pointers should be clean so we are done */
+	tnode_free(oldtnode);
+
+	/* resize children now that oldtnode is freed */
+	for (i = tnode_child_length(tn); i;) {
+		struct tnode *inode = tnode_get_child(tn, --i);
+
+		/* resize child node */
+		if (tnode_full(tn, inode))
+			resize(t, inode);
+	}
+}
+
 static int inflate(struct trie *t, struct tnode *oldtnode)
 {
-	struct tnode *inode, *node0, *node1, *tn, *tp;
-	unsigned long i, j, k;
+	struct tnode *tn;
+	unsigned long i;
 	t_key m;
 
 	pr_debug("In inflate\n");
@@ -446,13 +493,18 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 	if (!tn)
 		return -ENOMEM;
 
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
+
 	/* Assemble all of the pointers in our cluster, in this case that
 	 * represents all of the pointers out of our allocated nodes that
 	 * point to existing tnodes and the links between our allocated
 	 * nodes.
 	 */
 	for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
-		inode = tnode_get_child(oldtnode, --i);
+		struct tnode *inode = tnode_get_child(oldtnode, --i);
+		struct tnode *node0, *node1;
+		unsigned long j, k;
 
 		/* An empty child */
 		if (inode == NULL)
@@ -464,6 +516,9 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 			continue;
 		}
 
+		/* drop the node in the old tnode free list */
+		tnode_free_append(oldtnode, inode);
+
 		/* An internal node with two children */
 		if (inode->bits == 1) {
 			put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
@@ -488,9 +543,9 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
 		if (!node1)
 			goto nomem;
-		tnode_free_append(tn, node1);
+		node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
 
-		node0 = tnode_new(inode->key & ~m, inode->pos, inode->bits - 1);
+		tnode_free_append(tn, node1);
 		if (!node0)
 			goto nomem;
 		tnode_free_append(tn, node0);
@@ -512,53 +567,9 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 		put_child(tn, 2 * i, node0);
 	}
 
-	/* setup the parent pointer into and out of this node */
-	tp = node_parent(oldtnode);
-	NODE_INIT_PARENT(tn, tp);
-	put_child_root(tp, t, tn->key, tn);
-
-	/* prepare oldtnode to be freed */
-	tnode_free_init(oldtnode);
-
-	/* update all child nodes parent pointers to route to us */
-	for (i = tnode_child_length(oldtnode); i;) {
-		inode = tnode_get_child(oldtnode, --i);
-
-		/* A leaf or an internal node with skipped bits */
-		if (!tnode_full(oldtnode, inode)) {
-			node_set_parent(inode, tn);
-			continue;
-		}
-
-		/* drop the node in the old tnode free list */
-		tnode_free_append(oldtnode, inode);
-
-		/* fetch new nodes */
-		node1 = tnode_get_child(tn, 2 * i + 1);
-		node0 = tnode_get_child(tn, 2 * i);
+	/* setup the parent pointers into and out of this node */
+	replace(t, oldtnode, tn);
 
-		/* bits == 1 then node0 and node1 represent inode's children */
-		if (inode->bits == 1) {
-			node_set_parent(node1, tn);
-			node_set_parent(node0, tn);
-			continue;
-		}
-
-		/* update parent pointers in child node's children */
-		for (k = tnode_child_length(inode), j = k / 2; j;) {
-			node_set_parent(tnode_get_child(inode, --k), node1);
-			node_set_parent(tnode_get_child(inode, --j), node0);
-			node_set_parent(tnode_get_child(inode, --k), node1);
-			node_set_parent(tnode_get_child(inode, --j), node0);
-		}
-
-		/* resize child nodes */
-		resize(t, node1);
-		resize(t, node0);
-	}
-
-	/* we completed without error, prepare to free old node */
-	tnode_free(oldtnode);
 	return 0;
 nomem:
 	/* all pointers should be clean so we are done */
@@ -568,7 +579,7 @@ nomem:
 
 static int halve(struct trie *t, struct tnode *oldtnode)
 {
-	struct tnode *tn, *tp, *inode, *node0, *node1;
+	struct tnode *tn;
 	unsigned long i;
 
 	pr_debug("In halve\n");
@@ -577,14 +588,18 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 	if (!tn)
 		return -ENOMEM;
 
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
+
 	/* Assemble all of the pointers in our cluster, in this case that
 	 * represents all of the pointers out of our allocated nodes that
 	 * point to existing tnodes and the links between our allocated
 	 * nodes.
 	 */
 	for (i = tnode_child_length(oldtnode); i;) {
-		node1 = tnode_get_child(oldtnode, --i);
-		node0 = tnode_get_child(oldtnode, --i);
+		struct tnode *node1 = tnode_get_child(oldtnode, --i);
+		struct tnode *node0 = tnode_get_child(oldtnode, --i);
+		struct tnode *inode;
 
 		/* At least one of the children is empty */
 		if (!node1 || !node0) {
@@ -609,34 +624,8 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 		put_child(tn, i / 2, inode);
 	}
 
-	/* setup the parent pointer out of and back into this node */
-	tp = node_parent(oldtnode);
-	NODE_INIT_PARENT(tn, tp);
-	put_child_root(tp, t, tn->key, tn);
-
-	/* prepare oldtnode to be freed */
-	tnode_free_init(oldtnode);
-
-	/* update all of the child parent pointers */
-	for (i = tnode_child_length(tn); i;) {
-		inode = tnode_get_child(tn, --i);
-
-		/* only new tnodes will be considered "full" nodes */
-		if (!tnode_full(tn, inode)) {
-			node_set_parent(inode, tn);
-			continue;
-		}
-
-		/* Two nonempty children */
-		node_set_parent(tnode_get_child(inode, 1), inode);
-		node_set_parent(tnode_get_child(inode, 0), inode);
-
-		/* resize child node */
-		resize(t, inode);
-	}
-
-	/* all pointers should be clean so we are done */
-	tnode_free(oldtnode);
+	/* setup the parent pointers into and out of this node */
+	replace(t, oldtnode, tn);
 
 	return 0;
 }
-- 
cgit v1.2.3


From a80e89d4c650a7c3ab74f0b2d133cc2ce9738994 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:20 -0800
Subject: fib_trie: Fall back to slen update on inflate/halve failure

This change corrects an issue where if inflate or halve fails we were
exiting the resize function without at least updating the slen for the
node.  To correct this I have moved the update of max_size into the while
loop so that it is only decremented on a successful call to either inflate
or halve.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 7e9031739f06..80892f565030 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -752,7 +752,7 @@ static void resize(struct trie *t, struct tnode *tn)
 {
 	struct tnode *tp = node_parent(tn), *n = NULL;
 	struct tnode __rcu **cptr;
-	int max_work;
+	int max_work = MAX_WORK;
 
 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 		 tn, inflate_threshold, halve_threshold);
@@ -775,8 +775,7 @@ static void resize(struct trie *t, struct tnode *tn)
 	/* Double as long as the resulting node has a number of
 	 * nonempty nodes that are above the threshold.
 	 */
-	max_work = MAX_WORK;
-	while (should_inflate(tp, tn) && max_work--) {
+	while (should_inflate(tp, tn) && max_work) {
 		if (inflate(t, tn)) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(t->stats->resize_node_skipped);
@@ -784,6 +783,7 @@ static void resize(struct trie *t, struct tnode *tn)
 			break;
 		}
 
+		max_work--;
 		tn = rtnl_dereference(*cptr);
 	}
 
@@ -794,8 +794,7 @@ static void resize(struct trie *t, struct tnode *tn)
 	/* Halve as long as the number of empty children in this
 	 * node is above threshold.
 	 */
-	max_work = MAX_WORK;
-	while (should_halve(tp, tn) && max_work--) {
+	while (should_halve(tp, tn) && max_work) {
 		if (halve(t, tn)) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(t->stats->resize_node_skipped);
@@ -803,6 +802,7 @@ static void resize(struct trie *t, struct tnode *tn)
 			break;
 		}
 
+		max_work--;
 		tn = rtnl_dereference(*cptr);
 	}
 
-- 
cgit v1.2.3


From 95f60ea3e99aef5967d1566979f4ade7be386e34 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:26 -0800
Subject: fib_trie: Add collapse() and should_collapse() to resize

This patch really does two things.

First it pulls the logic for determining if we should collapse one node out
of the tree and the actual code doing the collapse into a separate pair of
functions.  This helps to make the changes to these areas more readable.

Second it encodes the upper 32b of the empty_children value onto the
full_children value in the case of bits == KEYLENGTH.  By doing this we are
able to handle the case of a 32b node where empty_children would appear to
be 0 when it was actually 1ul << 32.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 100 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 35 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 80892f565030..f874e1811eab 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -83,7 +83,8 @@
 
 #define MAX_STAT_DEPTH 32
 
-#define KEYLENGTH (8*sizeof(t_key))
+#define KEYLENGTH	(8*sizeof(t_key))
+#define KEY_MAX		((t_key)~0)
 
 typedef unsigned int t_key;
 
@@ -102,8 +103,8 @@ struct tnode {
 	union {
 		/* The fields in this struct are valid if bits > 0 (TNODE) */
 		struct {
-			unsigned int full_children;  /* KEYLENGTH bits needed */
-			unsigned int empty_children; /* KEYLENGTH bits needed */
+			t_key empty_children; /* KEYLENGTH bits needed */
+			t_key full_children;  /* KEYLENGTH bits needed */
 			struct tnode __rcu *child[0];
 		};
 		/* This list pointer if valid if bits == 0 (LEAF) */
@@ -302,6 +303,16 @@ static struct tnode *tnode_alloc(size_t size)
 		return vzalloc(size);
 }
 
+static inline void empty_child_inc(struct tnode *n)
+{
+	++n->empty_children ? : ++n->full_children;
+}
+
+static inline void empty_child_dec(struct tnode *n)
+{
+	n->empty_children-- ? : n->full_children--;
+}
+
 static struct tnode *leaf_new(t_key key)
 {
 	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
@@ -335,7 +346,7 @@ static struct leaf_info *leaf_info_new(int plen)
 
 static struct tnode *tnode_new(t_key key, int pos, int bits)
 {
-	size_t sz = offsetof(struct tnode, child[1 << bits]);
+	size_t sz = offsetof(struct tnode, child[1ul << bits]);
 	struct tnode *tn = tnode_alloc(sz);
 	unsigned int shift = pos + bits;
 
@@ -348,8 +359,10 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 		tn->pos = pos;
 		tn->bits = bits;
 		tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
-		tn->full_children = 0;
-		tn->empty_children = 1<<bits;
+		if (bits == KEYLENGTH)
+			tn->full_children = 1;
+		else
+			tn->empty_children = 1ul << bits;
 	}
 
 	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
@@ -375,11 +388,11 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 
 	BUG_ON(i >= tnode_child_length(tn));
 
-	/* update emptyChildren */
+	/* update emptyChildren, overflow into fullChildren */
 	if (n == NULL && chi != NULL)
-		tn->empty_children++;
-	else if (n != NULL && chi == NULL)
-		tn->empty_children--;
+		empty_child_inc(tn);
+	if (n != NULL && chi == NULL)
+		empty_child_dec(tn);
 
 	/* update fullChildren */
 	wasfull = tnode_full(tn, chi);
@@ -630,6 +643,24 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 	return 0;
 }
 
+static void collapse(struct trie *t, struct tnode *oldtnode)
+{
+	struct tnode *n, *tp;
+	unsigned long i;
+
+	/* scan the tnode looking for that one child that might still exist */
+	for (n = NULL, i = tnode_child_length(oldtnode); !n && i;)
+		n = tnode_get_child(oldtnode, --i);
+
+	/* compress one level */
+	tp = node_parent(oldtnode);
+	put_child_root(tp, t, oldtnode->key, n);
+	node_set_parent(n, tp);
+
+	/* drop dead node */
+	node_free(oldtnode);
+}
+
 static unsigned char update_suffix(struct tnode *tn)
 {
 	unsigned char slen = tn->pos;
@@ -729,10 +760,12 @@ static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
 
 	/* Keep root node larger */
 	threshold *= tp ? inflate_threshold : inflate_threshold_root;
-	used += tn->full_children;
 	used -= tn->empty_children;
+	used += tn->full_children;
 
-	return tn->pos && ((50 * used) >= threshold);
+	/* if bits == KEYLENGTH then pos = 0, and will fail below */
+
+	return (used > 1) && tn->pos && ((50 * used) >= threshold);
 }
 
 static bool should_halve(const struct tnode *tp, const struct tnode *tn)
@@ -744,13 +777,29 @@ static bool should_halve(const struct tnode *tp, const struct tnode *tn)
 	threshold *= tp ? halve_threshold : halve_threshold_root;
 	used -= tn->empty_children;
 
-	return (tn->bits > 1) && ((100 * used) < threshold);
+	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
+
+	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
+}
+
+static bool should_collapse(const struct tnode *tn)
+{
+	unsigned long used = tnode_child_length(tn);
+
+	used -= tn->empty_children;
+
+	/* account for bits == KEYLENGTH case */
+	if ((tn->bits == KEYLENGTH) && tn->full_children)
+		used -= KEY_MAX;
+
+	/* One child or none, time to drop us from the trie */
+	return used < 2;
 }
 
 #define MAX_WORK 10
 static void resize(struct trie *t, struct tnode *tn)
 {
-	struct tnode *tp = node_parent(tn), *n = NULL;
+	struct tnode *tp = node_parent(tn);
 	struct tnode __rcu **cptr;
 	int max_work = MAX_WORK;
 
@@ -764,14 +813,6 @@ static void resize(struct trie *t, struct tnode *tn)
 	cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
 	BUG_ON(tn != rtnl_dereference(*cptr));
 
-	/* No children */
-	if (tn->empty_children > (tnode_child_length(tn) - 1))
-		goto no_children;
-
-	/* One child */
-	if (tn->empty_children == (tnode_child_length(tn) - 1))
-		goto one_child;
-
 	/* Double as long as the resulting node has a number of
 	 * nonempty nodes that are above the threshold.
 	 */
@@ -807,19 +848,8 @@ static void resize(struct trie *t, struct tnode *tn)
 	}
 
 	/* Only one child remains */
-	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
-		unsigned long i;
-one_child:
-		for (i = tnode_child_length(tn); !n && i;)
-			n = tnode_get_child(tn, --i);
-no_children:
-		/* compress one level */
-		put_child_root(tp, t, tn->key, n);
-		node_set_parent(n, tp);
-
-		/* drop dead node */
-		tnode_free_init(tn);
-		tnode_free(tn);
+	if (should_collapse(tn)) {
+		collapse(t, tn);
 		return;
 	}
 
-- 
cgit v1.2.3


From 30cfe7c9c88d73440560d7e381bab12f5463a6cd Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:33 -0800
Subject: fib_trie: Use empty_children instead of counting empty nodes in stats
 collection

It doesn't make much sense to count the pointers ourselves when
empty_children already has a count for the number of NULL pointers stored
in the tnode.  As such save ourselves the cycles and just use
empty_children.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f874e1811eab..90654bb64e21 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1954,16 +1954,10 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 			hlist_for_each_entry_rcu(li, &n->list, hlist)
 				++s->prefixes;
 		} else {
-			unsigned long i;
-
 			s->tnodes++;
 			if (n->bits < MAX_STAT_DEPTH)
 				s->nodesizes[n->bits]++;
-
-			for (i = tnode_child_length(n); i--;) {
-				if (!rcu_access_pointer(n->child[i]))
-					s->nullpointers++;
-			}
+			s->nullpointers += n->empty_children;
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 02525368f48c197bce6e4251ff7bde92fa6f026e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:39 -0800
Subject: fib_trie: Move fib_find_alias to file where it is used

The function fib_find_alias is only accessed by functions in fib_trie.c as
such it makes sense to relocate it and cast it as static so that the
compiler can take advantage of optimizations it can do to it as a local
function.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_lookup.h    |  1 -
 net/ipv4/fib_semantics.c | 18 ------------------
 net/ipv4/fib_trie.c      | 20 ++++++++++++++++++++
 3 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 1e4f6600b31d..825981b1049a 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -32,7 +32,6 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
 		  unsigned int);
 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
 	       u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
-struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
 
 static inline void fib_result_assign(struct fib_result *res,
 				     struct fib_info *fi)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 265cb72b7c1b..1e2090ea663e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -411,24 +411,6 @@ errout:
 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
 }
 
-/* Return the first fib alias matching TOS with
- * priority less than or equal to PRIO.
- */
-struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
-{
-	if (fah) {
-		struct fib_alias *fa;
-		list_for_each_entry(fa, fah, fa_list) {
-			if (fa->fa_tos > tos)
-				continue;
-			if (fa->fa_info->fib_priority >= prio ||
-			    fa->fa_tos < tos)
-				return fa;
-		}
-	}
-	return NULL;
-}
-
 static int fib_detect_death(struct fib_info *fi, int order,
 			    struct fib_info **last_resort, int *last_idx,
 			    int dflt)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 90654bb64e21..7f342265968e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -998,6 +998,26 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
 	return n;
 }
 
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+{
+	struct fib_alias *fa;
+
+	if (!fah)
+		return NULL;
+
+	list_for_each_entry(fa, fah, fa_list) {
+		if (fa->fa_tos > tos)
+			continue;
+		if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+			return fa;
+	}
+
+	return NULL;
+}
+
 static void trie_rebalance(struct trie *t, struct tnode *tn)
 {
 	struct tnode *tp;
-- 
cgit v1.2.3


From 64c6272351a0eca55574f487b103770163d1dbce Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:45 -0800
Subject: fib_trie: Various clean-ups for handling slen

While doing further work on the fib_trie I noted a few items.

First I was using calls that were far more complicated than they needed to
be for determining when to push/pull the suffix length.  I have updated the
code to reflect the simplier logic.

The second issue is that I realised we weren't necessarily handling the
case of a leaf_info struct surviving a flush.  I have updated the logic so
that now we will call pull_suffix in the event of having a leaf info value
left in the leaf after flushing it.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 7f342265968e..3daf0224ff2e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -917,27 +917,20 @@ static void leaf_push_suffix(struct tnode *l)
 
 static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
 {
-	struct hlist_node *prev;
-
-	/* record the location of the pointer to this object */
-	prev = rtnl_dereference(hlist_pprev_rcu(&old->hlist));
+	/* record the location of the previous list_info entry */
+	struct hlist_node **pprev = old->hlist.pprev;
+	struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
 
 	/* remove the leaf info from the list */
 	hlist_del_rcu(&old->hlist);
 
-	/* if we emptied the list this leaf will be freed and we can sort
-	 * out parent suffix lengths as a part of trie_rebalance
-	 */
-	if (hlist_empty(&l->list))
+	/* only access li if it is pointing at the last valid hlist_node */
+	if (hlist_empty(&l->list) || (*pprev))
 		return;
 
-	/* if we removed the tail then we need to update slen */
-	if (!rcu_access_pointer(hlist_next_rcu(prev))) {
-		struct leaf_info *li = hlist_entry(prev, typeof(*li), hlist);
-
-		l->slen = KEYLENGTH - li->plen;
-		leaf_pull_suffix(l);
-	}
+	/* update the trie with the latest suffix length */
+	l->slen = KEYLENGTH - li->plen;
+	leaf_pull_suffix(l);
 }
 
 static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
@@ -961,7 +954,7 @@ static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
 	}
 
 	/* if we added to the tail node then we need to update slen */
-	if (!rcu_access_pointer(hlist_next_rcu(&new->hlist))) {
+	if (l->slen < (KEYLENGTH - new->plen)) {
 		l->slen = KEYLENGTH - new->plen;
 		leaf_push_suffix(l);
 	}
@@ -1613,6 +1606,7 @@ static int trie_flush_leaf(struct tnode *l)
 	struct hlist_head *lih = &l->list;
 	struct hlist_node *tmp;
 	struct leaf_info *li = NULL;
+	unsigned char plen = KEYLENGTH;
 
 	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
 		found += trie_flush_list(&li->falh);
@@ -1620,8 +1614,14 @@ static int trie_flush_leaf(struct tnode *l)
 		if (list_empty(&li->falh)) {
 			hlist_del_rcu(&li->hlist);
 			free_leaf_info(li);
+			continue;
 		}
+
+		plen = li->plen;
 	}
+
+	l->slen = KEYLENGTH - plen;
+
 	return found;
 }
 
@@ -1700,13 +1700,22 @@ int fib_table_flush(struct fib_table *tb)
 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
 		found += trie_flush_leaf(l);
 
-		if (ll && hlist_empty(&ll->list))
-			trie_leaf_remove(t, ll);
+		if (ll) {
+			if (hlist_empty(&ll->list))
+				trie_leaf_remove(t, ll);
+			else
+				leaf_pull_suffix(ll);
+		}
+
 		ll = l;
 	}
 
-	if (ll && hlist_empty(&ll->list))
-		trie_leaf_remove(t, ll);
+	if (ll) {
+		if (hlist_empty(&ll->list))
+			trie_leaf_remove(t, ll);
+		else
+			leaf_pull_suffix(ll);
+	}
 
 	pr_debug("trie_flush found=%d\n", found);
 	return found;
-- 
cgit v1.2.3


From b8693877ae016ac525d674d5d7a84ea0ea68ba60 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Wed, 28 Jan 2015 16:32:46 -0800
Subject: openvswitch: Add support for checksums on UDP tunnels.

Currently, it isn't possible to request checksums on the outer UDP
header of tunnels - the TUNNEL_CSUM flag is ignored. This adds
support for requesting that UDP checksums be computed on transmit
and properly reported if they are present on receive.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/geneve.h           | 2 +-
 net/ipv4/geneve.c              | 6 +++---
 net/openvswitch/vport-geneve.c | 2 +-
 net/openvswitch/vport-vxlan.c  | 7 +++++--
 4 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/geneve.h b/include/net/geneve.h
index 03aa2adb5bab..14fb8d3390b4 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -90,7 +90,7 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
 		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
 		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
-		    bool xnet);
+		    bool csum, bool xnet);
 #endif /*ifdef CONFIG_INET */
 
 #endif /*ifdef__NET_GENEVE_H */
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 93e51199e44b..5a4828ba05ad 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -107,13 +107,13 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
 		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
 		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
-		    bool xnet)
+		    bool csum, bool xnet)
 {
 	struct genevehdr *gnvh;
 	int min_headroom;
 	int err;
 
-	skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx);
+	skb = udp_tunnel_handle_offloads(skb, csum);
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
@@ -138,7 +138,7 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	return udp_tunnel_xmit_skb(rt, skb, src, dst,
 				   tos, ttl, df, src_port, dst_port, xnet,
-				   gs->sock->sk->sk_no_check_tx);
+				   !csum);
 }
 EXPORT_SYMBOL_GPL(geneve_xmit_skb);
 
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 7ca3d454ff3b..bf02fd5808c9 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -212,7 +212,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 			      tun_key->ipv4_dst, tun_key->ipv4_tos,
 			      tun_key->ipv4_ttl, df, sport, dport,
 			      tun_key->tun_flags, vni, opts_len, opts,
-			      false);
+			      !!(tun_key->tun_flags & TUNNEL_CSUM), false);
 	if (err < 0)
 		ip_rt_put(rt);
 	return err;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 3cc983bf444a..ff07d4062d60 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -74,7 +74,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	__be64 key;
 	__be16 flags;
 
-	flags = TUNNEL_KEY;
+	flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
 	vxlan_port = vxlan_vport(vport);
 	if (vxlan_port->exts & VXLAN_F_GBP)
 		flags |= TUNNEL_VXLAN_OPT;
@@ -230,6 +230,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 	__be16 src_port;
 	__be16 df;
 	int err;
+	u32 vxflags;
 
 	if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
 		err = -EINVAL;
@@ -251,11 +252,13 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 	src_port = udp_flow_src_port(net, skb, 0, 0, true);
 	md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
 	md.gbp = vxlan_ext_gbp(skb);
+	vxflags = vxlan_port->exts |
+		      (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0);
 
 	err = vxlan_xmit_skb(rt, skb, fl.saddr, tun_key->ipv4_dst,
 			     tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
 			     src_port, dst_port,
-			     &md, false, vxlan_port->exts);
+			     &md, false, vxflags);
 	if (err < 0)
 		ip_rt_put(rt);
 	return err;
-- 
cgit v1.2.3


From 207895fd388c7c4c48bc33055cd726d9e750298c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 29 Jan 2015 12:15:03 +0100
Subject: net: mark some potential candidates __read_mostly

They are all either written once or extremly rarely (e.g. from init
code), so we can move them to the .data..read_mostly section.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipvlan/ipvlan_core.c | 2 +-
 net/bridge/br_netlink.c          | 2 +-
 net/ipv4/devinet.c               | 2 +-
 net/ipv6/addrconf.c              | 2 +-
 net/mpls/mpls_gso.c              | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 2e195289ddf4..2a175006028b 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -9,7 +9,7 @@
 
 #include "ipvlan.h"
 
-static u32 ipvlan_jhash_secret;
+static u32 ipvlan_jhash_secret __read_mostly;
 
 void ipvlan_init_secret(void)
 {
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 3875ea51f6fe..e08b260f33fe 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -701,7 +701,7 @@ static size_t br_get_link_af_size(const struct net_device *dev)
 	return pv->num_vlans * nla_total_size(sizeof(struct bridge_vlan_info));
 }
 
-static struct rtnl_af_ops br_af_ops = {
+static struct rtnl_af_ops br_af_ops __read_mostly = {
 	.family			= AF_BRIDGE,
 	.get_link_af_size	= br_get_link_af_size,
 };
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 59ebe16d06fc..f0b4a31d7bd6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2322,7 +2322,7 @@ static __net_initdata struct pernet_operations devinet_ops = {
 	.exit = devinet_exit_net,
 };
 
-static struct rtnl_af_ops inet_af_ops = {
+static struct rtnl_af_ops inet_af_ops __read_mostly = {
 	.family		  = AF_INET,
 	.fill_link_af	  = inet_fill_link_af,
 	.get_link_af_size = inet_get_link_af_size,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7dcc065e2160..8623118cb2bb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5405,7 +5405,7 @@ static struct pernet_operations addrconf_ops = {
 	.exit = addrconf_exit_net,
 };
 
-static struct rtnl_af_ops inet6_ops = {
+static struct rtnl_af_ops inet6_ops __read_mostly = {
 	.family		  = AF_INET6,
 	.fill_link_af	  = inet6_fill_link_af,
 	.get_link_af_size = inet6_get_link_af_size,
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 349295d21946..809df534a720 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -60,14 +60,14 @@ out:
 	return segs;
 }
 
-static struct packet_offload mpls_mc_offload = {
+static struct packet_offload mpls_mc_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_MPLS_MC),
 	.callbacks = {
 		.gso_segment    =	mpls_gso_segment,
 	},
 };
 
-static struct packet_offload mpls_uc_offload = {
+static struct packet_offload mpls_uc_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_MPLS_UC),
 	.callbacks = {
 		.gso_segment    =	mpls_gso_segment,
-- 
cgit v1.2.3


From 932eb7638ad7d9145620178992044b5e87356969 Mon Sep 17 00:00:00 2001
From: Kenneth Klette Jonassen <kennetkl@ifi.uio.no>
Date: Thu, 29 Jan 2015 20:08:03 +0100
Subject: tcp: use SACK RTTs for CC

Current behavior only passes RTTs from sequentially acked data to CC.

If sender gets a combined ACK for segment 1 and SACK for segment 3, then the
computed RTT for CC is the time between sending segment 1 and receiving SACK
for segment 3.

Pass the minimum computed RTT from any acked data to CC, i.e. time between
sending segment 3 and receiving SACK for segment 3.

Signed-off-by: Kenneth Klette Jonassen <kennetkl@ifi.uio.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 71fb37c70581..ed11931f340f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3183,8 +3183,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
 
-		if (ca_ops->pkts_acked)
-			ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
+		if (ca_ops->pkts_acked) {
+			long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
+			ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
+		}
 
 	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
 		   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
-- 
cgit v1.2.3


From 349c9e3c7341bbab6efbea39acfadeba9ab19f61 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Jan 2015 15:58:09 -0800
Subject: ipv4: icmp: use percpu allocation

Get rid of nr_cpu_ids and use modern percpu allocation.

Note that the sockets themselves are not yet allocated
using NUMA affinity.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h |  3 ++-
 net/ipv4/icmp.c          | 17 ++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 24945cefc4fd..7283f4d39ae2 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -48,7 +48,8 @@ struct netns_ipv4 {
 	struct hlist_head	*fib_table_hash;
 	struct sock		*fibnl;
 
-	struct sock		**icmp_sk;
+	struct sock  * __percpu	*icmp_sk;
+
 	struct inet_peer_base	*peers;
 	struct tcpm_hash_bucket	*tcp_metrics_hash;
 	unsigned int		tcp_metrics_hash_log;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36f5584d93c5..5e564014a0b7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -205,7 +205,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
  */
 static struct sock *icmp_sk(struct net *net)
 {
-	return net->ipv4.icmp_sk[smp_processor_id()];
+	return *this_cpu_ptr(net->ipv4.icmp_sk);
 }
 
 static inline struct sock *icmp_xmit_lock(struct net *net)
@@ -1140,8 +1140,8 @@ static void __net_exit icmp_sk_exit(struct net *net)
 	int i;
 
 	for_each_possible_cpu(i)
-		inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]);
-	kfree(net->ipv4.icmp_sk);
+		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
+	free_percpu(net->ipv4.icmp_sk);
 	net->ipv4.icmp_sk = NULL;
 }
 
@@ -1149,9 +1149,8 @@ static int __net_init icmp_sk_init(struct net *net)
 {
 	int i, err;
 
-	net->ipv4.icmp_sk =
-		kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL);
-	if (net->ipv4.icmp_sk == NULL)
+	net->ipv4.icmp_sk = alloc_percpu(struct sock *);
+	if (!net->ipv4.icmp_sk)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
@@ -1162,7 +1161,7 @@ static int __net_init icmp_sk_init(struct net *net)
 		if (err < 0)
 			goto fail;
 
-		net->ipv4.icmp_sk[i] = sk;
+		*per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
 
 		/* Enough space for 2 64K ICMP packets, including
 		 * sk_buff/skb_shared_info struct overhead.
@@ -1203,8 +1202,8 @@ static int __net_init icmp_sk_init(struct net *net)
 
 fail:
 	for_each_possible_cpu(i)
-		inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]);
-	kfree(net->ipv4.icmp_sk);
+		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
+	free_percpu(net->ipv4.icmp_sk);
 	return err;
 }
 
-- 
cgit v1.2.3


From 49ca0d8bfaf3bc46d5eef60ce67b00eb195bd392 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 30 Jan 2015 13:29:31 -0500
Subject: net-timestamp: no-payload option

Add timestamping option SOF_TIMESTAMPING_OPT_TSONLY. For transmit
timestamps, this loops timestamps on top of empty packets.

Doing so reduces the pressure on SO_RCVBUF. Payload inspection and
cmsg reception (aside from timestamps) are no longer possible. This
works together with a follow on patch that allows administrators to
only allow tx timestamping if it does not loop payload or metadata.

Signed-off-by: Willem de Bruijn <willemb@google.com>

----

Changes (rfc -> v1)
  - add documentation
  - remove unnecessary skb->len test (thanks to Richard Cochran)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/timestamping.txt | 21 +++++++++++++++++++++
 include/uapi/linux/net_tstamp.h           |  3 ++-
 net/core/skbuff.c                         | 19 ++++++++++++++-----
 net/ipv4/ip_sockglue.c                    |  7 ++++---
 net/ipv6/datagram.c                       |  5 ++---
 net/rxrpc/ar-error.c                      |  5 +++++
 6 files changed, 48 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index a5c784c89312..5f0922613f1a 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -162,6 +162,27 @@ SOF_TIMESTAMPING_OPT_CMSG:
   option IP_PKTINFO simultaneously.
 
 
+SOF_TIMESTAMPING_OPT_TSONLY:
+
+  Applies to transmit timestamps only. Makes the kernel return the
+  timestamp as a cmsg alongside an empty packet, as opposed to
+  alongside the original packet. This reduces the amount of memory
+  charged to the socket's receive budget (SO_RCVBUF) and delivers
+  the timestamp even if sysctl net.core.tstamp_allow_data is 0.
+  This option disables SOF_TIMESTAMPING_OPT_CMSG.
+
+
+New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
+disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
+regardless of the setting of sysctl net.core.tstamp_allow_data.
+
+An exception is when a process needs additional cmsg data, for
+instance SOL_IP/IP_PKTINFO to detect the egress network interface.
+Then pass option SOF_TIMESTAMPING_OPT_CMSG. This option depends on
+having access to the contents of the original packet, so cannot be
+combined with SOF_TIMESTAMPING_OPT_TSONLY.
+
+
 1.4 Bytestream Timestamps
 
 The SO_TIMESTAMPING interface supports timestamping of bytes in a
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index edbc888ceb51..6d1abea9746e 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -24,8 +24,9 @@ enum {
 	SOF_TIMESTAMPING_TX_SCHED = (1<<8),
 	SOF_TIMESTAMPING_TX_ACK = (1<<9),
 	SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
+	SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_CMSG,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 56db472e9b86..65a3798f43e6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3710,19 +3710,28 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 		     struct sock *sk, int tstype)
 {
 	struct sk_buff *skb;
+	bool tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
 
 	if (!sk)
 		return;
 
-	if (hwtstamps)
-		*skb_hwtstamps(orig_skb) = *hwtstamps;
+	if (tsonly)
+		skb = alloc_skb(0, GFP_ATOMIC);
 	else
-		orig_skb->tstamp = ktime_get_real();
-
-	skb = skb_clone(orig_skb, GFP_ATOMIC);
+		skb = skb_clone(orig_skb, GFP_ATOMIC);
 	if (!skb)
 		return;
 
+	if (tsonly) {
+		skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags;
+		skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
+	}
+
+	if (hwtstamps)
+		*skb_hwtstamps(skb) = *hwtstamps;
+	else
+		skb->tstamp = ktime_get_real();
+
 	__skb_complete_tx_timestamp(skb, sk, tstype);
 }
 EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index db5e0f81ce0a..31d8c71986b4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -483,7 +483,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 
 	serr = SKB_EXT_ERR(skb);
 
-	if (sin) {
+	if (sin && skb->len) {
 		sin->sin_family = AF_INET;
 		sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
 						   serr->addr_offset);
@@ -496,8 +496,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 	sin = &errhdr.offender;
 	memset(sin, 0, sizeof(*sin));
 
-	if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
-	    ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) {
+	if (skb->len &&
+	    (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+	     ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) {
 		sin->sin_family = AF_INET;
 		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
 		if (inet_sk(sk)->cmsg_flags)
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 49f5e73db122..c215be70cac0 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -369,7 +369,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 
 	serr = SKB_EXT_ERR(skb);
 
-	if (sin) {
+	if (sin && skb->len) {
 		const unsigned char *nh = skb_network_header(skb);
 		sin->sin6_family = AF_INET6;
 		sin->sin6_flowinfo = 0;
@@ -394,8 +394,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
 	sin = &errhdr.offender;
 	memset(sin, 0, sizeof(*sin));
-
-	if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) {
+	if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL && skb->len) {
 		sin->sin6_family = AF_INET6;
 		if (np->rxopt.all) {
 			if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP &&
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
index 74c0fcd36838..5394b6be46ec 100644
--- a/net/rxrpc/ar-error.c
+++ b/net/rxrpc/ar-error.c
@@ -42,6 +42,11 @@ void rxrpc_UDP_error_report(struct sock *sk)
 		_leave("UDP socket errqueue empty");
 		return;
 	}
+	if (!skb->len) {
+		_leave("UDP empty message");
+		kfree_skb(skb);
+		return;
+	}
 
 	rxrpc_new_skb(skb);
 
-- 
cgit v1.2.3


From 843c2fdf7a12951815d981fa431d7e04d8259332 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 30 Jan 2015 20:45:20 +0100
Subject: net: dctcp: loosen requirement to assert ECT(0) during 3WHS

One deployment requirement of DCTCP is to be able to run
in a DC setting along with TCP traffic. As Glenn Judd's
NSDI'15 paper "Attaining the Promise and Avoiding the Pitfalls
of TCP in the Datacenter" [1] (tba) explains, one way to
solve this on switch side is to split DCTCP and TCP traffic
in two queues per switch port based on the DSCP: one queue
soley intended for DCTCP traffic and one for non-DCTCP traffic.

For the DCTCP queue, there's the marking threshold K as
explained in commit e3118e8359bb ("net: tcp: add DCTCP congestion
control algorithm") for RED marking ECT(0) packets with CE.
For the non-DCTCP queue, there's f.e. a classic tail drop queue.
As already explained in e3118e8359bb, running DCTCP at scale
when not marking SYN/SYN-ACK packets with ECT(0) has severe
consequences as for non-ECT(0) packets, traversing the RED
marking DCTCP queue will result in a severe reduction of
connection probability.

This is due to the DCTCP queue being dominated by ECT(0) traffic
and switches handle non-ECT traffic in the RED marking queue
after passing K as drops, where K is usually a low watermark
in order to leave enough tailroom for bursts. Splitting DCTCP
traffic among several queues (ECN and non-ECN queue) is being
considered a terrible idea in the network community as it
splits single flows across multiple network paths.

Therefore, commit e3118e8359bb implements this on Linux as
ECT(0) marked traffic, as we argue that marking all packets
of a DCTCP flow is the only viable solution and also doesn't
speak against the draft.

However, recently, a DCTCP implementation for FreeBSD hit also
their mainline kernel [2]. In order to let them play well
together with Linux' DCTCP, we would need to loosen the
requirement that ECT(0) has to be asserted during the 3WHS as
not implemented in FreeBSD. This simplifies the ECN test and
lets DCTCP work together with FreeBSD.

Joint work with Daniel Borkmann.

  [1] https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/judd
  [2] https://github.com/freebsd/freebsd/commit/8ad879445281027858a7fa706d13e458095b595f

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ed11931f340f..d3dfff78fa19 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5872,10 +5872,9 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
  * TCP ECN negotiation.
  *
  * Exception: tcp_ca wants ECN. This is required for DCTCP
- * congestion control; it requires setting ECT on all packets,
- * including SYN. We inverse the test in this case: If our
- * local socket wants ECN, but peer only set ece/cwr (but not
- * ECT in IP header) its probably a non-DCTCP aware sender.
+ * congestion control: Linux DCTCP asserts ECT on all packets,
+ * including SYN, which is most optimal solution; however,
+ * others, such as FreeBSD do not.
  */
 static void tcp_ecn_create_request(struct request_sock *req,
 				   const struct sk_buff *skb,
@@ -5885,18 +5884,15 @@ static void tcp_ecn_create_request(struct request_sock *req,
 	const struct tcphdr *th = tcp_hdr(skb);
 	const struct net *net = sock_net(listen_sk);
 	bool th_ecn = th->ece && th->cwr;
-	bool ect, need_ecn, ecn_ok;
+	bool ect, ecn_ok;
 
 	if (!th_ecn)
 		return;
 
 	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
-	need_ecn = tcp_ca_needs_ecn(listen_sk);
 	ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
 
-	if (!ect && !need_ecn && ecn_ok)
-		inet_rsk(req)->ecn_ok = 1;
-	else if (ect && need_ecn)
+	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
 		inet_rsk(req)->ecn_ok = 1;
 }
 
-- 
cgit v1.2.3


From 7ae9abfd9d6f3216500fc2874254e726cc30ca01 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 27 Nov 2014 19:30:51 -0500
Subject: ipv4: raw_send_hdrinc(): pass msghdr

Switch from passing msg->iov_iter.iov to passing msg itself

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/raw.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 0bb68df5055d..2c9d252072a2 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -337,7 +337,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
 }
 
 static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
-			   void *from, size_t length,
+			   struct msghdr *msg, size_t length,
 			   struct rtable **rtp,
 			   unsigned int flags)
 {
@@ -382,7 +382,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	skb->transport_header = skb->network_header;
 	err = -EFAULT;
-	if (memcpy_fromiovecend((void *)iph, from, 0, length))
+	if (memcpy_fromiovecend((void *)iph, msg->msg_iter.iov, 0, length))
 		goto error_free;
 
 	iphlen = iph->ihl * 4;
@@ -625,8 +625,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 back_from_confirm:
 
 	if (inet->hdrincl)
-		/* XXX: stripping const */
-		err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len,
+		err = raw_send_hdrinc(sk, &fl4, msg, len,
 				      &rt, msg->msg_flags);
 
 	 else {
-- 
cgit v1.2.3


From cacdc7d2f9fa42e29b650e2879df42ea7d7833c1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 27 Nov 2014 20:34:16 -0500
Subject: ip: stash a pointer to msghdr in struct ping_fakehdr

... instead of storing its ->mgs_iter.iov there

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/ping.h | 2 +-
 net/ipv4/ping.c    | 7 +++----
 net/ipv6/ping.c    | 3 +--
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ping.h b/include/net/ping.h
index f074060bc5de..cc16d413f681 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -59,7 +59,7 @@ extern struct pingv6_ops pingv6_ops;
 
 struct pingfakehdr {
 	struct icmphdr icmph;
-	struct iovec *iov;
+	struct msghdr *msg;
 	sa_family_t family;
 	__wsum wcheck;
 };
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2a3720fb5a5f..9e15ba701401 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -602,14 +602,14 @@ int ping_getfrag(void *from, char *to,
 		if (fraglen < sizeof(struct icmphdr))
 			BUG();
 		if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
-			    pfh->iov, 0, fraglen - sizeof(struct icmphdr),
+			    pfh->msg->msg_iter.iov, 0, fraglen - sizeof(struct icmphdr),
 			    &pfh->wcheck))
 			return -EFAULT;
 	} else if (offset < sizeof(struct icmphdr)) {
 			BUG();
 	} else {
 		if (csum_partial_copy_fromiovecend
-				(to, pfh->iov, offset - sizeof(struct icmphdr),
+				(to, pfh->msg->msg_iter.iov, offset - sizeof(struct icmphdr),
 				 fraglen, &pfh->wcheck))
 			return -EFAULT;
 	}
@@ -811,8 +811,7 @@ back_from_confirm:
 	pfh.icmph.checksum = 0;
 	pfh.icmph.un.echo.id = inet->inet_sport;
 	pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
-	/* XXX: stripping const */
-	pfh.iov = (struct iovec *)msg->msg_iter.iov;
+	pfh.msg = msg;
 	pfh.wcheck = 0;
 	pfh.family = AF_INET;
 
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 2d3148378a1f..bd46f736f61d 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -163,8 +163,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	pfh.icmph.checksum = 0;
 	pfh.icmph.un.echo.id = inet->inet_sport;
 	pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence;
-	/* XXX: stripping const */
-	pfh.iov = (struct iovec *)msg->msg_iter.iov;
+	pfh.msg = msg;
 	pfh.wcheck = 0;
 	pfh.family = AF_INET6;
 
-- 
cgit v1.2.3


From 57be5bdad759b9dde8b0d0cc630782a1a4ac4b9f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Nov 2014 13:40:20 -0500
Subject: ip: convert tcp_sendmsg() to iov_iter primitives

patch is actually smaller than it seems to be - most of it is unindenting
the inner loop body in tcp_sendmsg() itself...

the bit in tcp_input.c is going to get reverted very soon - that's what
memcpy_from_msg() will become, but not in this commit; let's keep it
reasonably contained...

There's one potentially subtle change here: in case of short copy from
userland, mainline tcp_send_syn_data() discards the skb it has allocated
and falls back to normal path, where we'll send as much as possible after
rereading the same data again.  This patch trims SYN+data skb instead -
that way we don't need to copy from the same place twice.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/sock.h    |  18 ++--
 net/ipv4/tcp.c        | 233 +++++++++++++++++++++++---------------------------
 net/ipv4/tcp_input.c  |   2 +-
 net/ipv4/tcp_output.c |  11 ++-
 4 files changed, 123 insertions(+), 141 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/sock.h b/include/net/sock.h
index 15341499786c..1e45e599a3ab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1803,27 +1803,25 @@ static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags)
 }
 
 static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
-					   char __user *from, char *to,
+					   struct iov_iter *from, char *to,
 					   int copy, int offset)
 {
 	if (skb->ip_summed == CHECKSUM_NONE) {
-		int err = 0;
-		__wsum csum = csum_and_copy_from_user(from, to, copy, 0, &err);
-		if (err)
-			return err;
+		__wsum csum = 0;
+		if (csum_and_copy_from_iter(to, copy, &csum, from) != copy)
+			return -EFAULT;
 		skb->csum = csum_block_add(skb->csum, csum, offset);
 	} else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
-		if (!access_ok(VERIFY_READ, from, copy) ||
-		    __copy_from_user_nocache(to, from, copy))
+		if (copy_from_iter_nocache(to, copy, from) != copy)
 			return -EFAULT;
-	} else if (copy_from_user(to, from, copy))
+	} else if (copy_from_iter(to, copy, from) != copy)
 		return -EFAULT;
 
 	return 0;
 }
 
 static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
-				       char __user *from, int copy)
+				       struct iov_iter *from, int copy)
 {
 	int err, offset = skb->len;
 
@@ -1835,7 +1833,7 @@ static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
 	return err;
 }
 
-static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from,
+static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from,
 					   struct sk_buff *skb,
 					   struct page *page,
 					   int off, int copy)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3075723c729b..9d72a0fcd928 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1067,11 +1067,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
-	const struct iovec *iov;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	int iovlen, flags, err, copied = 0;
-	int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
+	int flags, err, copied = 0;
+	int mss_now = 0, size_goal, copied_syn = 0;
 	bool sg;
 	long timeo;
 
@@ -1084,7 +1083,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			goto out;
 		else if (err)
 			goto out_err;
-		offset = copied_syn;
 	}
 
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -1118,8 +1116,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
 
 	/* Ok commence sending. */
-	iovlen = msg->msg_iter.nr_segs;
-	iov = msg->msg_iter.iov;
 	copied = 0;
 
 	err = -EPIPE;
@@ -1128,151 +1124,134 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	sg = !!(sk->sk_route_caps & NETIF_F_SG);
 
-	while (--iovlen >= 0) {
-		size_t seglen = iov->iov_len;
-		unsigned char __user *from = iov->iov_base;
+	while (iov_iter_count(&msg->msg_iter)) {
+		int copy = 0;
+		int max = size_goal;
 
-		iov++;
-		if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
-			if (offset >= seglen) {
-				offset -= seglen;
-				continue;
-			}
-			seglen -= offset;
-			from += offset;
-			offset = 0;
+		skb = tcp_write_queue_tail(sk);
+		if (tcp_send_head(sk)) {
+			if (skb->ip_summed == CHECKSUM_NONE)
+				max = mss_now;
+			copy = max - skb->len;
 		}
 
-		while (seglen > 0) {
-			int copy = 0;
-			int max = size_goal;
-
-			skb = tcp_write_queue_tail(sk);
-			if (tcp_send_head(sk)) {
-				if (skb->ip_summed == CHECKSUM_NONE)
-					max = mss_now;
-				copy = max - skb->len;
-			}
-
-			if (copy <= 0) {
+		if (copy <= 0) {
 new_segment:
-				/* Allocate new segment. If the interface is SG,
-				 * allocate skb fitting to single page.
-				 */
-				if (!sk_stream_memory_free(sk))
-					goto wait_for_sndbuf;
+			/* Allocate new segment. If the interface is SG,
+			 * allocate skb fitting to single page.
+			 */
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_sndbuf;
 
-				skb = sk_stream_alloc_skb(sk,
-							  select_size(sk, sg),
-							  sk->sk_allocation);
-				if (!skb)
-					goto wait_for_memory;
+			skb = sk_stream_alloc_skb(sk,
+						  select_size(sk, sg),
+						  sk->sk_allocation);
+			if (!skb)
+				goto wait_for_memory;
 
-				/*
-				 * Check whether we can use HW checksum.
-				 */
-				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
-					skb->ip_summed = CHECKSUM_PARTIAL;
+			/*
+			 * Check whether we can use HW checksum.
+			 */
+			if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
+				skb->ip_summed = CHECKSUM_PARTIAL;
 
-				skb_entail(sk, skb);
-				copy = size_goal;
-				max = size_goal;
+			skb_entail(sk, skb);
+			copy = size_goal;
+			max = size_goal;
 
-				/* All packets are restored as if they have
-				 * already been sent. skb_mstamp isn't set to
-				 * avoid wrong rtt estimation.
-				 */
-				if (tp->repair)
-					TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
-			}
+			/* All packets are restored as if they have
+			 * already been sent. skb_mstamp isn't set to
+			 * avoid wrong rtt estimation.
+			 */
+			if (tp->repair)
+				TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
+		}
 
-			/* Try to append data to the end of skb. */
-			if (copy > seglen)
-				copy = seglen;
-
-			/* Where to copy to? */
-			if (skb_availroom(skb) > 0) {
-				/* We have some space in skb head. Superb! */
-				copy = min_t(int, copy, skb_availroom(skb));
-				err = skb_add_data_nocache(sk, skb, from, copy);
-				if (err)
-					goto do_fault;
-			} else {
-				bool merge = true;
-				int i = skb_shinfo(skb)->nr_frags;
-				struct page_frag *pfrag = sk_page_frag(sk);
-
-				if (!sk_page_frag_refill(sk, pfrag))
-					goto wait_for_memory;
-
-				if (!skb_can_coalesce(skb, i, pfrag->page,
-						      pfrag->offset)) {
-					if (i == MAX_SKB_FRAGS || !sg) {
-						tcp_mark_push(tp, skb);
-						goto new_segment;
-					}
-					merge = false;
-				}
+		/* Try to append data to the end of skb. */
+		if (copy > iov_iter_count(&msg->msg_iter))
+			copy = iov_iter_count(&msg->msg_iter);
+
+		/* Where to copy to? */
+		if (skb_availroom(skb) > 0) {
+			/* We have some space in skb head. Superb! */
+			copy = min_t(int, copy, skb_availroom(skb));
+			err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
+			if (err)
+				goto do_fault;
+		} else {
+			bool merge = true;
+			int i = skb_shinfo(skb)->nr_frags;
+			struct page_frag *pfrag = sk_page_frag(sk);
+
+			if (!sk_page_frag_refill(sk, pfrag))
+				goto wait_for_memory;
 
-				copy = min_t(int, copy, pfrag->size - pfrag->offset);
-
-				if (!sk_wmem_schedule(sk, copy))
-					goto wait_for_memory;
-
-				err = skb_copy_to_page_nocache(sk, from, skb,
-							       pfrag->page,
-							       pfrag->offset,
-							       copy);
-				if (err)
-					goto do_error;
-
-				/* Update the skb. */
-				if (merge) {
-					skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
-				} else {
-					skb_fill_page_desc(skb, i, pfrag->page,
-							   pfrag->offset, copy);
-					get_page(pfrag->page);
+			if (!skb_can_coalesce(skb, i, pfrag->page,
+					      pfrag->offset)) {
+				if (i == MAX_SKB_FRAGS || !sg) {
+					tcp_mark_push(tp, skb);
+					goto new_segment;
 				}
-				pfrag->offset += copy;
+				merge = false;
 			}
 
-			if (!copied)
-				TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+			copy = min_t(int, copy, pfrag->size - pfrag->offset);
 
-			tp->write_seq += copy;
-			TCP_SKB_CB(skb)->end_seq += copy;
-			tcp_skb_pcount_set(skb, 0);
+			if (!sk_wmem_schedule(sk, copy))
+				goto wait_for_memory;
 
-			from += copy;
-			copied += copy;
-			if ((seglen -= copy) == 0 && iovlen == 0) {
-				tcp_tx_timestamp(sk, skb);
-				goto out;
+			err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+						       pfrag->page,
+						       pfrag->offset,
+						       copy);
+			if (err)
+				goto do_error;
+
+			/* Update the skb. */
+			if (merge) {
+				skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+			} else {
+				skb_fill_page_desc(skb, i, pfrag->page,
+						   pfrag->offset, copy);
+				get_page(pfrag->page);
 			}
+			pfrag->offset += copy;
+		}
 
-			if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
-				continue;
+		if (!copied)
+			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+
+		tp->write_seq += copy;
+		TCP_SKB_CB(skb)->end_seq += copy;
+		tcp_skb_pcount_set(skb, 0);
+
+		copied += copy;
+		if (!iov_iter_count(&msg->msg_iter)) {
+			tcp_tx_timestamp(sk, skb);
+			goto out;
+		}
 
-			if (forced_push(tp)) {
-				tcp_mark_push(tp, skb);
-				__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
-			} else if (skb == tcp_send_head(sk))
-				tcp_push_one(sk, mss_now);
+		if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
 			continue;
 
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == tcp_send_head(sk))
+			tcp_push_one(sk, mss_now);
+		continue;
+
 wait_for_sndbuf:
-			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
-			if (copied)
-				tcp_push(sk, flags & ~MSG_MORE, mss_now,
-					 TCP_NAGLE_PUSH, size_goal);
+		if (copied)
+			tcp_push(sk, flags & ~MSG_MORE, mss_now,
+				 TCP_NAGLE_PUSH, size_goal);
 
-			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
-				goto do_error;
+		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+			goto do_error;
 
-			mss_now = tcp_send_mss(sk, &size_goal, flags);
-		}
+		mss_now = tcp_send_mss(sk, &size_goal, flags);
 	}
 
 out:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 71fb37c70581..93c74829cbce 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4368,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 	if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
 		goto err_free;
 
-	if (memcpy_from_msg(skb_put(skb, size), msg, size))
+	if (copy_from_iter(skb_put(skb, size), size, &msg->msg_iter) != size)
 		goto err_free;
 
 	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20ab06b228ac..722c8bceaf9a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3055,7 +3055,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_fastopen_request *fo = tp->fastopen_req;
-	int syn_loss = 0, space, err = 0;
+	int syn_loss = 0, space, err = 0, copied;
 	unsigned long last_syn_loss = 0;
 	struct sk_buff *syn_data;
 
@@ -3093,11 +3093,16 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 		goto fallback;
 	syn_data->ip_summed = CHECKSUM_PARTIAL;
 	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
-	if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
-					 fo->data->msg_iter.iov, 0, space))) {
+	copied = copy_from_iter(skb_put(syn_data, space), space,
+				&fo->data->msg_iter);
+	if (unlikely(!copied)) {
 		kfree_skb(syn_data);
 		goto fallback;
 	}
+	if (copied != space) {
+		skb_trim(syn_data, copied);
+		space = copied;
+	}
 
 	/* No more data pending in inet_wait_for_connect() */
 	if (space == fo->size)
-- 
cgit v1.2.3


From 21226abb4e9f14d88238964d89b279e461ddc30c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Nov 2014 15:48:29 -0500
Subject: net: switch memcpy_fromiovec()/memcpy_fromiovecend() users to
 copy_from_iter()

That takes care of the majority of ->sendmsg() instances - most of them
via memcpy_to_msg() or assorted getfrag() callbacks.  One place where we
still keep memcpy_fromiovecend() is tipc - there we potentially read the
same data over and over; separate patch, that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h |  3 +--
 include/net/udplite.h  |  3 +--
 net/ipv4/ip_output.c   |  6 ++----
 net/ipv4/ping.c        | 14 +++++++-------
 net/ipv4/raw.c         |  2 +-
 net/ipv4/tcp_input.c   |  2 +-
 net/ipv6/raw.c         |  2 +-
 7 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9a8bafee1b67..b349c96dc80a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2692,8 +2692,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
-	/* XXX: stripping const */
-	return memcpy_fromiovec(data, (struct iovec *)msg->msg_iter.iov, len);
+	return copy_from_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
 }
 
 static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
diff --git a/include/net/udplite.h b/include/net/udplite.h
index ae7c8d1fbcad..80761938b9a7 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -20,8 +20,7 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 				      int len, int odd, struct sk_buff *skb)
 {
 	struct msghdr *msg = from;
-	/* XXX: stripping const */
-	return memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len);
+	return copy_from_iter(to, len, &msg->msg_iter) != len ? -EFAULT : 0;
 }
 
 /* Designate sk as UDP-Lite socket */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b50861b22b6b..f998bc87ae38 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -755,13 +755,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
 	struct msghdr *msg = from;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		/* XXX: stripping const */
-		if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0)
+		if (copy_from_iter(to, len, &msg->msg_iter) != len)
 			return -EFAULT;
 	} else {
 		__wsum csum = 0;
-		/* XXX: stripping const */
-		if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0)
+		if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
 			return -EFAULT;
 		skb->csum = csum_block_add(skb->csum, csum, odd);
 	}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9e15ba701401..e9f66e1cda50 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -599,18 +599,18 @@ int ping_getfrag(void *from, char *to,
 	struct pingfakehdr *pfh = (struct pingfakehdr *)from;
 
 	if (offset == 0) {
-		if (fraglen < sizeof(struct icmphdr))
+		fraglen -= sizeof(struct icmphdr);
+		if (fraglen < 0)
 			BUG();
-		if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
-			    pfh->msg->msg_iter.iov, 0, fraglen - sizeof(struct icmphdr),
-			    &pfh->wcheck))
+		if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
+			    fraglen, &pfh->wcheck,
+			    &pfh->msg->msg_iter) != fraglen)
 			return -EFAULT;
 	} else if (offset < sizeof(struct icmphdr)) {
 			BUG();
 	} else {
-		if (csum_partial_copy_fromiovecend
-				(to, pfh->msg->msg_iter.iov, offset - sizeof(struct icmphdr),
-				 fraglen, &pfh->wcheck))
+		if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
+					    &pfh->msg->msg_iter) != fraglen)
 			return -EFAULT;
 	}
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c9d252072a2..f027a708b7e0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -382,7 +382,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	skb->transport_header = skb->network_header;
 	err = -EFAULT;
-	if (memcpy_fromiovecend((void *)iph, msg->msg_iter.iov, 0, length))
+	if (memcpy_from_msg(iph, msg, length))
 		goto error_free;
 
 	iphlen = iph->ihl * 4;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 93c74829cbce..71fb37c70581 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4368,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 	if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
 		goto err_free;
 
-	if (copy_from_iter(skb_put(skb, size), size, &msg->msg_iter) != size)
+	if (memcpy_from_msg(skb_put(skb, size), msg, size))
 		goto err_free;
 
 	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 0dbb328fa688..dae7f1a1e464 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -648,7 +648,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	skb->ip_summed = CHECKSUM_NONE;
 
 	skb->transport_header = skb->network_header;
-	err = memcpy_fromiovecend((void *)iph, msg->msg_iter.iov, 0, length);
+	err = memcpy_from_msg(iph, msg, length);
 	if (err)
 		goto error_fault;
 
-- 
cgit v1.2.3


From dcdc8994697faa789669c3fdaca1a8bc27a8f356 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 2 Feb 2015 16:07:34 -0800
Subject: net: add skb functions to process remote checksum offload

This patch adds skb_remcsum_process and skb_gro_remcsum_process to
perform the appropriate adjustments to the skb when receiving
remote checksum offload.

Updated vxlan and gue to use these functions.

Tested: Ran TCP_RR and TCP_STREAM netperf for VXLAN and GUE, did
not see any change in performance.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 18 ++----------------
 include/linux/netdevice.h | 15 +++++++++++++++
 include/linux/skbuff.h    | 21 +++++++++++++++++++++
 net/ipv4/fou.c            | 18 ++----------------
 4 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 31bac2a21ce3..c184717e8b28 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -558,7 +558,6 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 					  u32 data)
 {
 	size_t start, offset, plen;
-	__wsum delta;
 
 	if (skb->remcsum_offload)
 		return vh;
@@ -580,12 +579,7 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 			return NULL;
 	}
 
-	delta = remcsum_adjust((void *)vh + hdrlen,
-			       NAPI_GRO_CB(skb)->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
-	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+	skb_gro_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
 
 	skb->remcsum_offload = 1;
 
@@ -1159,7 +1153,6 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 				      size_t hdrlen, u32 data)
 {
 	size_t start, offset, plen;
-	__wsum delta;
 
 	if (skb->remcsum_offload) {
 		/* Already processed in GRO path */
@@ -1179,14 +1172,7 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 
 	vh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
 
-	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
-		__skb_checksum_complete(skb);
-
-	delta = remcsum_adjust((void *)vh + hdrlen,
-			       skb->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
+	skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
 
 	return vh;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 16251e96e6aa..1347ac50d2af 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2318,6 +2318,21 @@ do {									\
 					   compute_pseudo(skb, proto));	\
 } while (0)
 
+static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
+					   int start, int offset)
+{
+	__wsum delta;
+
+	BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
+
+	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
+
+	/* Adjust skb->csum since we changed the packet */
+	skb->csum = csum_add(skb->csum, delta);
+	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+}
+
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2748ff639144..5405dfe02572 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3099,6 +3099,27 @@ do {									\
 				       compute_pseudo(skb, proto));	\
 } while (0)
 
+/* Update skbuf and packet to reflect the remote checksum offload operation.
+ * When called, ptr indicates the starting point for skb->csum when
+ * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
+ * here, skb_postpull_rcsum is done so skb->csum start is ptr.
+ */
+static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
+				       int start, int offset)
+{
+	__wsum delta;
+
+	 if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
+		__skb_checksum_complete(skb);
+		skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
+	}
+
+	delta = remcsum_adjust(ptr, skb->csum, start, offset);
+
+	/* Adjust skb->csum since we changed the packet */
+	skb->csum = csum_add(skb->csum, delta);
+}
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 void nf_conntrack_destroy(struct nf_conntrack *nfct);
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 3bc0cf07661c..92ddea1e6457 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -70,7 +70,6 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
-	__wsum delta;
 
 	if (skb->remcsum_offload) {
 		/* Already processed in GRO path */
@@ -82,14 +81,7 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 		return NULL;
 	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
 
-	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
-		__skb_checksum_complete(skb);
-
-	delta = remcsum_adjust((void *)guehdr + hdrlen,
-			       skb->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
+	skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
 
 	return guehdr;
 }
@@ -228,7 +220,6 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
-	__wsum delta;
 
 	if (skb->remcsum_offload)
 		return guehdr;
@@ -243,12 +234,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 			return NULL;
 	}
 
-	delta = remcsum_adjust((void *)guehdr + hdrlen,
-			       NAPI_GRO_CB(skb)->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
-	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
 
 	skb->remcsum_offload = 1;
 
-- 
cgit v1.2.3


From 9878196578286c5ed494778ada01da094377a686 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 3 Feb 2015 18:31:53 -0800
Subject: tcp: do not pace pure ack packets

When we added pacing to TCP, we decided to let sch_fq take care
of actual pacing.

All TCP had to do was to compute sk->pacing_rate using simple formula:

sk->pacing_rate = 2 * cwnd * mss / rtt

It works well for senders (bulk flows), but not very well for receivers
or even RPC :

cwnd on the receiver can be less than 10, rtt can be around 100ms, so we
can end up pacing ACK packets, slowing down the sender.

Really, only the sender should pace, according to its own logic.

Instead of adding a new bit in skb, or call yet another flow
dissection, we tweak skb->truesize to a small value (2), and
we instruct sch_fq to use new helper and not pace pure ack.

Note this also helps TCP small queue, as ack packets present
in qdisc/NIC do not prevent sending a data packet (RPC workload)

This helps to reduce tx completion overhead, ack packets can use regular
sock_wfree() instead of tcp_wfree() which is a bit more expensive.

This has no impact in the case packets are sent to loopback interface,
as we do not coalesce ack packets (were we would detect skb->truesize
lie)

In case netem (with a delay) is used, skb_orphan_partial() also sets
skb->truesize to 1.

This patch is a combination of two patches we used for about one year at
Google.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 15 +++++++++++++++
 net/ipv4/tcp_output.c | 10 +++++++++-
 net/sched/sch_fq.c    | 10 ++++++++--
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b8fdc6bab3f3..637ee490ec81 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1713,4 +1713,19 @@ static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 	return dopt;
 }
 
+/* locally generated TCP pure ACKs have skb->truesize == 2
+ * (check tcp_send_ack() in net/ipv4/tcp_output.c )
+ * This is much faster than dissecting the packet to find out.
+ * (Think of GRE encapsulations, IPv4, IPv6, ...)
+ */
+static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb)
+{
+	return skb->truesize == 2;
+}
+
+static inline void skb_set_tcp_pure_ack(struct sk_buff *skb)
+{
+	skb->truesize = 2;
+}
+
 #endif	/* _TCP_H */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20ab06b228ac..1b326ed46f7b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -948,7 +948,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	skb_orphan(skb);
 	skb->sk = sk;
-	skb->destructor = tcp_wfree;
+	skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
 	skb_set_hash_from_sk(skb, sk);
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
@@ -3265,6 +3265,14 @@ void tcp_send_ack(struct sock *sk)
 	skb_reserve(buff, MAX_TCP_HEADER);
 	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
 
+	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
+	 * too much.
+	 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
+	 * We also avoid tcp_wfree() overhead (cache line miss accessing
+	 * tp->tsq_flags) by using regular sock_wfree()
+	 */
+	skb_set_tcp_pure_ack(buff);
+
 	/* Send it off, this clears delayed acks for us. */
 	skb_mstamp_get(&buff->skb_mstamp);
 	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 2a50f5c62070..69a3dbf55c60 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -52,6 +52,7 @@
 #include <net/pkt_sched.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
+#include <net/tcp.h>
 
 /*
  * Per flow structure, dynamically allocated
@@ -445,7 +446,9 @@ begin:
 		goto begin;
 	}
 
-	if (unlikely(f->head && now < f->time_next_packet)) {
+	skb = f->head;
+	if (unlikely(skb && now < f->time_next_packet &&
+		     !skb_is_tcp_pure_ack(skb))) {
 		head->first = f->next;
 		fq_flow_set_throttled(q, f);
 		goto begin;
@@ -464,12 +467,15 @@ begin:
 		goto begin;
 	}
 	prefetch(&skb->end);
-	f->time_next_packet = now;
 	f->credit -= qdisc_pkt_len(skb);
 
 	if (f->credit > 0 || !q->rate_enable)
 		goto out;
 
+	/* Do not pace locally generated ack packets */
+	if (skb_is_tcp_pure_ack(skb))
+		goto out;
+
 	rate = q->flow_max_rate;
 	if (skb->sk)
 		rate = min(skb->sk->sk_pacing_rate, rate);
-- 
cgit v1.2.3


From 032ee4236954eb214651cb9bfc1b38ffa8fd7a01 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:38 -0500
Subject: tcp: helpers to mitigate ACK loops by rate-limiting out-of-window
 dupacks

Helpers for mitigating ACK loops by rate-limiting dupacks sent in
response to incoming out-of-window packets.

This patch includes:

- rate-limiting logic
- sysctl to control how often we allow dupacks to out-of-window packets
- SNMP counter for cases where we rate-limited our dupack sending

The rate-limiting logic in this patch decides to not send dupacks in
response to out-of-window segments if (a) they are SYNs or pure ACKs
and (b) the remote endpoint is sending them faster than the configured
rate limit.

We rate-limit our responses rather than blocking them entirely or
resetting the connection, because legitimate connections can rely on
dupacks in response to some out-of-window segments. For example, zero
window probes are typically sent with a sequence number that is below
the current window, and ZWPs thus expect to thus elicit a dupack in
response.

We allow dupacks in response to TCP segments with data, because these
may be spurious retransmissions for which the remote endpoint wants to
receive DSACKs. This is safe because segments with data can't
realistically be part of ACK loops, which by their nature consist of
each side sending pure/data-less ACKs to each other.

The dupack interval is controlled by a new sysctl knob,
tcp_invalid_ratelimit, given in milliseconds, in case an administrator
needs to dial this upward in the face of a high-rate DoS attack. The
name and units are chosen to be analogous to the existing analogous
knob for ICMP, icmp_ratelimit.

The default value for tcp_invalid_ratelimit is 500ms, which allows at
most one such dupack per 500ms. This is chosen to be 2x faster than
the 1-second minimum RTO interval allowed by RFC 6298 (section 2, rule
2.4). We allow the extra 2x factor because network delay variations
can cause packets sent at 1 second intervals to be compressed and
arrive much closer.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 22 ++++++++++++++++++++++
 include/net/tcp.h                      | 32 ++++++++++++++++++++++++++++++++
 include/uapi/linux/snmp.h              |  6 ++++++
 net/ipv4/proc.c                        |  6 ++++++
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp_input.c                   |  1 +
 6 files changed, 74 insertions(+)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a5e4c813f17f..1b8c964b0d17 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -290,6 +290,28 @@ tcp_frto - INTEGER
 
 	By default it's enabled with a non-zero value. 0 disables F-RTO.
 
+tcp_invalid_ratelimit - INTEGER
+	Limit the maximal rate for sending duplicate acknowledgments
+	in response to incoming TCP packets that are for an existing
+	connection but that are invalid due to any of these reasons:
+
+	  (a) out-of-window sequence number,
+	  (b) out-of-window acknowledgment number, or
+	  (c) PAWS (Protection Against Wrapped Sequence numbers) check failure
+
+	This can help mitigate simple "ack loop" DoS attacks, wherein
+	a buggy or malicious middlebox or man-in-the-middle can
+	rewrite TCP header fields in manner that causes each endpoint
+	to think that the other is sending invalid TCP segments, thus
+	causing each side to send an unterminating stream of duplicate
+	acknowledgments for invalid segments.
+
+	Using 0 disables rate-limiting of dupacks in response to
+	invalid segments; otherwise this value specifies the minimal
+	space between sending such dupacks, in milliseconds.
+
+	Default: 500 (milliseconds).
+
 tcp_keepalive_time - INTEGER
 	How often TCP sends out keepalive messages when keepalive is enabled.
 	Default: 2hours.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 28e9bd3abceb..b81f45c67b2e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -274,6 +274,7 @@ extern int sysctl_tcp_challenge_ack_limit;
 extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
+extern int sysctl_tcp_invalid_ratelimit;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1236,6 +1237,37 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
 	return true;
 }
 
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+static inline bool tcp_oow_rate_limited(struct net *net,
+					const struct sk_buff *skb,
+					int mib_idx, u32 *last_oow_ack_time)
+{
+	/* Data packets without SYNs are not likely part of an ACK loop. */
+	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+	    !tcp_hdr(skb)->syn)
+		goto not_rate_limited;
+
+	if (*last_oow_ack_time) {
+		s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+		if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+			NET_INC_STATS_BH(net, mib_idx);
+			return true;	/* rate-limited: don't send yet! */
+		}
+	}
+
+	*last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+	return false;	/* not rate-limited: go ahead, send dupack now! */
+}
+
 static inline void tcp_mib_init(struct net *net)
 {
 	/* See RFC 2012 */
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b22224100011..6a6fb747c78d 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -270,6 +270,12 @@ enum
 	LINUX_MIB_TCPHYSTARTTRAINCWND,		/* TCPHystartTrainCwnd */
 	LINUX_MIB_TCPHYSTARTDELAYDETECT,	/* TCPHystartDelayDetect */
 	LINUX_MIB_TCPHYSTARTDELAYCWND,		/* TCPHystartDelayCwnd */
+	LINUX_MIB_TCPACKSKIPPEDSYNRECV,		/* TCPACKSkippedSynRecv */
+	LINUX_MIB_TCPACKSKIPPEDPAWS,		/* TCPACKSkippedPAWS */
+	LINUX_MIB_TCPACKSKIPPEDSEQ,		/* TCPACKSkippedSeq */
+	LINUX_MIB_TCPACKSKIPPEDFINWAIT2,	/* TCPACKSkippedFinWait2 */
+	LINUX_MIB_TCPACKSKIPPEDTIMEWAIT,	/* TCPACKSkippedTimeWait */
+	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,	/* TCPACKSkippedChallenge */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8f9cd200ce20..d8953ef0770c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -292,6 +292,12 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
 	SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
 	SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
+	SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV),
+	SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS),
+	SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ),
+	SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
+	SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
+	SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e0ee384a448f..82601a68cf90 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -728,6 +728,13 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "tcp_invalid_ratelimit",
+		.data		= &sysctl_tcp_invalid_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
 	{
 		.procname	= "icmp_msgs_per_sec",
 		.data		= &sysctl_icmp_msgs_per_sec,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d3dfff78fa19..9401aa43b814 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
 
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
+int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
-- 
cgit v1.2.3


From a9b2c06dbef48ed31cff1764c5ce824829106f4f Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:39 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_request_sock

In the SYN_RECV state, where the TCP connection is represented by
tcp_request_sock, we now rate-limit SYNACKs in response to a client's
retransmitted SYNs: we do not send a SYNACK in response to client SYN
if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms)
since we last sent a SYNACK in response to a client's retransmitted
SYN.

This allows the vast majority of legitimate client connections to
proceed unimpeded, even for the most aggressive platforms, iOS and
MacOS, which actually retransmit SYNs 1-second intervals for several
times in a row. They use SYN RTO timeouts following the progression:
1,1,1,1,1,2,4,8,16,32.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 1 +
 include/net/tcp.h        | 1 +
 net/ipv4/tcp_minisocks.c | 6 +++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 67309ece0772..bcc828d3b9b9 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -115,6 +115,7 @@ struct tcp_request_sock {
 	u32				rcv_isn;
 	u32				snt_isn;
 	u32				snt_synack; /* synack sent time */
+	u32				last_oow_ack_time; /* last SYNACK */
 	u32				rcv_nxt; /* the ack # by SYNACK. For
 						  * FastOpen it's the seq#
 						  * after data-in-SYN.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b81f45c67b2e..da4196fb78db 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1145,6 +1145,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
 	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
+	tcp_rsk(req)->last_oow_ack_time = 0;
 	req->mss = rx_opt->mss_clamp;
 	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
 	ireq->tstamp_ok = rx_opt->tstamp_ok;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bc9216dc9de1..131aa4950d1c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -605,7 +605,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		 * Reset timer after retransmitting SYNACK, similar to
 		 * the idea of fast retransmit in recovery.
 		 */
-		if (!inet_rtx_syn_ack(sk, req))
+		if (!tcp_oow_rate_limited(sock_net(sk), skb,
+					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+					  &tcp_rsk(req)->last_oow_ack_time) &&
+
+		    !inet_rtx_syn_ack(sk, req))
 			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
 					   TCP_RTO_MAX) + jiffies;
 		return NULL;
-- 
cgit v1.2.3


From f2b2c582e82429270d5818fbabe653f4359d7024 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:40 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_sock

Ensure that in state ESTABLISHED, where the connection is represented
by a tcp_sock, we rate limit dupacks in response to incoming packets
(a) with TCP timestamps that fail PAWS checks, or (b) with sequence
numbers or ACK numbers that are out of the acceptable window.

We do not send a dupack in response to out-of-window packets if it has
been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we
last sent a dupack in response to an out-of-window packet.

There is already a similar (although global) rate-limiting mechanism
for "challenge ACKs". When deciding whether to send a challence ACK,
we first consult the new per-connection rate limit, and then the
global rate limit.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 +
 net/ipv4/tcp_input.c     | 29 ++++++++++++++++++++++-------
 net/ipv4/tcp_minisocks.c |  1 +
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bcc828d3b9b9..66d85a80a1ec 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -153,6 +153,7 @@ struct tcp_sock {
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
 	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
+	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */
 
 	u32	tsoffset;	/* timestamp offset */
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9401aa43b814..8fdd27b17306 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3322,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 }
 
 /* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 {
 	/* unprotected vars, we dont care of overwrites */
 	static u32 challenge_timestamp;
 	static unsigned int challenge_count;
-	u32 now = jiffies / HZ;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 now;
+
+	/* First check our per-socket dupack rate limit. */
+	if (tcp_oow_rate_limited(sock_net(sk), skb,
+				 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+				 &tp->last_oow_ack_time))
+		return;
 
+	/* Then check the check host-wide RFC 5961 rate limit. */
+	now = jiffies / HZ;
 	if (now != challenge_timestamp) {
 		challenge_timestamp = now;
 		challenge_count = 0;
@@ -3424,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (before(ack, prior_snd_una)) {
 		/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
 		if (before(ack, prior_snd_una - tp->max_window)) {
-			tcp_send_challenge_ack(sk);
+			tcp_send_challenge_ack(sk, skb);
 			return -1;
 		}
 		goto old_ack;
@@ -4993,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
-			tcp_send_dupack(sk, skb);
+			if (!tcp_oow_rate_limited(sock_net(sk), skb,
+						  LINUX_MIB_TCPACKSKIPPEDPAWS,
+						  &tp->last_oow_ack_time))
+				tcp_send_dupack(sk, skb);
 			goto discard;
 		}
 		/* Reset is accepted even if it did not pass PAWS. */
@@ -5010,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		if (!th->rst) {
 			if (th->syn)
 				goto syn_challenge;
-			tcp_send_dupack(sk, skb);
+			if (!tcp_oow_rate_limited(sock_net(sk), skb,
+						  LINUX_MIB_TCPACKSKIPPEDSEQ,
+						  &tp->last_oow_ack_time))
+				tcp_send_dupack(sk, skb);
 		}
 		goto discard;
 	}
@@ -5026,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
 			tcp_reset(sk);
 		else
-			tcp_send_challenge_ack(sk);
+			tcp_send_challenge_ack(sk, skb);
 		goto discard;
 	}
 
@@ -5040,7 +5055,7 @@ syn_challenge:
 		if (syn_inerr)
 			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
-		tcp_send_challenge_ack(sk);
+		tcp_send_challenge_ack(sk, skb);
 		goto discard;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 131aa4950d1c..98a840561ec8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -467,6 +467,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		tcp_enable_early_retrans(newtp);
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = treq->snt_synack;
+		newtp->last_oow_ack_time = 0;
 		newtp->total_retrans = req->num_retrans;
 
 		/* So many TCP implementations out there (incorrectly) count the
-- 
cgit v1.2.3


From 4fb17a6091674f469e8ac85dc770fbf9a9ba7cc8 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:41 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_timewait_sock

Ensure that in state FIN_WAIT2 or TIME_WAIT, where the connection is
represented by a tcp_timewait_sock, we rate limit dupacks in response
to incoming packets (a) with TCP timestamps that fail PAWS checks, or
(b) with sequence numbers that are out of the acceptable window.

We do not send a dupack in response to out-of-window packets if it has
been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we
last sent a dupack in response to an out-of-window packet.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  4 ++++
 net/ipv4/tcp_minisocks.c | 29 ++++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 66d85a80a1ec..1a7adb411647 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -342,6 +342,10 @@ struct tcp_timewait_sock {
 	u32			  tw_rcv_wnd;
 	u32			  tw_ts_offset;
 	u32			  tw_ts_recent;
+
+	/* The time we sent the last out-of-window ACK: */
+	u32			  tw_last_oow_ack_time;
+
 	long			  tw_ts_recent_stamp;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 98a840561ec8..dd11ac7798c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -58,6 +58,25 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 	return seq == e_win && seq == end_seq;
 }
 
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+				  const struct sk_buff *skb, int mib_idx)
+{
+	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+				  &tcptw->tw_last_oow_ack_time)) {
+		/* Send ACK. Note, we do not put the bucket,
+		 * it will be released by caller.
+		 */
+		return TCP_TW_ACK;
+	}
+
+	/* We are rate-limiting, so just release the tw sock and drop skb. */
+	inet_twsk_put(tw);
+	return TCP_TW_SUCCESS;
+}
+
 /*
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -116,7 +135,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 				   tcptw->tw_rcv_nxt,
 				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
-			return TCP_TW_ACK;
+			return tcp_timewait_check_oow_rate_limit(
+				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
 
 		if (th->rst)
 			goto kill;
@@ -250,10 +270,8 @@ kill:
 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 					   TCP_TIMEWAIT_LEN);
 
-		/* Send ACK. Note, we do not put the bucket,
-		 * it will be released by caller.
-		 */
-		return TCP_TW_ACK;
+		return tcp_timewait_check_oow_rate_limit(
+			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
 	}
 	inet_twsk_put(tw);
 	return TCP_TW_SUCCESS;
@@ -289,6 +307,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
+		tcptw->tw_last_oow_ack_time = 0;
 
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
-- 
cgit v1.2.3


From 3e97fa7059c19f7cc2566dfb30fe9282f6f1e673 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 6 Feb 2015 17:22:22 +0100
Subject: gre/ipip: use be16 variants of netlink functions

encap.sport and encap.dport are __be16, use nla_{get,put}_be16 instead
of nla_{get,put}_u16.

Fixes the sparse warnings:

warning: incorrect type in assignment (different base types)
   expected restricted __be32 [addressable] [usertype] o_key
   got restricted __be16 [addressable] [usertype] i_flags
warning: incorrect type in assignment (different base types)
   expected restricted __be16 [usertype] sport
   got unsigned short
warning: incorrect type in assignment (different base types)
   expected restricted __be16 [usertype] dport
   got unsigned short
warning: incorrect type in argument 3 (different base types)
   expected unsigned short [unsigned] [usertype] value
   got restricted __be16 [usertype] sport
warning: incorrect type in argument 3 (different base types)
   expected unsigned short [unsigned] [usertype] value
   got restricted __be16 [usertype] dport

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 12 ++++++------
 net/ipv4/ipip.c   | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 6e7727f27393..6207275fc749 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -659,12 +659,12 @@ static bool ipgre_netlink_encap_parms(struct nlattr *data[],
 
 	if (data[IFLA_GRE_ENCAP_SPORT]) {
 		ret = true;
-		ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]);
+		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
 	}
 
 	if (data[IFLA_GRE_ENCAP_DPORT]) {
 		ret = true;
-		ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]);
+		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
 	}
 
 	return ret;
@@ -786,10 +786,10 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
 			t->encap.type) ||
-	    nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT,
-			t->encap.sport) ||
-	    nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT,
-			t->encap.dport) ||
+	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+			 t->encap.sport) ||
+	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+			 t->encap.dport) ||
 	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
 			t->encap.flags))
 		goto nla_put_failure;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index b58d6689874c..915d215a7d14 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -366,12 +366,12 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
 
 	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
 		ret = true;
-		ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]);
+		ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
 	}
 
 	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
 		ret = true;
-		ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]);
+		ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
 	}
 
 	return ret;
@@ -460,10 +460,10 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
 			tunnel->encap.type) ||
-	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT,
-			tunnel->encap.sport) ||
-	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT,
-			tunnel->encap.dport) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+			 tunnel->encap.sport) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+			 tunnel->encap.dport) ||
 	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
 			tunnel->encap.flags))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 567e4b79731c352a17d73c483959f795d3593e03 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 6 Feb 2015 12:59:01 -0800
Subject: net: rfs: add hash collision detection

Receive Flow Steering is a nice solution but suffers from
hash collisions when a mix of connected and unconnected traffic
is received on the host, when flow hash table is populated.

Also, clearing flow in inet_release() makes RFS not very good
for short lived flows, as many packets can follow close().
(FIN , ACK packets, ...)

This patch extends the information stored into global hash table
to not only include cpu number, but upper part of the hash value.

I use a 32bit value, and dynamically split it in two parts.

For host with less than 64 possible cpus, this gives 6 bits for the
cpu number, and 26 (32-6) bits for the upper part of the hash.

Since hash bucket selection use low order bits of the hash, we have
a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
enough.

If the hash found in flow table does not match, we fallback to RPS (if
it is enabled for the rxqueue).

This means that a packet for an non connected flow can avoid the
IPI through a unrelated/victim CPU.

This also means we no longer have to clear the table at socket
close time, and this helps short lived flows performance.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c          |  5 +----
 include/linux/netdevice.h  | 34 ++++++++++++++++----------------
 include/net/sock.h         | 24 +----------------------
 net/core/dev.c             | 48 ++++++++++++++++++++++++++--------------------
 net/core/sysctl_net_core.c |  2 +-
 net/ipv4/af_inet.c         |  2 --
 6 files changed, 47 insertions(+), 68 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ad7d3d5f3ee5..857dca47bf80 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
 {
 	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
 		  e->rxhash, e->queue_index);
-	sock_rps_reset_flow_hash(e->rps_rxhash);
 	hlist_del_rcu(&e->hash_link);
 	kfree_rcu(e, rcu);
 	--tun->flow_count;
@@ -373,10 +372,8 @@ unlock:
  */
 static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 {
-	if (unlikely(e->rps_rxhash != hash)) {
-		sock_rps_reset_flow_hash(e->rps_rxhash);
+	if (unlikely(e->rps_rxhash != hash))
 		e->rps_rxhash = hash;
-	}
 }
 
 /* We try to identify a flow through its rxhash first. The reason that
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ce784d5018e0..ab3b7cef4638 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -644,39 +644,39 @@ struct rps_dev_flow_table {
 /*
  * The rps_sock_flow_table contains mappings of flows to the last CPU
  * on which they were processed by the application (set in recvmsg).
+ * Each entry is a 32bit value. Upper part is the high order bits
+ * of flow hash, lower part is cpu number.
+ * rps_cpu_mask is used to partition the space, depending on number of
+ * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
+ * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
+ * meaning we use 32-6=26 bits for the hash.
  */
 struct rps_sock_flow_table {
-	unsigned int mask;
-	u16 ents[0];
+	u32	mask;
+	u32	ents[0];
 };
-#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
-    ((_num) * sizeof(u16)))
+#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 
 #define RPS_NO_CPU 0xffff
 
+extern u32 rps_cpu_mask;
+extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+
 static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
 					u32 hash)
 {
 	if (table && hash) {
-		unsigned int cpu, index = hash & table->mask;
+		unsigned int index = hash & table->mask;
+		u32 val = hash & ~rps_cpu_mask;
 
 		/* We only give a hint, preemption can change cpu under us */
-		cpu = raw_smp_processor_id();
+		val |= raw_smp_processor_id();
 
-		if (table->ents[index] != cpu)
-			table->ents[index] = cpu;
+		if (table->ents[index] != val)
+			table->ents[index] = val;
 	}
 }
 
-static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
-				       u32 hash)
-{
-	if (table && hash)
-		table->ents[hash & table->mask] = RPS_NO_CPU;
-}
-
-extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
-
 #ifdef CONFIG_RFS_ACCEL
 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 			 u16 filter_id);
diff --git a/include/net/sock.h b/include/net/sock.h
index d28b8fededd6..e13824570b0f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 #endif
 }
 
-static inline void sock_rps_reset_flow_hash(__u32 hash)
-{
-#ifdef CONFIG_RPS
-	struct rps_sock_flow_table *sock_flow_table;
-
-	rcu_read_lock();
-	sock_flow_table = rcu_dereference(rps_sock_flow_table);
-	rps_reset_sock_flow(sock_flow_table, hash);
-	rcu_read_unlock();
-#endif
-}
-
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
@@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
 #endif
 }
 
-static inline void sock_rps_reset_flow(const struct sock *sk)
-{
-#ifdef CONFIG_RPS
-	sock_rps_reset_flow_hash(sk->sk_rxhash);
-#endif
-}
-
 static inline void sock_rps_save_rxhash(struct sock *sk,
 					const struct sk_buff *skb)
 {
 #ifdef CONFIG_RPS
-	if (unlikely(sk->sk_rxhash != skb->hash)) {
-		sock_rps_reset_flow(sk);
+	if (unlikely(sk->sk_rxhash != skb->hash))
 		sk->sk_rxhash = skb->hash;
-	}
 #endif
 }
 
 static inline void sock_rps_reset_rxhash(struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	sock_rps_reset_flow(sk);
 	sk->sk_rxhash = 0;
 #endif
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index a3a96ffc67f4..8be38675e1a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 /* One global table that all flow-based protocols share. */
 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
+u32 rps_cpu_mask __read_mostly;
+EXPORT_SYMBOL(rps_cpu_mask);
 
 struct static_key rps_needed __read_mostly;
 
@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		       struct rps_dev_flow **rflowp)
 {
-	struct netdev_rx_queue *rxqueue;
-	struct rps_map *map;
+	const struct rps_sock_flow_table *sock_flow_table;
+	struct netdev_rx_queue *rxqueue = dev->_rx;
 	struct rps_dev_flow_table *flow_table;
-	struct rps_sock_flow_table *sock_flow_table;
+	struct rps_map *map;
 	int cpu = -1;
-	u16 tcpu;
+	u32 tcpu;
 	u32 hash;
 
 	if (skb_rx_queue_recorded(skb)) {
 		u16 index = skb_get_rx_queue(skb);
+
 		if (unlikely(index >= dev->real_num_rx_queues)) {
 			WARN_ONCE(dev->real_num_rx_queues > 1,
 				  "%s received packet on queue %u, but number "
@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 				  dev->name, index, dev->real_num_rx_queues);
 			goto done;
 		}
-		rxqueue = dev->_rx + index;
-	} else
-		rxqueue = dev->_rx;
+		rxqueue += index;
+	}
 
+	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
+
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	map = rcu_dereference(rxqueue->rps_map);
-	if (map) {
-		if (map->len == 1 &&
-		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
-			tcpu = map->cpus[0];
-			if (cpu_online(tcpu))
-				cpu = tcpu;
-			goto done;
-		}
-	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
+	if (!flow_table && !map)
 		goto done;
-	}
 
 	skb_reset_network_header(skb);
 	hash = skb_get_hash(skb);
 	if (!hash)
 		goto done;
 
-	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 	if (flow_table && sock_flow_table) {
-		u16 next_cpu;
 		struct rps_dev_flow *rflow;
+		u32 next_cpu;
+		u32 ident;
+
+		/* First check into global flow table if there is a match */
+		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
+		if ((ident ^ hash) & ~rps_cpu_mask)
+			goto try_rps;
 
+		next_cpu = ident & rps_cpu_mask;
+
+		/* OK, now we know there is a match,
+		 * we can look at the local (per receive queue) flow table
+		 */
 		rflow = &flow_table->flows[hash & flow_table->mask];
 		tcpu = rflow->cpu;
 
-		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
-
 		/*
 		 * If the desired CPU (where last recvmsg was done) is
 		 * different from current CPU (one in the rx-queue flow
@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		}
 	}
 
+try_rps:
+
 	if (map) {
 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 		if (cpu_online(tcpu)) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index fde21d19e61b..7a31be5e361f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 					mutex_unlock(&sock_flow_mutex);
 					return -ENOMEM;
 				}
-
+				rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
 				sock_table->mask = size - 1;
 			} else
 				sock_table = orig_sock_table;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a44773c8346c..d2e49baaff63 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
 	if (sk) {
 		long timeout;
 
-		sock_rps_reset_flow(sk);
-
 		/* Applications forget to leave groups before exiting */
 		ip_mc_drop_socket(sk);
 
-- 
cgit v1.2.3


From 531c94a9681b8c253fd0490a4ca8bbe01a38c78b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 9 Feb 2015 12:35:23 -0800
Subject: tcp: don't include Fast Open option in SYN-ACK on pure SYN-data

If a server has enabled Fast Open and it receives a pure SYN-data
packet (without a Fast Open option), it won't accept the data but it
incorrectly returns a SYN-ACK with a Fast Open cookie and also
increments the SNMP stat LINUX_MIB_TCPFASTOPENPASSIVEFAIL.

This patch makes the server include a Fast Open cookie in SYN-ACK
only if the SYN has some Fast Open option (i.e., when client
requests or presents a cookie).

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_fastopen.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 815c85e3b1e0..53db2c309572 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -255,6 +255,9 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
 	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
 
+	if (foc->len == 0) /* Client requests a cookie */
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+
 	if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
 	      (syn_data || foc->len >= 0) &&
 	      tcp_fastopen_queue_check(sk))) {
@@ -265,7 +268,8 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 	if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
 		goto fastopen;
 
-	if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
+	    tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
 	    foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
 	    foc->len == valid_foc.len &&
 	    !memcmp(foc->val, valid_foc.val, foc->len)) {
@@ -284,11 +288,10 @@ fastopen:
 					 LINUX_MIB_TCPFASTOPENPASSIVE);
 			return true;
 		}
-	}
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+	} else if (foc->len > 0) /* Client presents an invalid cookie */
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
 
-	NET_INC_STATS_BH(sock_net(sk), foc->len ?
-			 LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
-			 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
 	*foc = valid_foc;
 	return false;
 }
-- 
cgit v1.2.3


From b0f9ca53cbb103e9240a29a974e0b6085e58f9f7 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@intel.com>
Date: Tue, 10 Feb 2015 09:53:16 +0800
Subject: ipv4: Namespecify TCP PMTU mechanism

Packetization Layer Path MTU Discovery works separately beside
Path MTU Discovery at IP level, different net namespace has
various requirements on which one to chose, e.g., a virutalized
container instance would require TCP PMTU to probe an usable
effective mtu for underlying tunnel, while the host would
employ classical ICMP based PMTU to function.

Hence making TCP PMTU mechanism per net namespace to decouple
two functionality. Furthermore the probe base MSS should also
be configured separately for each namespace.

Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h          |  2 --
 net/ipv4/sysctl_net_ipv4.c | 28 ++++++++++++++--------------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      |  8 +++-----
 net/ipv4/tcp_timer.c       |  7 +++++--
 6 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e0bdcb147326..dbe225478adb 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -82,6 +82,8 @@ struct netns_ipv4 {
 
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
+	int sysctl_tcp_mtu_probing;
+	int sysctl_tcp_base_mss;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index da4196fb78db..8d6b983d5099 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,8 +262,6 @@ extern int sysctl_tcp_low_latency;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
-extern int sysctl_tcp_mtu_probing;
-extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 82601a68cf90..d151539da8e6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -603,20 +603,6 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= TCP_CA_NAME_MAX,
 		.proc_handler	= proc_tcp_congestion_control,
 	},
-	{
-		.procname	= "tcp_mtu_probing",
-		.data		= &sysctl_tcp_mtu_probing,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "tcp_base_mss",
-		.data		= &sysctl_tcp_base_mss,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "tcp_workaround_signed_windows",
 		.data		= &sysctl_tcp_workaround_signed_windows,
@@ -883,6 +869,20 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_mtu_probing",
+		.data		= &init_net.ipv4.sysctl_tcp_mtu_probing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_base_mss",
+		.data		= &init_net.ipv4.sysctl_tcp_base_mss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 67bc95fb5d9e..5a2dfed4783b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2459,6 +2459,7 @@ static int __net_init tcp_sk_init(struct net *net)
 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
 	}
 	net->ipv4.sysctl_tcp_ecn = 2;
+	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
 	return 0;
 
 fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4fcc9a768849..a2a796c5536b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,9 +59,6 @@ int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
  */
 int sysctl_tcp_tso_win_divisor __read_mostly = 3;
 
-int sysctl_tcp_mtu_probing __read_mostly = 0;
-int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
-
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
@@ -1350,11 +1347,12 @@ void tcp_mtup_init(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct net *net = sock_net(sk);
 
-	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
 	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
 			       icsk->icsk_af_ops->net_header_len;
-	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
 	icsk->icsk_mtup.probe_size = 0;
 }
 EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 1829c7fbc77e..0732b787904e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -101,17 +101,20 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
 
 static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
 {
+	struct net *net = sock_net(sk);
+
 	/* Black hole detection */
-	if (sysctl_tcp_mtu_probing) {
+	if (net->ipv4.sysctl_tcp_mtu_probing) {
 		if (!icsk->icsk_mtup.enabled) {
 			icsk->icsk_mtup.enabled = 1;
 			tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		} else {
+			struct net *net = sock_net(sk);
 			struct tcp_sock *tp = tcp_sk(sk);
 			int mss;
 
 			mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
-			mss = min(sysctl_tcp_base_mss, mss);
+			mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
 			mss = max(mss, 68 - tp->tcp_header_len);
 			icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
 			tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
-- 
cgit v1.2.3