From d4546c2509b1e9cd082e3682dcec98472e37ee5a Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Sun, 24 Jun 2018 14:13:49 +0900
Subject: net: Convert GRO SKB handling to list_head.

Manage pending per-NAPI GRO packets via list_head.

Return an SKB pointer from the GRO receive handlers.  When GRO receive
handlers return non-NULL, it means that this SKB needs to be completed
at this time and removed from the NAPI queue.

Several operations are greatly simplified by this transformation,
especially timing out the oldest SKB in the list when gro_count
exceeds MAX_GRO_SKBS, and napi_gro_flush() which walks the queue
in reverse order.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 68 ++++++++++++++++++++++++----------------------------------
 1 file changed, 28 insertions(+), 40 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index a5aa1c7444e6..aa61b9344b46 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4881,36 +4881,25 @@ out:
  */
 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 {
-	struct sk_buff *skb, *prev = NULL;
-
-	/* scan list and build reverse chain */
-	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
-		skb->prev = prev;
-		prev = skb;
-	}
-
-	for (skb = prev; skb; skb = prev) {
-		skb->next = NULL;
+	struct sk_buff *skb, *p;
 
+	list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) {
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
-
-		prev = skb->prev;
+		list_del_init(&skb->list);
 		napi_gro_complete(skb);
 		napi->gro_count--;
 	}
-
-	napi->gro_list = NULL;
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 {
-	struct sk_buff *p;
 	unsigned int maclen = skb->dev->hard_header_len;
 	u32 hash = skb_get_hash_raw(skb);
+	struct sk_buff *p;
 
-	for (p = napi->gro_list; p; p = p->next) {
+	list_for_each_entry(p, &napi->gro_list, list) {
 		unsigned long diffs;
 
 		NAPI_GRO_CB(p)->flush = 0;
@@ -4977,12 +4966,12 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
-	struct sk_buff **pp = NULL;
+	struct list_head *head = &offload_base;
 	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
-	struct list_head *head = &offload_base;
-	int same_flow;
+	struct sk_buff *pp = NULL;
 	enum gro_result ret;
+	int same_flow;
 	int grow;
 
 	if (netif_elide_gro(skb->dev))
@@ -5039,11 +5028,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
 	if (pp) {
-		struct sk_buff *nskb = *pp;
-
-		*pp = nskb->next;
-		nskb->next = NULL;
-		napi_gro_complete(nskb);
+		list_del_init(&pp->list);
+		napi_gro_complete(pp);
 		napi->gro_count--;
 	}
 
@@ -5054,15 +5040,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		goto normal;
 
 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
-		struct sk_buff *nskb = napi->gro_list;
+		struct sk_buff *nskb;
 
-		/* locate the end of the list to select the 'oldest' flow */
-		while (nskb->next) {
-			pp = &nskb->next;
-			nskb = *pp;
-		}
-		*pp = NULL;
-		nskb->next = NULL;
+		nskb = list_last_entry(&napi->gro_list, struct sk_buff, list);
+		list_del(&nskb->list);
 		napi_gro_complete(nskb);
 	} else {
 		napi->gro_count++;
@@ -5071,8 +5052,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	NAPI_GRO_CB(skb)->age = jiffies;
 	NAPI_GRO_CB(skb)->last = skb;
 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
-	skb->next = napi->gro_list;
-	napi->gro_list = skb;
+	list_add(&skb->list, &napi->gro_list);
 	ret = GRO_HELD;
 
 pull:
@@ -5478,7 +5458,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 				 NAPIF_STATE_IN_BUSY_POLL)))
 		return false;
 
-	if (n->gro_list) {
+	if (!list_empty(&n->gro_list)) {
 		unsigned long timeout = 0;
 
 		if (work_done)
@@ -5687,7 +5667,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 	 */
-	if (napi->gro_list && !napi_disable_pending(napi) &&
+	if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) &&
 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 		__napi_schedule_irqoff(napi);
 
@@ -5701,7 +5681,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
 	napi->gro_count = 0;
-	napi->gro_list = NULL;
+	INIT_LIST_HEAD(&napi->gro_list);
 	napi->skb = NULL;
 	napi->poll = poll;
 	if (weight > NAPI_POLL_WEIGHT)
@@ -5734,6 +5714,14 @@ void napi_disable(struct napi_struct *n)
 }
 EXPORT_SYMBOL(napi_disable);
 
+static void gro_list_free(struct list_head *head)
+{
+	struct sk_buff *skb, *p;
+
+	list_for_each_entry_safe(skb, p, head, list)
+		kfree_skb(skb);
+}
+
 /* Must be called in process context */
 void netif_napi_del(struct napi_struct *napi)
 {
@@ -5743,8 +5731,8 @@ void netif_napi_del(struct napi_struct *napi)
 	list_del_init(&napi->dev_list);
 	napi_free_frags(napi);
 
-	kfree_skb_list(napi->gro_list);
-	napi->gro_list = NULL;
+	gro_list_free(&napi->gro_list);
+	INIT_LIST_HEAD(&napi->gro_list);
 	napi->gro_count = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
@@ -5787,7 +5775,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 		goto out_unlock;
 	}
 
-	if (n->gro_list) {
+	if (!list_empty(&n->gro_list)) {
 		/* flush too old packets
 		 * If HZ < 1000, flush all packets.
 		 */
-- 
cgit v1.2.3


From 07d78363dcffd9cb1bf6f06a6cac0e0847f3c1de Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Sun, 24 Jun 2018 14:14:02 +0900
Subject: net: Convert NAPI gro list into a small hash table.

Improve the performance of GRO receive by splitting flows into
multiple hash chains.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   3 +-
 net/core/dev.c            | 105 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 81 insertions(+), 27 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f176d9873910..c6b377a15869 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -305,6 +305,7 @@ int __init netdev_boot_setup(char *str);
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
+#define GRO_HASH_BUCKETS	8
 struct napi_struct {
 	/* The poll_list must only be managed by the entity which
 	 * changes the state of the NAPI_STATE_SCHED bit.  This means
@@ -322,7 +323,7 @@ struct napi_struct {
 	int			poll_owner;
 #endif
 	struct net_device	*dev;
-	struct list_head	gro_list;
+	struct list_head	gro_hash[GRO_HASH_BUCKETS];
 	struct sk_buff		*skb;
 	struct hrtimer		timer;
 	struct list_head	dev_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index aa61b9344b46..dffed642e686 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4875,15 +4875,12 @@ out:
 	return netif_receive_skb_internal(skb);
 }
 
-/* napi->gro_list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head,
+				   bool flush_old)
 {
 	struct sk_buff *skb, *p;
 
-	list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) {
+	list_for_each_entry_safe_reverse(skb, p, head, list) {
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
 		list_del_init(&skb->list);
@@ -4891,15 +4888,33 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 		napi->gro_count--;
 	}
 }
+
+/* napi->gro_hash contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+	int i;
+
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		struct list_head *head = &napi->gro_hash[i];
+
+		__napi_gro_flush_chain(napi, head, flush_old);
+	}
+}
 EXPORT_SYMBOL(napi_gro_flush);
 
-static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+static struct list_head *gro_list_prepare(struct napi_struct *napi,
+					  struct sk_buff *skb)
 {
 	unsigned int maclen = skb->dev->hard_header_len;
 	u32 hash = skb_get_hash_raw(skb);
+	struct list_head *head;
 	struct sk_buff *p;
 
-	list_for_each_entry(p, &napi->gro_list, list) {
+	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)];
+	list_for_each_entry(p, head, list) {
 		unsigned long diffs;
 
 		NAPI_GRO_CB(p)->flush = 0;
@@ -4922,6 +4937,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 				       maclen);
 		NAPI_GRO_CB(p)->same_flow = !diffs;
 	}
+
+	return head;
 }
 
 static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -4964,11 +4981,45 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 	}
 }
 
+static void gro_flush_oldest(struct napi_struct *napi)
+{
+	struct sk_buff *oldest = NULL;
+	unsigned long age = jiffies;
+	int i;
+
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		struct list_head *head = &napi->gro_hash[i];
+		struct sk_buff *skb;
+
+		if (list_empty(head))
+			continue;
+
+		skb = list_last_entry(head, struct sk_buff, list);
+		if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) {
+			oldest = skb;
+			age = NAPI_GRO_CB(skb)->age;
+		}
+	}
+
+	/* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is
+	 * impossible.
+	 */
+	if (WARN_ON_ONCE(!oldest))
+		return;
+
+	/* Do not adjust napi->gro_count, caller is adding a new SKB to
+	 * the chain.
+	 */
+	list_del(&oldest->list);
+	napi_gro_complete(oldest);
+}
+
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct list_head *head = &offload_base;
 	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
+	struct list_head *gro_head;
 	struct sk_buff *pp = NULL;
 	enum gro_result ret;
 	int same_flow;
@@ -4977,7 +5028,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (netif_elide_gro(skb->dev))
 		goto normal;
 
-	gro_list_prepare(napi, skb);
+	gro_head = gro_list_prepare(napi, skb);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
@@ -5011,7 +5062,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 			NAPI_GRO_CB(skb)->csum_valid = 0;
 		}
 
-		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
+		pp = ptype->callbacks.gro_receive(gro_head, skb);
 		break;
 	}
 	rcu_read_unlock();
@@ -5040,11 +5091,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		goto normal;
 
 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
-		struct sk_buff *nskb;
-
-		nskb = list_last_entry(&napi->gro_list, struct sk_buff, list);
-		list_del(&nskb->list);
-		napi_gro_complete(nskb);
+		gro_flush_oldest(napi);
 	} else {
 		napi->gro_count++;
 	}
@@ -5052,7 +5099,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	NAPI_GRO_CB(skb)->age = jiffies;
 	NAPI_GRO_CB(skb)->last = skb;
 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
-	list_add(&skb->list, &napi->gro_list);
+	list_add(&skb->list, gro_head);
 	ret = GRO_HELD;
 
 pull:
@@ -5458,7 +5505,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 				 NAPIF_STATE_IN_BUSY_POLL)))
 		return false;
 
-	if (!list_empty(&n->gro_list)) {
+	if (n->gro_count) {
 		unsigned long timeout = 0;
 
 		if (work_done)
@@ -5667,7 +5714,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 	 */
-	if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) &&
+	if (napi->gro_count && !napi_disable_pending(napi) &&
 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 		__napi_schedule_irqoff(napi);
 
@@ -5677,11 +5724,14 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
+	int i;
+
 	INIT_LIST_HEAD(&napi->poll_list);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
 	napi->gro_count = 0;
-	INIT_LIST_HEAD(&napi->gro_list);
+	for (i = 0; i < GRO_HASH_BUCKETS; i++)
+		INIT_LIST_HEAD(&napi->gro_hash[i]);
 	napi->skb = NULL;
 	napi->poll = poll;
 	if (weight > NAPI_POLL_WEIGHT)
@@ -5714,12 +5764,16 @@ void napi_disable(struct napi_struct *n)
 }
 EXPORT_SYMBOL(napi_disable);
 
-static void gro_list_free(struct list_head *head)
+static void flush_gro_hash(struct napi_struct *napi)
 {
-	struct sk_buff *skb, *p;
+	int i;
 
-	list_for_each_entry_safe(skb, p, head, list)
-		kfree_skb(skb);
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		struct sk_buff *skb, *n;
+
+		list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list)
+			kfree_skb(skb);
+	}
 }
 
 /* Must be called in process context */
@@ -5731,8 +5785,7 @@ void netif_napi_del(struct napi_struct *napi)
 	list_del_init(&napi->dev_list);
 	napi_free_frags(napi);
 
-	gro_list_free(&napi->gro_list);
-	INIT_LIST_HEAD(&napi->gro_list);
+	flush_gro_hash(napi);
 	napi->gro_count = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
@@ -5775,7 +5828,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 		goto out_unlock;
 	}
 
-	if (!list_empty(&n->gro_list)) {
+	if (n->gro_count) {
 		/* flush too old packets
 		 * If HZ < 1000, flush all packets.
 		 */
-- 
cgit v1.2.3


From 80d19669ecd34423e85ca04f2210b0e42a47cb16 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 29 Jun 2018 21:26:41 -0700
Subject: net: Refactor XPS for CPUs and Rx queues

Refactor XPS code to support Tx queue selection based on
CPU(s) map or Rx queue(s) map.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cpumask.h   |  11 ++-
 include/linux/netdevice.h |  98 ++++++++++++++++++++-
 net/core/dev.c            | 211 ++++++++++++++++++++++++++++++----------------
 net/core/net-sysfs.c      |   4 +-
 4 files changed, 244 insertions(+), 80 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bf53d893ad02..57f20a0a7794 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
 #define cpu_active(cpu)		((cpu) == 0)
 #endif
 
-/* verify cpu argument to cpumask_* operators */
-static inline unsigned int cpumask_check(unsigned int cpu)
+static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
 {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-	WARN_ON_ONCE(cpu >= nr_cpumask_bits);
+	WARN_ON_ONCE(cpu >= bits);
 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+}
+
+/* verify cpu argument to cpumask_* operators */
+static inline unsigned int cpumask_check(unsigned int cpu)
+{
+	cpu_max_bits_warn(cpu, nr_cpumask_bits);
 	return cpu;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6b377a15869..8bf8d6149f79 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -731,10 +731,15 @@ struct xps_map {
  */
 struct xps_dev_maps {
 	struct rcu_head rcu;
-	struct xps_map __rcu *cpu_map[0];
+	struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +		\
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +	\
 	(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+	(_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE	16
@@ -1910,7 +1915,8 @@ struct net_device {
 	int			watchdog_timeo;
 
 #ifdef CONFIG_XPS
-	struct xps_dev_maps __rcu *xps_maps;
+	struct xps_dev_maps __rcu *xps_cpus_map;
+	struct xps_dev_maps __rcu *xps_rxqs_map;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
@@ -3259,6 +3265,92 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+			  u16 index, bool is_rxqs_map);
+
+/**
+ *	netif_attr_test_mask - Test a CPU or Rx queue set in a mask
+ *	@j: CPU/Rx queue index
+ *	@mask: bitmask of all cpus/rx queues
+ *	@nr_bits: number of bits in the bitmask
+ *
+ * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
+ */
+static inline bool netif_attr_test_mask(unsigned long j,
+					const unsigned long *mask,
+					unsigned int nr_bits)
+{
+	cpu_max_bits_warn(j, nr_bits);
+	return test_bit(j, mask);
+}
+
+/**
+ *	netif_attr_test_online - Test for online CPU/Rx queue
+ *	@j: CPU/Rx queue index
+ *	@online_mask: bitmask for CPUs/Rx queues that are online
+ *	@nr_bits: number of bits in the bitmask
+ *
+ * Returns true if a CPU/Rx queue is online.
+ */
+static inline bool netif_attr_test_online(unsigned long j,
+					  const unsigned long *online_mask,
+					  unsigned int nr_bits)
+{
+	cpu_max_bits_warn(j, nr_bits);
+
+	if (online_mask)
+		return test_bit(j, online_mask);
+
+	return (j < nr_bits);
+}
+
+/**
+ *	netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
+ *	@n: CPU/Rx queue index
+ *	@srcp: the cpumask/Rx queue mask pointer
+ *	@nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set.
+ */
+static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
+					       unsigned int nr_bits)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpu_max_bits_warn(n, nr_bits);
+
+	if (srcp)
+		return find_next_bit(srcp, nr_bits, n + 1);
+
+	return n + 1;
+}
+
+/**
+ *	netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p
+ *	@n: CPU/Rx queue index
+ *	@src1p: the first CPUs/Rx queues mask pointer
+ *	@src2p: the second CPUs/Rx queues mask pointer
+ *	@nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set in both.
+ */
+static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
+					  const unsigned long *src2p,
+					  unsigned int nr_bits)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpu_max_bits_warn(n, nr_bits);
+
+	if (src1p && src2p)
+		return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
+	else if (src1p)
+		return find_next_bit(src1p, nr_bits, n + 1);
+	else if (src2p)
+		return find_next_bit(src2p, nr_bits, n + 1);
+
+	return n + 1;
+}
 #else
 static inline int netif_set_xps_queue(struct net_device *dev,
 				      const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index dffed642e686..71059558dc39 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 	int pos;
 
 	if (dev_maps)
-		map = xmap_dereference(dev_maps->cpu_map[tci]);
+		map = xmap_dereference(dev_maps->attr_map[tci]);
 	if (!map)
 		return false;
 
@@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 			break;
 		}
 
-		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 		kfree_rcu(map, rcu);
 		return false;
 	}
@@ -2135,31 +2135,58 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 	return active;
 }
 
+static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
+			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
+			   u16 offset, u16 count, bool is_rxqs_map)
+{
+	bool active = false;
+	int i, j;
+
+	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
+	     j < nr_ids;)
+		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
+					       count);
+	if (!active) {
+		if (is_rxqs_map) {
+			RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+		} else {
+			RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+
+			for (i = offset + (count - 1); count--; i--)
+				netdev_queue_numa_node_write(
+					netdev_get_tx_queue(dev, i),
+							NUMA_NO_NODE);
+		}
+		kfree_rcu(dev_maps, rcu);
+	}
+}
+
 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 				   u16 count)
 {
+	const unsigned long *possible_mask = NULL;
 	struct xps_dev_maps *dev_maps;
-	int cpu, i;
-	bool active = false;
+	unsigned int nr_ids;
 
 	mutex_lock(&xps_map_mutex);
-	dev_maps = xmap_dereference(dev->xps_maps);
 
-	if (!dev_maps)
-		goto out_no_maps;
-
-	for_each_possible_cpu(cpu)
-		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
-					       offset, count);
+	dev_maps = xmap_dereference(dev->xps_rxqs_map);
+	if (dev_maps) {
+		nr_ids = dev->num_rx_queues;
+		clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset,
+			       count, true);
 
-	if (!active) {
-		RCU_INIT_POINTER(dev->xps_maps, NULL);
-		kfree_rcu(dev_maps, rcu);
 	}
 
-	for (i = offset + (count - 1); count--; i--)
-		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
-					     NUMA_NO_NODE);
+	dev_maps = xmap_dereference(dev->xps_cpus_map);
+	if (!dev_maps)
+		goto out_no_maps;
+
+	if (num_possible_cpus() > 1)
+		possible_mask = cpumask_bits(cpu_possible_mask);
+	nr_ids = nr_cpu_ids;
+	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
+		       false);
 
 out_no_maps:
 	mutex_unlock(&xps_map_mutex);
@@ -2170,8 +2197,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 }
 
-static struct xps_map *expand_xps_map(struct xps_map *map,
-				      int cpu, u16 index)
+static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
+				      u16 index, bool is_rxqs_map)
 {
 	struct xps_map *new_map;
 	int alloc_len = XPS_MIN_MAP_ALLOC;
@@ -2183,7 +2210,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
 		return map;
 	}
 
-	/* Need to add queue to this CPU's existing map */
+	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 	if (map) {
 		if (pos < map->alloc_len)
 			return map;
@@ -2191,9 +2218,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
 		alloc_len = map->alloc_len * 2;
 	}
 
-	/* Need to allocate new map to store queue on this CPU's map */
-	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
-			       cpu_to_node(cpu));
+	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
+	 *  map
+	 */
+	if (is_rxqs_map)
+		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
+	else
+		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+				       cpu_to_node(attr_index));
 	if (!new_map)
 		return NULL;
 
@@ -2205,14 +2237,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
 	return new_map;
 }
 
-int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
-			u16 index)
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+			  u16 index, bool is_rxqs_map)
 {
+	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
-	int i, cpu, tci, numa_node_id = -2;
+	int i, j, tci, numa_node_id = -2;
 	int maps_sz, num_tc = 1, tc = 0;
 	struct xps_map *map, *new_map;
 	bool active = false;
+	unsigned int nr_ids;
 
 	if (dev->num_tc) {
 		num_tc = dev->num_tc;
@@ -2221,16 +2255,27 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			return -EINVAL;
 	}
 
-	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
-	if (maps_sz < L1_CACHE_BYTES)
-		maps_sz = L1_CACHE_BYTES;
-
 	mutex_lock(&xps_map_mutex);
+	if (is_rxqs_map) {
+		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
+		dev_maps = xmap_dereference(dev->xps_rxqs_map);
+		nr_ids = dev->num_rx_queues;
+	} else {
+		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
+		if (num_possible_cpus() > 1) {
+			online_mask = cpumask_bits(cpu_online_mask);
+			possible_mask = cpumask_bits(cpu_possible_mask);
+		}
+		dev_maps = xmap_dereference(dev->xps_cpus_map);
+		nr_ids = nr_cpu_ids;
+	}
 
-	dev_maps = xmap_dereference(dev->xps_maps);
+	if (maps_sz < L1_CACHE_BYTES)
+		maps_sz = L1_CACHE_BYTES;
 
 	/* allocate memory for queue storage */
-	for_each_cpu_and(cpu, cpu_online_mask, mask) {
+	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
+	     j < nr_ids;) {
 		if (!new_dev_maps)
 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 		if (!new_dev_maps) {
@@ -2238,73 +2283,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			return -ENOMEM;
 		}
 
-		tci = cpu * num_tc + tc;
-		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
+		tci = j * num_tc + tc;
+		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 				 NULL;
 
-		map = expand_xps_map(map, cpu, index);
+		map = expand_xps_map(map, j, index, is_rxqs_map);
 		if (!map)
 			goto error;
 
-		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 	}
 
 	if (!new_dev_maps)
 		goto out_no_new_maps;
 
-	for_each_possible_cpu(cpu) {
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
 		/* copy maps belonging to foreign traffic classes */
-		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 			/* fill in the new device map from the old device map */
-			map = xmap_dereference(dev_maps->cpu_map[tci]);
-			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+			map = xmap_dereference(dev_maps->attr_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 		}
 
 		/* We need to explicitly update tci as prevous loop
 		 * could break out early if dev_maps is NULL.
 		 */
-		tci = cpu * num_tc + tc;
+		tci = j * num_tc + tc;
 
-		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
-			/* add queue to CPU maps */
+		if (netif_attr_test_mask(j, mask, nr_ids) &&
+		    netif_attr_test_online(j, online_mask, nr_ids)) {
+			/* add tx-queue to CPU/rx-queue maps */
 			int pos = 0;
 
-			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 			while ((pos < map->len) && (map->queues[pos] != index))
 				pos++;
 
 			if (pos == map->len)
 				map->queues[map->len++] = index;
 #ifdef CONFIG_NUMA
-			if (numa_node_id == -2)
-				numa_node_id = cpu_to_node(cpu);
-			else if (numa_node_id != cpu_to_node(cpu))
-				numa_node_id = -1;
+			if (!is_rxqs_map) {
+				if (numa_node_id == -2)
+					numa_node_id = cpu_to_node(j);
+				else if (numa_node_id != cpu_to_node(j))
+					numa_node_id = -1;
+			}
 #endif
 		} else if (dev_maps) {
 			/* fill in the new device map from the old device map */
-			map = xmap_dereference(dev_maps->cpu_map[tci]);
-			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+			map = xmap_dereference(dev_maps->attr_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 		}
 
 		/* copy maps belonging to foreign traffic classes */
 		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 			/* fill in the new device map from the old device map */
-			map = xmap_dereference(dev_maps->cpu_map[tci]);
-			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+			map = xmap_dereference(dev_maps->attr_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 		}
 	}
 
-	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+	if (is_rxqs_map)
+		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
+	else
+		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 
 	/* Cleanup old maps */
 	if (!dev_maps)
 		goto out_no_old_maps;
 
-	for_each_possible_cpu(cpu) {
-		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
-			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
-			map = xmap_dereference(dev_maps->cpu_map[tci]);
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
+		for (i = num_tc, tci = j * num_tc; i--; tci++) {
+			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+			map = xmap_dereference(dev_maps->attr_map[tci]);
 			if (map && map != new_map)
 				kfree_rcu(map, rcu);
 		}
@@ -2317,19 +2370,23 @@ out_no_old_maps:
 	active = true;
 
 out_no_new_maps:
-	/* update Tx queue numa node */
-	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
-				     (numa_node_id >= 0) ? numa_node_id :
-				     NUMA_NO_NODE);
+	if (!is_rxqs_map) {
+		/* update Tx queue numa node */
+		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+					     (numa_node_id >= 0) ?
+					     numa_node_id : NUMA_NO_NODE);
+	}
 
 	if (!dev_maps)
 		goto out_no_maps;
 
-	/* removes queue from unused CPUs */
-	for_each_possible_cpu(cpu) {
-		for (i = tc, tci = cpu * num_tc; i--; tci++)
+	/* removes tx-queue from unused CPUs/rx-queues */
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
+		for (i = tc, tci = j * num_tc; i--; tci++)
 			active |= remove_xps_queue(dev_maps, tci, index);
-		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+		if (!netif_attr_test_mask(j, mask, nr_ids) ||
+		    !netif_attr_test_online(j, online_mask, nr_ids))
 			active |= remove_xps_queue(dev_maps, tci, index);
 		for (i = num_tc - tc, tci++; --i; tci++)
 			active |= remove_xps_queue(dev_maps, tci, index);
@@ -2337,7 +2394,10 @@ out_no_new_maps:
 
 	/* free map if not active */
 	if (!active) {
-		RCU_INIT_POINTER(dev->xps_maps, NULL);
+		if (is_rxqs_map)
+			RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+		else
+			RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 		kfree_rcu(dev_maps, rcu);
 	}
 
@@ -2347,11 +2407,12 @@ out_no_maps:
 	return 0;
 error:
 	/* remove any maps that we added */
-	for_each_possible_cpu(cpu) {
-		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
-			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
+		for (i = num_tc, tci = j * num_tc; i--; tci++) {
+			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 			map = dev_maps ?
-			      xmap_dereference(dev_maps->cpu_map[tci]) :
+			      xmap_dereference(dev_maps->attr_map[tci]) :
 			      NULL;
 			if (new_map && new_map != map)
 				kfree(new_map);
@@ -2363,6 +2424,12 @@ error:
 	kfree(new_dev_maps);
 	return -ENOMEM;
 }
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+			u16 index)
+{
+	return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+}
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
@@ -3384,7 +3451,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	int queue_index = -1;
 
 	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_maps);
+	dev_maps = rcu_dereference(dev->xps_cpus_map);
 	if (dev_maps) {
 		unsigned int tci = skb->sender_cpu - 1;
 
@@ -3393,7 +3460,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 			tci += netdev_get_prio_tc_map(dev, skb->priority);
 		}
 
-		map = rcu_dereference(dev_maps->cpu_map[tci]);
+		map = rcu_dereference(dev_maps->attr_map[tci]);
 		if (map) {
 			if (map->len == 1)
 				queue_index = map->queues[0];
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bb7e80f4ced3..b39987c81d53 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 		return -ENOMEM;
 
 	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_maps);
+	dev_maps = rcu_dereference(dev->xps_cpus_map);
 	if (dev_maps) {
 		for_each_possible_cpu(cpu) {
 			int i, tci = cpu * num_tc + tc;
 			struct xps_map *map;
 
-			map = rcu_dereference(dev_maps->cpu_map[tci]);
+			map = rcu_dereference(dev_maps->attr_map[tci]);
 			if (!map)
 				continue;
 
-- 
cgit v1.2.3


From 04157469b7b848f4a9978b63b1ea2ce62ad3a0a3 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 29 Jun 2018 21:26:46 -0700
Subject: net: Use static_key for XPS maps

Use static_key for XPS maps to reduce the cost of extra map checks,
similar to how it is used for RPS and RFS. This includes static_key
'xps_needed' for XPS and another for 'xps_rxqs_needed' for XPS using
Rx queues map.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 71059558dc39..43b5575e40c5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2081,6 +2081,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 EXPORT_SYMBOL(netdev_txq_to_tc);
 
 #ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
+struct static_key xps_rxqs_needed __read_mostly;
+EXPORT_SYMBOL(xps_rxqs_needed);
 static DEFINE_MUTEX(xps_map_mutex);
 #define xmap_dereference(P)		\
 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
@@ -2168,14 +2172,18 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 	struct xps_dev_maps *dev_maps;
 	unsigned int nr_ids;
 
-	mutex_lock(&xps_map_mutex);
+	if (!static_key_false(&xps_needed))
+		return;
 
-	dev_maps = xmap_dereference(dev->xps_rxqs_map);
-	if (dev_maps) {
-		nr_ids = dev->num_rx_queues;
-		clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset,
-			       count, true);
+	mutex_lock(&xps_map_mutex);
 
+	if (static_key_false(&xps_rxqs_needed)) {
+		dev_maps = xmap_dereference(dev->xps_rxqs_map);
+		if (dev_maps) {
+			nr_ids = dev->num_rx_queues;
+			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
+				       offset, count, true);
+		}
 	}
 
 	dev_maps = xmap_dereference(dev->xps_cpus_map);
@@ -2189,6 +2197,10 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 		       false);
 
 out_no_maps:
+	if (static_key_enabled(&xps_rxqs_needed))
+		static_key_slow_dec(&xps_rxqs_needed);
+
+	static_key_slow_dec(&xps_needed);
 	mutex_unlock(&xps_map_mutex);
 }
 
@@ -2297,6 +2309,10 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	if (!new_dev_maps)
 		goto out_no_new_maps;
 
+	static_key_slow_inc(&xps_needed);
+	if (is_rxqs_map)
+		static_key_slow_inc(&xps_rxqs_needed);
+
 	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 	     j < nr_ids;) {
 		/* copy maps belonging to foreign traffic classes */
@@ -3450,6 +3466,9 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	struct xps_map *map;
 	int queue_index = -1;
 
+	if (!static_key_false(&xps_needed))
+		return -1;
+
 	rcu_read_lock();
 	dev_maps = rcu_dereference(dev->xps_cpus_map);
 	if (dev_maps) {
-- 
cgit v1.2.3


From fc9bab24e9c654f62f3d411fc0b041be9e487e9d Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 29 Jun 2018 21:27:02 -0700
Subject: net: Enable Tx queue selection based on Rx queues

This patch adds support to pick Tx queue based on the Rx queue(s) map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive queue(s) map
does not apply, then the Tx queue selection falls back to CPU(s) map
based selection and finally to hashing.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 10 +++++++++
 net/core/dev.c     | 62 +++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 55 insertions(+), 17 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2b097cc89727..2ed99bfa4595 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1730,6 +1730,16 @@ static inline void sk_rx_queue_clear(struct sock *sk)
 #endif
 }
 
+#ifdef CONFIG_XPS
+static inline int sk_rx_queue_get(const struct sock *sk)
+{
+	if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
+		return sk->sk_rx_queue_mapping;
+
+	return -1;
+}
+#endif
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 43b5575e40c5..08d58e0debe5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3459,35 +3459,63 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+			       struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+	struct xps_map *map;
+	int queue_index = -1;
+
+	if (dev->num_tc) {
+		tci *= dev->num_tc;
+		tci += netdev_get_prio_tc_map(dev, skb->priority);
+	}
+
+	map = rcu_dereference(dev_maps->attr_map[tci]);
+	if (map) {
+		if (map->len == 1)
+			queue_index = map->queues[0];
+		else
+			queue_index = map->queues[reciprocal_scale(
+						skb_get_hash(skb), map->len)];
+		if (unlikely(queue_index >= dev->real_num_tx_queues))
+			queue_index = -1;
+	}
+	return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
 	struct xps_dev_maps *dev_maps;
-	struct xps_map *map;
+	struct sock *sk = skb->sk;
 	int queue_index = -1;
 
 	if (!static_key_false(&xps_needed))
 		return -1;
 
 	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_cpus_map);
+	if (!static_key_false(&xps_rxqs_needed))
+		goto get_cpus_map;
+
+	dev_maps = rcu_dereference(dev->xps_rxqs_map);
 	if (dev_maps) {
-		unsigned int tci = skb->sender_cpu - 1;
+		int tci = sk_rx_queue_get(sk);
 
-		if (dev->num_tc) {
-			tci *= dev->num_tc;
-			tci += netdev_get_prio_tc_map(dev, skb->priority);
-		}
+		if (tci >= 0 && tci < dev->num_rx_queues)
+			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+							  tci);
+	}
 
-		map = rcu_dereference(dev_maps->attr_map[tci]);
-		if (map) {
-			if (map->len == 1)
-				queue_index = map->queues[0];
-			else
-				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
-									   map->len)];
-			if (unlikely(queue_index >= dev->real_num_tx_queues))
-				queue_index = -1;
+get_cpus_map:
+	if (queue_index < 0) {
+		dev_maps = rcu_dereference(dev->xps_cpus_map);
+		if (dev_maps) {
+			unsigned int tci = skb->sender_cpu - 1;
+
+			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+							  tci);
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3


From f6ad8c1bcdf014272d08c55b9469536952a0a771 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:12:45 +0100
Subject: net: core: trivial netif_receive_skb_list() entry point

Just calls netif_receive_skb() in a loop.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 64480a0f2c16..f67258f057ca 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3477,6 +3477,7 @@ int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
 int netif_receive_skb_core(struct sk_buff *skb);
+void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/net/core/dev.c b/net/core/dev.c
index 08d58e0debe5..85c456a4b551 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4906,6 +4906,25 @@ int netif_receive_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
+/**
+ *	netif_receive_skb_list - process many receive buffers from network
+ *	@head: list of skbs to process.
+ *
+ *	For now, just calls netif_receive_skb() in a loop, ignoring the
+ *	return value.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ */
+void netif_receive_skb_list(struct list_head *head)
+{
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list)
+		netif_receive_skb(skb);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
 DEFINE_PER_CPU(struct work_struct, flush_works);
 
 /* Network device is going away, flush any packets still pending */
-- 
cgit v1.2.3


From 920572b73280a29e3a9f58807a8b90051b19ee60 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:13:11 +0100
Subject: net: core: unwrap skb list receive slightly further

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/net.h | 7 +++++++
 net/core/dev.c             | 4 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 9c886739246a..00aa72ce0e7c 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -223,6 +223,13 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,
 	TP_ARGS(skb)
 );
 
+DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,
+
+	TP_PROTO(const struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
 DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,
 
 	TP_PROTO(const struct sk_buff *skb),
diff --git a/net/core/dev.c b/net/core/dev.c
index 85c456a4b551..308acfd48139 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4920,8 +4920,10 @@ void netif_receive_skb_list(struct list_head *head)
 {
 	struct sk_buff *skb, *next;
 
+	list_for_each_entry(skb, head, list)
+		trace_netif_receive_skb_list_entry(skb);
 	list_for_each_entry_safe(skb, next, head, list)
-		netif_receive_skb(skb);
+		netif_receive_skb_internal(skb);
 }
 EXPORT_SYMBOL(netif_receive_skb_list);
 
-- 
cgit v1.2.3


From 7da517a3bc529dc5399e742688b32cafa2ca5ca0 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:13:24 +0100
Subject: net: core: Another step of skb receive list processing

netif_receive_skb_list_internal() now processes a list and hands it
 on to the next function.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 5 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 308acfd48139..1e87361df2ab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4843,6 +4843,14 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 	return ret;
 }
 
+static void __netif_receive_skb_list(struct list_head *head)
+{
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list)
+		__netif_receive_skb(skb);
+}
+
 static int netif_receive_skb_internal(struct sk_buff *skb)
 {
 	int ret;
@@ -4883,6 +4891,50 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	return ret;
 }
 
+static void netif_receive_skb_list_internal(struct list_head *head)
+{
+	struct bpf_prog *xdp_prog = NULL;
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		net_timestamp_check(netdev_tstamp_prequeue, skb);
+		if (skb_defer_rx_timestamp(skb))
+			/* Handled, remove from list */
+			list_del(&skb->list);
+	}
+
+	if (static_branch_unlikely(&generic_xdp_needed_key)) {
+		preempt_disable();
+		rcu_read_lock();
+		list_for_each_entry_safe(skb, next, head, list) {
+			xdp_prog = rcu_dereference(skb->dev->xdp_prog);
+			if (do_xdp_generic(xdp_prog, skb) != XDP_PASS)
+				/* Dropped, remove from list */
+				list_del(&skb->list);
+		}
+		rcu_read_unlock();
+		preempt_enable();
+	}
+
+	rcu_read_lock();
+#ifdef CONFIG_RPS
+	if (static_key_false(&rps_needed)) {
+		list_for_each_entry_safe(skb, next, head, list) {
+			struct rps_dev_flow voidflow, *rflow = &voidflow;
+			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+			if (cpu >= 0) {
+				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+				/* Handled, remove from list */
+				list_del(&skb->list);
+			}
+		}
+	}
+#endif
+	__netif_receive_skb_list(head);
+	rcu_read_unlock();
+}
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
@@ -4910,20 +4962,19 @@ EXPORT_SYMBOL(netif_receive_skb);
  *	netif_receive_skb_list - process many receive buffers from network
  *	@head: list of skbs to process.
  *
- *	For now, just calls netif_receive_skb() in a loop, ignoring the
- *	return value.
+ *	Since return value of netif_receive_skb() is normally ignored, and
+ *	wouldn't be meaningful for a list, this function returns void.
  *
  *	This function may only be called from softirq context and interrupts
  *	should be enabled.
  */
 void netif_receive_skb_list(struct list_head *head)
 {
-	struct sk_buff *skb, *next;
+	struct sk_buff *skb;
 
 	list_for_each_entry(skb, head, list)
 		trace_netif_receive_skb_list_entry(skb);
-	list_for_each_entry_safe(skb, next, head, list)
-		netif_receive_skb_internal(skb);
+	netif_receive_skb_list_internal(head);
 }
 EXPORT_SYMBOL(netif_receive_skb_list);
 
-- 
cgit v1.2.3


From 4ce0017a373afaaa9ef17614d8fa4f6fde261d18 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:13:40 +0100
Subject: net: core: another layer of lists, around PF_MEMALLOC skb handling

First example of a layer splitting the list (rather than merely taking
 individual packets off it).
Involves new list.h function, list_cut_before(), like list_cut_position()
 but cuts on the other side of the given entry.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list.h | 30 ++++++++++++++++++++++++++++++
 net/core/dev.c       | 44 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 66 insertions(+), 8 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/list.h b/include/linux/list.h
index 4b129df4d46b..de04cc5ed536 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -285,6 +285,36 @@ static inline void list_cut_position(struct list_head *list,
 		__list_cut_position(list, head, entry);
 }
 
+/**
+ * list_cut_before - cut a list into two, before given entry
+ * @list: a new list to add all removed entries
+ * @head: a list with entries
+ * @entry: an entry within head, could be the head itself
+ *
+ * This helper moves the initial part of @head, up to but
+ * excluding @entry, from @head to @list.  You should pass
+ * in @entry an element you know is on @head.  @list should
+ * be an empty list or a list you do not care about losing
+ * its data.
+ * If @entry == @head, all entries on @head are moved to
+ * @list.
+ */
+static inline void list_cut_before(struct list_head *list,
+				   struct list_head *head,
+				   struct list_head *entry)
+{
+	if (head->next == entry) {
+		INIT_LIST_HEAD(list);
+		return;
+	}
+	list->next = head->next;
+	list->next->prev = list;
+	list->prev = entry->prev;
+	list->prev->next = list;
+	head->next = entry;
+	entry->prev = head;
+}
+
 static inline void __list_splice(const struct list_head *list,
 				 struct list_head *prev,
 				 struct list_head *next)
diff --git a/net/core/dev.c b/net/core/dev.c
index 1e87361df2ab..9aadef976e8c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4784,6 +4784,14 @@ int netif_receive_skb_core(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb_core);
 
+static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+{
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list)
+		__netif_receive_skb_core(skb, pfmemalloc);
+}
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	int ret;
@@ -4809,6 +4817,34 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
+static void __netif_receive_skb_list(struct list_head *head)
+{
+	unsigned long noreclaim_flag = 0;
+	struct sk_buff *skb, *next;
+	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+			struct list_head sublist;
+
+			/* Handle the previous sublist */
+			list_cut_before(&sublist, head, &skb->list);
+			__netif_receive_skb_list_core(&sublist, pfmemalloc);
+			pfmemalloc = !pfmemalloc;
+			/* See comments in __netif_receive_skb */
+			if (pfmemalloc)
+				noreclaim_flag = memalloc_noreclaim_save();
+			else
+				memalloc_noreclaim_restore(noreclaim_flag);
+		}
+	}
+	/* Handle the remaining sublist */
+	__netif_receive_skb_list_core(head, pfmemalloc);
+	/* Restore pflags */
+	if (pfmemalloc)
+		memalloc_noreclaim_restore(noreclaim_flag);
+}
+
 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
@@ -4843,14 +4879,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 	return ret;
 }
 
-static void __netif_receive_skb_list(struct list_head *head)
-{
-	struct sk_buff *skb, *next;
-
-	list_for_each_entry_safe(skb, next, head, list)
-		__netif_receive_skb(skb);
-}
-
 static int netif_receive_skb_internal(struct sk_buff *skb)
 {
 	int ret;
-- 
cgit v1.2.3


From 88eb1944e18c1ba61da538ae9d1732832eb79b9d Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:13:56 +0100
Subject: net: core: propagate SKB lists through packet_type lookup

__netif_receive_skb_core() does a depressingly large amount of per-packet
 work that can't easily be listified, because the another_round looping
 makes it nontrivial to slice up into smaller functions.
Fortunately, most of that work disappears in the fast path:
 * Hardware devices generally don't have an rx_handler
 * Unless you're tcpdumping or something, there is usually only one ptype
 * VLAN processing comes before the protocol ptype lookup, so doesn't force
   a pt_prev deliver
 so normally, __netif_receive_skb_core() will run straight through and pass
 back the one ptype found in ptype_base[hash of skb->protocol].

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 8 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 9aadef976e8c..1bc485bb0678 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4608,7 +4608,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 	return 0;
 }
 
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+				    struct packet_type **ppt_prev)
 {
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
@@ -4738,8 +4739,7 @@ skip_classify:
 	if (pt_prev) {
 		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 			goto drop;
-		else
-			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+		*ppt_prev = pt_prev;
 	} else {
 drop:
 		if (!deliver_exact)
@@ -4757,6 +4757,18 @@ out:
 	return ret;
 }
 
+static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
+{
+	struct net_device *orig_dev = skb->dev;
+	struct packet_type *pt_prev = NULL;
+	int ret;
+
+	ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+	if (pt_prev)
+		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+	return ret;
+}
+
 /**
  *	netif_receive_skb_core - special purpose version of netif_receive_skb
  *	@skb: buffer to process
@@ -4777,19 +4789,63 @@ int netif_receive_skb_core(struct sk_buff *skb)
 	int ret;
 
 	rcu_read_lock();
-	ret = __netif_receive_skb_core(skb, false);
+	ret = __netif_receive_skb_one_core(skb, false);
 	rcu_read_unlock();
 
 	return ret;
 }
 EXPORT_SYMBOL(netif_receive_skb_core);
 
-static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+static inline void __netif_receive_skb_list_ptype(struct list_head *head,
+						  struct packet_type *pt_prev,
+						  struct net_device *orig_dev)
 {
 	struct sk_buff *skb, *next;
 
+	if (!pt_prev)
+		return;
+	if (list_empty(head))
+		return;
+
 	list_for_each_entry_safe(skb, next, head, list)
-		__netif_receive_skb_core(skb, pfmemalloc);
+		pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+{
+	/* Fast-path assumptions:
+	 * - There is no RX handler.
+	 * - Only one packet_type matches.
+	 * If either of these fails, we will end up doing some per-packet
+	 * processing in-line, then handling the 'last ptype' for the whole
+	 * sublist.  This can't cause out-of-order delivery to any single ptype,
+	 * because the 'last ptype' must be constant across the sublist, and all
+	 * other ptypes are handled per-packet.
+	 */
+	/* Current (common) ptype of sublist */
+	struct packet_type *pt_curr = NULL;
+	/* Current (common) orig_dev of sublist */
+	struct net_device *od_curr = NULL;
+	struct list_head sublist;
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		struct net_device *orig_dev = skb->dev;
+		struct packet_type *pt_prev = NULL;
+
+		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+		if (pt_curr != pt_prev || od_curr != orig_dev) {
+			/* dispatch old sublist */
+			list_cut_before(&sublist, head, &skb->list);
+			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+			/* start new sublist */
+			pt_curr = pt_prev;
+			od_curr = orig_dev;
+		}
+	}
+
+	/* dispatch final sublist */
+	__netif_receive_skb_list_ptype(head, pt_curr, od_curr);
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)
@@ -4809,10 +4865,10 @@ static int __netif_receive_skb(struct sk_buff *skb)
 		 * context down to all allocation sites.
 		 */
 		noreclaim_flag = memalloc_noreclaim_save();
-		ret = __netif_receive_skb_core(skb, true);
+		ret = __netif_receive_skb_one_core(skb, true);
 		memalloc_noreclaim_restore(noreclaim_flag);
 	} else
-		ret = __netif_receive_skb_core(skb, false);
+		ret = __netif_receive_skb_one_core(skb, false);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 17266ee939849cb095ed7dd9edbec4162172226b Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:14:12 +0100
Subject: net: ipv4: listified version of ip_rcv

Also involved adding a way to run a netfilter hook over a list of packets.
 Rather than attempting to make netfilter know about lists (which would be
 a major project in itself) we just let it call the regular okfn (in this
 case ip_rcv_finish()) for any packets it steals, and have it give us back
 a list of packets it's synchronously accepted (which normally NF_HOOK
 would automatically call okfn() on, but we want to be able to potentially
 pass the list to a listified version of okfn().)
The netfilter hooks themselves are indirect calls that still happen per-
 packet (see nf_hook_entry_hookfn()), but again, changing that can be left
 for future work.

There is potential for out-of-order receives if the netfilter hook ends up
 synchronously stealing packets, as they will be processed before any
 accepts earlier in the list.  However, it was already possible for an
 asynchronous accept to cause out-of-order receives, so presumably this is
 considered OK.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 include/linux/netfilter.h | 22 +++++++++++++++
 include/net/ip.h          |  2 ++
 net/core/dev.c            |  8 +++---
 net/ipv4/af_inet.c        |  1 +
 net/ipv4/ip_input.c       | 68 ++++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 94 insertions(+), 10 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f67258f057ca..c1ef749b6f9f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2297,6 +2297,9 @@ struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
+	void			(*list_func) (struct list_head *,
+					      struct packet_type *,
+					      struct net_device *);
 	bool			(*id_match)(struct packet_type *ptype,
 					    struct sock *sk);
 	void			*af_packet_priv;
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index dd2052f0efb7..5a5e0a2ab2a3 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -288,6 +288,20 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct
 	return ret;
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+	     struct list_head *head, struct net_device *in, struct net_device *out,
+	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
+		if (ret != 1)
+			list_del(&skb->list);
+	}
+}
+
 /* Call setsockopt() */
 int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
 		  unsigned int len);
@@ -369,6 +383,14 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	return okfn(net, sk, skb);
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+	     struct list_head *head, struct net_device *in, struct net_device *out,
+	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+	/* nothing to do */
+}
+
 static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 			  struct sock *sk, struct sk_buff *skb,
 			  struct net_device *indev, struct net_device *outdev,
diff --git a/include/net/ip.h b/include/net/ip.h
index 09da79d8ceea..99d1b835d2aa 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -138,6 +138,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 			  struct ip_options_rcu *opt);
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	   struct net_device *orig_dev);
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		 struct net_device *orig_dev);
 int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1bc485bb0678..5e22719ce71d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4806,9 +4806,11 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 		return;
 	if (list_empty(head))
 		return;
-
-	list_for_each_entry_safe(skb, next, head, list)
-		pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+	if (pt_prev->list_func != NULL)
+		pt_prev->list_func(head, pt_prev, orig_dev);
+	else
+		list_for_each_entry_safe(skb, next, head, list)
+			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
 static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9263a2c114e0..c716be13d58c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1882,6 +1882,7 @@ fs_initcall(ipv4_offload_init);
 static struct packet_type ip_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
 	.func = ip_rcv,
+	.list_func = ip_list_rcv,
 };
 
 static int __init inet_init(void)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7582713dd18f..914240830bdf 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -408,10 +408,9 @@ drop_error:
 /*
  * 	Main IP Receive routine.
  */
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 {
 	const struct iphdr *iph;
-	struct net *net;
 	u32 len;
 
 	/* When the interface is in promisc. mode, drop all the crap
@@ -421,7 +420,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		goto drop;
 
 
-	net = dev_net(dev);
 	__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
 
 	skb = skb_share_check(skb, GFP_ATOMIC);
@@ -489,9 +487,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	/* Must drop socket now because of tproxy. */
 	skb_orphan(skb);
 
-	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
-		       net, NULL, skb, dev, NULL,
-		       ip_rcv_finish);
+	return skb;
 
 csum_error:
 	__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
@@ -500,5 +496,63 @@ inhdr_error:
 drop:
 	kfree_skb(skb);
 out:
-	return NET_RX_DROP;
+	return NULL;
+}
+
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+	   struct net_device *orig_dev)
+{
+	struct net *net = dev_net(dev);
+
+	skb = ip_rcv_core(skb, net);
+	if (skb == NULL)
+		return NET_RX_DROP;
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+		       net, NULL, skb, dev, NULL,
+		       ip_rcv_finish);
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+			   struct net *net)
+{
+	struct sk_buff *skb, *next;
+
+	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+		     head, dev, NULL, ip_rcv_finish);
+	list_for_each_entry_safe(skb, next, head, list)
+		ip_rcv_finish(net, NULL, skb);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		 struct net_device *orig_dev)
+{
+	struct net_device *curr_dev = NULL;
+	struct net *curr_net = NULL;
+	struct sk_buff *skb, *next;
+	struct list_head sublist;
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		struct net_device *dev = skb->dev;
+		struct net *net = dev_net(dev);
+
+		skb = ip_rcv_core(skb, net);
+		if (skb == NULL)
+			continue;
+
+		if (curr_dev != dev || curr_net != net) {
+			/* dispatch old sublist */
+			list_cut_before(&sublist, head, &skb->list);
+			if (!list_empty(&sublist))
+				ip_sublist_rcv(&sublist, dev, net);
+			/* start new sublist */
+			curr_dev = dev;
+			curr_net = net;
+		}
+	}
+	/* dispatch final sublist */
+	ip_sublist_rcv(head, curr_dev, curr_net);
 }
-- 
cgit v1.2.3


From b9f463d6c9849230043123a6335d59ac7fea4d5a Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:14:44 +0100
Subject: net: don't bother calling list RX functions on empty lists

Generally the check should be very cheap, as the sk_buff_head is in cache.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 5e22719ce71d..7e6a2f66db5c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4887,7 +4887,8 @@ static void __netif_receive_skb_list(struct list_head *head)
 
 			/* Handle the previous sublist */
 			list_cut_before(&sublist, head, &skb->list);
-			__netif_receive_skb_list_core(&sublist, pfmemalloc);
+			if (!list_empty(&sublist))
+				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 			pfmemalloc = !pfmemalloc;
 			/* See comments in __netif_receive_skb */
 			if (pfmemalloc)
@@ -4897,7 +4898,8 @@ static void __netif_receive_skb_list(struct list_head *head)
 		}
 	}
 	/* Handle the remaining sublist */
-	__netif_receive_skb_list_core(head, pfmemalloc);
+	if (!list_empty(head))
+		__netif_receive_skb_list_core(head, pfmemalloc);
 	/* Restore pflags */
 	if (pfmemalloc)
 		memalloc_noreclaim_restore(noreclaim_flag);
@@ -5058,6 +5060,8 @@ void netif_receive_skb_list(struct list_head *head)
 {
 	struct sk_buff *skb;
 
+	if (list_empty(head))
+		return;
 	list_for_each_entry(skb, head, list)
 		trace_netif_receive_skb_list_entry(skb);
 	netif_receive_skb_list_internal(head);
-- 
cgit v1.2.3


From 6312fe77751f57d4fa2b28abeef84c6a95c28136 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Thu, 5 Jul 2018 14:34:32 +0800
Subject: net: limit each hash list length to MAX_GRO_SKBS

After commit 07d78363dcff ("net: Convert NAPI gro list into a small hash
table.")' there is 8 hash buckets, which allows more flows to be held for
merging.  but MAX_GRO_SKBS, the total held skb for merging, is 8 skb still,
limit the hash table performance.

keep MAX_GRO_SKBS as 8 skb, but limit each hash list length to 8 skb, not
the total 8 skb

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 +++++-
 net/core/dev.c            | 56 +++++++++++++++++++----------------------------
 2 files changed, 29 insertions(+), 34 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f06ee8f91e74..b683971e500d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -302,6 +302,11 @@ struct netdev_boot_setup {
 
 int __init netdev_boot_setup(char *str);
 
+struct gro_list {
+	struct list_head	list;
+	int			count;
+};
+
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
@@ -323,7 +328,7 @@ struct napi_struct {
 	int			poll_owner;
 #endif
 	struct net_device	*dev;
-	struct list_head	gro_hash[GRO_HASH_BUCKETS];
+	struct gro_list		gro_hash[GRO_HASH_BUCKETS];
 	struct sk_buff		*skb;
 	struct hrtimer		timer;
 	struct list_head	dev_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7e6a2f66db5c..89825c1eccdc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -149,7 +149,6 @@
 
 #include "net-sysfs.h"
 
-/* Instead of increasing this, you should create a hash table. */
 #define MAX_GRO_SKBS 8
 
 /* This should be increased if a protocol with a bigger head is added. */
@@ -5151,9 +5150,10 @@ out:
 	return netif_receive_skb_internal(skb);
 }
 
-static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head,
+static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 				   bool flush_old)
 {
+	struct list_head *head = &napi->gro_hash[index].list;
 	struct sk_buff *skb, *p;
 
 	list_for_each_entry_safe_reverse(skb, p, head, list) {
@@ -5162,22 +5162,20 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *h
 		list_del_init(&skb->list);
 		napi_gro_complete(skb);
 		napi->gro_count--;
+		napi->gro_hash[index].count--;
 	}
 }
 
-/* napi->gro_hash contains packets ordered by age.
+/* napi->gro_hash[].list contains packets ordered by age.
  * youngest packets at the head of it.
  * Complete skbs in reverse order to reduce latencies.
  */
 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 {
-	int i;
-
-	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
-		struct list_head *head = &napi->gro_hash[i];
+	u32 i;
 
-		__napi_gro_flush_chain(napi, head, flush_old);
-	}
+	for (i = 0; i < GRO_HASH_BUCKETS; i++)
+		__napi_gro_flush_chain(napi, i, flush_old);
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
@@ -5189,7 +5187,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
 	struct list_head *head;
 	struct sk_buff *p;
 
-	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)];
+	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 	list_for_each_entry(p, head, list) {
 		unsigned long diffs;
 
@@ -5257,27 +5255,13 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 	}
 }
 
-static void gro_flush_oldest(struct napi_struct *napi)
+static void gro_flush_oldest(struct list_head *head)
 {
-	struct sk_buff *oldest = NULL;
-	unsigned long age = jiffies;
-	int i;
-
-	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
-		struct list_head *head = &napi->gro_hash[i];
-		struct sk_buff *skb;
-
-		if (list_empty(head))
-			continue;
+	struct sk_buff *oldest;
 
-		skb = list_last_entry(head, struct sk_buff, list);
-		if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) {
-			oldest = skb;
-			age = NAPI_GRO_CB(skb)->age;
-		}
-	}
+	oldest = list_last_entry(head, struct sk_buff, list);
 
-	/* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is
+	/* We are called with head length >= MAX_GRO_SKBS, so this is
 	 * impossible.
 	 */
 	if (WARN_ON_ONCE(!oldest))
@@ -5292,6 +5276,7 @@ static void gro_flush_oldest(struct napi_struct *napi)
 
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
+	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 	struct list_head *head = &offload_base;
 	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
@@ -5358,6 +5343,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		list_del_init(&pp->list);
 		napi_gro_complete(pp);
 		napi->gro_count--;
+		napi->gro_hash[hash].count--;
 	}
 
 	if (same_flow)
@@ -5366,10 +5352,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (NAPI_GRO_CB(skb)->flush)
 		goto normal;
 
-	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
-		gro_flush_oldest(napi);
+	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
+		gro_flush_oldest(gro_head);
 	} else {
 		napi->gro_count++;
+		napi->gro_hash[hash].count++;
 	}
 	NAPI_GRO_CB(skb)->count = 1;
 	NAPI_GRO_CB(skb)->age = jiffies;
@@ -6006,8 +5993,10 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
 	napi->gro_count = 0;
-	for (i = 0; i < GRO_HASH_BUCKETS; i++)
-		INIT_LIST_HEAD(&napi->gro_hash[i]);
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		INIT_LIST_HEAD(&napi->gro_hash[i].list);
+		napi->gro_hash[i].count = 0;
+	}
 	napi->skb = NULL;
 	napi->poll = poll;
 	if (weight > NAPI_POLL_WEIGHT)
@@ -6047,8 +6036,9 @@ static void flush_gro_hash(struct napi_struct *napi)
 	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 		struct sk_buff *skb, *n;
 
-		list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list)
+		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 			kfree_skb(skb);
+		napi->gro_hash[i].count = 0;
 	}
 }
 
-- 
cgit v1.2.3


From ffcfe25bb50f27395e15fa999f1a7eb769f55360 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:19:38 -0400
Subject: net: Add support for subordinate device traffic classes

This patch is meant to provide the basic tools needed to allow us to create
subordinate device traffic classes. The general idea here is to allow
subdividing the queues of a device into queue groups accessible through an
upper device such as a macvlan.

The idea here is to enforce the idea that an upper device has to be a
single queue device, ideally with IFF_NO_QUQUE set. With that being the
case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
for the upper device are unused. As such we could reuse those in order to
support subdividing the lower device and distributing those queues between
the subordinate devices.

In order to distinguish between a regular set of traffic classes and if a
device is carrying subordinate traffic classes I changed num_tc from a u8
to a s16 value and use the negative values to represent the subordinate
pool values. So starting at -1 and running to -32768 we can encode those as
pool values, and the existing values of 0 to 15 can be maintained.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/netdevice.h | 16 ++++++++-
 net/core/dev.c            | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 net/core/net-sysfs.c      | 21 ++++++++++-
 3 files changed, 124 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b683971e500d..b1ff77276bc4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -575,6 +575,9 @@ struct netdev_queue {
 	 * (/sys/class/net/DEV/Q/trans_timeout)
 	 */
 	unsigned long		trans_timeout;
+
+	/* Subordinate device that the queue has been assigned to */
+	struct net_device	*sb_dev;
 /*
  * write-mostly part
  */
@@ -1991,7 +1994,7 @@ struct net_device {
 #ifdef CONFIG_DCB
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
-	u8			num_tc;
+	s16			num_tc;
 	struct netdev_tc_txq	tc_to_txq[TC_MAX_QUEUE];
 	u8			prio_tc_map[TC_BITMASK + 1];
 
@@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev)
 	return dev->num_tc;
 }
 
+void netdev_unbind_sb_channel(struct net_device *dev,
+			      struct net_device *sb_dev);
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+				 struct net_device *sb_dev,
+				 u8 tc, u16 count, u16 offset);
+int netdev_set_sb_channel(struct net_device *dev, u16 channel);
+static inline int netdev_get_sb_channel(struct net_device *dev)
+{
+	return max_t(int, -dev->num_tc, 0);
+}
+
 static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1eccdc..cc1d6bba017a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2067,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 		int i;
 
+		/* walk through the TCs and see if it falls into any of them */
 		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 			if ((txq - tc->offset) < tc->count)
 				return i;
 		}
 
+		/* didn't find it, just return -1 to indicate no match */
 		return -1;
 	}
 
@@ -2260,7 +2262,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	unsigned int nr_ids;
 
 	if (dev->num_tc) {
+		/* Do not allow XPS on subordinate device directly */
 		num_tc = dev->num_tc;
+		if (num_tc < 0)
+			return -EINVAL;
+
+		/* If queue belongs to subordinate dev use its map */
+		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
 		tc = netdev_txq_to_tc(dev, index);
 		if (tc < 0)
 			return -EINVAL;
@@ -2448,11 +2457,25 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+	/* Unbind any subordinate channels */
+	while (txq-- != &dev->_tx[0]) {
+		if (txq->sb_dev)
+			netdev_unbind_sb_channel(dev, txq->sb_dev);
+	}
+}
+
 void netdev_reset_tc(struct net_device *dev)
 {
 #ifdef CONFIG_XPS
 	netif_reset_xps_queues_gt(dev, 0);
 #endif
+	netdev_unbind_all_sb_channels(dev);
+
+	/* Reset TC configuration of device */
 	dev->num_tc = 0;
 	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2481,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 #ifdef CONFIG_XPS
 	netif_reset_xps_queues_gt(dev, 0);
 #endif
+	netdev_unbind_all_sb_channels(dev);
+
 	dev->num_tc = num_tc;
 	return 0;
 }
 EXPORT_SYMBOL(netdev_set_num_tc);
 
+void netdev_unbind_sb_channel(struct net_device *dev,
+			      struct net_device *sb_dev)
+{
+	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+#ifdef CONFIG_XPS
+	netif_reset_xps_queues_gt(sb_dev, 0);
+#endif
+	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
+	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
+
+	while (txq-- != &dev->_tx[0]) {
+		if (txq->sb_dev == sb_dev)
+			txq->sb_dev = NULL;
+	}
+}
+EXPORT_SYMBOL(netdev_unbind_sb_channel);
+
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+				 struct net_device *sb_dev,
+				 u8 tc, u16 count, u16 offset)
+{
+	/* Make certain the sb_dev and dev are already configured */
+	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
+		return -EINVAL;
+
+	/* We cannot hand out queues we don't have */
+	if ((offset + count) > dev->real_num_tx_queues)
+		return -EINVAL;
+
+	/* Record the mapping */
+	sb_dev->tc_to_txq[tc].count = count;
+	sb_dev->tc_to_txq[tc].offset = offset;
+
+	/* Provide a way for Tx queue to find the tc_to_txq map or
+	 * XPS map for itself.
+	 */
+	while (count--)
+		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
+
+	return 0;
+}
+EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
+
+int netdev_set_sb_channel(struct net_device *dev, u16 channel)
+{
+	/* Do not use a multiqueue device to represent a subordinate channel */
+	if (netif_is_multiqueue(dev))
+		return -ENODEV;
+
+	/* We allow channels 1 - 32767 to be used for subordinate channels.
+	 * Channel 0 is meant to be "native" mode and used only to represent
+	 * the main root device. We allow writing 0 to reset the device back
+	 * to normal mode after being used as a subordinate channel.
+	 */
+	if (channel > S16_MAX)
+		return -EINVAL;
+
+	dev->num_tc = -channel;
+
+	return 0;
+}
+EXPORT_SYMBOL(netdev_set_sb_channel);
+
 /*
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
  * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index dce3ae0fbca2..ffa1d18f2c2c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1054,11 +1054,23 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
 		return -ENOENT;
 
 	index = get_netdev_queue_index(queue);
+
+	/* If queue belongs to subordinate dev use its TC mapping */
+	dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
 	tc = netdev_txq_to_tc(dev, index);
 	if (tc < 0)
 		return -EINVAL;
 
-	return sprintf(buf, "%u\n", tc);
+	/* We can report the traffic class one of two ways:
+	 * Subordinate device traffic classes are reported with the traffic
+	 * class first, and then the subordinate class so for example TC0 on
+	 * subordinate device 2 will be reported as "0-2". If the queue
+	 * belongs to the root device it will be reported with just the
+	 * traffic class, so just "0" for TC 0 for example.
+	 */
+	return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
+				 sprintf(buf, "%u\n", tc);
 }
 
 #ifdef CONFIG_XPS
@@ -1225,7 +1237,14 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 	index = get_netdev_queue_index(queue);
 
 	if (dev->num_tc) {
+		/* Do not allow XPS on subordinate device directly */
 		num_tc = dev->num_tc;
+		if (num_tc < 0)
+			return -EINVAL;
+
+		/* If queue belongs to subordinate dev use its map */
+		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
 		tc = netdev_txq_to_tc(dev, index);
 		if (tc < 0)
 			return -EINVAL;
-- 
cgit v1.2.3


From eadec877ce9ca46a94e9036b5a44e7941d4fc501 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:19:48 -0400
Subject: net: Add support for subordinate traffic classes to netdev_pick_tx

This change makes it so that we can support the concept of subordinate
device traffic classes to the core networking code. In doing this we can
start pulling out the driver specific bits needed to support selecting a
queue based on an upper device.

The solution at is currently stands is only partially implemented. I have
the start of some XPS bits in here, but I would still need to allow for
configuration of the XPS maps on the queues reserved for the subordinate
devices. For now I am using the reference to the sb_dev XPS map as just a
way to skip the lookup of the lower device XPS map for now as that would
result in the wrong queue being picked.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 19 +++------
 drivers/net/macvlan.c                         | 10 +----
 include/linux/netdevice.h                     |  4 +-
 net/core/dev.c                                | 58 ++++++++++++++++-----------
 4 files changed, 45 insertions(+), 46 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 80225af2acb1..abb176df2e7f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8208,20 +8208,17 @@ static void ixgbe_atr(struct ixgbe_ring *ring,
 					      input, common, ring->queue_index);
 }
 
+#ifdef IXGBE_FCOE
 static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 			      void *accel_priv, select_queue_fallback_t fallback)
 {
-	struct ixgbe_fwd_adapter *fwd_adapter = accel_priv;
-#ifdef IXGBE_FCOE
 	struct ixgbe_adapter *adapter;
 	struct ixgbe_ring_feature *f;
-#endif
 	int txq;
 
-	if (fwd_adapter) {
-		u8 tc = netdev_get_num_tc(dev) ?
-			netdev_get_prio_tc_map(dev, skb->priority) : 0;
-		struct net_device *vdev = fwd_adapter->netdev;
+	if (accel_priv) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+		struct net_device *vdev = accel_priv;
 
 		txq = vdev->tc_to_txq[tc].offset;
 		txq += reciprocal_scale(skb_get_hash(skb),
@@ -8230,8 +8227,6 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 		return txq;
 	}
 
-#ifdef IXGBE_FCOE
-
 	/*
 	 * only execute the code below if protocol is FCoE
 	 * or FIP and we have FCoE enabled on the adapter
@@ -8257,11 +8252,9 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 		txq -= f->indices;
 
 	return txq + f->offset;
-#else
-	return fallback(dev, skb);
-#endif
 }
 
+#endif
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 			       struct xdp_frame *xdpf)
 {
@@ -10058,7 +10051,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
 	.ndo_start_xmit		= ixgbe_xmit_frame,
-	.ndo_select_queue	= ixgbe_select_queue,
 	.ndo_set_rx_mode	= ixgbe_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= ixgbe_set_mac,
@@ -10081,6 +10073,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif
 #ifdef IXGBE_FCOE
+	.ndo_select_queue	= ixgbe_select_queue,
 	.ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get,
 	.ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target,
 	.ndo_fcoe_ddp_done = ixgbe_fcoe_ddp_put,
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index adde8fc45588..401e1d1ce1ec 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -514,7 +514,6 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 	const struct macvlan_dev *vlan = netdev_priv(dev);
 	const struct macvlan_port *port = vlan->port;
 	const struct macvlan_dev *dest;
-	void *accel_priv = NULL;
 
 	if (vlan->mode == MACVLAN_MODE_BRIDGE) {
 		const struct ethhdr *eth = (void *)skb->data;
@@ -533,15 +532,10 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 			return NET_XMIT_SUCCESS;
 		}
 	}
-
-	/* For packets that are non-multicast and not bridged we will pass
-	 * the necessary information so that the lowerdev can distinguish
-	 * the source of the packets via the accel_priv value.
-	 */
-	accel_priv = vlan->accel_priv;
 xmit_world:
 	skb->dev = vlan->lowerdev;
-	return dev_queue_xmit_accel(skb, accel_priv);
+	return dev_queue_xmit_accel(skb,
+				    netdev_get_sb_channel(dev) ? dev : NULL);
 }
 
 static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b1ff77276bc4..fda0bcda7a42 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2103,7 +2103,7 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
-				    void *accel_priv);
+				    struct net_device *sb_dev);
 
 /* returns the headroom that the master device needs to take in account
  * when forwarding to this dev
@@ -2568,7 +2568,7 @@ void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 int dev_queue_xmit(struct sk_buff *skb);
-int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
+int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
diff --git a/net/core/dev.c b/net/core/dev.c
index cc1d6bba017a..09a7cc2f3c55 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2786,24 +2786,26 @@ EXPORT_SYMBOL(netif_device_attach);
  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
  * to be used as a distribution range.
  */
-static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
+static u16 skb_tx_hash(const struct net_device *dev,
+		       const struct net_device *sb_dev,
+		       struct sk_buff *skb)
 {
 	u32 hash;
 	u16 qoffset = 0;
 	u16 qcount = dev->real_num_tx_queues;
 
+	if (dev->num_tc) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
+		qoffset = sb_dev->tc_to_txq[tc].offset;
+		qcount = sb_dev->tc_to_txq[tc].count;
+	}
+
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
 		while (unlikely(hash >= qcount))
 			hash -= qcount;
-		return hash;
-	}
-
-	if (dev->num_tc) {
-		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-
-		qoffset = dev->tc_to_txq[tc].offset;
-		qcount = dev->tc_to_txq[tc].count;
+		return hash + qoffset;
 	}
 
 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
@@ -3573,7 +3575,8 @@ static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 }
 #endif
 
-static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
+			 struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
 	struct xps_dev_maps *dev_maps;
@@ -3587,7 +3590,7 @@ static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	if (!static_key_false(&xps_rxqs_needed))
 		goto get_cpus_map;
 
-	dev_maps = rcu_dereference(dev->xps_rxqs_map);
+	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 	if (dev_maps) {
 		int tci = sk_rx_queue_get(sk);
 
@@ -3598,7 +3601,7 @@ static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 
 get_cpus_map:
 	if (queue_index < 0) {
-		dev_maps = rcu_dereference(dev->xps_cpus_map);
+		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 		if (dev_maps) {
 			unsigned int tci = skb->sender_cpu - 1;
 
@@ -3614,17 +3617,20 @@ get_cpus_map:
 #endif
 }
 
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+			     struct net_device *sb_dev)
 {
 	struct sock *sk = skb->sk;
 	int queue_index = sk_tx_queue_get(sk);
 
+	sb_dev = sb_dev ? : dev;
+
 	if (queue_index < 0 || skb->ooo_okay ||
 	    queue_index >= dev->real_num_tx_queues) {
-		int new_index = get_xps_queue(dev, skb);
+		int new_index = get_xps_queue(dev, sb_dev, skb);
 
 		if (new_index < 0)
-			new_index = skb_tx_hash(dev, skb);
+			new_index = skb_tx_hash(dev, sb_dev, skb);
 
 		if (queue_index != new_index && sk &&
 		    sk_fullsock(sk) &&
@@ -3637,9 +3643,15 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 	return queue_index;
 }
 
+static u16 __netdev_pick_tx(struct net_device *dev,
+			    struct sk_buff *skb)
+{
+	return ___netdev_pick_tx(dev, skb, NULL);
+}
+
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
-				    void *accel_priv)
+				    struct net_device *sb_dev)
 {
 	int queue_index = 0;
 
@@ -3654,10 +3666,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 		const struct net_device_ops *ops = dev->netdev_ops;
 
 		if (ops->ndo_select_queue)
-			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+			queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
 							    __netdev_pick_tx);
 		else
-			queue_index = __netdev_pick_tx(dev, skb);
+			queue_index = ___netdev_pick_tx(dev, skb, sb_dev);
 
 		queue_index = netdev_cap_txqueue(dev, queue_index);
 	}
@@ -3669,7 +3681,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 /**
  *	__dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
- *	@accel_priv: private data used for L2 forwarding offload
+ *	@sb_dev: suboordinate device used for L2 forwarding offload
  *
  *	Queue a buffer for transmission to a network device. The caller must
  *	have set the device and priority and built the buffer before calling
@@ -3692,7 +3704,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
  *      the BH enable code must have IRQs enabled so that it will not deadlock.
  *          --BLG
  */
-static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
+static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 {
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq;
@@ -3731,7 +3743,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	else
 		skb_dst_force(skb);
 
-	txq = netdev_pick_tx(dev, skb, accel_priv);
+	txq = netdev_pick_tx(dev, skb, sb_dev);
 	q = rcu_dereference_bh(txq->qdisc);
 
 	trace_net_dev_queue(skb);
@@ -3805,9 +3817,9 @@ int dev_queue_xmit(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(dev_queue_xmit);
 
-int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 {
-	return __dev_queue_xmit(skb, accel_priv);
+	return __dev_queue_xmit(skb, sb_dev);
 }
 EXPORT_SYMBOL(dev_queue_xmit_accel);
 
-- 
cgit v1.2.3


From a4ea8a3dacc312c3402c78f6e4843afdda9b43a0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:19:54 -0400
Subject: net: Add generic ndo_select_queue functions

This patch adds a generic version of the ndo_select_queue functions for
either returning 0 or selecting a queue based on the processor ID. This is
generally meant to just reduce the number of functions we have to change
in the future when we have to deal with ndo_select_queue changes.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/lantiq_etop.c   | 10 +---------
 drivers/net/ethernet/ti/netcp_core.c |  9 +--------
 drivers/staging/netlogic/xlr_net.c   |  9 +--------
 include/linux/netdevice.h            |  4 ++++
 net/core/dev.c                       | 14 ++++++++++++++
 net/packet/af_packet.c               |  2 +-
 6 files changed, 22 insertions(+), 26 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c
index afc810069440..7a637b51c7d2 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -563,14 +563,6 @@ ltq_etop_set_multicast_list(struct net_device *dev)
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
-static u16
-ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb,
-		      void *accel_priv, select_queue_fallback_t fallback)
-{
-	/* we are currently only using the first queue */
-	return 0;
-}
-
 static int
 ltq_etop_init(struct net_device *dev)
 {
@@ -641,7 +633,7 @@ static const struct net_device_ops ltq_eth_netdev_ops = {
 	.ndo_set_mac_address = ltq_etop_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
 	.ndo_set_rx_mode = ltq_etop_set_multicast_list,
-	.ndo_select_queue = ltq_etop_select_queue,
+	.ndo_select_queue = dev_pick_tx_zero,
 	.ndo_init = ltq_etop_init,
 	.ndo_tx_timeout = ltq_etop_tx_timeout,
 };
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 6ebf110cd594..a1d335a3c5e4 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1889,13 +1889,6 @@ static int netcp_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid)
 	return err;
 }
 
-static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
-			      void *accel_priv,
-			      select_queue_fallback_t fallback)
-{
-	return 0;
-}
-
 static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			  void *type_data)
 {
@@ -1972,7 +1965,7 @@ static const struct net_device_ops netcp_netdev_ops = {
 	.ndo_vlan_rx_add_vid	= netcp_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= netcp_rx_kill_vid,
 	.ndo_tx_timeout		= netcp_ndo_tx_timeout,
-	.ndo_select_queue	= netcp_select_queue,
+	.ndo_select_queue	= dev_pick_tx_zero,
 	.ndo_setup_tc		= netcp_setup_tc,
 };
 
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index e461168313bf..4e6611e4c59b 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -290,13 +290,6 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb,
-				void *accel_priv,
-				select_queue_fallback_t fallback)
-{
-	return (u16)smp_processor_id();
-}
-
 static void xlr_hw_set_mac_addr(struct net_device *ndev)
 {
 	struct xlr_net_priv *priv = netdev_priv(ndev);
@@ -403,7 +396,7 @@ static const struct net_device_ops xlr_netdev_ops = {
 	.ndo_open = xlr_net_open,
 	.ndo_stop = xlr_net_stop,
 	.ndo_start_xmit = xlr_net_start_xmit,
-	.ndo_select_queue = xlr_net_select_queue,
+	.ndo_select_queue = dev_pick_tx_cpu_id,
 	.ndo_set_mac_address = xlr_net_set_mac_addr,
 	.ndo_set_rx_mode = xlr_set_rx_mode,
 	.ndo_get_stats64 = xlr_stats,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fda0bcda7a42..46f4c44ce3e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2567,6 +2567,10 @@ void dev_close(struct net_device *dev);
 void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+		     void *accel_priv, select_queue_fallback_t fallback);
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv, select_queue_fallback_t fallback);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
diff --git a/net/core/dev.c b/net/core/dev.c
index 09a7cc2f3c55..b5e538032d5e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3617,6 +3617,20 @@ get_cpus_map:
 #endif
 }
 
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+		     void *accel_priv, select_queue_fallback_t fallback)
+{
+	return 0;
+}
+EXPORT_SYMBOL(dev_pick_tx_zero);
+
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv, select_queue_fallback_t fallback)
+{
+	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
+}
+EXPORT_SYMBOL(dev_pick_tx_cpu_id);
+
 static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 			     struct net_device *sb_dev)
 {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 47931ebfaef3..f37d087ae652 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -277,7 +277,7 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
 
 static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
 {
-	return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
+	return dev_pick_tx_cpu_id(dev, skb, NULL, NULL);
 }
 
 static u16 packet_pick_tx_queue(struct sk_buff *skb)
-- 
cgit v1.2.3


From 4f49dec9075aa0277b8c9c657ec31e6361f88724 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:19:59 -0400
Subject: net: allow ndo_select_queue to pass netdev

This patch makes it so that instead of passing a void pointer as the
accel_priv we instead pass a net_device pointer as sb_dev. Making this
change allows us to pass the subordinate device through to the fallback
function eventually so that we can keep the actual code in the
ndo_select_queue call as focused on possible on the exception cases.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/infiniband/hw/hfi1/vnic_main.c            |  2 +-
 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c |  4 ++--
 drivers/net/bonding/bond_main.c                   |  3 ++-
 drivers/net/ethernet/amazon/ena/ena_netdev.c      |  3 ++-
 drivers/net/ethernet/broadcom/bcmsysport.c        |  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |  3 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   |  3 ++-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c     |  3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  7 ++++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c        |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h      |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |  3 ++-
 drivers/net/ethernet/renesas/ravb_main.c          |  3 ++-
 drivers/net/ethernet/sun/ldmvsw.c                 |  3 ++-
 drivers/net/ethernet/sun/sunvnet.c                |  3 ++-
 drivers/net/hyperv/netvsc_drv.c                   |  4 ++--
 drivers/net/net_failover.c                        |  5 +++--
 drivers/net/team/team.c                           |  3 ++-
 drivers/net/tun.c                                 |  3 ++-
 drivers/net/wireless/marvell/mwifiex/main.c       |  3 ++-
 drivers/net/xen-netback/interface.c               |  2 +-
 drivers/net/xen-netfront.c                        |  3 ++-
 drivers/staging/rtl8188eu/os_dep/os_intfs.c       |  3 ++-
 drivers/staging/rtl8723bs/os_dep/os_intfs.c       |  7 +++----
 include/linux/netdevice.h                         | 11 +++++++----
 net/core/dev.c                                    |  6 ++++--
 net/mac80211/iface.c                              |  4 ++--
 29 files changed, 66 insertions(+), 42 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
index 5d65582fe4d9..616fc9b6fad8 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -423,7 +423,7 @@ tx_finish:
 
 static u16 hfi1_vnic_select_queue(struct net_device *netdev,
 				  struct sk_buff *skb,
-				  void *accel_priv,
+				  struct net_device *sb_dev,
 				  select_queue_fallback_t fallback)
 {
 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
index 0c8aec62a425..61558788b3fa 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -95,7 +95,7 @@ static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb,
 }
 
 static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb,
-				 void *accel_priv,
+				 struct net_device *sb_dev,
 				 select_queue_fallback_t fallback)
 {
 	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
@@ -107,7 +107,7 @@ static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb,
 	mdata->entropy = opa_vnic_calc_entropy(skb);
 	mdata->vl = opa_vnic_get_vl(adapter, skb);
 	rc = adapter->rn_ops->ndo_select_queue(netdev, skb,
-					       accel_priv, fallback);
+					       sb_dev, fallback);
 	skb_pull(skb, sizeof(*mdata));
 	return rc;
 }
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 63e3844c5bec..9a2ea3c1f949 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4094,7 +4094,8 @@ static inline int bond_slave_override(struct bonding *bond,
 
 
 static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	/* This helper function exists to help dev_pick_tx get the correct
 	 * destination queue.  Using a helper function skips a call to
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index f2af87d70594..e3befb1f9204 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2213,7 +2213,8 @@ static void ena_netpoll(struct net_device *netdev)
 #endif /* CONFIG_NET_POLL_CONTROLLER */
 
 static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	u16 qid;
 	/* we suspect that this is good for in--kernel network services that
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index d5fca2e5a9bc..32f548e6431d 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2107,7 +2107,7 @@ static const struct ethtool_ops bcm_sysport_ethtool_ops = {
 };
 
 static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb,
-				    void *accel_priv,
+				    struct net_device *sb_dev,
 				    select_queue_fallback_t fallback)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index af7b5a4d8ba0..e4e1cf907ac6 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1910,7 +1910,8 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw)
 }
 
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback)
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index a8ce5c55bbb0..0e508e5defce 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -497,7 +497,8 @@ int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
 
 /* select_queue callback */
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback);
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback);
 
 static inline void bnx2x_update_rx_prod(struct bnx2x *bp,
 					struct bnx2x_fastpath *fp,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 0d91716a2566..5dc5e5604f05 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -930,7 +930,8 @@ freeout:
 }
 
 static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	int txq;
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index ef9ef703d13a..ff7a74ec8f11 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -2022,7 +2022,8 @@ static void hns_nic_get_stats64(struct net_device *ndev,
 
 static u16
 hns_nic_select_queue(struct net_device *ndev, struct sk_buff *skb,
-		     void *accel_priv, select_queue_fallback_t fallback)
+		     struct net_device *sb_dev,
+		     select_queue_fallback_t fallback)
 {
 	struct ethhdr *eth_hdr = (struct ethhdr *)skb->data;
 	struct hns_nic_priv *priv = netdev_priv(ndev);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index abb176df2e7f..8c7a68c57afa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8210,15 +8210,16 @@ static void ixgbe_atr(struct ixgbe_ring *ring,
 
 #ifdef IXGBE_FCOE
 static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
-			      void *accel_priv, select_queue_fallback_t fallback)
+			      struct net_device *sb_dev,
+			      select_queue_fallback_t fallback)
 {
 	struct ixgbe_adapter *adapter;
 	struct ixgbe_ring_feature *f;
 	int txq;
 
-	if (accel_priv) {
+	if (sb_dev) {
 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-		struct net_device *vdev = accel_priv;
+		struct net_device *vdev = sb_dev;
 
 		txq = vdev->tc_to_txq[tc].offset;
 		txq += reciprocal_scale(skb_get_hash(skb),
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 0227786308af..df2996618cd1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -688,7 +688,8 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
 }
 
 u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
-			 void *accel_priv, select_queue_fallback_t fallback)
+			 struct net_device *sb_dev,
+			 select_queue_fallback_t fallback)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	u16 rings_p_up = priv->num_tx_rings_p_up;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index ace6545f82e6..c3228b89df46 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -699,7 +699,8 @@ void mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
 
 void mlx4_en_tx_irq(struct mlx4_cq *mcq);
 u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
-			 void *accel_priv, select_queue_fallback_t fallback);
+			 struct net_device *sb_dev,
+			 select_queue_fallback_t fallback);
 netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
 netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
 			       struct mlx4_en_rx_alloc *frame,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e2b7586ed7a0..e1b237ccdf56 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -865,7 +865,8 @@ struct mlx5e_profile {
 void mlx5e_build_ptys2ethtool_map(void);
 
 u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback);
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback);
 netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev);
 netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5e_tx_wqe *wqe, u16 pi);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index f0739dae7b56..dfcc3710b65f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -111,7 +111,8 @@ static inline int mlx5e_get_dscp_up(struct mlx5e_priv *priv, struct sk_buff *skb
 #endif
 
 u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback)
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	int channel_ix = fallback(dev, skb);
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 68f122140966..4a7f54c8e7aa 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1656,7 +1656,8 @@ drop:
 }
 
 static u16 ravb_select_queue(struct net_device *ndev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	/* If skb needs TX timestamp, it is handled in network control queue */
 	return (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) ? RAVB_NC :
diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c
index a5dd627fe2f9..d42f47f6c632 100644
--- a/drivers/net/ethernet/sun/ldmvsw.c
+++ b/drivers/net/ethernet/sun/ldmvsw.c
@@ -101,7 +101,8 @@ static struct vnet_port *vsw_tx_port_find(struct sk_buff *skb,
 }
 
 static u16 vsw_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	struct vnet_port *port = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index a94f50442613..12539b357a78 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -234,7 +234,8 @@ static struct vnet_port *vnet_tx_port_find(struct sk_buff *skb,
 }
 
 static u16 vnet_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	struct vnet *vp = netdev_priv(dev);
 	struct vnet_port *port = __tx_port_find(vp, skb);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index dd1d6e115145..98c0107d6ca1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -329,7 +329,7 @@ static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb)
 }
 
 static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
-			       void *accel_priv,
+			       struct net_device *sb_dev,
 			       select_queue_fallback_t fallback)
 {
 	struct net_device_context *ndc = netdev_priv(ndev);
@@ -343,7 +343,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
 
 		if (vf_ops->ndo_select_queue)
 			txq = vf_ops->ndo_select_queue(vf_netdev, skb,
-						       accel_priv, fallback);
+						       sb_dev, fallback);
 		else
 			txq = fallback(vf_netdev, skb);
 
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 4f390fa557e4..78b549698b7b 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -115,7 +115,8 @@ static netdev_tx_t net_failover_start_xmit(struct sk_buff *skb,
 }
 
 static u16 net_failover_select_queue(struct net_device *dev,
-				     struct sk_buff *skb, void *accel_priv,
+				     struct sk_buff *skb,
+				     struct net_device *sb_dev,
 				     select_queue_fallback_t fallback)
 {
 	struct net_failover_info *nfo_info = netdev_priv(dev);
@@ -128,7 +129,7 @@ static u16 net_failover_select_queue(struct net_device *dev,
 
 		if (ops->ndo_select_queue)
 			txq = ops->ndo_select_queue(primary_dev, skb,
-						    accel_priv, fallback);
+						    sb_dev, fallback);
 		else
 			txq = fallback(primary_dev, skb);
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index b070959737ff..3a95eaae0c98 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1707,7 +1707,8 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	/*
 	 * This helper function exists to help dev_pick_tx get the correct
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a192a017cc68..76f0f4131197 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -607,7 +607,8 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 }
 
 static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	u16 ret;
diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index 510f6b8e717d..fa3e8ddfe9a9 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1279,7 +1279,8 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev)
 
 static u16
 mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb,
-				void *accel_priv, select_queue_fallback_t fallback)
+				struct net_device *sb_dev,
+				select_queue_fallback_t fallback)
 {
 	skb->priority = cfg80211_classify8021d(skb, NULL);
 	return mwifiex_1d_to_wmm_queue[skb->priority];
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 78ebe494fef0..19c4c585f472 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -148,7 +148,7 @@ void xenvif_wake_queue(struct xenvif_queue *queue)
 }
 
 static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb,
-			       void *accel_priv,
+			       struct net_device *sb_dev,
 			       select_queue_fallback_t fallback)
 {
 	struct xenvif *vif = netdev_priv(dev);
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index a57daecf1d57..d67cd379d156 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -545,7 +545,8 @@ static int xennet_count_skb_slots(struct sk_buff *skb)
 }
 
 static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb,
-			       void *accel_priv, select_queue_fallback_t fallback)
+			       struct net_device *sb_dev,
+			       select_queue_fallback_t fallback)
 {
 	unsigned int num_queues = dev->real_num_tx_queues;
 	u32 hash;
diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c
index add1ba00f3e9..38e85c8a85c8 100644
--- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c
+++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c
@@ -253,7 +253,8 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb)
 }
 
 static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	struct adapter	*padapter = rtw_netdev_priv(dev);
 	struct mlme_priv *pmlmepriv = &padapter->mlmepriv;
diff --git a/drivers/staging/rtl8723bs/os_dep/os_intfs.c b/drivers/staging/rtl8723bs/os_dep/os_intfs.c
index ace68f023b49..181642358e3f 100644
--- a/drivers/staging/rtl8723bs/os_dep/os_intfs.c
+++ b/drivers/staging/rtl8723bs/os_dep/os_intfs.c
@@ -403,10 +403,9 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb)
 }
 
 
-static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb
-				, void *accel_priv
-				, select_queue_fallback_t fallback
-)
+static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	struct adapter	*padapter = rtw_netdev_priv(dev);
 	struct mlme_priv *pmlmepriv = &padapter->mlmepriv;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46f4c44ce3e4..bbf062c1ca8a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -957,7 +957,8 @@ struct dev_ifalias {
  *	those the driver believes to be appropriate.
  *
  * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
- *                         void *accel_priv, select_queue_fallback_t fallback);
+ *                         struct net_device *sb_dev,
+ *                         select_queue_fallback_t fallback);
  *	Called to decide which queue to use when device supports multiple
  *	transmit queues.
  *
@@ -1229,7 +1230,7 @@ struct net_device_ops {
 						      netdev_features_t features);
 	u16			(*ndo_select_queue)(struct net_device *dev,
 						    struct sk_buff *skb,
-						    void *accel_priv,
+						    struct net_device *sb_dev,
 						    select_queue_fallback_t fallback);
 	void			(*ndo_change_rx_flags)(struct net_device *dev,
 						       int flags);
@@ -2568,9 +2569,11 @@ void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
-		     void *accel_priv, select_queue_fallback_t fallback);
+		     struct net_device *sb_dev,
+		     select_queue_fallback_t fallback);
 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback);
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
diff --git a/net/core/dev.c b/net/core/dev.c
index b5e538032d5e..a051ce27198b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3618,14 +3618,16 @@ get_cpus_map:
 }
 
 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
-		     void *accel_priv, select_queue_fallback_t fallback)
+		     struct net_device *sb_dev,
+		     select_queue_fallback_t fallback)
 {
 	return 0;
 }
 EXPORT_SYMBOL(dev_pick_tx_zero);
 
 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback)
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback)
 {
 	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 }
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 555e389b7dfa..5e6cf2cee965 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev)
 
 static u16 ieee80211_netdev_select_queue(struct net_device *dev,
 					 struct sk_buff *skb,
-					 void *accel_priv,
+					 struct net_device *sb_dev,
 					 select_queue_fallback_t fallback)
 {
 	return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
@@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = {
 
 static u16 ieee80211_monitor_select_queue(struct net_device *dev,
 					  struct sk_buff *skb,
-					  void *accel_priv,
+					  struct net_device *sb_dev,
 					  select_queue_fallback_t fallback)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-- 
cgit v1.2.3


From 8ec56fc3c5ee6f9700adac190e9ce5b8859a58b6 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:20:04 -0400
Subject: net: allow fallback function to pass netdev

For most of these calls we can just pass NULL through to the fallback
function as the sb_dev. The only cases where we cannot are the cases where
we might be dealing with either an upper device or a driver that would
have configured things to support an sb_dev itself.

The only driver that has any significant change in this patch set should be
ixgbe as we can drop the redundant functionality that existed in both the
ndo_select_queue function and the fallback function that was passed through
to us.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c    |  2 +-
 drivers/net/ethernet/broadcom/bcmsysport.c      |  4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |  3 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c   |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  4 ++--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c      |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |  2 +-
 drivers/net/hyperv/netvsc_drv.c                 |  2 +-
 drivers/net/net_failover.c                      |  2 +-
 drivers/net/xen-netback/interface.c             |  2 +-
 include/linux/netdevice.h                       |  3 ++-
 net/core/dev.c                                  | 12 +++---------
 net/packet/af_packet.c                          |  7 ++++---
 14 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index e3befb1f9204..c673ac2df65b 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2224,7 +2224,7 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
 	if (skb_rx_queue_recorded(skb))
 		qid = skb_get_rx_queue(skb);
 	else
-		qid = fallback(dev, skb);
+		qid = fallback(dev, skb, NULL);
 
 	return qid;
 }
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 32f548e6431d..eb890c4b3b2d 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2116,7 +2116,7 @@ static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb,
 	unsigned int q, port;
 
 	if (!netdev_uses_dsa(dev))
-		return fallback(dev, skb);
+		return fallback(dev, skb, NULL);
 
 	/* DSA tagging layer will have configured the correct queue */
 	q = BRCM_TAG_GET_QUEUE(queue);
@@ -2124,7 +2124,7 @@ static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb,
 	tx_ring = priv->ring_map[q + port * priv->per_port_num_tx_queues];
 
 	if (unlikely(!tx_ring))
-		return fallback(dev, skb);
+		return fallback(dev, skb, NULL);
 
 	return tx_ring->index;
 }
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index e4e1cf907ac6..5a727d4729da 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1933,7 +1933,8 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
 	}
 
 	/* select a non-FCoE queue */
-	return fallback(dev, skb) % (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos);
+	return fallback(dev, skb, NULL) %
+	       (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos);
 }
 
 void bnx2x_set_num_queues(struct bnx2x *bp)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 5dc5e5604f05..40cf8dc9f163 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -973,7 +973,7 @@ static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb,
 		return txq;
 	}
 
-	return fallback(dev, skb) % dev->real_num_tx_queues;
+	return fallback(dev, skb, NULL) % dev->real_num_tx_queues;
 }
 
 static int closest_timer(const struct sge *s, int time)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index ff7a74ec8f11..948b3e0d18f4 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -2033,7 +2033,7 @@ hns_nic_select_queue(struct net_device *ndev, struct sk_buff *skb,
 	    is_multicast_ether_addr(eth_hdr->h_dest))
 		return 0;
 	else
-		return fallback(ndev, skb);
+		return fallback(ndev, skb, NULL);
 }
 
 static const struct net_device_ops hns_nic_netdev_ops = {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8c7a68c57afa..bd6d9ea27b4b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8237,11 +8237,11 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 	case htons(ETH_P_FIP):
 		adapter = netdev_priv(dev);
 
-		if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED)
+		if (!sb_dev && (adapter->flags & IXGBE_FLAG_FCOE_ENABLED))
 			break;
 		/* fall through */
 	default:
-		return fallback(dev, skb);
+		return fallback(dev, skb, sb_dev);
 	}
 
 	f = &adapter->ring_feature[RING_F_FCOE];
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index df2996618cd1..1857ee0f0871 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -695,9 +695,9 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
 	u16 rings_p_up = priv->num_tx_rings_p_up;
 
 	if (netdev_get_num_tc(dev))
-		return fallback(dev, skb);
+		return fallback(dev, skb, NULL);
 
-	return fallback(dev, skb) % rings_p_up;
+	return fallback(dev, skb, NULL) % rings_p_up;
 }
 
 static void mlx4_bf_copy(void __iomem *dst, const void *src,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index dfcc3710b65f..9106ea45e3cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -115,7 +115,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
 		       select_queue_fallback_t fallback)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
-	int channel_ix = fallback(dev, skb);
+	int channel_ix = fallback(dev, skb, NULL);
 	u16 num_channels;
 	int up = 0;
 
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 98c0107d6ca1..cf4f40a04194 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -345,7 +345,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
 			txq = vf_ops->ndo_select_queue(vf_netdev, skb,
 						       sb_dev, fallback);
 		else
-			txq = fallback(vf_netdev, skb);
+			txq = fallback(vf_netdev, skb, NULL);
 
 		/* Record the queue selected by VF so that it can be
 		 * used for common case where VF has more queues than
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 78b549698b7b..d00d42c845b7 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -131,7 +131,7 @@ static u16 net_failover_select_queue(struct net_device *dev,
 			txq = ops->ndo_select_queue(primary_dev, skb,
 						    sb_dev, fallback);
 		else
-			txq = fallback(primary_dev, skb);
+			txq = fallback(primary_dev, skb, NULL);
 
 		qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
 
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 19c4c585f472..92274c237200 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -155,7 +155,7 @@ static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb,
 	unsigned int size = vif->hash.size;
 
 	if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE)
-		return fallback(dev, skb) % dev->real_num_tx_queues;
+		return fallback(dev, skb, NULL) % dev->real_num_tx_queues;
 
 	xenvif_set_skb_hash(vif, skb);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bbf062c1ca8a..2daf2fa6554f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -793,7 +793,8 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
 }
 
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
-				       struct sk_buff *skb);
+				       struct sk_buff *skb,
+				       struct net_device *sb_dev);
 
 enum tc_setup_type {
 	TC_SETUP_QDISC_MQPRIO,
diff --git a/net/core/dev.c b/net/core/dev.c
index a051ce27198b..e18d81837a6c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3633,8 +3633,8 @@ u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 
-static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
-			     struct net_device *sb_dev)
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+			    struct net_device *sb_dev)
 {
 	struct sock *sk = skb->sk;
 	int queue_index = sk_tx_queue_get(sk);
@@ -3659,12 +3659,6 @@ static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 	return queue_index;
 }
 
-static u16 __netdev_pick_tx(struct net_device *dev,
-			    struct sk_buff *skb)
-{
-	return ___netdev_pick_tx(dev, skb, NULL);
-}
-
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
 				    struct net_device *sb_dev)
@@ -3685,7 +3679,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 			queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
 							    __netdev_pick_tx);
 		else
-			queue_index = ___netdev_pick_tx(dev, skb, sb_dev);
+			queue_index = __netdev_pick_tx(dev, skb, sb_dev);
 
 		queue_index = netdev_cap_txqueue(dev, queue_index);
 	}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f37d087ae652..00189a3b07f2 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
 	return po->xmit == packet_direct_xmit;
 }
 
-static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
+				  struct net_device *sb_dev)
 {
-	return dev_pick_tx_cpu_id(dev, skb, NULL, NULL);
+	return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
 }
 
 static u16 packet_pick_tx_queue(struct sk_buff *skb)
@@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb)
 						    __packet_pick_tx_queue);
 		queue_index = netdev_cap_txqueue(dev, queue_index);
 	} else {
-		queue_index = __packet_pick_tx_queue(dev, skb);
+		queue_index = __packet_pick_tx_queue(dev, skb, NULL);
 	}
 
 	return queue_index;
-- 
cgit v1.2.3


From 8c057efaebb557b60ba514b5e39e8000a1eab0f1 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 9 Jul 2018 18:09:54 +0100
Subject: net: core: fix uses-after-free in list processing

In netif_receive_skb_list_internal(), all of skb_defer_rx_timestamp(),
 do_xdp_generic() and enqueue_to_backlog() can lead to kfree(skb).  Thus,
 we cannot wait until after they return to remove the skb from the list;
 instead, we remove it first and, in the pass case, add it to a sublist
 afterwards.
In the case of enqueue_to_backlog() we have already decided not to pass
 when we call the function, so we do not need a sublist.

Fixes: 7da517a3bc52 ("net: core: Another step of skb receive list processing")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1eccdc..ce4583564e00 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4982,25 +4982,30 @@ static void netif_receive_skb_list_internal(struct list_head *head)
 {
 	struct bpf_prog *xdp_prog = NULL;
 	struct sk_buff *skb, *next;
+	struct list_head sublist;
 
+	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
 		net_timestamp_check(netdev_tstamp_prequeue, skb);
-		if (skb_defer_rx_timestamp(skb))
-			/* Handled, remove from list */
-			list_del(&skb->list);
+		list_del(&skb->list);
+		if (!skb_defer_rx_timestamp(skb))
+			list_add_tail(&skb->list, &sublist);
 	}
+	list_splice_init(&sublist, head);
 
 	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 		preempt_disable();
 		rcu_read_lock();
 		list_for_each_entry_safe(skb, next, head, list) {
 			xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-			if (do_xdp_generic(xdp_prog, skb) != XDP_PASS)
-				/* Dropped, remove from list */
-				list_del(&skb->list);
+			list_del(&skb->list);
+			if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
+				list_add_tail(&skb->list, &sublist);
 		}
 		rcu_read_unlock();
 		preempt_enable();
+		/* Put passed packets back on main list */
+		list_splice_init(&sublist, head);
 	}
 
 	rcu_read_lock();
@@ -5011,9 +5016,9 @@ static void netif_receive_skb_list_internal(struct list_head *head)
 			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 
 			if (cpu >= 0) {
-				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
-				/* Handled, remove from list */
+				/* Will be handled, remove from list */
 				list_del(&skb->list);
+				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 9af86f9338949a9369bda5e6fed69347d1813054 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 9 Jul 2018 18:10:19 +0100
Subject: net: core: fix use-after-free in __netif_receive_skb_list_core

__netif_receive_skb_core can free the skb, so we have to use the dequeue-
 enqueue model when calling it from __netif_receive_skb_list_core.

Fixes: 88eb1944e18c ("net: core: propagate SKB lists through packet_type lookup")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ce4583564e00..d13cddcac41f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4830,23 +4830,28 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 	struct list_head sublist;
 	struct sk_buff *skb, *next;
 
+	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
 		struct net_device *orig_dev = skb->dev;
 		struct packet_type *pt_prev = NULL;
 
+		list_del(&skb->list);
 		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+		if (!pt_prev)
+			continue;
 		if (pt_curr != pt_prev || od_curr != orig_dev) {
 			/* dispatch old sublist */
-			list_cut_before(&sublist, head, &skb->list);
 			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 			/* start new sublist */
+			INIT_LIST_HEAD(&sublist);
 			pt_curr = pt_prev;
 			od_curr = orig_dev;
 		}
+		list_add_tail(&skb->list, &sublist);
 	}
 
 	/* dispatch final sublist */
-	__netif_receive_skb_list_ptype(head, pt_curr, od_curr);
+	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)
-- 
cgit v1.2.3


From 68d2f84a1368cc5d4ccbbbfc6821f159d27681c9 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 12 Jul 2018 16:24:59 +0900
Subject: net: gro: properly remove skb from list

Following crash occurs in validate_xmit_skb_list() when same skb is
iterated multiple times in the loop and consume_skb() is called.

The root cause is calling list_del_init(&skb->list) and not clearing
skb->next in d4546c2509b1. list_del_init(&skb->list) sets skb->next
to point to skb itself. skb->next needs to be cleared because other
parts of network stack uses another kind of SKB lists.
validate_xmit_skb_list() uses such list.

A similar type of bugfix was reported by Jesper Dangaard Brouer.
https://patchwork.ozlabs.org/patch/942541/

This patch clears skb->next and changes list_del_init() to list_del()
so that list->prev will maintain the list poison.

[  148.185511] ==================================================================
[  148.187865] BUG: KASAN: use-after-free in validate_xmit_skb_list+0x4b/0xa0
[  148.190158] Read of size 8 at addr ffff8801e52eefc0 by task swapper/1/0
[  148.192940]
[  148.193642] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.18.0-rc3+ #25
[  148.195423] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
[  148.199129] Call Trace:
[  148.200565]  <IRQ>
[  148.201911]  dump_stack+0xc6/0x14c
[  148.203572]  ? dump_stack_print_info.cold.1+0x2f/0x2f
[  148.205083]  ? kmsg_dump_rewind_nolock+0x59/0x59
[  148.206307]  ? validate_xmit_skb+0x2c6/0x560
[  148.207432]  ? debug_show_held_locks+0x30/0x30
[  148.208571]  ? validate_xmit_skb_list+0x4b/0xa0
[  148.211144]  print_address_description+0x6c/0x23c
[  148.212601]  ? validate_xmit_skb_list+0x4b/0xa0
[  148.213782]  kasan_report.cold.6+0x241/0x2fd
[  148.214958]  validate_xmit_skb_list+0x4b/0xa0
[  148.216494]  sch_direct_xmit+0x1b0/0x680
[  148.217601]  ? dev_watchdog+0x4e0/0x4e0
[  148.218675]  ? do_raw_spin_trylock+0x10/0x120
[  148.219818]  ? do_raw_spin_lock+0xe0/0xe0
[  148.221032]  __dev_queue_xmit+0x1167/0x1810
[  148.222155]  ? sched_clock+0x5/0x10
[...]

[  148.474257] Allocated by task 0:
[  148.475363]  kasan_kmalloc+0xbf/0xe0
[  148.476503]  kmem_cache_alloc+0xb4/0x1b0
[  148.477654]  __build_skb+0x91/0x250
[  148.478677]  build_skb+0x67/0x180
[  148.479657]  e1000_clean_rx_irq+0x542/0x8a0
[  148.480757]  e1000_clean+0x652/0xd10
[  148.481772]  net_rx_action+0x4ea/0xc20
[  148.482808]  __do_softirq+0x1f9/0x574
[  148.483831]
[  148.484575] Freed by task 0:
[  148.485504]  __kasan_slab_free+0x12e/0x180
[  148.486589]  kmem_cache_free+0xb4/0x240
[  148.487634]  kfree_skbmem+0xed/0x150
[  148.488648]  consume_skb+0x146/0x250
[  148.489665]  validate_xmit_skb+0x2b7/0x560
[  148.490754]  validate_xmit_skb_list+0x70/0xa0
[  148.491897]  sch_direct_xmit+0x1b0/0x680
[  148.493949]  __dev_queue_xmit+0x1167/0x1810
[  148.495103]  br_dev_queue_push_xmit+0xce/0x250
[  148.496196]  br_forward_finish+0x276/0x280
[  148.497234]  __br_forward+0x44f/0x520
[  148.498260]  br_forward+0x19f/0x1b0
[  148.499264]  br_handle_frame_finish+0x65e/0x980
[  148.500398]  NF_HOOK.constprop.10+0x290/0x2a0
[  148.501522]  br_handle_frame+0x417/0x640
[  148.502582]  __netif_receive_skb_core+0xaac/0x18f0
[  148.503753]  __netif_receive_skb_one_core+0x98/0x120
[  148.504958]  netif_receive_skb_internal+0xe3/0x330
[  148.506154]  napi_gro_complete+0x190/0x2a0
[  148.507243]  dev_gro_receive+0x9f7/0x1100
[  148.508316]  napi_gro_receive+0xcb/0x260
[  148.509387]  e1000_clean_rx_irq+0x2fc/0x8a0
[  148.510501]  e1000_clean+0x652/0xd10
[  148.511523]  net_rx_action+0x4ea/0xc20
[  148.512566]  __do_softirq+0x1f9/0x574
[  148.513598]
[  148.514346] The buggy address belongs to the object at ffff8801e52eefc0
[  148.514346]  which belongs to the cache skbuff_head_cache of size 232
[  148.517047] The buggy address is located 0 bytes inside of
[  148.517047]  232-byte region [ffff8801e52eefc0, ffff8801e52ef0a8)
[  148.519549] The buggy address belongs to the page:
[  148.520726] page:ffffea000794bb00 count:1 mapcount:0 mapping:ffff880106f4dfc0 index:0xffff8801e52ee840 compound_mapcount: 0
[  148.524325] flags: 0x17ffffc0008100(slab|head)
[  148.525481] raw: 0017ffffc0008100 ffff880106b938d0 ffff880106b938d0 ffff880106f4dfc0
[  148.527503] raw: ffff8801e52ee840 0000000000190011 00000001ffffffff 0000000000000000
[  148.529547] page dumped because: kasan: bad access detected

Fixes: d4546c2509b1 ("net: Convert GRO SKB handling to list_head.")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Reported-by: Tyler Hicks <tyhicks@canonical.com>
Tested-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1c3f0997e857..14a748ee8cc9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5280,7 +5280,8 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 	list_for_each_entry_safe_reverse(skb, p, head, list) {
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
-		list_del_init(&skb->list);
+		list_del(&skb->list);
+		skb->next = NULL;
 		napi_gro_complete(skb);
 		napi->gro_count--;
 		napi->gro_hash[index].count--;
@@ -5461,7 +5462,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
 	if (pp) {
-		list_del_init(&pp->list);
+		list_del(&pp->list);
+		pp->next = NULL;
 		napi_gro_complete(pp);
 		napi->gro_count--;
 		napi->gro_hash[hash].count--;
-- 
cgit v1.2.3


From 6b8675897338f874c41612655a85d8e10cdb23d8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 11 Jul 2018 20:36:39 -0700
Subject: xdp: don't make drivers report attachment mode

prog_attached of struct netdev_bpf should have been superseded
by simply setting prog_id long time ago, but we kept it around
to allow offloading drivers to communicate attachment mode (drv
vs hw).  Subsequently drivers were also allowed to report back
attachment flags (prog_flags), and since nowadays only programs
attached will XDP_FLAGS_HW_MODE can get offloaded, we can tell
the attachment mode from the flags driver reports.  Remove
prog_attached member.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c       | 1 -
 drivers/net/ethernet/cavium/thunder/nicvf_main.c    | 1 -
 drivers/net/ethernet/intel/i40e/i40e_main.c         | 1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c       | 1 -
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c   | 1 -
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c      | 1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   | 1 -
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 ---
 drivers/net/ethernet/qlogic/qede/qede_filter.c      | 1 -
 drivers/net/netdevsim/bpf.c                         | 1 -
 drivers/net/tun.c                                   | 1 -
 drivers/net/virtio_net.c                            | 1 -
 include/linux/netdevice.h                           | 5 -----
 net/core/dev.c                                      | 7 +++----
 net/core/rtnetlink.c                                | 8 ++++++--
 15 files changed, 9 insertions(+), 25 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 1f0e872d0667..0584d07c8c33 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -219,7 +219,6 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		rc = bnxt_xdp_set(bp, xdp->prog);
 		break;
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!bp->xdp_prog;
 		xdp->prog_id = bp->xdp_prog ? bp->xdp_prog->aux->id : 0;
 		rc = 0;
 		break;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 135766c4296b..768f584f8392 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1848,7 +1848,6 @@ static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		return nicvf_xdp_setup(nic, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!nic->xdp_prog;
 		xdp->prog_id = nic->xdp_prog ? nic->xdp_prog->aux->id : 0;
 		return 0;
 	default:
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 426b0ccb1fc6..51762428b40e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11841,7 +11841,6 @@ static int i40e_xdp(struct net_device *dev,
 	case XDP_SETUP_PROG:
 		return i40e_xdp_setup(vsi, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = i40e_enabled_xdp_vsi(vsi);
 		xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
 		return 0;
 	default:
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a8e21becb619..3862fea1c923 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9966,7 +9966,6 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		return ixgbe_xdp_setup(dev, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!(adapter->xdp_prog);
 		xdp->prog_id = adapter->xdp_prog ?
 			adapter->xdp_prog->aux->id : 0;
 		return 0;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 59416eddd840..d86446d202d5 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -4462,7 +4462,6 @@ static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		return ixgbevf_xdp_setup(dev, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!(adapter->xdp_prog);
 		xdp->prog_id = adapter->xdp_prog ?
 			       adapter->xdp_prog->aux->id : 0;
 		return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 65eb06e017e4..6785661d1a72 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2926,7 +2926,6 @@ static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		return mlx4_xdp_set(dev, xdp->prog);
 	case XDP_QUERY_PROG:
 		xdp->prog_id = mlx4_xdp_query(dev);
-		xdp->prog_attached = !!xdp->prog_id;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bbd2fd0b2e06..e4a9a0768a81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4192,7 +4192,6 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		return mlx5e_xdp_set(dev, xdp->prog);
 	case XDP_QUERY_PROG:
 		xdp->prog_id = mlx5e_xdp_query(dev);
-		xdp->prog_attached = !!xdp->prog_id;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 7df5ca37bfb8..d20714598613 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3459,9 +3459,6 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 		return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags,
 					 xdp->extack);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!nn->xdp_prog;
-		if (nn->dp.bpf_offload_xdp)
-			xdp->prog_attached = XDP_ATTACHED_HW;
 		xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0;
 		xdp->prog_flags = nn->xdp_prog ? nn->xdp_flags : 0;
 		return 0;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index b823bfe2ea4d..f9a327c821eb 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1116,7 +1116,6 @@ int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		return qede_xdp_set(edev, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!edev->xdp_prog;
 		xdp->prog_id = edev->xdp_prog ? edev->xdp_prog->aux->id : 0;
 		return 0;
 	default:
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 75c25306d234..712e6f918065 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -567,7 +567,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 		nsim_bpf_destroy_prog(bpf->offload.prog);
 		return 0;
 	case XDP_QUERY_PROG:
-		bpf->prog_attached = ns->xdp_prog_mode;
 		bpf->prog_id = ns->xdp_prog ? ns->xdp_prog->aux->id : 0;
 		bpf->prog_flags = ns->xdp_prog ? ns->xdp_flags : 0;
 		return 0;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a192a017cc68..49a50219d0da 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1268,7 +1268,6 @@ static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		return tun_xdp_set(dev, xdp->prog, xdp->extack);
 	case XDP_QUERY_PROG:
 		xdp->prog_id = tun_xdp_query(dev);
-		xdp->prog_attached = !!xdp->prog_id;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 53085c63277b..2ff08bc103a9 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2343,7 +2343,6 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
 	case XDP_QUERY_PROG:
 		xdp->prog_id = virtnet_xdp_query(dev);
-		xdp->prog_attached = !!xdp->prog_id;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b683971e500d..69a664789b33 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -819,10 +819,6 @@ enum bpf_netdev_command {
 	 */
 	XDP_SETUP_PROG,
 	XDP_SETUP_PROG_HW,
-	/* Check if a bpf program is set on the device.  The callee should
-	 * set @prog_attached to one of XDP_ATTACHED_* values, note that "true"
-	 * is equivalent to XDP_ATTACHED_DRV.
-	 */
 	XDP_QUERY_PROG,
 	/* BPF program for offload callbacks, invoked at program load time. */
 	BPF_OFFLOAD_VERIFIER_PREP,
@@ -849,7 +845,6 @@ struct netdev_bpf {
 		};
 		/* XDP_QUERY_PROG */
 		struct {
-			u8 prog_attached;
 			u32 prog_id;
 			/* flags with which program was installed */
 			u32 prog_flags;
diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1eccdc..9fa3b3705a8e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4926,7 +4926,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 		break;
 
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!old;
 		xdp->prog_id = old ? old->aux->id : 0;
 		break;
 
@@ -7593,13 +7592,13 @@ void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
 	WARN_ON(bpf_op(dev, xdp) < 0);
 }
 
-static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
+static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
 {
 	struct netdev_bpf xdp;
 
 	__dev_xdp_query(dev, bpf_op, &xdp);
 
-	return xdp.prog_attached;
+	return xdp.prog_id;
 }
 
 static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
@@ -7634,7 +7633,7 @@ static void dev_xdp_uninstall(struct net_device *dev)
 		return;
 
 	__dev_xdp_query(dev, ndo_bpf, &xdp);
-	if (xdp.prog_attached == XDP_ATTACHED_NONE)
+	if (!xdp.prog_id)
 		return;
 
 	/* Program removal should always succeed */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b40242459907..02ebc056a688 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1372,9 +1372,13 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 		return XDP_ATTACHED_NONE;
 
 	__dev_xdp_query(dev, ops->ndo_bpf, &xdp);
-	*prog_id = xdp.prog_id;
+	if (!xdp.prog_id)
+		return XDP_ATTACHED_NONE;
 
-	return xdp.prog_attached;
+	*prog_id = xdp.prog_id;
+	if (xdp.prog_flags & XDP_FLAGS_HW_MODE)
+		return XDP_ATTACHED_HW;
+	return XDP_ATTACHED_DRV;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
-- 
cgit v1.2.3


From a25717d2b604347d9af8da81deea7b08e8c94220 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 11 Jul 2018 20:36:41 -0700
Subject: xdp: support simultaneous driver and hw XDP attachment

Split the query of HW-attached program from the software one.
Introduce new .ndo_bpf command to query HW-attached program.
This will allow drivers to install different programs in HW
and SW at the same time.  Netlink can now also carry multiple
programs on dump (in which case mode will be set to
XDP_ATTACHED_MULTI and user has to check per-attachment point
attributes, IFLA_XDP_PROG_ID will not be present).  We reuse
IFLA_XDP_PROG_ID skb space for second mode, so rtnl_xdp_size()
doesn't need to be updated.

Note that the installation side is still not there, since all
drivers currently reject installing more than one program at
the time.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  6 ++
 drivers/net/netdevsim/bpf.c                        |  6 ++
 include/linux/netdevice.h                          |  7 +-
 include/uapi/linux/if_link.h                       |  1 +
 net/core/dev.c                                     | 45 ++++++-----
 net/core/rtnetlink.c                               | 93 ++++++++++++----------
 6 files changed, 96 insertions(+), 62 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 4bb589dbffbc..bb1e72e8dbc2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3453,6 +3453,12 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG_HW:
 		return nfp_net_xdp_setup(nn, xdp);
 	case XDP_QUERY_PROG:
+		if (nn->dp.bpf_offload_xdp)
+			return 0;
+		return xdp_attachment_query(&nn->xdp, xdp);
+	case XDP_QUERY_PROG_HW:
+		if (!nn->dp.bpf_offload_xdp)
+			return 0;
 		return xdp_attachment_query(&nn->xdp, xdp);
 	default:
 		return nfp_app_bpf(nn->app, nn, xdp);
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index c485d97b5df4..5544c9b51173 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -561,6 +561,12 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 		nsim_bpf_destroy_prog(bpf->offload.prog);
 		return 0;
 	case XDP_QUERY_PROG:
+		if (ns->xdp_prog_mode != XDP_ATTACHED_DRV)
+			return 0;
+		return xdp_attachment_query(&ns->xdp, bpf);
+	case XDP_QUERY_PROG_HW:
+		if (ns->xdp_prog_mode != XDP_ATTACHED_HW)
+			return 0;
 		return xdp_attachment_query(&ns->xdp, bpf);
 	case XDP_SETUP_PROG:
 		err = nsim_setup_prog_checks(ns, bpf);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 69a664789b33..2422c0e88f5c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -820,6 +820,7 @@ enum bpf_netdev_command {
 	XDP_SETUP_PROG,
 	XDP_SETUP_PROG_HW,
 	XDP_QUERY_PROG,
+	XDP_QUERY_PROG_HW,
 	/* BPF program for offload callbacks, invoked at program load time. */
 	BPF_OFFLOAD_VERIFIER_PREP,
 	BPF_OFFLOAD_TRANSLATE,
@@ -843,7 +844,7 @@ struct netdev_bpf {
 			struct bpf_prog *prog;
 			struct netlink_ext_ack *extack;
 		};
-		/* XDP_QUERY_PROG */
+		/* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */
 		struct {
 			u32 prog_id;
 			/* flags with which program was installed */
@@ -3533,8 +3534,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
-		     struct netdev_bpf *xdp);
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
+		    enum bpf_netdev_command cmd);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bc86c2b105ec..8759cfb8aa2e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -920,6 +920,7 @@ enum {
 	XDP_ATTACHED_DRV,
 	XDP_ATTACHED_SKB,
 	XDP_ATTACHED_HW,
+	XDP_ATTACHED_MULTI,
 };
 
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index 9fa3b3705a8e..993cdc3cd086 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7582,21 +7582,19 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
-		     struct netdev_bpf *xdp)
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+		    enum bpf_netdev_command cmd)
 {
-	memset(xdp, 0, sizeof(*xdp));
-	xdp->command = XDP_QUERY_PROG;
+	struct netdev_bpf xdp;
 
-	/* Query must always succeed. */
-	WARN_ON(bpf_op(dev, xdp) < 0);
-}
+	if (!bpf_op)
+		return 0;
 
-static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
-{
-	struct netdev_bpf xdp;
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = cmd;
 
-	__dev_xdp_query(dev, bpf_op, &xdp);
+	/* Query must always succeed. */
+	WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
 
 	return xdp.prog_id;
 }
@@ -7632,12 +7630,19 @@ static void dev_xdp_uninstall(struct net_device *dev)
 	if (!ndo_bpf)
 		return;
 
-	__dev_xdp_query(dev, ndo_bpf, &xdp);
-	if (!xdp.prog_id)
-		return;
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG;
+	WARN_ON(ndo_bpf(dev, &xdp));
+	if (xdp.prog_id)
+		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+					NULL));
 
-	/* Program removal should always succeed */
-	WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+	/* Remove HW offload */
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG_HW;
+	if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
+		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+					NULL));
 }
 
 /**
@@ -7653,12 +7658,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
+	enum bpf_netdev_command query;
 	struct bpf_prog *prog = NULL;
 	bpf_op_t bpf_op, bpf_chk;
 	int err;
 
 	ASSERT_RTNL();
 
+	query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
+
 	bpf_op = bpf_chk = ops->ndo_bpf;
 	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
 		return -EOPNOTSUPP;
@@ -7668,10 +7676,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		bpf_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
+		if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
+		    __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, bpf_op))
+		    __dev_xdp_query(dev, bpf_op, query))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 02ebc056a688..c9929ef17539 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -964,7 +964,7 @@ static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
 			  nla_total_size(1) +	/* XDP_ATTACHED */
-			  nla_total_size(4) +	/* XDP_PROG_ID */
+			  nla_total_size(4) +	/* XDP_PROG_ID (or 1st mode) */
 			  nla_total_size(4);	/* XDP_<mode>_PROG_ID */
 
 	return xdp_size;
@@ -1354,37 +1354,57 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
-static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
+static u32 rtnl_xdp_prog_skb(struct net_device *dev)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
 	const struct bpf_prog *generic_xdp_prog;
-	struct netdev_bpf xdp;
 
 	ASSERT_RTNL();
 
-	*prog_id = 0;
 	generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
-	if (generic_xdp_prog) {
-		*prog_id = generic_xdp_prog->aux->id;
-		return XDP_ATTACHED_SKB;
-	}
-	if (!ops->ndo_bpf)
-		return XDP_ATTACHED_NONE;
+	if (!generic_xdp_prog)
+		return 0;
+	return generic_xdp_prog->aux->id;
+}
+
+static u32 rtnl_xdp_prog_drv(struct net_device *dev)
+{
+	return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+}
+
+static u32 rtnl_xdp_prog_hw(struct net_device *dev)
+{
+	return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
+			       XDP_QUERY_PROG_HW);
+}
+
+static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
+			       u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
+			       u32 (*get_prog_id)(struct net_device *dev))
+{
+	u32 curr_id;
+	int err;
+
+	curr_id = get_prog_id(dev);
+	if (!curr_id)
+		return 0;
+
+	*prog_id = curr_id;
+	err = nla_put_u32(skb, attr, curr_id);
+	if (err)
+		return err;
 
-	__dev_xdp_query(dev, ops->ndo_bpf, &xdp);
-	if (!xdp.prog_id)
-		return XDP_ATTACHED_NONE;
+	if (*mode != XDP_ATTACHED_NONE)
+		*mode = XDP_ATTACHED_MULTI;
+	else
+		*mode = tgt_mode;
 
-	*prog_id = xdp.prog_id;
-	if (xdp.prog_flags & XDP_FLAGS_HW_MODE)
-		return XDP_ATTACHED_HW;
-	return XDP_ATTACHED_DRV;
+	return 0;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
-	u32 prog_attr, prog_id;
 	struct nlattr *xdp;
+	u32 prog_id;
 	int err;
 	u8 mode;
 
@@ -1392,35 +1412,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 	if (!xdp)
 		return -EMSGSIZE;
 
-	mode = rtnl_xdp_attached_mode(dev, &prog_id);
+	prog_id = 0;
+	mode = XDP_ATTACHED_NONE;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
+				IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb))
+		goto err_cancel;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
+				IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv))
+		goto err_cancel;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
+				IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw))
+		goto err_cancel;
+
 	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
 	if (err)
 		goto err_cancel;
 
-	if (prog_id) {
+	if (prog_id && mode != XDP_ATTACHED_MULTI) {
 		err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
 		if (err)
 			goto err_cancel;
-
-		switch (mode) {
-		case XDP_ATTACHED_DRV:
-			prog_attr = IFLA_XDP_DRV_PROG_ID;
-			break;
-		case XDP_ATTACHED_SKB:
-			prog_attr = IFLA_XDP_SKB_PROG_ID;
-			break;
-		case XDP_ATTACHED_HW:
-			prog_attr = IFLA_XDP_HW_PROG_ID;
-			break;
-		case XDP_ATTACHED_NONE:
-		default:
-			err = -EINVAL;
-			goto err_cancel;
-		}
-
-		err = nla_put_u32(skb, prog_attr, prog_id);
-		if (err)
-			goto err_cancel;
 	}
 
 	nla_nest_end(skb, xdp);
-- 
cgit v1.2.3


From d9f37d01e294e5338aa3e9d3b2eda61b59b619df Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 13 Jul 2018 14:41:36 +0800
Subject: net: convert gro_count to bitmask

gro_hash size is 192 bytes, and uses 3 cache lines, if there is few
flows, gro_hash may be not fully used, so it is unnecessary to iterate
all gro_hash in napi_gro_flush(), to occupy unnecessary cacheline.

convert gro_count to a bitmask, and rename it as gro_bitmask, each bit
represents a element of gro_hash, only flush a gro_hash element if the
related bit is set, to speed up napi_gro_flush().

and update gro_bitmask only if it will be changed, to reduce cache
update

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Cc: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 +++++++--
 net/core/dev.c            | 36 ++++++++++++++++++++++++------------
 2 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3514d67112b3..c1295c7a452e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -308,9 +308,14 @@ struct gro_list {
 };
 
 /*
- * Structure for NAPI scheduling similar to tasklet but with weighting
+ * size of gro hash buckets, must less than bit number of
+ * napi_struct::gro_bitmask
  */
 #define GRO_HASH_BUCKETS	8
+
+/*
+ * Structure for NAPI scheduling similar to tasklet but with weighting
+ */
 struct napi_struct {
 	/* The poll_list must only be managed by the entity which
 	 * changes the state of the NAPI_STATE_SCHED bit.  This means
@@ -322,7 +327,7 @@ struct napi_struct {
 
 	unsigned long		state;
 	int			weight;
-	unsigned int		gro_count;
+	unsigned long		gro_bitmask;
 	int			(*poll)(struct napi_struct *, int);
 #ifdef CONFIG_NETPOLL
 	int			poll_owner;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0df1771a12f9..c883b17ee0fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5282,9 +5282,11 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 		list_del(&skb->list);
 		skb->next = NULL;
 		napi_gro_complete(skb);
-		napi->gro_count--;
 		napi->gro_hash[index].count--;
 	}
+
+	if (!napi->gro_hash[index].count)
+		__clear_bit(index, &napi->gro_bitmask);
 }
 
 /* napi->gro_hash[].list contains packets ordered by age.
@@ -5295,8 +5297,10 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 {
 	u32 i;
 
-	for (i = 0; i < GRO_HASH_BUCKETS; i++)
-		__napi_gro_flush_chain(napi, i, flush_old);
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		if (test_bit(i, &napi->gro_bitmask))
+			__napi_gro_flush_chain(napi, i, flush_old);
+	}
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
@@ -5388,8 +5392,8 @@ static void gro_flush_oldest(struct list_head *head)
 	if (WARN_ON_ONCE(!oldest))
 		return;
 
-	/* Do not adjust napi->gro_count, caller is adding a new SKB to
-	 * the chain.
+	/* Do not adjust napi->gro_hash[].count, caller is adding a new
+	 * SKB to the chain.
 	 */
 	list_del(&oldest->list);
 	napi_gro_complete(oldest);
@@ -5464,7 +5468,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		list_del(&pp->list);
 		pp->next = NULL;
 		napi_gro_complete(pp);
-		napi->gro_count--;
 		napi->gro_hash[hash].count--;
 	}
 
@@ -5477,7 +5480,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 		gro_flush_oldest(gro_head);
 	} else {
-		napi->gro_count++;
 		napi->gro_hash[hash].count++;
 	}
 	NAPI_GRO_CB(skb)->count = 1;
@@ -5492,6 +5494,13 @@ pull:
 	if (grow > 0)
 		gro_pull_from_frag0(skb, grow);
 ok:
+	if (napi->gro_hash[hash].count) {
+		if (!test_bit(hash, &napi->gro_bitmask))
+			__set_bit(hash, &napi->gro_bitmask);
+	} else if (test_bit(hash, &napi->gro_bitmask)) {
+		__clear_bit(hash, &napi->gro_bitmask);
+	}
+
 	return ret;
 
 normal:
@@ -5890,7 +5899,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 				 NAPIF_STATE_IN_BUSY_POLL)))
 		return false;
 
-	if (n->gro_count) {
+	if (n->gro_bitmask) {
 		unsigned long timeout = 0;
 
 		if (work_done)
@@ -6099,7 +6108,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 	 */
-	if (napi->gro_count && !napi_disable_pending(napi) &&
+	if (napi->gro_bitmask && !napi_disable_pending(napi) &&
 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 		__napi_schedule_irqoff(napi);
 
@@ -6114,7 +6123,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	INIT_LIST_HEAD(&napi->poll_list);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
-	napi->gro_count = 0;
+	napi->gro_bitmask = 0;
 	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 		napi->gro_hash[i].count = 0;
@@ -6174,7 +6183,7 @@ void netif_napi_del(struct napi_struct *napi)
 	napi_free_frags(napi);
 
 	flush_gro_hash(napi);
-	napi->gro_count = 0;
+	napi->gro_bitmask = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
 
@@ -6216,7 +6225,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 		goto out_unlock;
 	}
 
-	if (n->gro_count) {
+	if (n->gro_bitmask) {
 		/* flush too old packets
 		 * If HZ < 1000, flush all packets.
 		 */
@@ -9272,6 +9281,9 @@ static struct hlist_head * __net_init netdev_create_hash(void)
 /* Initialize per network namespace state */
 static int __net_init netdev_init(struct net *net)
 {
+	BUILD_BUG_ON(GRO_HASH_BUCKETS >
+			FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+
 	if (net != &init_net)
 		INIT_LIST_HEAD(&net->dev_base_head);
 
-- 
cgit v1.2.3


From ccdb51717ba3bdc9585998e4ffd41d70c04dedea Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 16 Jul 2018 17:02:04 -0700
Subject: net: Fix GRO_HASH_BUCKETS assertion.

FIELD_SIZEOF() is in bytes, but we want bits.

Fixes: d9f37d01e294 ("net: convert gro_count to bitmask")
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index c883b17ee0fe..4f8b92d81d10 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9282,7 +9282,7 @@ static struct hlist_head * __net_init netdev_create_hash(void)
 static int __net_init netdev_init(struct net *net)
 {
 	BUILD_BUG_ON(GRO_HASH_BUCKETS >
-			FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+		     8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
 
 	if (net != &init_net)
 		INIT_LIST_HEAD(&net->dev_base_head);
-- 
cgit v1.2.3


From 7c4ec749a3bd89237d7195ccd621bf5d4124d6b5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 20 Jul 2018 23:37:55 -0700
Subject: net: Init backlog NAPI's gro_hash.

Based upon a patch by Sean Tranchetti.

Fixes: d4546c2509b1 ("net: Convert GRO SKB handling to list_head.")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4f8b92d81d10..87c42c8249ae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6115,19 +6115,24 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
-		    int (*poll)(struct napi_struct *, int), int weight)
+static void init_gro_hash(struct napi_struct *napi)
 {
 	int i;
 
-	INIT_LIST_HEAD(&napi->poll_list);
-	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
-	napi->timer.function = napi_watchdog;
-	napi->gro_bitmask = 0;
 	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 		napi->gro_hash[i].count = 0;
 	}
+	napi->gro_bitmask = 0;
+}
+
+void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+		    int (*poll)(struct napi_struct *, int), int weight)
+{
+	INIT_LIST_HEAD(&napi->poll_list);
+	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	napi->timer.function = napi_watchdog;
+	init_gro_hash(napi);
 	napi->skb = NULL;
 	napi->poll = poll;
 	if (weight > NAPI_POLL_WEIGHT)
@@ -9554,6 +9559,7 @@ static int __init net_dev_init(void)
 		sd->cpu = i;
 #endif
 
+		init_gro_hash(&sd->backlog);
 		sd->backlog.poll = process_backlog;
 		sd->backlog.weight = weight_p;
 	}
-- 
cgit v1.2.3


From 7a4c53bee3324ac00bf964aa2f82d15d279e86e4 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 27 Jul 2018 13:43:23 -0700
Subject: net: report invalid mtu value via netlink extack

If an invalid MTU value is set through rtnetlink return extra error
information instead of putting message in kernel log. For other cases
where there is no visible API, keep the error report in the log.

Example:
	# ip li set dev enp12s0 mtu 10000
	Error: mtu greater than device maximum.

	# ifconfig enp12s0 mtu 10000
	SIOCSIFMTU: Invalid argument
	# dmesg | tail -1
	[ 2047.795467] enp12s0: mtu greater than device maximum

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 23 +++++++++++++++++------
 net/core/rtnetlink.c      |  2 +-
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c1295c7a452e..9c917467a2c7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3546,6 +3546,8 @@ int dev_set_alias(struct net_device *, const char *, size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
 int dev_change_net_namespace(struct net_device *, struct net *, const char *);
 int __dev_set_mtu(struct net_device *, int);
+int dev_set_mtu_ext(struct net_device *dev, int mtu,
+		    struct netlink_ext_ack *extack);
 int dev_set_mtu(struct net_device *, int);
 int dev_change_tx_queue_len(struct net_device *, unsigned long);
 void dev_set_group(struct net_device *, int);
diff --git a/net/core/dev.c b/net/core/dev.c
index 87c42c8249ae..89031b5fef9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7523,13 +7523,15 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
 EXPORT_SYMBOL(__dev_set_mtu);
 
 /**
- *	dev_set_mtu - Change maximum transfer unit
+ *	dev_set_mtu_ext - Change maximum transfer unit
  *	@dev: device
  *	@new_mtu: new transfer unit
+ *	@extack: netlink extended ack
  *
  *	Change the maximum transfer size of the network device.
  */
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
+		    struct netlink_ext_ack *extack)
 {
 	int err, orig_mtu;
 
@@ -7538,14 +7540,12 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 
 	/* MTU must be positive, and in range */
 	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
-		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
-				    dev->name, new_mtu, dev->min_mtu);
+		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 		return -EINVAL;
 	}
 
 	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
-		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
-				    dev->name, new_mtu, dev->max_mtu);
+		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 		return -EINVAL;
 	}
 
@@ -7573,6 +7573,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	}
 	return err;
 }
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+	struct netlink_ext_ack extack;
+	int err;
+
+	err = dev_set_mtu_ext(dev, new_mtu, &extack);
+	if (err)
+		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
+	return err;
+}
 EXPORT_SYMBOL(dev_set_mtu);
 
 /**
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 510d4f765a13..24431e578310 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2382,7 +2382,7 @@ static int do_setlink(const struct sk_buff *skb,
 	}
 
 	if (tb[IFLA_MTU]) {
-		err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+		err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
 		if (err < 0)
 			goto errout;
 		status |= DO_SETLINK_MODIFIED;
-- 
cgit v1.2.3


From cd11b164073b719203318227918f9510809d5e10 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 30 Jul 2018 14:30:44 +0200
Subject: net/tc: introduce TC_ACT_REINSERT.

This is similar TC_ACT_REDIRECT, but with a slightly different
semantic:
- on ingress the mirred skbs are passed to the target device
network stack without any additional check not scrubbing.
- the rcu-protected stats provided via the tcf_result struct
  are updated on error conditions.

This new tcfa_action value is not exposed to the user-space
and can be used only internally by clsact.

v1 -> v2: do not touch TC_ACT_REDIRECT code path, introduce
 a new action type instead
v2 -> v3:
 - rename the new action value TC_ACT_REINJECT, update the
   helper accordingly
 - take care of uncloned reinjected packets in XDP generic
   hook
v3 -> v4:
 - renamed again the new action value (JiriP)
v4 -> v5:
 - fix build error with !NET_CLS_ACT (kbuild bot)

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     |  3 +++
 include/net/sch_generic.h | 28 ++++++++++++++++++++++++++++
 net/core/dev.c            |  6 +++++-
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6d02f31abba8..22bfc3a13c25 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -7,6 +7,9 @@
 #include <net/sch_generic.h>
 #include <net/act_api.h>
 
+/* TC action not accessible from user space */
+#define TC_ACT_REINSERT		(TC_ACT_VALUE_MAX + 1)
+
 /* Basic packet classifier frontend definitions. */
 
 struct tcf_walker {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index bcae181c1857..a6d00093f35e 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -235,6 +235,12 @@ struct tcf_result {
 			u32		classid;
 		};
 		const struct tcf_proto *goto_tp;
+
+		/* used by the TC_ACT_REINSERT action */
+		struct {
+			bool		ingress;
+			struct gnet_stats_queue *qstats;
+		};
 	};
 };
 
@@ -569,6 +575,15 @@ static inline void skb_reset_tc(struct sk_buff *skb)
 #endif
 }
 
+static inline bool skb_is_tc_redirected(const struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	return skb->tc_redirected;
+#else
+	return false;
+#endif
+}
+
 static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
@@ -1108,4 +1123,17 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
 void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
 			  struct mini_Qdisc __rcu **p_miniq);
 
+static inline void skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res)
+{
+	struct gnet_stats_queue *stats = res->qstats;
+	int ret;
+
+	if (res->ingress)
+		ret = netif_receive_skb(skb);
+	else
+		ret = dev_queue_xmit(skb);
+	if (ret && stats)
+		qstats_overlimit_inc(res->qstats);
+}
+
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 89031b5fef9f..38b0c414d780 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4252,7 +4252,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	/* Reinjected packets coming from act_mirred or similar should
 	 * not get XDP generic processing.
 	 */
-	if (skb_cloned(skb))
+	if (skb_cloned(skb) || skb_is_tc_redirected(skb))
 		return XDP_PASS;
 
 	/* XDP packets must be linear and must have sufficient headroom
@@ -4602,6 +4602,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		__skb_push(skb, skb->mac_len);
 		skb_do_redirect(skb);
 		return NULL;
+	case TC_ACT_REINSERT:
+		/* this does not scrub the packet, and updates stats on error */
+		skb_tc_reinsert(skb, &cl_res);
+		return NULL;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From a6bcfc89694ed8cb482a82cdc8b93aae63a8b691 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 3 Aug 2018 15:45:21 +0800
Subject: net: check extack._msg before print

dev_set_mtu_ext is able to fail with a valid mtu value, at that
condition, extack._msg is not set and random since it is in stack,
then kernel will crash when print it.

Fixes: 7a4c53bee3324a ("net: report invalid mtu value via netlink extack")
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 36e994519488..f68122f0ab02 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7583,8 +7583,9 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	struct netlink_ext_ack extack;
 	int err;
 
+	memset(&extack, 0, sizeof(extack));
 	err = dev_set_mtu_ext(dev, new_mtu, &extack);
-	if (err)
+	if (err && extack._msg)
 		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 	return err;
 }
-- 
cgit v1.2.3


From 4d99f6602cb552fb58db0c3b1d935bb6fa017f24 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Wed, 8 Aug 2018 20:07:35 -0700
Subject: net: allow to call netif_reset_xps_queues() under cpus_read_lock

The definition of static_key_slow_inc() has cpus_read_lock in place. In the
virtio_net driver, XPS queues are initialized after setting the queue:cpu
affinity in virtnet_set_affinity() which is already protected within
cpus_read_lock. Lockdep prints a warning when we are trying to acquire
cpus_read_lock when it is already held.

This patch adds an ability to call __netif_set_xps_queue under
cpus_read_lock().
Acked-by: Jason Wang <jasowang@redhat.com>

============================================
WARNING: possible recursive locking detected
4.18.0-rc3-next-20180703+ #1 Not tainted
--------------------------------------------
swapper/0/1 is trying to acquire lock:
00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: static_key_slow_inc+0xe/0x20

but task is already holding lock:
00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: init_vqs+0x513/0x5a0

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(cpu_hotplug_lock.rw_sem);
  lock(cpu_hotplug_lock.rw_sem);

 *** DEADLOCK ***

 May be due to missing lock nesting notation

3 locks held by swapper/0/1:
 #0: 00000000244bc7da (&dev->mutex){....}, at: __driver_attach+0x5a/0x110
 #1: 00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: init_vqs+0x513/0x5a0
 #2: 000000005cd8463f (xps_map_mutex){+.+.}, at: __netif_set_xps_queue+0x8d/0xc60

v2: move cpus_read_lock() out of __netif_set_xps_queue()

Cc: "Nambiar, Amritha" <amritha.nambiar@intel.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Fixes: 8af2c06ff4b1 ("net-sysfs: Add interface for Rx queue(s) map per Tx queue")

Signed-off-by: Andrei Vagin <avagin@gmail.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c |  4 +++-
 net/core/dev.c           | 20 +++++++++++++++-----
 net/core/net-sysfs.c     |  4 ++++
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 62311dde6e71..39a7f4452587 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1903,9 +1903,11 @@ static void virtnet_set_affinity(struct virtnet_info *vi)
 
 	i = 0;
 	for_each_online_cpu(cpu) {
+		const unsigned long *mask = cpumask_bits(cpumask_of(cpu));
+
 		virtqueue_set_affinity(vi->rq[i].vq, cpu);
 		virtqueue_set_affinity(vi->sq[i].vq, cpu);
-		netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
+		__netif_set_xps_queue(vi->dev, mask, i, false);
 		i++;
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index f68122f0ab02..325fc5088370 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2176,6 +2176,7 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 	if (!static_key_false(&xps_needed))
 		return;
 
+	cpus_read_lock();
 	mutex_lock(&xps_map_mutex);
 
 	if (static_key_false(&xps_rxqs_needed)) {
@@ -2199,10 +2200,11 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 
 out_no_maps:
 	if (static_key_enabled(&xps_rxqs_needed))
-		static_key_slow_dec(&xps_rxqs_needed);
+		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 
-	static_key_slow_dec(&xps_needed);
+	static_key_slow_dec_cpuslocked(&xps_needed);
 	mutex_unlock(&xps_map_mutex);
+	cpus_read_unlock();
 }
 
 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
@@ -2250,6 +2252,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 	return new_map;
 }
 
+/* Must be called under cpus_read_lock */
 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 			  u16 index, bool is_rxqs_map)
 {
@@ -2317,9 +2320,9 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	if (!new_dev_maps)
 		goto out_no_new_maps;
 
-	static_key_slow_inc(&xps_needed);
+	static_key_slow_inc_cpuslocked(&xps_needed);
 	if (is_rxqs_map)
-		static_key_slow_inc(&xps_rxqs_needed);
+		static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 
 	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 	     j < nr_ids;) {
@@ -2448,11 +2451,18 @@ error:
 	kfree(new_dev_maps);
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			u16 index)
 {
-	return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+	int ret;
+
+	cpus_read_lock();
+	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+	cpus_read_unlock();
+
+	return ret;
 }
 EXPORT_SYMBOL(netif_set_xps_queue);
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 0a95bcf64cdc..bd67c4d0fcfd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -26,6 +26,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/of.h>
 #include <linux/of_net.h>
+#include <linux/cpu.h>
 
 #include "net-sysfs.h"
 
@@ -1400,7 +1401,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
 		return err;
 	}
 
+	cpus_read_lock();
 	err = __netif_set_xps_queue(dev, mask, index, true);
+	cpus_read_unlock();
+
 	kfree(mask);
 	return err ? : len;
 }
-- 
cgit v1.2.3