From f0796d5c73e59786d09a1e617689d1d415f2db44 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 1 Jul 2010 13:21:57 +0000 Subject: net: decreasing real_num_tx_queues needs to flush qdisc Reducing real_num_queues needs to flush the qdisc otherwise skbs with queue_mappings greater then real_num_tx_queues can be sent to the underlying driver. The flow for this is, dev_queue_xmit() dev_pick_tx() skb_tx_hash() => hash using real_num_tx_queues skb_set_queue_mapping() ... qdisc_enqueue_root() => enqueue skb on txq from hash ... dev->real_num_tx_queues -= n ... sch_direct_xmit() dev_hard_start_xmit() ndo_start_xmit(skb,dev) => skb queue set with old hash skbs are enqueued on the qdisc with skb->queue_mapping set 0 < queue_mappings < real_num_tx_queues. When the driver decreases real_num_tx_queues skb's may be dequeued from the qdisc with a queue_mapping greater then real_num_tx_queues. This fixes a case in ixgbe where this was occurring with DCB and FCoE. Because the driver is using queue_mapping to map skbs to tx descriptor rings we can potentially map skbs to rings that no longer exist. Signed-off-by: John Fastabend Tested-by: Ross Brattain Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/dev.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2b3bf53bc687..723a34710ad4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1553,6 +1553,24 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_unlock(); } +/* + * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues + * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + */ +void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +{ + unsigned int real_num = dev->real_num_tx_queues; + + if (unlikely(txq > dev->num_tx_queues)) + ; + else if (txq > real_num) + dev->real_num_tx_queues = txq; + else if (txq < real_num) { + dev->real_num_tx_queues = txq; + qdisc_reset_all_tx_gt(dev, txq); + } +} +EXPORT_SYMBOL(netif_set_real_num_tx_queues); static inline void __netif_reschedule(struct Qdisc *q) { -- cgit v1.2.3 From 87fd308cfc6b2e880bf717a740bd5c58d2aed10c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Jul 2010 05:24:20 +0000 Subject: net: skb_tx_hash() fix relative to skb_orphan_try() commit fc6055a5ba31e2 (net: Introduce skb_orphan_try()) added early orphaning of skbs. This unfortunately added a performance regression in skb_tx_hash() in case of stacked devices (bonding, vlans, ...) Since skb->sk is now NULL, we cannot access sk->sk_hash anymore to spread tx packets to multiple NIC queues on multiqueue devices. skb_tx_hash() in this case only uses skb->protocol, same value for all flows. skb_orphan_try() can copy sk->sk_hash into skb->rxhash and skb_tx_hash() can use this saved sk_hash value to compute its internal hash value. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 723a34710ad4..4b05fdf762ab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1911,8 +1911,16 @@ static int dev_gso_segment(struct sk_buff *skb) */ static inline void skb_orphan_try(struct sk_buff *skb) { - if (!skb_tx(skb)->flags) + struct sock *sk = skb->sk; + + if (sk && !skb_tx(skb)->flags) { + /* skb_tx_hash() wont be able to get sk. + * We copy sk_hash into skb->rxhash + */ + if (!skb->rxhash) + skb->rxhash = sk->sk_hash; skb_orphan(skb); + } } int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, @@ -1998,8 +2006,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else - hash = (__force u16) skb->protocol; - + hash = (__force u16) skb->protocol ^ skb->rxhash; hash = jhash_1word(hash, hashrnd); return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); -- cgit v1.2.3 From b0f77d0eae0c58a5a9691a067ada112ceeae2d00 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 14 Jul 2010 20:50:29 -0700 Subject: net: fix problem in reading sock TX queue Fix problem in reading the tx_queue recorded in a socket. In dev_pick_tx, the TX queue is read by doing a check with sk_tx_queue_recorded on the socket, followed by a sk_tx_queue_get. The problem is that there is not mutual exclusion across these calls in the socket so it it is possible that the queue in the sock can be invalidated after sk_tx_queue_recorded is called so that sk_tx_queue get returns -1, which sets 65535 in queue_index and thus dev_pick_tx returns 65536 which is a bogus queue and can cause crash in dev_queue_xmit. We fix this by only calling sk_tx_queue_get which does the proper checks. The interface is that sk_tx_queue_get returns the TX queue if the sock argument is non-NULL and TX queue is recorded, else it returns -1. sk_tx_queue_recorded is no longer used so it can be completely removed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/sock.h | 7 +------ net/core/dev.c | 7 +++---- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/sock.h b/include/net/sock.h index 731150d52799..0a691ea7654a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1224,12 +1224,7 @@ static inline void sk_tx_queue_clear(struct sock *sk) static inline int sk_tx_queue_get(const struct sock *sk) { - return sk->sk_tx_queue_mapping; -} - -static inline bool sk_tx_queue_recorded(const struct sock *sk) -{ - return (sk && sk->sk_tx_queue_mapping >= 0); + return sk ? sk->sk_tx_queue_mapping : -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) diff --git a/net/core/dev.c b/net/core/dev.c index 4b05fdf762ab..0ea10f849be8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2029,12 +2029,11 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { - u16 queue_index; + int queue_index; struct sock *sk = skb->sk; - if (sk_tx_queue_recorded(sk)) { - queue_index = sk_tx_queue_get(sk); - } else { + queue_index = sk_tx_queue_get(sk); + if (queue_index < 0) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) { -- cgit v1.2.3 From c736eefadb71a01a5e61e0de700f28f6952b4444 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 22 Jul 2010 09:54:47 +0000 Subject: net: dev_forward_skb should call nf_reset With conn-track zones and probably with different network namespaces, the netfilter logic needs to be re-calculated on packet receive. If the netfilter logic is not reset, it will not be recalculated properly. This patch adds the nf_reset logic to dev_forward_skb. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 0ea10f849be8..1f466e82ac33 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1488,6 +1488,7 @@ static inline void net_timestamp_check(struct sk_buff *skb) int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { skb_orphan(skb); + nf_reset(skb); if (!(dev->flags & IFF_UP) || (skb->len > (dev->mtu + dev->hard_header_len))) { -- cgit v1.2.3