From e45a8a9e60ff1dd5ad118c794337a1101b46ab0d Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Tue, 9 Aug 2016 18:27:08 +0200
Subject: xfrm: constify xfrm_replay structures

The xfrm_replay structures are never modified, so declare them as const.

Done with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index adfebd6f243c..d2fdd6d70959 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -187,7 +187,7 @@ struct xfrm_state {
 	struct xfrm_replay_state_esn *preplay_esn;
 
 	/* The functions for replay detection. */
-	struct xfrm_replay	*repl;
+	const struct xfrm_replay *repl;
 
 	/* internal flag that only holds state for delayed aevent at the
 	 * moment
-- 
cgit v1.2.3


From d737a5805581c6f99dad4caa9fdf80965d617d1a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Aug 2016 12:16:09 +0200
Subject: xfrm: state: don't use lock anymore unless acquire operation is
 needed

push the lock down, after earlier patches we can rely on rcu to
make sure state struct won't go away.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h | 6 +++---
 net/xfrm/xfrm_state.c    | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 24cd3949a9a4..1ab51d188408 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -38,9 +38,9 @@ struct netns_xfrm {
 	 * mode. Also, it can be used by ah/esp icmp error handler to find
 	 * offending SA.
 	 */
-	struct hlist_head	*state_bydst;
-	struct hlist_head	*state_bysrc;
-	struct hlist_head	*state_byspi;
+	struct hlist_head	__rcu *state_bydst;
+	struct hlist_head	__rcu *state_bysrc;
+	struct hlist_head	__rcu *state_byspi;
 	unsigned int		state_hmask;
 	unsigned int		state_num;
 	struct work_struct	state_hash_work;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 53e7867f9254..1a15b658a79e 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -799,7 +799,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 
 	sequence = read_seqcount_begin(&xfrm_state_hash_generation);
 
-	spin_lock_bh(&net->xfrm.xfrm_state_lock);
+	rcu_read_lock();
 	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
 	hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) {
 		if (x->props.family == encap_family &&
@@ -870,6 +870,7 @@ found:
 		}
 
 		if (km_query(x, tmpl, pol) == 0) {
+			spin_lock_bh(&net->xfrm.xfrm_state_lock);
 			x->km.state = XFRM_STATE_ACQ;
 			list_add(&x->km.all, &net->xfrm.state_all);
 			hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
@@ -883,6 +884,7 @@ found:
 			tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
 			net->xfrm.state_num++;
 			xfrm_hash_grow_check(net, x->bydst.next != NULL);
+			spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 		} else {
 			x->km.state = XFRM_STATE_DEAD;
 			to_put = x;
@@ -899,7 +901,7 @@ out:
 	} else {
 		*err = acquire_in_progress ? -EAGAIN : error;
 	}
-	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+	rcu_read_unlock();
 	if (to_put)
 		xfrm_state_put(to_put);
 
-- 
cgit v1.2.3


From a7c44247f704e385c77579d65c6ee6d002832529 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:56 +0200
Subject: xfrm: policy: make xfrm_policy_lookup_bytype lockless

side effect: no longer disables BH (should be fine).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h | 2 +-
 net/xfrm/xfrm_policy.c   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 1ab51d188408..3ab828a97e68 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -11,7 +11,7 @@
 struct ctl_table_header;
 
 struct xfrm_policy_hash {
-	struct hlist_head	*table;
+	struct hlist_head	__rcu *table;
 	unsigned int		hmask;
 	u8			dbits4;
 	u8			sbits4;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 09f2e2b38246..9302647f20a0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1123,7 +1123,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	if (unlikely(!daddr || !saddr))
 		return NULL;
 
-	read_lock_bh(&net->xfrm.xfrm_policy_lock);
+	rcu_read_lock();
  retry:
 	do {
 		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
@@ -1172,7 +1172,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	if (ret && !xfrm_pol_hold_rcu(ret))
 		goto retry;
 fail:
-	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
cgit v1.2.3


From 9d0380df6217e8dd014118fa1c99dda9974f3613 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:59 +0200
Subject: xfrm: policy: convert policy_lock to spinlock

After earlier patches conversions all spots acquire the writer lock and
we can now convert this to a normal spinlock.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h |  2 +-
 net/xfrm/xfrm_policy.c   | 68 ++++++++++++++++++++++++------------------------
 2 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 3ab828a97e68..177ed444d7b2 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -73,7 +73,7 @@ struct netns_xfrm {
 	struct dst_ops		xfrm6_dst_ops;
 #endif
 	spinlock_t xfrm_state_lock;
-	rwlock_t xfrm_policy_lock;
+	spinlock_t xfrm_policy_lock;
 	struct mutex xfrm_cfg_mutex;
 
 	/* flow cache part */
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 35b85a9a358c..dd01fd2e55fa 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -484,7 +484,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 	if (!ndst)
 		return;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	write_seqcount_begin(&xfrm_policy_hash_generation);
 
 	odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
@@ -500,7 +500,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 	net->xfrm.policy_bydst[dir].hmask = nhashmask;
 
 	write_seqcount_end(&xfrm_policy_hash_generation);
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	synchronize_rcu();
 
@@ -519,7 +519,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
 	if (!nidx)
 		return;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
 	for (i = hmask; i >= 0; i--)
 		xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
@@ -527,7 +527,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
 	net->xfrm.policy_byidx = nidx;
 	net->xfrm.policy_idx_hmask = nhashmask;
 
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
 }
@@ -617,7 +617,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 		rbits6 = net->xfrm.policy_hthresh.rbits6;
 	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
 	/* reset the bydst and inexact table in all directions */
 	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
@@ -659,7 +659,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 			hlist_add_head(&policy->bydst, chain);
 	}
 
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	mutex_unlock(&hash_resize_mutex);
 }
@@ -770,7 +770,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	struct hlist_head *chain;
 	struct hlist_node *newpos;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
 	delpol = NULL;
 	newpos = NULL;
@@ -781,7 +781,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		    xfrm_sec_ctx_match(pol->security, policy->security) &&
 		    !WARN_ON(delpol)) {
 			if (excl) {
-				write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+				spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 				return -EEXIST;
 			}
 			delpol = pol;
@@ -817,7 +817,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	policy->curlft.use_time = 0;
 	if (!mod_timer(&policy->timer, jiffies + HZ))
 		xfrm_pol_hold(policy);
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (delpol)
 		xfrm_policy_kill(delpol);
@@ -837,7 +837,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	struct hlist_head *chain;
 
 	*err = 0;
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, sel, sel->family, dir);
 	ret = NULL;
 	hlist_for_each_entry(pol, chain, bydst) {
@@ -850,7 +850,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 				*err = security_xfrm_policy_delete(
 								pol->security);
 				if (*err) {
-					write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+					spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 					return pol;
 				}
 				__xfrm_policy_unlink(pol, dir);
@@ -859,7 +859,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 			break;
 		}
 	}
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (ret && delete)
 		xfrm_policy_kill(ret);
@@ -878,7 +878,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 		return NULL;
 
 	*err = 0;
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = net->xfrm.policy_byidx + idx_hash(net, id);
 	ret = NULL;
 	hlist_for_each_entry(pol, chain, byidx) {
@@ -889,7 +889,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 				*err = security_xfrm_policy_delete(
 								pol->security);
 				if (*err) {
-					write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+					spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 					return pol;
 				}
 				__xfrm_policy_unlink(pol, dir);
@@ -898,7 +898,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 			break;
 		}
 	}
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (ret && delete)
 		xfrm_policy_kill(ret);
@@ -956,7 +956,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 {
 	int dir, err = 0, cnt = 0;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
 	err = xfrm_policy_flush_secctx_check(net, type, task_valid);
 	if (err)
@@ -972,14 +972,14 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 			if (pol->type != type)
 				continue;
 			__xfrm_policy_unlink(pol, dir);
-			write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 			cnt++;
 
 			xfrm_audit_policy_delete(pol, 1, task_valid);
 
 			xfrm_policy_kill(pol);
 
-			write_lock_bh(&net->xfrm.xfrm_policy_lock);
+			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 			goto again1;
 		}
 
@@ -991,13 +991,13 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 				if (pol->type != type)
 					continue;
 				__xfrm_policy_unlink(pol, dir);
-				write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+				spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 				cnt++;
 
 				xfrm_audit_policy_delete(pol, 1, task_valid);
 				xfrm_policy_kill(pol);
 
-				write_lock_bh(&net->xfrm.xfrm_policy_lock);
+				spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 				goto again2;
 			}
 		}
@@ -1006,7 +1006,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 	if (!cnt)
 		err = -ESRCH;
 out:
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	return err;
 }
 EXPORT_SYMBOL(xfrm_policy_flush);
@@ -1026,7 +1026,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
 	if (list_empty(&walk->walk.all) && walk->seq != 0)
 		return 0;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	if (list_empty(&walk->walk.all))
 		x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
 	else
@@ -1054,7 +1054,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
 	}
 	list_del_init(&walk->walk.all);
 out:
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	return error;
 }
 EXPORT_SYMBOL(xfrm_policy_walk);
@@ -1073,9 +1073,9 @@ void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
 	if (list_empty(&walk->walk.all))
 		return;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
 	list_del(&walk->walk.all);
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 }
 EXPORT_SYMBOL(xfrm_policy_walk_done);
 
@@ -1321,9 +1321,9 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 {
 	struct net *net = xp_net(pol);
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	pol = __xfrm_policy_unlink(pol, dir);
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	if (pol) {
 		xfrm_policy_kill(pol);
 		return 0;
@@ -1342,7 +1342,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 		return -EINVAL;
 #endif
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	old_pol = rcu_dereference_protected(sk->sk_policy[dir],
 				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
 	if (pol) {
@@ -1360,7 +1360,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 		 */
 		xfrm_sk_policy_unlink(old_pol, dir);
 	}
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (old_pol) {
 		xfrm_policy_kill(old_pol);
@@ -1390,9 +1390,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
 		newp->type = old->type;
 		memcpy(newp->xfrm_vec, old->xfrm_vec,
 		       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
-		write_lock_bh(&net->xfrm.xfrm_policy_lock);
+		spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 		xfrm_sk_policy_link(newp, dir);
-		write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 		xfrm_pol_put(newp);
 	}
 	return newp;
@@ -3074,7 +3074,7 @@ static int __net_init xfrm_net_init(struct net *net)
 
 	/* Initialize the per-net locks here */
 	spin_lock_init(&net->xfrm.xfrm_state_lock);
-	rwlock_init(&net->xfrm.xfrm_policy_lock);
+	spin_lock_init(&net->xfrm.xfrm_policy_lock);
 	mutex_init(&net->xfrm.xfrm_cfg_mutex);
 
 	return 0;
@@ -3206,7 +3206,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
 	struct hlist_head *chain;
 	u32 priority = ~0U;
 
-	read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
 	hlist_for_each_entry(pol, chain, bydst) {
 		if (xfrm_migrate_selector_match(sel, &pol->selector) &&
@@ -3230,7 +3230,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
 
 	xfrm_pol_hold(ret);
 
-	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 1766e7b3763a0707c2fda9689a7866dceed07b7a Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Wed, 10 Aug 2016 18:49:27 +0530
Subject: mfd: da8xx-cfgchip: New header file for CFGCHIP registers

Create a new header file for TI DA8XX SoC CFGCHIPx registers.
This will be used by a number of planned drivers including a new USB
PHY driver and common clock framework drivers.

The same defines *will* be removed from the platform_data header,
once all the users start using the new syscon device header.

This also fixes the following compiler error caused due to
a dependent patch not merged.
drivers/phy/phy-da8xx-usb.c:19:37:
fatal error: linux/mfd/da8xx-cfgchip.h: No such file or directory
 #include <linux/mfd/da8xx-cfgchip.h>

Signed-off-by: David Lechner <david@lechnology.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 include/linux/mfd/da8xx-cfgchip.h | 153 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 include/linux/mfd/da8xx-cfgchip.h

(limited to 'include')

diff --git a/include/linux/mfd/da8xx-cfgchip.h b/include/linux/mfd/da8xx-cfgchip.h
new file mode 100644
index 000000000000..304985e288d2
--- /dev/null
+++ b/include/linux/mfd/da8xx-cfgchip.h
@@ -0,0 +1,153 @@
+/*
+ * TI DaVinci DA8xx CHIPCFGx registers for syscon consumers.
+ *
+ * Copyright (C) 2016 David Lechner <david@lechnology.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_MFD_DA8XX_CFGCHIP_H
+#define __LINUX_MFD_DA8XX_CFGCHIP_H
+
+#include <linux/bitops.h>
+
+/* register offset (32-bit registers) */
+#define CFGCHIP(n)				((n) * 4)
+
+/* CFGCHIP0 (PLL0/EDMA3_0) register bits */
+#define CFGCHIP0_PLL_MASTER_LOCK		BIT(4)
+#define CFGCHIP0_EDMA30TC1DBS(n)		((n) << 2)
+#define CFGCHIP0_EDMA30TC1DBS_MASK		CFGCHIP0_EDMA30TC1DBS(0x3)
+#define CFGCHIP0_EDMA30TC1DBS_16		CFGCHIP0_EDMA30TC1DBS(0x0)
+#define CFGCHIP0_EDMA30TC1DBS_32		CFGCHIP0_EDMA30TC1DBS(0x1)
+#define CFGCHIP0_EDMA30TC1DBS_64		CFGCHIP0_EDMA30TC1DBS(0x2)
+#define CFGCHIP0_EDMA30TC0DBS(n)		((n) << 0)
+#define CFGCHIP0_EDMA30TC0DBS_MASK		CFGCHIP0_EDMA30TC0DBS(0x3)
+#define CFGCHIP0_EDMA30TC0DBS_16		CFGCHIP0_EDMA30TC0DBS(0x0)
+#define CFGCHIP0_EDMA30TC0DBS_32		CFGCHIP0_EDMA30TC0DBS(0x1)
+#define CFGCHIP0_EDMA30TC0DBS_64		CFGCHIP0_EDMA30TC0DBS(0x2)
+
+/* CFGCHIP1 (eCAP/HPI/EDMA3_1/eHRPWM TBCLK/McASP0 AMUTEIN) register bits */
+#define CFGCHIP1_CAP2SRC(n)			((n) << 27)
+#define CFGCHIP1_CAP2SRC_MASK			CFGCHIP1_CAP2SRC(0x1f)
+#define CFGCHIP1_CAP2SRC_ECAP_PIN		CFGCHIP1_CAP2SRC(0x0)
+#define CFGCHIP1_CAP2SRC_MCASP0_TX		CFGCHIP1_CAP2SRC(0x1)
+#define CFGCHIP1_CAP2SRC_MCASP0_RX		CFGCHIP1_CAP2SRC(0x2)
+#define CFGCHIP1_CAP2SRC_EMAC_C0_RX_THRESHOLD	CFGCHIP1_CAP2SRC(0x7)
+#define CFGCHIP1_CAP2SRC_EMAC_C0_RX		CFGCHIP1_CAP2SRC(0x8)
+#define CFGCHIP1_CAP2SRC_EMAC_C0_TX		CFGCHIP1_CAP2SRC(0x9)
+#define CFGCHIP1_CAP2SRC_EMAC_C0_MISC		CFGCHIP1_CAP2SRC(0xa)
+#define CFGCHIP1_CAP2SRC_EMAC_C1_RX_THRESHOLD	CFGCHIP1_CAP2SRC(0xb)
+#define CFGCHIP1_CAP2SRC_EMAC_C1_RX		CFGCHIP1_CAP2SRC(0xc)
+#define CFGCHIP1_CAP2SRC_EMAC_C1_TX		CFGCHIP1_CAP2SRC(0xd)
+#define CFGCHIP1_CAP2SRC_EMAC_C1_MISC		CFGCHIP1_CAP2SRC(0xe)
+#define CFGCHIP1_CAP2SRC_EMAC_C2_RX_THRESHOLD	CFGCHIP1_CAP2SRC(0xf)
+#define CFGCHIP1_CAP2SRC_EMAC_C2_RX		CFGCHIP1_CAP2SRC(0x10)
+#define CFGCHIP1_CAP2SRC_EMAC_C2_TX		CFGCHIP1_CAP2SRC(0x11)
+#define CFGCHIP1_CAP2SRC_EMAC_C2_MISC		CFGCHIP1_CAP2SRC(0x12)
+#define CFGCHIP1_CAP1SRC(n)			((n) << 22)
+#define CFGCHIP1_CAP1SRC_MASK			CFGCHIP1_CAP1SRC(0x1f)
+#define CFGCHIP1_CAP1SRC_ECAP_PIN		CFGCHIP1_CAP1SRC(0x0)
+#define CFGCHIP1_CAP1SRC_MCASP0_TX		CFGCHIP1_CAP1SRC(0x1)
+#define CFGCHIP1_CAP1SRC_MCASP0_RX		CFGCHIP1_CAP1SRC(0x2)
+#define CFGCHIP1_CAP1SRC_EMAC_C0_RX_THRESHOLD	CFGCHIP1_CAP1SRC(0x7)
+#define CFGCHIP1_CAP1SRC_EMAC_C0_RX		CFGCHIP1_CAP1SRC(0x8)
+#define CFGCHIP1_CAP1SRC_EMAC_C0_TX		CFGCHIP1_CAP1SRC(0x9)
+#define CFGCHIP1_CAP1SRC_EMAC_C0_MISC		CFGCHIP1_CAP1SRC(0xa)
+#define CFGCHIP1_CAP1SRC_EMAC_C1_RX_THRESHOLD	CFGCHIP1_CAP1SRC(0xb)
+#define CFGCHIP1_CAP1SRC_EMAC_C1_RX		CFGCHIP1_CAP1SRC(0xc)
+#define CFGCHIP1_CAP1SRC_EMAC_C1_TX		CFGCHIP1_CAP1SRC(0xd)
+#define CFGCHIP1_CAP1SRC_EMAC_C1_MISC		CFGCHIP1_CAP1SRC(0xe)
+#define CFGCHIP1_CAP1SRC_EMAC_C2_RX_THRESHOLD	CFGCHIP1_CAP1SRC(0xf)
+#define CFGCHIP1_CAP1SRC_EMAC_C2_RX		CFGCHIP1_CAP1SRC(0x10)
+#define CFGCHIP1_CAP1SRC_EMAC_C2_TX		CFGCHIP1_CAP1SRC(0x11)
+#define CFGCHIP1_CAP1SRC_EMAC_C2_MISC		CFGCHIP1_CAP1SRC(0x12)
+#define CFGCHIP1_CAP0SRC(n)			((n) << 17)
+#define CFGCHIP1_CAP0SRC_MASK			CFGCHIP1_CAP0SRC(0x1f)
+#define CFGCHIP1_CAP0SRC_ECAP_PIN		CFGCHIP1_CAP0SRC(0x0)
+#define CFGCHIP1_CAP0SRC_MCASP0_TX		CFGCHIP1_CAP0SRC(0x1)
+#define CFGCHIP1_CAP0SRC_MCASP0_RX		CFGCHIP1_CAP0SRC(0x2)
+#define CFGCHIP1_CAP0SRC_EMAC_C0_RX_THRESHOLD	CFGCHIP1_CAP0SRC(0x7)
+#define CFGCHIP1_CAP0SRC_EMAC_C0_RX		CFGCHIP1_CAP0SRC(0x8)
+#define CFGCHIP1_CAP0SRC_EMAC_C0_TX		CFGCHIP1_CAP0SRC(0x9)
+#define CFGCHIP1_CAP0SRC_EMAC_C0_MISC		CFGCHIP1_CAP0SRC(0xa)
+#define CFGCHIP1_CAP0SRC_EMAC_C1_RX_THRESHOLD	CFGCHIP1_CAP0SRC(0xb)
+#define CFGCHIP1_CAP0SRC_EMAC_C1_RX		CFGCHIP1_CAP0SRC(0xc)
+#define CFGCHIP1_CAP0SRC_EMAC_C1_TX		CFGCHIP1_CAP0SRC(0xd)
+#define CFGCHIP1_CAP0SRC_EMAC_C1_MISC		CFGCHIP1_CAP0SRC(0xe)
+#define CFGCHIP1_CAP0SRC_EMAC_C2_RX_THRESHOLD	CFGCHIP1_CAP0SRC(0xf)
+#define CFGCHIP1_CAP0SRC_EMAC_C2_RX		CFGCHIP1_CAP0SRC(0x10)
+#define CFGCHIP1_CAP0SRC_EMAC_C2_TX		CFGCHIP1_CAP0SRC(0x11)
+#define CFGCHIP1_CAP0SRC_EMAC_C2_MISC		CFGCHIP1_CAP0SRC(0x12)
+#define CFGCHIP1_HPIBYTEAD			BIT(16)
+#define CFGCHIP1_HPIENA				BIT(15)
+#define CFGCHIP0_EDMA31TC0DBS(n)		((n) << 13)
+#define CFGCHIP0_EDMA31TC0DBS_MASK		CFGCHIP0_EDMA31TC0DBS(0x3)
+#define CFGCHIP0_EDMA31TC0DBS_16		CFGCHIP0_EDMA31TC0DBS(0x0)
+#define CFGCHIP0_EDMA31TC0DBS_32		CFGCHIP0_EDMA31TC0DBS(0x1)
+#define CFGCHIP0_EDMA31TC0DBS_64		CFGCHIP0_EDMA31TC0DBS(0x2)
+#define CFGCHIP1_TBCLKSYNC			BIT(12)
+#define CFGCHIP1_AMUTESEL0(n)			((n) << 0)
+#define CFGCHIP1_AMUTESEL0_MASK			CFGCHIP1_AMUTESEL0(0xf)
+#define CFGCHIP1_AMUTESEL0_LOW			CFGCHIP1_AMUTESEL0(0x0)
+#define CFGCHIP1_AMUTESEL0_BANK_0		CFGCHIP1_AMUTESEL0(0x1)
+#define CFGCHIP1_AMUTESEL0_BANK_1		CFGCHIP1_AMUTESEL0(0x2)
+#define CFGCHIP1_AMUTESEL0_BANK_2		CFGCHIP1_AMUTESEL0(0x3)
+#define CFGCHIP1_AMUTESEL0_BANK_3		CFGCHIP1_AMUTESEL0(0x4)
+#define CFGCHIP1_AMUTESEL0_BANK_4		CFGCHIP1_AMUTESEL0(0x5)
+#define CFGCHIP1_AMUTESEL0_BANK_5		CFGCHIP1_AMUTESEL0(0x6)
+#define CFGCHIP1_AMUTESEL0_BANK_6		CFGCHIP1_AMUTESEL0(0x7)
+#define CFGCHIP1_AMUTESEL0_BANK_7		CFGCHIP1_AMUTESEL0(0x8)
+
+/* CFGCHIP2 (USB PHY) register bits */
+#define CFGCHIP2_PHYCLKGD			BIT(17)
+#define CFGCHIP2_VBUSSENSE			BIT(16)
+#define CFGCHIP2_RESET				BIT(15)
+#define CFGCHIP2_OTGMODE(n)			((n) << 13)
+#define CFGCHIP2_OTGMODE_MASK			CFGCHIP2_OTGMODE(0x3)
+#define CFGCHIP2_OTGMODE_NO_OVERRIDE		CFGCHIP2_OTGMODE(0x0)
+#define CFGCHIP2_OTGMODE_FORCE_HOST		CFGCHIP2_OTGMODE(0x1)
+#define CFGCHIP2_OTGMODE_FORCE_DEVICE		CFGCHIP2_OTGMODE(0x2)
+#define CFGCHIP2_OTGMODE_FORCE_HOST_VBUS_LOW	CFGCHIP2_OTGMODE(0x3)
+#define CFGCHIP2_USB1PHYCLKMUX			BIT(12)
+#define CFGCHIP2_USB2PHYCLKMUX			BIT(11)
+#define CFGCHIP2_PHYPWRDN			BIT(10)
+#define CFGCHIP2_OTGPWRDN			BIT(9)
+#define CFGCHIP2_DATPOL				BIT(8)
+#define CFGCHIP2_USB1SUSPENDM			BIT(7)
+#define CFGCHIP2_PHY_PLLON			BIT(6)
+#define CFGCHIP2_SESENDEN			BIT(5)
+#define CFGCHIP2_VBDTCTEN			BIT(4)
+#define CFGCHIP2_REFFREQ(n)			((n) << 0)
+#define CFGCHIP2_REFFREQ_MASK			CFGCHIP2_REFFREQ(0xf)
+#define CFGCHIP2_REFFREQ_12MHZ			CFGCHIP2_REFFREQ(0x1)
+#define CFGCHIP2_REFFREQ_24MHZ			CFGCHIP2_REFFREQ(0x2)
+#define CFGCHIP2_REFFREQ_48MHZ			CFGCHIP2_REFFREQ(0x3)
+#define CFGCHIP2_REFFREQ_19_2MHZ		CFGCHIP2_REFFREQ(0x4)
+#define CFGCHIP2_REFFREQ_38_4MHZ		CFGCHIP2_REFFREQ(0x5)
+#define CFGCHIP2_REFFREQ_13MHZ			CFGCHIP2_REFFREQ(0x6)
+#define CFGCHIP2_REFFREQ_26MHZ			CFGCHIP2_REFFREQ(0x7)
+#define CFGCHIP2_REFFREQ_20MHZ			CFGCHIP2_REFFREQ(0x8)
+#define CFGCHIP2_REFFREQ_40MHZ			CFGCHIP2_REFFREQ(0x9)
+
+/* CFGCHIP3 (EMAC/uPP/PLL1/ASYNC3/PRU/DIV4.5/EMIFA) register bits */
+#define CFGCHIP3_RMII_SEL			BIT(8)
+#define CFGCHIP3_UPP_TX_CLKSRC			BIT(6)
+#define CFGCHIP3_PLL1_MASTER_LOCK		BIT(5)
+#define CFGCHIP3_ASYNC3_CLKSRC			BIT(4)
+#define CFGCHIP3_PRUEVTSEL			BIT(3)
+#define CFGCHIP3_DIV45PENA			BIT(2)
+#define CFGCHIP3_EMA_CLKSRC			BIT(1)
+
+/* CFGCHIP4 (McASP0 AMUNTEIN) register bits */
+#define CFGCHIP4_AMUTECLR0			BIT(0)
+
+#endif /* __LINUX_MFD_DA8XX_CFGCHIP_H */
-- 
cgit v1.2.3


From b2f0c09664b72b2f8c581383a9337ac3092e42c8 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Mon, 11 Jul 2016 13:50:01 +0200
Subject: iio: sw-trigger: Fix config group initialization

Use the IS_ENABLED() helper macro to ensure that the configfs group is
initialized either when configfs is built-in or when configfs is built as a
module. Otherwise software trigger creation will result in undefined
behaviour when configfs is built as a mdoule since the configfs group for
the trigger is not properly initialized.

Fixes: b662f809d410 ("iio: core: Introduce IIO software triggers")
Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Acked-by: Daniel Baluta <daniel.baluta@intel.com>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/sw_trigger.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iio/sw_trigger.h b/include/linux/iio/sw_trigger.h
index 5198f8ed08a4..c97eab67558f 100644
--- a/include/linux/iio/sw_trigger.h
+++ b/include/linux/iio/sw_trigger.h
@@ -62,7 +62,7 @@ void iio_swt_group_init_type_name(struct iio_sw_trigger *t,
 				  const char *name,
 				  struct config_item_type *type)
 {
-#ifdef CONFIG_CONFIGFS_FS
+#if IS_ENABLED(CONFIG_CONFIGFS_FS)
 	config_group_init_type_name(&t->group, name, type);
 #endif
 }
-- 
cgit v1.2.3


From c1a23f6d64552b4480208aa584ec7e9c13d6d9c3 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Wed, 17 Aug 2016 11:46:16 +0200
Subject: scsi: sas: provide stub implementation for scsi_is_sas_rphy

Provide a stub implementation for scsi_is_sas_rphy for kernel
configurations which do not have CONFIG_SCSI_SAS_ATTRS defined.

Reported-by: kbuild test robot <lkp@intel.com>
Suggested-by: James Bottomley <jejb@linux.vnet.ibm.com>
Reviewed-by: James E.J. Bottomley <jejb@linux.vnet.ibm.com>
Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi_transport_sas.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/scsi/scsi_transport_sas.h b/include/scsi/scsi_transport_sas.h
index 13c0b2ba1b6c..31ae074dad9d 100644
--- a/include/scsi/scsi_transport_sas.h
+++ b/include/scsi/scsi_transport_sas.h
@@ -15,8 +15,14 @@ static inline int is_sas_attached(struct scsi_device *sdev)
 {
 	return 0;
 }
+
+static inline int scsi_is_sas_rphy(const struct device *sdev)
+{
+	return 0;
+}
 #else
 extern int is_sas_attached(struct scsi_device *sdev);
+extern int scsi_is_sas_rphy(const struct device *);
 #endif
 
 static inline int sas_protocol_ata(enum sas_protocol proto)
@@ -202,7 +208,6 @@ extern int sas_rphy_add(struct sas_rphy *);
 extern void sas_rphy_remove(struct sas_rphy *);
 extern void sas_rphy_delete(struct sas_rphy *);
 extern void sas_rphy_unlink(struct sas_rphy *);
-extern int scsi_is_sas_rphy(const struct device *);
 
 struct sas_port *sas_port_alloc(struct device *, int);
 struct sas_port *sas_port_alloc_num(struct device *);
-- 
cgit v1.2.3


From a0f81dbeef13aa0aeaa8b955b38735dbf09de392 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Wed, 17 Aug 2016 11:46:18 +0200
Subject: scsi: sas: remove is_sas_attached()

As there are no more users of is_sas_attached() left, remove it.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: James E.J. Bottomley <jejb@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_transport_sas.c | 16 ----------------
 include/scsi/scsi_transport_sas.h |  6 ------
 2 files changed, 22 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 3f0ff072184b..60b651bfaa01 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -340,22 +340,6 @@ static int do_sas_phy_delete(struct device *dev, void *data)
 	return 0;
 }
 
-/**
- * is_sas_attached - check if device is SAS attached
- * @sdev: scsi device to check
- *
- * returns true if the device is SAS attached
- */
-int is_sas_attached(struct scsi_device *sdev)
-{
-	struct Scsi_Host *shost = sdev->host;
-
-	return shost->transportt->host_attrs.ac.class ==
-		&sas_host_class.class;
-}
-EXPORT_SYMBOL(is_sas_attached);
-
-
 /**
  * sas_remove_children  -  tear down a devices SAS data structures
  * @dev:	device belonging to the sas object
diff --git a/include/scsi/scsi_transport_sas.h b/include/scsi/scsi_transport_sas.h
index 31ae074dad9d..73d870918939 100644
--- a/include/scsi/scsi_transport_sas.h
+++ b/include/scsi/scsi_transport_sas.h
@@ -11,17 +11,11 @@ struct sas_rphy;
 struct request;
 
 #if !IS_ENABLED(CONFIG_SCSI_SAS_ATTRS)
-static inline int is_sas_attached(struct scsi_device *sdev)
-{
-	return 0;
-}
-
 static inline int scsi_is_sas_rphy(const struct device *sdev)
 {
 	return 0;
 }
 #else
-extern int is_sas_attached(struct scsi_device *sdev);
 extern int scsi_is_sas_rphy(const struct device *);
 #endif
 
-- 
cgit v1.2.3


From 7a665d2f60b457c0d77b3e4f01e21c55ffc57069 Mon Sep 17 00:00:00 2001
From: Daniel Verkamp <daniel.verkamp@intel.com>
Date: Tue, 28 Jun 2016 11:20:23 -0700
Subject: nvme-fabrics: change NQN UUID to big-endian format

NVM Express 1.2.1 section 7.9, NVMe Qualified Names, specifies that the
UUID format of NQN uses a UUID based on RFC 4122.

RFC 4122 specifies that the UUID is encoded in big-endian byte order.

Switch the NVMe over Fabrics host ID field from little-endian UUID to
big-endian UUID to match the specification.

Signed-off-by: Daniel Verkamp <daniel.verkamp@intel.com>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fabrics.c | 10 +++++-----
 drivers/nvme/host/fabrics.h |  2 +-
 include/linux/nvme.h        |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 020302c6ea57..be0b1067c9fa 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -56,7 +56,7 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn)
 
 	kref_init(&host->ref);
 	memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
-	uuid_le_gen(&host->id);
+	uuid_be_gen(&host->id);
 
 	list_add_tail(&host->list, &nvmf_hosts);
 out_unlock:
@@ -73,9 +73,9 @@ static struct nvmf_host *nvmf_host_default(void)
 		return NULL;
 
 	kref_init(&host->ref);
-	uuid_le_gen(&host->id);
+	uuid_be_gen(&host->id);
 	snprintf(host->nqn, NVMF_NQN_SIZE,
-		"nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUl", &host->id);
+		"nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUb", &host->id);
 
 	mutex_lock(&nvmf_hosts_mutex);
 	list_add_tail(&host->list, &nvmf_hosts);
@@ -382,7 +382,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 	if (!data)
 		return -ENOMEM;
 
-	memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le));
+	memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_be));
 	data->cntlid = cpu_to_le16(0xffff);
 	strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
 	strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
@@ -441,7 +441,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 	if (!data)
 		return -ENOMEM;
 
-	memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le));
+	memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_be));
 	data->cntlid = cpu_to_le16(ctrl->cntlid);
 	strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
 	strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 89df52c8be97..46e460aee52d 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -34,7 +34,7 @@ struct nvmf_host {
 	struct kref		ref;
 	struct list_head	list;
 	char			nqn[NVMF_NQN_SIZE];
-	uuid_le			id;
+	uuid_be			id;
 };
 
 /**
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d8b37bab2887..7676557ce357 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -794,7 +794,7 @@ struct nvmf_connect_command {
 };
 
 struct nvmf_connect_data {
-	uuid_le		hostid;
+	uuid_be		hostid;
 	__le16		cntlid;
 	char		resv4[238];
 	char		subsysnqn[NVMF_NQN_FIELD_LEN];
-- 
cgit v1.2.3


From c0678b2d6648ab65b68703044709e367799ba9f9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 15 Aug 2016 15:52:23 -0700
Subject: include/linux: fix excess fence.h kernel-doc notation

Fix excess fields in kernel-doc notation in <linux/fence.h>
after some struct fields were removed.

Fixes these kernel-doc warnings:
..//include/linux/fence.h:85: warning: Excess struct/union/enum/typedef member 'child_list' description in 'fence'
..//include/linux/fence.h:85: warning: Excess struct/union/enum/typedef member 'active_list' description in 'fence'

Fixes: 0431b9065f28 ("staging/android: bring struct sync_pt back")

Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fence.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fence.h b/include/linux/fence.h
index 8cc719a63728..2ac6fa5f4712 100644
--- a/include/linux/fence.h
+++ b/include/linux/fence.h
@@ -49,8 +49,6 @@ struct fence_cb;
  * @timestamp: Timestamp when the fence was signaled.
  * @status: Optional, only valid if < 0, must be set before calling
  * fence_signal, indicates that the fence has completed with an error.
- * @child_list: list of children fences
- * @active_list: list of active fences
  *
  * the flags member must be manipulated and read using the appropriate
  * atomic ops (bit_*), so taking the spinlock will not be needed most
-- 
cgit v1.2.3


From 7175cce1c3f1d8c8840d2004f78f96a3904249b5 Mon Sep 17 00:00:00 2001
From: Vignesh R <vigneshr@ti.com>
Date: Wed, 17 Aug 2016 17:43:01 +0530
Subject: iio: adc: ti_am335x_adc: Increase timeout value waiting for ADC
 sample

Now that open delay and sample delay for each channel is configurable
via DT, the default IDLE_TIMEOUT value is not enough as this is
calculated based on hardcoded macros. This results in driver returning
EBUSY sometimes. Fix this by increasing the timeout
value based on maximum value possible to open delay and sample delays
for each channel.

Fixes: 5dc11e810676e ("iio: adc: ti_am335x_adc: make sample delay, open delay, averaging DT parameters")
Signed-off-by: Vignesh R <vigneshr@ti.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/adc/ti_am335x_adc.c      | 2 +-
 include/linux/mfd/ti_am335x_tscadc.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/ti_am335x_adc.c b/drivers/iio/adc/ti_am335x_adc.c
index bed9977a1863..c3cfacca2541 100644
--- a/drivers/iio/adc/ti_am335x_adc.c
+++ b/drivers/iio/adc/ti_am335x_adc.c
@@ -381,7 +381,7 @@ static int tiadc_read_raw(struct iio_dev *indio_dev,
 
 	am335x_tsc_se_set_once(adc_dev->mfd_tscadc, step_en);
 
-	timeout = jiffies + usecs_to_jiffies
+	timeout = jiffies + msecs_to_jiffies
 				(IDLE_TIMEOUT * adc_dev->channels);
 	/* Wait for Fifo threshold interrupt */
 	while (1) {
diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index 2567a87872b0..7f55b8b41032 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -138,16 +138,16 @@
 /*
  * time in us for processing a single channel, calculated as follows:
  *
- * num cycles = open delay + (sample delay + conv time) * averaging
+ * max num cycles = open delay + (sample delay + conv time) * averaging
  *
- * num cycles: 152 + (1 + 13) * 16 = 376
+ * max num cycles: 262143 + (255 + 13) * 16 = 266431
  *
  * clock frequency: 26MHz / 8 = 3.25MHz
  * clock period: 1 / 3.25MHz = 308ns
  *
- * processing time: 376 * 308ns = 116us
+ * max processing time: 266431 * 308ns = 83ms(approx)
  */
-#define IDLE_TIMEOUT 116 /* microsec */
+#define IDLE_TIMEOUT 83 /* milliseconds */
 
 #define TSCADC_CELLS		2
 
-- 
cgit v1.2.3


From 62148f0930a8e9bd5c5614f8387222f0220d7d47 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 2 Aug 2016 08:11:00 -0300
Subject: [media] cec: rename cec_devnode fhs_lock to just lock

This lock will be used to protect more than just the fhs list.
So rename it to just 'lock'.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/staging/media/cec/cec-adap.c | 12 ++++++------
 drivers/staging/media/cec/cec-api.c  |  8 ++++----
 drivers/staging/media/cec/cec-core.c |  6 +++---
 include/media/cec.h                  |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/staging/media/cec/cec-adap.c b/drivers/staging/media/cec/cec-adap.c
index b2393bbacb26..9dcb784b8d6a 100644
--- a/drivers/staging/media/cec/cec-adap.c
+++ b/drivers/staging/media/cec/cec-adap.c
@@ -124,10 +124,10 @@ static void cec_queue_event(struct cec_adapter *adap,
 	u64 ts = ktime_get_ns();
 	struct cec_fh *fh;
 
-	mutex_lock(&adap->devnode.fhs_lock);
+	mutex_lock(&adap->devnode.lock);
 	list_for_each_entry(fh, &adap->devnode.fhs, list)
 		cec_queue_event_fh(fh, ev, ts);
-	mutex_unlock(&adap->devnode.fhs_lock);
+	mutex_unlock(&adap->devnode.lock);
 }
 
 /*
@@ -191,12 +191,12 @@ static void cec_queue_msg_monitor(struct cec_adapter *adap,
 	u32 monitor_mode = valid_la ? CEC_MODE_MONITOR :
 				      CEC_MODE_MONITOR_ALL;
 
-	mutex_lock(&adap->devnode.fhs_lock);
+	mutex_lock(&adap->devnode.lock);
 	list_for_each_entry(fh, &adap->devnode.fhs, list) {
 		if (fh->mode_follower >= monitor_mode)
 			cec_queue_msg_fh(fh, msg);
 	}
-	mutex_unlock(&adap->devnode.fhs_lock);
+	mutex_unlock(&adap->devnode.lock);
 }
 
 /*
@@ -207,12 +207,12 @@ static void cec_queue_msg_followers(struct cec_adapter *adap,
 {
 	struct cec_fh *fh;
 
-	mutex_lock(&adap->devnode.fhs_lock);
+	mutex_lock(&adap->devnode.lock);
 	list_for_each_entry(fh, &adap->devnode.fhs, list) {
 		if (fh->mode_follower == CEC_MODE_FOLLOWER)
 			cec_queue_msg_fh(fh, msg);
 	}
-	mutex_unlock(&adap->devnode.fhs_lock);
+	mutex_unlock(&adap->devnode.lock);
 }
 
 /* Notify userspace of an adapter state change. */
diff --git a/drivers/staging/media/cec/cec-api.c b/drivers/staging/media/cec/cec-api.c
index 7be7615a0fdf..4e2696a34ddb 100644
--- a/drivers/staging/media/cec/cec-api.c
+++ b/drivers/staging/media/cec/cec-api.c
@@ -508,14 +508,14 @@ static int cec_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = fh;
 
-	mutex_lock(&devnode->fhs_lock);
+	mutex_lock(&devnode->lock);
 	/* Queue up initial state events */
 	ev_state.state_change.phys_addr = adap->phys_addr;
 	ev_state.state_change.log_addr_mask = adap->log_addrs.log_addr_mask;
 	cec_queue_event_fh(fh, &ev_state, 0);
 
 	list_add(&fh->list, &devnode->fhs);
-	mutex_unlock(&devnode->fhs_lock);
+	mutex_unlock(&devnode->lock);
 
 	return 0;
 }
@@ -540,9 +540,9 @@ static int cec_release(struct inode *inode, struct file *filp)
 		cec_monitor_all_cnt_dec(adap);
 	mutex_unlock(&adap->lock);
 
-	mutex_lock(&devnode->fhs_lock);
+	mutex_lock(&devnode->lock);
 	list_del(&fh->list);
-	mutex_unlock(&devnode->fhs_lock);
+	mutex_unlock(&devnode->lock);
 
 	/* Unhook pending transmits from this filehandle. */
 	mutex_lock(&adap->lock);
diff --git a/drivers/staging/media/cec/cec-core.c b/drivers/staging/media/cec/cec-core.c
index 112a5fae12f5..73792d078462 100644
--- a/drivers/staging/media/cec/cec-core.c
+++ b/drivers/staging/media/cec/cec-core.c
@@ -117,7 +117,7 @@ static int __must_check cec_devnode_register(struct cec_devnode *devnode,
 
 	/* Initialization */
 	INIT_LIST_HEAD(&devnode->fhs);
-	mutex_init(&devnode->fhs_lock);
+	mutex_init(&devnode->lock);
 
 	/* Part 1: Find a free minor number */
 	mutex_lock(&cec_devnode_lock);
@@ -181,10 +181,10 @@ static void cec_devnode_unregister(struct cec_devnode *devnode)
 	if (!devnode->registered || devnode->unregistered)
 		return;
 
-	mutex_lock(&devnode->fhs_lock);
+	mutex_lock(&devnode->lock);
 	list_for_each_entry(fh, &devnode->fhs, list)
 		wake_up_interruptible(&fh->wait);
-	mutex_unlock(&devnode->fhs_lock);
+	mutex_unlock(&devnode->lock);
 
 	devnode->registered = false;
 	devnode->unregistered = true;
diff --git a/include/media/cec.h b/include/media/cec.h
index dc7854b855f3..fdb5d600e4bb 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -57,8 +57,8 @@ struct cec_devnode {
 	int minor;
 	bool registered;
 	bool unregistered;
-	struct mutex fhs_lock;
 	struct list_head fhs;
+	struct mutex lock;
 };
 
 struct cec_adapter;
-- 
cgit v1.2.3


From 9ebf1945d757433a089ab3ee940673503e3e11ec Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Mon, 1 Aug 2016 07:29:34 -0300
Subject: [media] cec-funcs.h: fix typo: && should be &

Fix typo where logical AND was used instead of bitwise AND.

Reported-by: David Binderman <linuxdev.baldrick@gmail.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/linux/cec-funcs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/cec-funcs.h b/include/linux/cec-funcs.h
index 82c3d3b7269d..9e054aa168f3 100644
--- a/include/linux/cec-funcs.h
+++ b/include/linux/cec-funcs.h
@@ -227,7 +227,7 @@ static inline void cec_set_digital_service_id(__u8 *msg,
 	if (digital->service_id_method == CEC_OP_SERVICE_ID_METHOD_BY_CHANNEL) {
 		*msg++ = (digital->channel.channel_number_fmt << 2) |
 			 (digital->channel.major >> 8);
-		*msg++ = digital->channel.major && 0xff;
+		*msg++ = digital->channel.major & 0xff;
 		*msg++ = digital->channel.minor >> 8;
 		*msg++ = digital->channel.minor & 0xff;
 		*msg++ = 0;
@@ -1277,7 +1277,7 @@ static inline void cec_msg_user_control_pressed(struct cec_msg *msg,
 		msg->len += 4;
 		msg->msg[3] = (ui_cmd->channel_identifier.channel_number_fmt << 2) |
 			      (ui_cmd->channel_identifier.major >> 8);
-		msg->msg[4] = ui_cmd->channel_identifier.major && 0xff;
+		msg->msg[4] = ui_cmd->channel_identifier.major & 0xff;
 		msg->msg[5] = ui_cmd->channel_identifier.minor >> 8;
 		msg->msg[6] = ui_cmd->channel_identifier.minor & 0xff;
 		break;
-- 
cgit v1.2.3


From 31dc8b7302f1e48952ec8e90cd49dca843146cd0 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Wed, 10 Aug 2016 08:01:38 -0300
Subject: [media] cec-funcs.h: add reply argument for Record On/Off

A reply parameter is added to the cec_msg_record_on/off functions in
cec-funcs.h. The standard mandates that Record Status shall be replied
to Record On, and it may be replied to Record Off.

Signed-off-by: Johan Fjeldtvedt <jaffe1@gmail.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/linux/cec-funcs.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cec-funcs.h b/include/linux/cec-funcs.h
index 9e054aa168f3..8af613e67633 100644
--- a/include/linux/cec-funcs.h
+++ b/include/linux/cec-funcs.h
@@ -162,10 +162,11 @@ static inline void cec_msg_standby(struct cec_msg *msg)
 
 
 /* One Touch Record Feature */
-static inline void cec_msg_record_off(struct cec_msg *msg)
+static inline void cec_msg_record_off(struct cec_msg *msg, bool reply)
 {
 	msg->len = 2;
 	msg->msg[1] = CEC_MSG_RECORD_OFF;
+	msg->reply = reply ? CEC_MSG_RECORD_STATUS : 0;
 }
 
 struct cec_op_arib_data {
@@ -323,6 +324,7 @@ static inline void cec_msg_record_on_phys_addr(struct cec_msg *msg,
 }
 
 static inline void cec_msg_record_on(struct cec_msg *msg,
+				     bool reply,
 				     const struct cec_op_record_src *rec_src)
 {
 	switch (rec_src->type) {
@@ -346,6 +348,7 @@ static inline void cec_msg_record_on(struct cec_msg *msg,
 					    rec_src->ext_phys_addr.phys_addr);
 		break;
 	}
+	msg->reply = reply ? CEC_MSG_RECORD_STATUS : 0;
 }
 
 static inline void cec_ops_record_on(const struct cec_msg *msg,
-- 
cgit v1.2.3


From dcceb1eaf210096831b14471bc87678375b086ed Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Wed, 10 Aug 2016 09:24:45 -0300
Subject: [media] cec: add CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK flag

Currently if none of the requested logical addresses can be claimed, the
framework will fall back to the Unregistered logical address.

Add a flag to enable this explicitly. By default it will just go back to
the unconfigured state.

Usually Unregistered is not something you want since the functionality is
very limited. Unless the application has support for this, it will fail
to work correctly. So require that the application explicitly requests
this.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 .../media/uapi/cec/cec-ioc-adap-g-log-addrs.rst     | 21 ++++++++++++++++++++-
 drivers/staging/media/cec/cec-adap.c                |  4 ++++
 drivers/staging/media/cec/cec-api.c                 |  2 +-
 include/linux/cec.h                                 |  5 ++++-
 4 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst b/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst
index 04ee90099676..201d4839931c 100644
--- a/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst
@@ -144,7 +144,7 @@ logical address types are already defined will return with error ``EBUSY``.
 
        -  ``flags``
 
-       -  Flags. No flags are defined yet, so set this to 0.
+       -  Flags. See :ref:`cec-log-addrs-flags` for a list of available flags.
 
     -  .. row 7
 
@@ -201,6 +201,25 @@ logical address types are already defined will return with error ``EBUSY``.
           give the CEC framework more information about the device type, even
           though the framework won't use it directly in the CEC message.
 
+.. _cec-log-addrs-flags:
+
+.. flat-table:: Flags for struct cec_log_addrs
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       3 1 4
+
+
+    -  .. _`CEC-LOG-ADDRS-FL-ALLOW-UNREG-FALLBACK`:
+
+       -  ``CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK``
+
+       -  1
+
+       -  By default if no logical address of the requested type can be claimed, then
+	  it will go back to the unconfigured state. If this flag is set, then it will
+	  fallback to the Unregistered logical address. Note that if the Unregistered
+	  logical address was explicitly requested, then this flag has no effect.
+
 .. _cec-versions:
 
 .. flat-table:: CEC Versions
diff --git a/drivers/staging/media/cec/cec-adap.c b/drivers/staging/media/cec/cec-adap.c
index 9dcb784b8d6a..2458a6c87642 100644
--- a/drivers/staging/media/cec/cec-adap.c
+++ b/drivers/staging/media/cec/cec-adap.c
@@ -1047,6 +1047,10 @@ static int cec_config_thread_func(void *arg)
 			dprintk(1, "could not claim LA %d\n", i);
 	}
 
+	if (adap->log_addrs.log_addr_mask == 0 &&
+	    !(las->flags & CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK))
+		goto unconfigure;
+
 configured:
 	if (adap->log_addrs.log_addr_mask == 0) {
 		/* Fall back to unregistered */
diff --git a/drivers/staging/media/cec/cec-api.c b/drivers/staging/media/cec/cec-api.c
index 4e2696a34ddb..6f58ee85eea4 100644
--- a/drivers/staging/media/cec/cec-api.c
+++ b/drivers/staging/media/cec/cec-api.c
@@ -162,7 +162,7 @@ static long cec_adap_s_log_addrs(struct cec_adapter *adap, struct cec_fh *fh,
 		return -ENOTTY;
 	if (copy_from_user(&log_addrs, parg, sizeof(log_addrs)))
 		return -EFAULT;
-	log_addrs.flags = 0;
+	log_addrs.flags &= CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK;
 	mutex_lock(&adap->lock);
 	if (!adap->is_configuring &&
 	    (!log_addrs.num_log_addrs || !adap->is_configured) &&
diff --git a/include/linux/cec.h b/include/linux/cec.h
index b3e22893a002..851968e803fa 100644
--- a/include/linux/cec.h
+++ b/include/linux/cec.h
@@ -364,7 +364,7 @@ struct cec_caps {
  * @num_log_addrs: how many logical addresses should be claimed. Set by the
  *	caller.
  * @vendor_id: the vendor ID of the device. Set by the caller.
- * @flags: set to 0.
+ * @flags: flags.
  * @osd_name: the OSD name of the device. Set by the caller.
  * @primary_device_type: the primary device type for each logical address.
  *	Set by the caller.
@@ -389,6 +389,9 @@ struct cec_log_addrs {
 	__u8 features[CEC_MAX_LOG_ADDRS][12];
 };
 
+/* Allow a fallback to unregistered */
+#define CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK	(1 << 0)
+
 /* Events */
 
 /* Event that occurs when the adapter state changes */
-- 
cgit v1.2.3


From 4808f721627c2a23b5d749f9bbd20d4529ea2b8d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Sat, 20 Aug 2016 07:54:38 -0300
Subject: [media] cec-funcs.h: add missing vendor-specific messages

The cec-funcs.h header was missing support for these three vendor-specific messages:

CEC_MSG_VENDOR_COMMAND
CEC_MSG_VENDOR_COMMAND_WITH_ID
CEC_MSG_VENDOR_REMOTE_BUTTON_DOWN

Add wrappers for these messages.

I originally postponed adding these wrappers due to the fact that the argument is
just a byte array which cec-ctl couldn't handle at the time, and then I just forgot
to add them once the CEC framework was finalized.

It wasn't until an attempt to transmit a vendor specific command was made that I
realized that these wrappers were missing.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/linux/cec-funcs.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'include')

diff --git a/include/linux/cec-funcs.h b/include/linux/cec-funcs.h
index 8af613e67633..138bbf721e70 100644
--- a/include/linux/cec-funcs.h
+++ b/include/linux/cec-funcs.h
@@ -1144,6 +1144,75 @@ static inline void cec_msg_give_device_vendor_id(struct cec_msg *msg,
 	msg->reply = reply ? CEC_MSG_DEVICE_VENDOR_ID : 0;
 }
 
+static inline void cec_msg_vendor_command(struct cec_msg *msg,
+					  __u8 size, const __u8 *vendor_cmd)
+{
+	if (size > 14)
+		size = 14;
+	msg->len = 2 + size;
+	msg->msg[1] = CEC_MSG_VENDOR_COMMAND;
+	memcpy(msg->msg + 2, vendor_cmd, size);
+}
+
+static inline void cec_ops_vendor_command(const struct cec_msg *msg,
+					  __u8 *size,
+					  const __u8 **vendor_cmd)
+{
+	*size = msg->len - 2;
+
+	if (*size > 14)
+		*size = 14;
+	*vendor_cmd = msg->msg + 2;
+}
+
+static inline void cec_msg_vendor_command_with_id(struct cec_msg *msg,
+						  __u32 vendor_id, __u8 size,
+						  const __u8 *vendor_cmd)
+{
+	if (size > 11)
+		size = 11;
+	msg->len = 5 + size;
+	msg->msg[1] = CEC_MSG_VENDOR_COMMAND_WITH_ID;
+	msg->msg[2] = vendor_id >> 16;
+	msg->msg[3] = (vendor_id >> 8) & 0xff;
+	msg->msg[4] = vendor_id & 0xff;
+	memcpy(msg->msg + 5, vendor_cmd, size);
+}
+
+static inline void cec_ops_vendor_command_with_id(const struct cec_msg *msg,
+						  __u32 *vendor_id,  __u8 *size,
+						  const __u8 **vendor_cmd)
+{
+	*size = msg->len - 5;
+
+	if (*size > 11)
+		*size = 11;
+	*vendor_id = (msg->msg[2] << 16) | (msg->msg[3] << 8) | msg->msg[4];
+	*vendor_cmd = msg->msg + 5;
+}
+
+static inline void cec_msg_vendor_remote_button_down(struct cec_msg *msg,
+						     __u8 size,
+						     const __u8 *rc_code)
+{
+	if (size > 14)
+		size = 14;
+	msg->len = 2 + size;
+	msg->msg[1] = CEC_MSG_VENDOR_REMOTE_BUTTON_DOWN;
+	memcpy(msg->msg + 2, rc_code, size);
+}
+
+static inline void cec_ops_vendor_remote_button_down(const struct cec_msg *msg,
+						     __u8 *size,
+						     const __u8 **rc_code)
+{
+	*size = msg->len - 2;
+
+	if (*size > 14)
+		*size = 14;
+	*rc_code = msg->msg + 2;
+}
+
 static inline void cec_msg_vendor_remote_button_up(struct cec_msg *msg)
 {
 	msg->len = 2;
-- 
cgit v1.2.3


From 35db57bbc4b7ab810bba6e6d6954a0faf5a842cf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 23 Aug 2016 16:00:12 +0200
Subject: xfrm: state: remove per-netns gc task

After commit 5b8ef3415a21f173
("xfrm: Remove ancient sleeping when the SA is in acquire state")
gc does not need any per-netns data anymore.

As far as gc is concerned all state structs are the same, so we
can use a global work struct for it.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h |  2 --
 net/xfrm/xfrm_state.c    | 18 +++++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 177ed444d7b2..27bb9633c69d 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -44,8 +44,6 @@ struct netns_xfrm {
 	unsigned int		state_hmask;
 	unsigned int		state_num;
 	struct work_struct	state_hash_work;
-	struct hlist_head	state_gc_list;
-	struct work_struct	state_gc_work;
 
 	struct list_head	policy_all;
 	struct hlist_head	*policy_byidx;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1a15b658a79e..ba8bf518ba14 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -31,6 +31,8 @@
 #define xfrm_state_deref_prot(table, net) \
 	rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))
 
+static void xfrm_state_gc_task(struct work_struct *work);
+
 /* Each xfrm_state may be linked to two tables:
 
    1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
@@ -41,6 +43,9 @@
 static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
 
+static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
+static HLIST_HEAD(xfrm_state_gc_list);
+
 static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
 {
 	return atomic_inc_not_zero(&x->refcnt);
@@ -368,13 +373,12 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 
 static void xfrm_state_gc_task(struct work_struct *work)
 {
-	struct net *net = container_of(work, struct net, xfrm.state_gc_work);
 	struct xfrm_state *x;
 	struct hlist_node *tmp;
 	struct hlist_head gc_list;
 
 	spin_lock_bh(&xfrm_state_gc_lock);
-	hlist_move_list(&net->xfrm.state_gc_list, &gc_list);
+	hlist_move_list(&xfrm_state_gc_list, &gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 
 	synchronize_rcu();
@@ -515,14 +519,12 @@ EXPORT_SYMBOL(xfrm_state_alloc);
 
 void __xfrm_state_destroy(struct xfrm_state *x)
 {
-	struct net *net = xs_net(x);
-
 	WARN_ON(x->km.state != XFRM_STATE_DEAD);
 
 	spin_lock_bh(&xfrm_state_gc_lock);
-	hlist_add_head(&x->gclist, &net->xfrm.state_gc_list);
+	hlist_add_head(&x->gclist, &xfrm_state_gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
-	schedule_work(&net->xfrm.state_gc_work);
+	schedule_work(&xfrm_state_gc_work);
 }
 EXPORT_SYMBOL(__xfrm_state_destroy);
 
@@ -2134,8 +2136,6 @@ int __net_init xfrm_state_init(struct net *net)
 
 	net->xfrm.state_num = 0;
 	INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
-	INIT_HLIST_HEAD(&net->xfrm.state_gc_list);
-	INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task);
 	spin_lock_init(&net->xfrm.xfrm_state_lock);
 	return 0;
 
@@ -2153,7 +2153,7 @@ void xfrm_state_fini(struct net *net)
 
 	flush_work(&net->xfrm.state_hash_work);
 	xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
-	flush_work(&net->xfrm.state_gc_work);
+	flush_work(&xfrm_state_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.state_all));
 
-- 
cgit v1.2.3


From 89e1f6d2b956649fbe0704d543a90b8e0cf872b0 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 01:02:18 +0800
Subject: netfilter: nft_reject: restrict to INPUT/FORWARD/OUTPUT

After I add the nft rule "nft add rule filter prerouting reject
with tcp reset", kernel panic happened on my system:
  NULL pointer dereference at ...
  IP: [<ffffffff81b9db2f>] nf_send_reset+0xaf/0x400
  Call Trace:
  [<ffffffff81b9da80>] ? nf_reject_ip_tcphdr_get+0x160/0x160
  [<ffffffffa0928061>] nft_reject_ipv4_eval+0x61/0xb0 [nft_reject_ipv4]
  [<ffffffffa08e836a>] nft_do_chain+0x1fa/0x890 [nf_tables]
  [<ffffffffa08e8170>] ? __nft_trace_packet+0x170/0x170 [nf_tables]
  [<ffffffffa06e0900>] ? nf_ct_invert_tuple+0xb0/0xc0 [nf_conntrack]
  [<ffffffffa07224d4>] ? nf_nat_setup_info+0x5d4/0x650 [nf_nat]
  [...]

Because in the PREROUTING chain, routing information is not exist,
then we will dereference the NULL pointer and oops happen.

So we restrict reject expression to INPUT, FORWARD and OUTPUT chain.
This is consistent with iptables REJECT target.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_reject.h   |  4 ++++
 net/ipv4/netfilter/nft_reject_ipv4.c |  1 +
 net/ipv6/netfilter/nft_reject_ipv6.c |  1 +
 net/netfilter/nft_reject.c           | 16 ++++++++++++++++
 net/netfilter/nft_reject_inet.c      |  7 ++++++-
 5 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h
index 60fa1530006b..02e28c529b29 100644
--- a/include/net/netfilter/nft_reject.h
+++ b/include/net/netfilter/nft_reject.h
@@ -8,6 +8,10 @@ struct nft_reject {
 
 extern const struct nla_policy nft_reject_policy[];
 
+int nft_reject_validate(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nft_data **data);
+
 int nft_reject_init(const struct nft_ctx *ctx,
 		    const struct nft_expr *expr,
 		    const struct nlattr * const tb[]);
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index c24f41c816b3..2c2553b9026c 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = {
 	.eval		= nft_reject_ipv4_eval,
 	.init		= nft_reject_init,
 	.dump		= nft_reject_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 533cd5719c59..92bda9908bb9 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -47,6 +47,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = {
 	.eval		= nft_reject_ipv6_eval,
 	.init		= nft_reject_init,
 	.dump		= nft_reject_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 0522fc9bfb0a..c64de3f7379d 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -26,11 +26,27 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
 };
 EXPORT_SYMBOL_GPL(nft_reject_policy);
 
+int nft_reject_validate(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nft_data **data)
+{
+	return nft_chain_validate_hooks(ctx->chain,
+					(1 << NF_INET_LOCAL_IN) |
+					(1 << NF_INET_FORWARD) |
+					(1 << NF_INET_LOCAL_OUT));
+}
+EXPORT_SYMBOL_GPL(nft_reject_validate);
+
 int nft_reject_init(const struct nft_ctx *ctx,
 		    const struct nft_expr *expr,
 		    const struct nlattr * const tb[])
 {
 	struct nft_reject *priv = nft_expr_priv(expr);
+	int err;
+
+	err = nft_reject_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
 
 	if (tb[NFTA_REJECT_TYPE] == NULL)
 		return -EINVAL;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 759ca5248a3d..e79d9ca2ffee 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -66,7 +66,11 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx,
 				const struct nlattr * const tb[])
 {
 	struct nft_reject *priv = nft_expr_priv(expr);
-	int icmp_code;
+	int icmp_code, err;
+
+	err = nft_reject_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
 
 	if (tb[NFTA_REJECT_TYPE] == NULL)
 		return -EINVAL;
@@ -124,6 +128,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
 	.eval		= nft_reject_inet_eval,
 	.init		= nft_reject_inet_init,
 	.dump		= nft_reject_inet_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_inet_type __read_mostly = {
-- 
cgit v1.2.3


From 960fa72f67f1be6891d63a5518860d1ae4e14b88 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 22:57:56 +0800
Subject: netfilter: nft_meta: improve the validity check of pkttype set expr

"meta pkttype set" is only supported on prerouting chain with bridge
family and ingress chain with netdev family.

But the validate check is incomplete, and the user can add the nft
rules on input chain with bridge family, for example:
  # nft add table bridge filter
  # nft add chain bridge filter input {type filter hook input \
    priority 0 \;}
  # nft add chain bridge filter test
  # nft add rule bridge filter test meta pkttype set unicast
  # nft add rule bridge filter input jump test

This patch fixes the problem.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h       |  4 ++++
 net/bridge/netfilter/nft_meta_bridge.c |  1 +
 net/netfilter/nft_meta.c               | 17 +++++++++++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index d27588c8dbd9..1139cde0fdc5 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -36,4 +36,8 @@ void nft_meta_set_eval(const struct nft_expr *expr,
 void nft_meta_set_destroy(const struct nft_ctx *ctx,
 			  const struct nft_expr *expr);
 
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nft_data **data);
+
 #endif
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 4b901d9f2e7c..ad47a921b701 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -86,6 +86,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = {
 	.init		= nft_meta_set_init,
 	.destroy	= nft_meta_set_destroy,
 	.dump		= nft_meta_set_dump,
+	.validate	= nft_meta_set_validate,
 };
 
 static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 2863f3493038..8a6bc7630912 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -291,10 +291,16 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(nft_meta_get_init);
 
-static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nft_data **data)
 {
+	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int hooks;
 
+	if (priv->key != NFT_META_PKTTYPE)
+		return 0;
+
 	switch (ctx->afi->family) {
 	case NFPROTO_BRIDGE:
 		hooks = 1 << NF_BR_PRE_ROUTING;
@@ -308,6 +314,7 @@ static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
 
 	return nft_chain_validate_hooks(ctx->chain, hooks);
 }
+EXPORT_SYMBOL_GPL(nft_meta_set_validate);
 
 int nft_meta_set_init(const struct nft_ctx *ctx,
 		      const struct nft_expr *expr,
@@ -327,15 +334,16 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
 		len = sizeof(u8);
 		break;
 	case NFT_META_PKTTYPE:
-		err = nft_meta_set_init_pkttype(ctx);
-		if (err)
-			return err;
 		len = sizeof(u8);
 		break;
 	default:
 		return -EOPNOTSUPP;
 	}
 
+	err = nft_meta_set_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
+
 	priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
 	err = nft_validate_register_load(priv->sreg, len);
 	if (err < 0)
@@ -407,6 +415,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
 	.init		= nft_meta_set_init,
 	.destroy	= nft_meta_set_destroy,
 	.dump		= nft_meta_set_dump,
+	.validate	= nft_meta_set_validate,
 };
 
 static const struct nft_expr_ops *
-- 
cgit v1.2.3


From 17de0a9ff3df8f54f2f47746d118112d4e61d973 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Aug 2016 11:33:58 +1000
Subject: iomap: don't set FIEMAP_EXTENT_MERGED for extent based filesystems

Filesystems like XFS that use extents should not set the
FIEMAP_EXTENT_MERGED flag in the fiemap extent structures.  To allow
for both behaviors for the upcoming gfs2 usage split the iomap
type field into type and flags, and only set FIEMAP_EXTENT_MERGED if
the IOMAP_F_MERGED flag is set.  The flags field will also come in
handy for future features such as shared extents on reflink-enabled
file systems.

Reported-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/iomap.c            | 5 ++++-
 include/linux/iomap.h | 8 +++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/iomap.c b/fs/iomap.c
index 0342254646e3..706270f21b35 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -428,9 +428,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 		break;
 	}
 
+	if (iomap->flags & IOMAP_F_MERGED)
+		flags |= FIEMAP_EXTENT_MERGED;
+
 	return fiemap_fill_next_extent(fi, iomap->offset,
 			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
-			iomap->length, flags | FIEMAP_EXTENT_MERGED);
+			iomap->length, flags);
 
 }
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3267df461012..3d70ece10313 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -18,6 +18,11 @@ struct vm_fault;
 #define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
 #define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
 
+/*
+ * Flags for iomap mappings:
+ */
+#define IOMAP_F_MERGED	0x01	/* contains multiple blocks/extents */
+
 /*
  * Magic value for blkno:
  */
@@ -27,7 +32,8 @@ struct iomap {
 	sector_t		blkno;	/* 1st sector of mapping, 512b units */
 	loff_t			offset;	/* file offset of mapping, bytes */
 	u64			length;	/* length of mapping, bytes */
-	int			type;	/* type of mapping */
+	u16			type;	/* type of mapping */
+	u16			flags;	/* flags for mapping */
 	struct block_device	*bdev;	/* block device for I/O */
 };
 
-- 
cgit v1.2.3


From 61aaa0e8c1c15d9e045f0577f046be50f2f571ab Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Fri, 19 Aug 2016 22:02:48 +0200
Subject: cfg80211: Add stub for cfg80211_get_station()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows modules using this function (currently: batman-adv) to
compile even if cfg80211 is not built at all, thus relaxing
dependencies.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9c23f4d33e06..beb7610d64e9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1102,6 +1102,7 @@ struct station_info {
 	struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1];
 };
 
+#if IS_ENABLED(CONFIG_CFG80211)
 /**
  * cfg80211_get_station - retrieve information about a given station
  * @dev: the device where the station is supposed to be connected to
@@ -1114,6 +1115,14 @@ struct station_info {
  */
 int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
 			 struct station_info *sinfo);
+#else
+static inline int cfg80211_get_station(struct net_device *dev,
+				       const u8 *mac_addr,
+				       struct station_info *sinfo)
+{
+	return -ENOENT;
+}
+#endif
 
 /**
  * enum monitor_flags - monitor flags
-- 
cgit v1.2.3


From 0d025d271e55f3de21f0aaaf54b42d20404d2b23 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 30 Aug 2016 08:04:16 -0500
Subject: mm/usercopy: get rid of CONFIG_DEBUG_STRICT_USER_COPY_CHECKS

There are three usercopy warnings which are currently being silenced for
gcc 4.6 and newer:

1) "copy_from_user() buffer size is too small" compile warning/error

   This is a static warning which happens when object size and copy size
   are both const, and copy size > object size.  I didn't see any false
   positives for this one.  So the function warning attribute seems to
   be working fine here.

   Note this scenario is always a bug and so I think it should be
   changed to *always* be an error, regardless of
   CONFIG_DEBUG_STRICT_USER_COPY_CHECKS.

2) "copy_from_user() buffer size is not provably correct" compile warning

   This is another static warning which happens when I enable
   __compiletime_object_size() for new compilers (and
   CONFIG_DEBUG_STRICT_USER_COPY_CHECKS).  It happens when object size
   is const, but copy size is *not*.  In this case there's no way to
   compare the two at build time, so it gives the warning.  (Note the
   warning is a byproduct of the fact that gcc has no way of knowing
   whether the overflow function will be called, so the call isn't dead
   code and the warning attribute is activated.)

   So this warning seems to only indicate "this is an unusual pattern,
   maybe you should check it out" rather than "this is a bug".

   I get 102(!) of these warnings with allyesconfig and the
   __compiletime_object_size() gcc check removed.  I don't know if there
   are any real bugs hiding in there, but from looking at a small
   sample, I didn't see any.  According to Kees, it does sometimes find
   real bugs.  But the false positive rate seems high.

3) "Buffer overflow detected" runtime warning

   This is a runtime warning where object size is const, and copy size >
   object size.

All three warnings (both static and runtime) were completely disabled
for gcc 4.6 with the following commit:

  2fb0815c9ee6 ("gcc4: disable __compiletime_object_size for GCC 4.6+")

That commit mistakenly assumed that the false positives were caused by a
gcc bug in __compiletime_object_size().  But in fact,
__compiletime_object_size() seems to be working fine.  The false
positives were instead triggered by #2 above.  (Though I don't have an
explanation for why the warnings supposedly only started showing up in
gcc 4.6.)

So remove warning #2 to get rid of all the false positives, and re-enable
warnings #1 and #3 by reverting the above commit.

Furthermore, since #1 is a real bug which is detected at compile time,
upgrade it to always be an error.

Having done all that, CONFIG_DEBUG_STRICT_USER_COPY_CHECKS is no longer
needed.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/parisc/Kconfig                         |  1 -
 arch/parisc/configs/c8000_defconfig         |  1 -
 arch/parisc/configs/generic-64bit_defconfig |  1 -
 arch/parisc/include/asm/uaccess.h           | 22 ++++-----
 arch/s390/Kconfig                           |  1 -
 arch/s390/configs/default_defconfig         |  1 -
 arch/s390/configs/gcov_defconfig            |  1 -
 arch/s390/configs/performance_defconfig     |  1 -
 arch/s390/defconfig                         |  1 -
 arch/s390/include/asm/uaccess.h             | 19 +++++---
 arch/tile/Kconfig                           |  1 -
 arch/tile/include/asm/uaccess.h             | 22 +++++----
 arch/x86/Kconfig                            |  1 -
 arch/x86/include/asm/uaccess.h              | 69 ++++-------------------------
 include/asm-generic/uaccess.h               |  1 +
 include/linux/compiler-gcc.h                |  2 +-
 lib/Kconfig.debug                           | 18 --------
 lib/Makefile                                |  1 -
 lib/usercopy.c                              |  9 ----
 19 files changed, 45 insertions(+), 128 deletions(-)
 delete mode 100644 lib/usercopy.c

(limited to 'include')

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index cd8778103165..af12c2db9bb8 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -1,6 +1,5 @@
 config PARISC
 	def_bool y
-	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select HAVE_IDE
 	select HAVE_OPROFILE
diff --git a/arch/parisc/configs/c8000_defconfig b/arch/parisc/configs/c8000_defconfig
index 1a8f6f95689e..f6a4c016304b 100644
--- a/arch/parisc/configs/c8000_defconfig
+++ b/arch/parisc/configs/c8000_defconfig
@@ -245,7 +245,6 @@ CONFIG_DEBUG_RT_MUTEXES=y
 CONFIG_PROVE_RCU_DELAY=y
 CONFIG_DEBUG_BLOCK_EXT_DEVT=y
 CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
 CONFIG_KEYS=y
 # CONFIG_CRYPTO_HW is not set
 CONFIG_FONTS=y
diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig
index 7e0792658952..c564e6e1fa23 100644
--- a/arch/parisc/configs/generic-64bit_defconfig
+++ b/arch/parisc/configs/generic-64bit_defconfig
@@ -291,7 +291,6 @@ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_TIMER_STATS=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_PCBC=m
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h
index 0f59fd9ca205..e9150487e20d 100644
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -208,13 +208,13 @@ unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned lo
 #define __copy_to_user_inatomic __copy_to_user
 #define __copy_from_user_inatomic __copy_from_user
 
-extern void copy_from_user_overflow(void)
-#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
-        __compiletime_error("copy_from_user() buffer size is not provably correct")
-#else
-        __compiletime_warning("copy_from_user() buffer size is not provably correct")
-#endif
-;
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
+
+static inline void copy_user_overflow(int size, unsigned long count)
+{
+	WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
 
 static inline unsigned long __must_check copy_from_user(void *to,
                                           const void __user *from,
@@ -223,10 +223,12 @@ static inline unsigned long __must_check copy_from_user(void *to,
         int sz = __compiletime_object_size(to);
         int ret = -EFAULT;
 
-        if (likely(sz == -1 || !__builtin_constant_p(n) || sz >= n))
+        if (likely(sz == -1 || sz >= n))
                 ret = __copy_from_user(to, from, n);
-        else
-                copy_from_user_overflow();
+        else if (!__builtin_constant_p(n))
+		copy_user_overflow(sz, n);
+	else
+                __bad_copy_user();
 
         return ret;
 }
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e751fe25d6ab..c109f073d454 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -68,7 +68,6 @@ config DEBUG_RODATA
 config S390
 	def_bool y
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index 26e0c7f08814..412b1bd21029 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -602,7 +602,6 @@ CONFIG_FAIL_FUTEX=y
 CONFIG_FAULT_INJECTION_DEBUG_FS=y
 CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y
 CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
 CONFIG_IRQSOFF_TRACER=y
 CONFIG_PREEMPT_TRACER=y
 CONFIG_SCHED_TRACER=y
diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig
index 24879dab47bc..bec279eb4b93 100644
--- a/arch/s390/configs/gcov_defconfig
+++ b/arch/s390/configs/gcov_defconfig
@@ -552,7 +552,6 @@ CONFIG_NOTIFIER_ERROR_INJECTION=m
 CONFIG_CPU_NOTIFIER_ERROR_INJECT=m
 CONFIG_PM_NOTIFIER_ERROR_INJECT=m
 CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
 CONFIG_BLK_DEV_IO_TRACE=y
 # CONFIG_KPROBE_EVENT is not set
 CONFIG_TRACE_ENUM_MAP_FILE=y
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index a5c1e5f2a0ca..1751446a5bbb 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -549,7 +549,6 @@ CONFIG_TIMER_STATS=y
 CONFIG_RCU_TORTURE_TEST=m
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
 CONFIG_SCHED_TRACER=y
 CONFIG_FTRACE_SYSCALLS=y
 CONFIG_STACK_TRACER=y
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index 73610f2e3b4f..2d40ef0a6295 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -172,7 +172,6 @@ CONFIG_DEBUG_NOTIFIERS=y
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_RCU_TRACE=y
 CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
 CONFIG_SCHED_TRACER=y
 CONFIG_FTRACE_SYSCALLS=y
 CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 9b49cf1daa8f..95aefdba4be2 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -311,6 +311,14 @@ int __get_user_bad(void) __attribute__((noreturn));
 #define __put_user_unaligned __put_user
 #define __get_user_unaligned __get_user
 
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
+
+static inline void copy_user_overflow(int size, unsigned long count)
+{
+	WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
+
 /**
  * copy_to_user: - Copy a block of data into user space.
  * @to:   Destination address, in user space.
@@ -332,12 +340,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 	return __copy_to_user(to, from, n);
 }
 
-void copy_from_user_overflow(void)
-#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
-__compiletime_warning("copy_from_user() buffer size is not provably correct")
-#endif
-;
-
 /**
  * copy_from_user: - Copy a block of data from user space.
  * @to:   Destination address, in kernel space.
@@ -362,7 +364,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
 
 	might_fault();
 	if (unlikely(sz != -1 && sz < n)) {
-		copy_from_user_overflow();
+		if (!__builtin_constant_p(n))
+			copy_user_overflow(sz, n);
+		else
+			__bad_copy_user();
 		return n;
 	}
 	return __copy_from_user(to, from, n);
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 4820a02838ac..78da75b670bc 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -4,7 +4,6 @@
 config TILE
 	def_bool y
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_WANT_FRAME_POINTERS
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index 0a9c4265763b..a77369e91e54 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -416,14 +416,13 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 	return n;
 }
 
-#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
-/*
- * There are still unprovable places in the generic code as of 2.6.34, so this
- * option is not really compatible with -Werror, which is more useful in
- * general.
- */
-extern void copy_from_user_overflow(void)
-	__compiletime_warning("copy_from_user() size is not provably correct");
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
+
+static inline void copy_user_overflow(int size, unsigned long count)
+{
+	WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
 
 static inline unsigned long __must_check copy_from_user(void *to,
 					  const void __user *from,
@@ -433,14 +432,13 @@ static inline unsigned long __must_check copy_from_user(void *to,
 
 	if (likely(sz == -1 || sz >= n))
 		n = _copy_from_user(to, from, n);
+	else if (!__builtin_constant_p(n))
+		copy_user_overflow(sz, n);
 	else
-		copy_from_user_overflow();
+		__bad_copy_user();
 
 	return n;
 }
-#else
-#define copy_from_user _copy_from_user
-#endif
 
 #ifdef __tilegx__
 /**
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c580d8c33562..2a1f0ce7c59a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,7 +24,6 @@ config X86
 	select ARCH_DISCARD_MEMBLOCK
 	select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a0ae610b9280..c3f291195294 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -697,43 +697,14 @@ unsigned long __must_check _copy_from_user(void *to, const void __user *from,
 unsigned long __must_check _copy_to_user(void __user *to, const void *from,
 					 unsigned n);
 
-#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
-# define copy_user_diag __compiletime_error
-#else
-# define copy_user_diag __compiletime_warning
-#endif
-
-extern void copy_user_diag("copy_from_user() buffer size is too small")
-copy_from_user_overflow(void);
-extern void copy_user_diag("copy_to_user() buffer size is too small")
-copy_to_user_overflow(void) __asm__("copy_from_user_overflow");
-
-#undef copy_user_diag
-
-#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
-
-extern void
-__compiletime_warning("copy_from_user() buffer size is not provably correct")
-__copy_from_user_overflow(void) __asm__("copy_from_user_overflow");
-#define __copy_from_user_overflow(size, count) __copy_from_user_overflow()
-
-extern void
-__compiletime_warning("copy_to_user() buffer size is not provably correct")
-__copy_to_user_overflow(void) __asm__("copy_from_user_overflow");
-#define __copy_to_user_overflow(size, count) __copy_to_user_overflow()
-
-#else
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
 
-static inline void
-__copy_from_user_overflow(int size, unsigned long count)
+static inline void copy_user_overflow(int size, unsigned long count)
 {
 	WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
 }
 
-#define __copy_to_user_overflow __copy_from_user_overflow
-
-#endif
-
 static inline unsigned long __must_check
 copy_from_user(void *to, const void __user *from, unsigned long n)
 {
@@ -743,31 +714,13 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
 
 	kasan_check_write(to, n);
 
-	/*
-	 * While we would like to have the compiler do the checking for us
-	 * even in the non-constant size case, any false positives there are
-	 * a problem (especially when DEBUG_STRICT_USER_COPY_CHECKS, but even
-	 * without - the [hopefully] dangerous looking nature of the warning
-	 * would make people go look at the respecitive call sites over and
-	 * over again just to find that there's no problem).
-	 *
-	 * And there are cases where it's just not realistic for the compiler
-	 * to prove the count to be in range. For example when multiple call
-	 * sites of a helper function - perhaps in different source files -
-	 * all doing proper range checking, yet the helper function not doing
-	 * so again.
-	 *
-	 * Therefore limit the compile time checking to the constant size
-	 * case, and do only runtime checking for non-constant sizes.
-	 */
-
 	if (likely(sz < 0 || sz >= n)) {
 		check_object_size(to, n, false);
 		n = _copy_from_user(to, from, n);
-	} else if (__builtin_constant_p(n))
-		copy_from_user_overflow();
+	} else if (!__builtin_constant_p(n))
+		copy_user_overflow(sz, n);
 	else
-		__copy_from_user_overflow(sz, n);
+		__bad_copy_user();
 
 	return n;
 }
@@ -781,21 +734,17 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 
 	might_fault();
 
-	/* See the comment in copy_from_user() above. */
 	if (likely(sz < 0 || sz >= n)) {
 		check_object_size(from, n, true);
 		n = _copy_to_user(to, from, n);
-	} else if (__builtin_constant_p(n))
-		copy_to_user_overflow();
+	} else if (!__builtin_constant_p(n))
+		copy_user_overflow(sz, n);
 	else
-		__copy_to_user_overflow(sz, n);
+		__bad_copy_user();
 
 	return n;
 }
 
-#undef __copy_from_user_overflow
-#undef __copy_to_user_overflow
-
 /*
  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
  * nested NMI paths are careful to preserve CR2.
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index 1bfa602958f2..5dea1fb6979c 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -72,6 +72,7 @@ struct exception_table_entry
 /* Returns 0 if exception not found and fixup otherwise.  */
 extern unsigned long search_exception_table(unsigned long);
 
+
 /*
  * architectures with an MMU should override these two
  */
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 8dbc8929a6a0..573c5a18908f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -158,7 +158,7 @@
 #define __compiler_offsetof(a, b)					\
 	__builtin_offsetof(a, b)
 
-#if GCC_VERSION >= 40100 && GCC_VERSION < 40600
+#if GCC_VERSION >= 40100
 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
 #endif
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2307d7c89dac..2e2cca509231 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1686,24 +1686,6 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 
-config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
-	bool
-
-config DEBUG_STRICT_USER_COPY_CHECKS
-	bool "Strict user copy size checks"
-	depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
-	depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
-	help
-	  Enabling this option turns a certain set of sanity checks for user
-	  copy operations into compile time failures.
-
-	  The copy_from_user() etc checks are there to help test if there
-	  are sufficient security checks on the length argument of
-	  the copy operation, by having gcc prove that the argument is
-	  within bounds.
-
-	  If unsure, say N.
-
 source kernel/trace/Kconfig
 
 menu "Runtime Testing"
diff --git a/lib/Makefile b/lib/Makefile
index cfa68eb269e4..5dc77a8ec297 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -24,7 +24,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
 
-obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_HAS_DMA) += dma-noop.o
diff --git a/lib/usercopy.c b/lib/usercopy.c
deleted file mode 100644
index 4f5b1ddbcd25..000000000000
--- a/lib/usercopy.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#include <linux/export.h>
-#include <linux/bug.h>
-#include <linux/uaccess.h>
-
-void copy_from_user_overflow(void)
-{
-	WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
-- 
cgit v1.2.3


From 5db4f7f80d165fc9725f356e99feec409e446baa Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 16 Aug 2016 15:06:54 +0300
Subject: Revert "tty/serial/8250: use mctrl_gpio helpers"

Serial console is broken in v4.8-rcX. Mika and I independently bisected down to
commit 4ef03d328769 ("tty/serial/8250: use mctrl_gpio helpers").

Since neither author nor anyone else didn't propose a solution we better revert
it for now.

This reverts commit 4ef03d328769eddbfeca1f1c958fdb181a69c341.

Link: https://lkml.kernel.org/r/20160809130229.GN1729@lahna.fi.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/devicetree/bindings/serial/8250.txt | 19 ------------
 drivers/tty/serial/8250/8250.h                    | 35 +----------------------
 drivers/tty/serial/8250/8250_core.c               |  9 ------
 drivers/tty/serial/8250/8250_omap.c               | 31 +++++++++-----------
 drivers/tty/serial/8250/8250_port.c               |  7 +----
 drivers/tty/serial/8250/Kconfig                   |  1 -
 include/linux/serial_8250.h                       |  1 -
 7 files changed, 15 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/serial/8250.txt b/Documentation/devicetree/bindings/serial/8250.txt
index f5561ac7e17e..936ab5b87324 100644
--- a/Documentation/devicetree/bindings/serial/8250.txt
+++ b/Documentation/devicetree/bindings/serial/8250.txt
@@ -42,9 +42,6 @@ Optional properties:
 - auto-flow-control: one way to enable automatic flow control support. The
   driver is allowed to detect support for the capability even without this
   property.
-- {rts,cts,dtr,dsr,rng,dcd}-gpios: specify a GPIO for RTS/CTS/DTR/DSR/RI/DCD
-  line respectively. It will use specified GPIO instead of the peripheral
-  function pin for the UART feature. If unsure, don't specify this property.
 
 Note:
 * fsl,ns16550:
@@ -66,19 +63,3 @@ Example:
 		interrupts = <10>;
 		reg-shift = <2>;
 	};
-
-Example for OMAP UART using GPIO-based modem control signals:
-
-	uart4: serial@49042000 {
-		compatible = "ti,omap3-uart";
-		reg = <0x49042000 0x400>;
-		interrupts = <80>;
-		ti,hwmods = "uart4";
-		clock-frequency = <48000000>;
-		cts-gpios = <&gpio3 5 GPIO_ACTIVE_LOW>;
-		rts-gpios = <&gpio3 6 GPIO_ACTIVE_LOW>;
-		dtr-gpios = <&gpio1 12 GPIO_ACTIVE_LOW>;
-		dsr-gpios = <&gpio1 13 GPIO_ACTIVE_LOW>;
-		dcd-gpios = <&gpio1 14 GPIO_ACTIVE_LOW>;
-		rng-gpios = <&gpio1 15 GPIO_ACTIVE_LOW>;
-	};
diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 122e0e4029fe..1a16feac9a36 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -15,8 +15,6 @@
 #include <linux/serial_reg.h>
 #include <linux/dmaengine.h>
 
-#include "../serial_mctrl_gpio.h"
-
 struct uart_8250_dma {
 	int (*tx_dma)(struct uart_8250_port *p);
 	int (*rx_dma)(struct uart_8250_port *p);
@@ -133,43 +131,12 @@ void serial8250_em485_destroy(struct uart_8250_port *p);
 
 static inline void serial8250_out_MCR(struct uart_8250_port *up, int value)
 {
-	int mctrl_gpio = 0;
-
 	serial_out(up, UART_MCR, value);
-
-	if (value & UART_MCR_RTS)
-		mctrl_gpio |= TIOCM_RTS;
-	if (value & UART_MCR_DTR)
-		mctrl_gpio |= TIOCM_DTR;
-
-	mctrl_gpio_set(up->gpios, mctrl_gpio);
 }
 
 static inline int serial8250_in_MCR(struct uart_8250_port *up)
 {
-	int mctrl, mctrl_gpio = 0;
-
-	mctrl = serial_in(up, UART_MCR);
-
-	/* save current MCR values */
-	if (mctrl & UART_MCR_RTS)
-		mctrl_gpio |= TIOCM_RTS;
-	if (mctrl & UART_MCR_DTR)
-		mctrl_gpio |= TIOCM_DTR;
-
-	mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio);
-
-	if (mctrl_gpio & TIOCM_RTS)
-		mctrl |= UART_MCR_RTS;
-	else
-		mctrl &= ~UART_MCR_RTS;
-
-	if (mctrl_gpio & TIOCM_DTR)
-		mctrl |= UART_MCR_DTR;
-	else
-		mctrl &= ~UART_MCR_DTR;
-
-	return mctrl;
+	return serial_in(up, UART_MCR);
 }
 
 #if defined(__alpha__) && !defined(CONFIG_PCI)
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 13ad5c3d2e68..dcf43f66404f 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -974,8 +974,6 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 
 	uart = serial8250_find_match_or_unused(&up->port);
 	if (uart && uart->port.type != PORT_8250_CIR) {
-		struct mctrl_gpios *gpios;
-
 		if (uart->port.dev)
 			uart_remove_one_port(&serial8250_reg, &uart->port);
 
@@ -1013,13 +1011,6 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 		if (up->port.flags & UPF_FIXED_TYPE)
 			uart->port.type = up->port.type;
 
-		gpios = mctrl_gpio_init(&uart->port, 0);
-		if (IS_ERR(gpios)) {
-			if (PTR_ERR(gpios) != -ENOSYS)
-				return PTR_ERR(gpios);
-		} else
-			uart->gpios = gpios;
-
 		serial8250_set_defaults(uart);
 
 		/* Possibly override default I/O functions.  */
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index e14982f36a04..61ad6c3b20a0 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -134,21 +134,18 @@ static void omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
 
 	serial8250_do_set_mctrl(port, mctrl);
 
-	if (IS_ERR_OR_NULL(mctrl_gpio_to_gpiod(up->gpios,
-						UART_GPIO_RTS))) {
-		/*
-		 * Turn off autoRTS if RTS is lowered and restore autoRTS
-		 * setting if RTS is raised
-		 */
-		lcr = serial_in(up, UART_LCR);
-		serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
-		if ((mctrl & TIOCM_RTS) && (port->status & UPSTAT_AUTORTS))
-			priv->efr |= UART_EFR_RTS;
-		else
-			priv->efr &= ~UART_EFR_RTS;
-		serial_out(up, UART_EFR, priv->efr);
-		serial_out(up, UART_LCR, lcr);
-	}
+	/*
+	 * Turn off autoRTS if RTS is lowered and restore autoRTS setting
+	 * if RTS is raised
+	 */
+	lcr = serial_in(up, UART_LCR);
+	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
+	if ((mctrl & TIOCM_RTS) && (port->status & UPSTAT_AUTORTS))
+		priv->efr |= UART_EFR_RTS;
+	else
+		priv->efr &= ~UART_EFR_RTS;
+	serial_out(up, UART_EFR, priv->efr);
+	serial_out(up, UART_LCR, lcr);
 }
 
 /*
@@ -449,9 +446,7 @@ static void omap_8250_set_termios(struct uart_port *port,
 	priv->efr = 0;
 	up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF);
 
-	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW
-		&& IS_ERR_OR_NULL(mctrl_gpio_to_gpiod(up->gpios,
-							UART_GPIO_RTS))) {
+	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW) {
 		/* Enable AUTOCTS (autoRTS is enabled when RTS is raised) */
 		up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS;
 		priv->efr |= UART_EFR_CTS;
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 7481b95c6d84..bdfa659b9606 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1618,8 +1618,6 @@ static void serial8250_disable_ms(struct uart_port *port)
 	if (up->bugs & UART_BUG_NOMSR)
 		return;
 
-	mctrl_gpio_disable_ms(up->gpios);
-
 	up->ier &= ~UART_IER_MSI;
 	serial_port_out(port, UART_IER, up->ier);
 }
@@ -1632,8 +1630,6 @@ static void serial8250_enable_ms(struct uart_port *port)
 	if (up->bugs & UART_BUG_NOMSR)
 		return;
 
-	mctrl_gpio_enable_ms(up->gpios);
-
 	up->ier |= UART_IER_MSI;
 
 	serial8250_rpm_get(up);
@@ -1917,8 +1913,7 @@ unsigned int serial8250_do_get_mctrl(struct uart_port *port)
 		ret |= TIOCM_DSR;
 	if (status & UART_MSR_CTS)
 		ret |= TIOCM_CTS;
-
-	return mctrl_gpio_get(up->gpios, &ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);
 
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index c9ec839a5ddf..7c6f7afca5dd 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -6,7 +6,6 @@
 config SERIAL_8250
 	tristate "8250/16550 and compatible serial support"
 	select SERIAL_CORE
-	select SERIAL_MCTRL_GPIO if GPIOLIB
 	---help---
 	  This selects whether you want to include the driver for the standard
 	  serial ports.  The standard answer is Y.  People who might say N
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 923266cd294a..48ec7651989b 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -111,7 +111,6 @@ struct uart_8250_port {
 						 *   if no_console_suspend
 						 */
 	unsigned char		probe;
-	struct mctrl_gpios	*gpios;
 #define UART_PROBE_RSA	(1 << 0)
 
 	/*
-- 
cgit v1.2.3


From cd81a9170e69e018bbaba547c1fd85a585f5697a Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mguzik@redhat.com>
Date: Tue, 23 Aug 2016 16:20:38 +0200
Subject: mm: introduce get_task_exe_file

For more convenient access if one has a pointer to the task.

As a minor nit take advantage of the fact that only task lock + rcu are
needed to safely grab ->exe_file. This saves mm refcount dance.

Use the helper in proc_exe_link.

Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Cc: <stable@vger.kernel.org> # 4.3.x
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/proc/base.c     |  7 +------
 include/linux/mm.h |  1 +
 kernel/fork.c      | 23 +++++++++++++++++++++++
 3 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0d163a84082d..da8b1943ba04 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1552,18 +1552,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
 	struct task_struct *task;
-	struct mm_struct *mm;
 	struct file *exe_file;
 
 	task = get_proc_task(d_inode(dentry));
 	if (!task)
 		return -ENOENT;
-	mm = get_task_mm(task);
+	exe_file = get_task_exe_file(task);
 	put_task_struct(task);
-	if (!mm)
-		return -ENOENT;
-	exe_file = get_mm_exe_file(mm);
-	mmput(mm);
 	if (exe_file) {
 		*exe_path = exe_file->f_path;
 		path_get(&exe_file->f_path);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f468e0d2534..004c73a988b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1987,6 +1987,7 @@ extern void mm_drop_all_locks(struct mm_struct *mm);
 
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
+extern struct file *get_task_exe_file(struct task_struct *task);
 
 extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
 extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
diff --git a/kernel/fork.c b/kernel/fork.c
index d277e83ed3e0..42451aeb245f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -773,6 +773,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(get_mm_exe_file);
 
+/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+	struct file *exe_file = NULL;
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm) {
+		if (!(task->flags & PF_KTHREAD))
+			exe_file = get_mm_exe_file(mm);
+	}
+	task_unlock(task);
+	return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
-- 
cgit v1.2.3


From 2a3a2a3f35249412e35fbb48b743348c40373409 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Thu, 1 Sep 2016 11:11:59 +0200
Subject: ovl: don't cache acl on overlay layer

Some operations (setxattr/chmod) can make the cached acl stale.  We either
need to clear overlay's acl cache for the affected inode or prevent acl
caching on the overlay altogether.  Preventing caching has the following
advantages:

 - no double caching, less memory used

 - overlay cache doesn't go stale when fs clears it's own cache

Possible disadvantage is performance loss.  If that becomes a problem
get_acl() can be optimized for overlayfs.

This patch disables caching by pre setting i_*acl to a value that

  - has bit 0 set, so is_uncached_acl() will return true

  - is not equal to ACL_NOT_CACHED, so get_acl() will not overwrite it

The constant -3 was chosen for this purpose.

Fixes: 39a25b2b3762 ("ovl: define ->get_acl() for overlay inodes")
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/inode.c | 3 +++
 include/linux/fs.h   | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index d50d1ead1b6f..47a4f33df47b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -416,6 +416,9 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
 	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_flags |= S_NOCMTIME;
+#ifdef CONFIG_FS_POSIX_ACL
+	inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
+#endif
 
 	mode &= S_IFMT;
 	switch (mode) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3523bf62f328..901e25d495cc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -574,6 +574,7 @@ static inline void mapping_allow_writable(struct address_space *mapping)
 
 struct posix_acl;
 #define ACL_NOT_CACHED ((void *)(-1))
+#define ACL_DONT_CACHE ((void *)(-3))
 
 static inline struct posix_acl *
 uncached_acl_sentinel(struct task_struct *task)
-- 
cgit v1.2.3


From 6aa303defb7454a2520c4ddcdf6b081f62a15890 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 1 Sep 2016 16:14:55 -0700
Subject: mm, vmscan: only allocate and reclaim from zones with pages managed
 by the buddy allocator

Firmware Assisted Dump (FA_DUMP) on ppc64 reserves substantial amounts
of memory when booting a secondary kernel.  Srikar Dronamraju reported
that multiple nodes may have no memory managed by the buddy allocator
but still return true for populated_zone().

Commit 1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of
nodes") was reported to cause kswapd to spin at 100% CPU usage when
fadump was enabled.  The old code happened to deal with the situation of
a populated node with zero free pages by co-incidence but the current
code tries to reclaim populated zones without realising that is
impossible.

We cannot just convert populated_zone() as many existing users really
need to check for present_pages.  This patch introduces a managed_zone()
helper and uses it in the few cases where it is critical that the check
is made for managed pages -- zonelist construction and page reclaim.

Link: http://lkml.kernel.org/r/20160831195104.GB8119@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Reported-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 16 ++++++++++++++--
 mm/page_alloc.c        |  4 ++--
 mm/vmscan.c            | 22 +++++++++++-----------
 3 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d572b78b65e1..7f2ae99e5daf 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -828,9 +828,21 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
-static inline int populated_zone(struct zone *zone)
+/*
+ * Returns true if a zone has pages managed by the buddy allocator.
+ * All the reclaim decisions have to use this function rather than
+ * populated_zone(). If the whole zone is reserved then we can easily
+ * end up with populated_zone() && !managed_zone().
+ */
+static inline bool managed_zone(struct zone *zone)
+{
+	return zone->managed_pages;
+}
+
+/* Returns true if a zone has memory */
+static inline bool populated_zone(struct zone *zone)
 {
-	return (!!zone->present_pages);
+	return zone->present_pages;
 }
 
 extern int movable_zone;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7791a03f8deb..a2214c64ed3c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4360,7 +4360,7 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
-		if (populated_zone(zone)) {
+		if (managed_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
@@ -4598,7 +4598,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
-			if (populated_zone(z)) {
+			if (managed_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 374d95d04178..b1e12a1ea9cf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1665,7 +1665,7 @@ static bool inactive_reclaimable_pages(struct lruvec *lruvec,
 
 	for (zid = sc->reclaim_idx; zid >= 0; zid--) {
 		zone = &pgdat->node_zones[zid];
-		if (!populated_zone(zone))
+		if (!managed_zone(zone))
 			continue;
 
 		if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
@@ -2036,7 +2036,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 		struct zone *zone = &pgdat->node_zones[zid];
 		unsigned long inactive_zone, active_zone;
 
-		if (!populated_zone(zone))
+		if (!managed_zone(zone))
 			continue;
 
 		inactive_zone = zone_page_state(zone,
@@ -2171,7 +2171,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 
 		for (z = 0; z < MAX_NR_ZONES; z++) {
 			struct zone *zone = &pgdat->node_zones[z];
-			if (!populated_zone(zone))
+			if (!managed_zone(zone))
 				continue;
 
 			total_high_wmark += high_wmark_pages(zone);
@@ -2510,7 +2510,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 	/* If compaction would go ahead or the allocation would succeed, stop */
 	for (z = 0; z <= sc->reclaim_idx; z++) {
 		struct zone *zone = &pgdat->node_zones[z];
-		if (!populated_zone(zone))
+		if (!managed_zone(zone))
 			continue;
 
 		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
@@ -2840,7 +2840,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 
 	for (i = 0; i <= ZONE_NORMAL; i++) {
 		zone = &pgdat->node_zones[i];
-		if (!populated_zone(zone) ||
+		if (!managed_zone(zone) ||
 		    pgdat_reclaimable_pages(pgdat) == 0)
 			continue;
 
@@ -3141,7 +3141,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 	for (i = 0; i <= classzone_idx; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 
-		if (!populated_zone(zone))
+		if (!managed_zone(zone))
 			continue;
 
 		if (!zone_balanced(zone, order, classzone_idx))
@@ -3169,7 +3169,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
 	sc->nr_to_reclaim = 0;
 	for (z = 0; z <= sc->reclaim_idx; z++) {
 		zone = pgdat->node_zones + z;
-		if (!populated_zone(zone))
+		if (!managed_zone(zone))
 			continue;
 
 		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
@@ -3242,7 +3242,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		if (buffer_heads_over_limit) {
 			for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
 				zone = pgdat->node_zones + i;
-				if (!populated_zone(zone))
+				if (!managed_zone(zone))
 					continue;
 
 				sc.reclaim_idx = i;
@@ -3262,7 +3262,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 */
 		for (i = classzone_idx; i >= 0; i--) {
 			zone = pgdat->node_zones + i;
-			if (!populated_zone(zone))
+			if (!managed_zone(zone))
 				continue;
 
 			if (zone_balanced(zone, sc.order, classzone_idx))
@@ -3508,7 +3508,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	pg_data_t *pgdat;
 	int z;
 
-	if (!populated_zone(zone))
+	if (!managed_zone(zone))
 		return;
 
 	if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
@@ -3522,7 +3522,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	/* Only wake kswapd if all zones are unbalanced */
 	for (z = 0; z <= classzone_idx; z++) {
 		zone = pgdat->node_zones + z;
-		if (!populated_zone(zone))
+		if (!managed_zone(zone))
 			continue;
 
 		if (zone_balanced(zone, order, classzone_idx))
-- 
cgit v1.2.3


From 7e932159901183283cd82d797bc9a7c681e48e9c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 1 Sep 2016 16:15:01 -0700
Subject: treewide: remove references to the now unnecessary
 DEFINE_PCI_DEVICE_TABLE

It's been eliminated from the sources, remove it from everywhere else.

Link: http://lkml.kernel.org/r/076eff466fd7edb550c25c8b25d76924ca0eba62.1472660229.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/PCI/pci.txt | 1 -
 include/linux/pci.h       | 9 ---------
 scripts/checkpatch.pl     | 9 ---------
 scripts/tags.sh           | 1 -
 4 files changed, 20 deletions(-)

(limited to 'include')

diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt
index 123881f62219..77f49dc5be23 100644
--- a/Documentation/PCI/pci.txt
+++ b/Documentation/PCI/pci.txt
@@ -124,7 +124,6 @@ initialization with a pointer to a structure describing the driver
 
 The ID table is an array of struct pci_device_id entries ending with an
 all-zero entry.  Definitions with static const are generally preferred.
-Use of the deprecated macro DEFINE_PCI_DEVICE_TABLE should be avoided.
 
 Each entry consists of:
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index fbc1fa625c3e..0ab835965669 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -682,15 +682,6 @@ struct pci_driver {
 
 #define	to_pci_driver(drv) container_of(drv, struct pci_driver, driver)
 
-/**
- * DEFINE_PCI_DEVICE_TABLE - macro used to describe a pci device table
- * @_table: device table name
- *
- * This macro is deprecated and should not be used in new code.
- */
-#define DEFINE_PCI_DEVICE_TABLE(_table) \
-	const struct pci_device_id _table[]
-
 /**
  * PCI_DEVICE - macro used to describe a specific pci device
  * @vend: the 16 bit PCI Vendor ID
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4de3cc42fc50..206a6b346a8d 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3570,15 +3570,6 @@ sub process {
 			}
 		}
 
-# check for uses of DEFINE_PCI_DEVICE_TABLE
-		if ($line =~ /\bDEFINE_PCI_DEVICE_TABLE\s*\(\s*(\w+)\s*\)\s*=/) {
-			if (WARN("DEFINE_PCI_DEVICE_TABLE",
-				 "Prefer struct pci_device_id over deprecated DEFINE_PCI_DEVICE_TABLE\n" . $herecurr) &&
-			    $fix) {
-				$fixed[$fixlinenr] =~ s/\b(?:static\s+|)DEFINE_PCI_DEVICE_TABLE\s*\(\s*(\w+)\s*\)\s*=\s*/static const struct pci_device_id $1\[\] = /;
-			}
-		}
-
 # check for new typedefs, only function parameters and sparse annotations
 # make sense.
 		if ($line =~ /\btypedef\s/ &&
diff --git a/scripts/tags.sh b/scripts/tags.sh
index ed7eef24ef89..b3775a9604ea 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -206,7 +206,6 @@ regex_c=(
 	'/\<DEFINE_PER_CPU_SHARED_ALIGNED([^,]*, *\([[:alnum:]_]*\)/\1/v/'
 	'/\<DECLARE_WAIT_QUEUE_HEAD(\([[:alnum:]_]*\)/\1/v/'
 	'/\<DECLARE_\(TASKLET\|WORK\|DELAYED_WORK\)(\([[:alnum:]_]*\)/\2/v/'
-	'/\<DEFINE_PCI_DEVICE_TABLE(\([[:alnum:]_]*\)/\1/v/'
 	'/\(^\s\)OFFSET(\([[:alnum:]_]*\)/\2/v/'
 	'/\(^\s\)DEFINE(\([[:alnum:]_]*\)/\2/v/'
 	'/\<DEFINE_HASHTABLE(\([[:alnum:]_]*\)/\1/v/'
-- 
cgit v1.2.3


From c11600e4fed67ae4cd6a8096936afd445410e8ed Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 1 Sep 2016 16:15:07 -0700
Subject: mm, mempolicy: task->mempolicy must be NULL before dropping final
 reference

KASAN allocates memory from the page allocator as part of
kmem_cache_free(), and that can reference current->mempolicy through any
number of allocation functions.  It needs to be NULL'd out before the
final reference is dropped to prevent a use-after-free bug:

	BUG: KASAN: use-after-free in alloc_pages_current+0x363/0x370 at addr ffff88010b48102c
	CPU: 0 PID: 15425 Comm: trinity-c2 Not tainted 4.8.0-rc2+ #140
	...
	Call Trace:
		dump_stack
		kasan_object_err
		kasan_report_error
		__asan_report_load2_noabort
		alloc_pages_current	<-- use after free
		depot_save_stack
		save_stack
		kasan_slab_free
		kmem_cache_free
		__mpol_put		<-- free
		do_exit

This patch sets current->mempolicy to NULL before dropping the final
reference.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1608301442180.63329@chino.kir.corp.google.com
Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB")
Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>	[4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  4 ++++
 kernel/exit.c             |  7 +------
 mm/mempolicy.c            | 17 +++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 4429d255c8ab..5e5b2969d931 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -195,6 +195,7 @@ static inline bool vma_migratable(struct vm_area_struct *vma)
 }
 
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern void mpol_put_task_policy(struct task_struct *);
 
 #else
 
@@ -297,5 +298,8 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
 	return -1; /* no node preference */
 }
 
+static inline void mpol_put_task_policy(struct task_struct *task)
+{
+}
 #endif /* CONFIG_NUMA */
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f974ae042a6..091a78be3b09 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -848,12 +848,7 @@ void do_exit(long code)
 	TASKS_RCU(preempt_enable());
 	exit_notify(tsk, group_dead);
 	proc_exit_connector(tsk);
-#ifdef CONFIG_NUMA
-	task_lock(tsk);
-	mpol_put(tsk->mempolicy);
-	tsk->mempolicy = NULL;
-	task_unlock(tsk);
-#endif
+	mpol_put_task_policy(tsk);
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
 		kfree(current->pi_state_cache);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d8c4e38fb5f4..2da72a5b6ecc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2336,6 +2336,23 @@ out:
 	return ret;
 }
 
+/*
+ * Drop the (possibly final) reference to task->mempolicy.  It needs to be
+ * dropped after task->mempolicy is set to NULL so that any allocation done as
+ * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
+ * policy.
+ */
+void mpol_put_task_policy(struct task_struct *task)
+{
+	struct mempolicy *pol;
+
+	task_lock(task);
+	pol = task->mempolicy;
+	task->mempolicy = NULL;
+	task_unlock(task);
+	mpol_put(pol);
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
-- 
cgit v1.2.3


From 3feab13c919f99b0a17d0ca22ae00cf90f5d3fd1 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Tue, 16 Aug 2016 16:59:52 +0100
Subject: ACPI / drivers: fix typo in ACPI_DECLARE_PROBE_ENTRY macro

When the ACPI_DECLARE_PROBE_ENTRY macro was added in
commit e647b532275b ("ACPI: Add early device probing infrastructure"),
a stub macro adding an unused entry was added for the !CONFIG_ACPI
Kconfig option case to make sure kernel code making use of the
macro did not require to be guarded within CONFIG_ACPI in order to
be compiled.

The stub macro was never used since all kernel code that defines
ACPI_DECLARE_PROBE_ENTRY entries is currently guarded within
CONFIG_ACPI; it contains a typo that should be nonetheless fixed.

Fix the typo in the stub (ie !CONFIG_ACPI) ACPI_DECLARE_PROBE_ENTRY()
macro so that it can actually be used if needed.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Fixes: e647b532275b (ACPI: Add early device probing infrastructure)
Cc: 4.4+ <stable@vger.kernel.org> # 4.4+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4d8452c2384b..c5eaf2f80a4c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1056,7 +1056,7 @@ static inline struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 	return NULL;
 }
 
-#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, validate, data, fn) \
+#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
 	static const void * __acpi_table_##name[]			\
 		__attribute__((unused))					\
 		 = { (void *) table_id,					\
-- 
cgit v1.2.3


From 3f37ec79dd21fbdbbab8143a48a87272b22fef22 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Mon, 25 Jul 2016 20:33:56 +0200
Subject: bcma: support BCM53573 series of wireless SoCs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BCM53573 seems to be the first series of Northstar family with wireless
on the chip. The base models are BCM53573-s (A0, A1) and there is also
BCM47189B0 which seems to be some small modification.

The only problem with these chipsets seems to be watchdog. It's totally
unavailable on 53573A0 / 53573A1 and preferable PMU watchdog is broken
on 53573B0 / 53573B1.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/driver_chipcommon.c | 32 +++++++++++++++++++++++++++++---
 include/linux/bcma/bcma.h        |  3 +++
 2 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/bcma/driver_chipcommon.c b/drivers/bcma/driver_chipcommon.c
index 921ce1834673..b4f6520e74f0 100644
--- a/drivers/bcma/driver_chipcommon.c
+++ b/drivers/bcma/driver_chipcommon.c
@@ -36,12 +36,31 @@ u32 bcma_chipco_get_alp_clock(struct bcma_drv_cc *cc)
 }
 EXPORT_SYMBOL_GPL(bcma_chipco_get_alp_clock);
 
+static bool bcma_core_cc_has_pmu_watchdog(struct bcma_drv_cc *cc)
+{
+	struct bcma_bus *bus = cc->core->bus;
+
+	if (cc->capabilities & BCMA_CC_CAP_PMU) {
+		if (bus->chipinfo.id == BCMA_CHIP_ID_BCM53573) {
+			WARN(bus->chipinfo.rev <= 1, "No watchdog available\n");
+			/* 53573B0 and 53573B1 have bugged PMU watchdog. It can
+			 * be enabled but timer can't be bumped. Use CC one
+			 * instead.
+			 */
+			return false;
+		}
+		return true;
+	} else {
+		return false;
+	}
+}
+
 static u32 bcma_chipco_watchdog_get_max_timer(struct bcma_drv_cc *cc)
 {
 	struct bcma_bus *bus = cc->core->bus;
 	u32 nb;
 
-	if (cc->capabilities & BCMA_CC_CAP_PMU) {
+	if (bcma_core_cc_has_pmu_watchdog(cc)) {
 		if (bus->chipinfo.id == BCMA_CHIP_ID_BCM4706)
 			nb = 32;
 		else if (cc->core->id.rev < 26)
@@ -95,9 +114,16 @@ static int bcma_chipco_watchdog_ticks_per_ms(struct bcma_drv_cc *cc)
 
 int bcma_chipco_watchdog_register(struct bcma_drv_cc *cc)
 {
+	struct bcma_bus *bus = cc->core->bus;
 	struct bcm47xx_wdt wdt = {};
 	struct platform_device *pdev;
 
+	if (bus->chipinfo.id == BCMA_CHIP_ID_BCM53573 &&
+	    bus->chipinfo.rev <= 1) {
+		pr_debug("No watchdog on 53573A0 / 53573A1\n");
+		return 0;
+	}
+
 	wdt.driver_data = cc;
 	wdt.timer_set = bcma_chipco_watchdog_timer_set_wdt;
 	wdt.timer_set_ms = bcma_chipco_watchdog_timer_set_ms_wdt;
@@ -105,7 +131,7 @@ int bcma_chipco_watchdog_register(struct bcma_drv_cc *cc)
 		bcma_chipco_watchdog_get_max_timer(cc) / cc->ticks_per_ms;
 
 	pdev = platform_device_register_data(NULL, "bcm47xx-wdt",
-					     cc->core->bus->num, &wdt,
+					     bus->num, &wdt,
 					     sizeof(wdt));
 	if (IS_ERR(pdev))
 		return PTR_ERR(pdev);
@@ -217,7 +243,7 @@ u32 bcma_chipco_watchdog_timer_set(struct bcma_drv_cc *cc, u32 ticks)
 	u32 maxt;
 
 	maxt = bcma_chipco_watchdog_get_max_timer(cc);
-	if (cc->capabilities & BCMA_CC_CAP_PMU) {
+	if (bcma_core_cc_has_pmu_watchdog(cc)) {
 		if (ticks == 1)
 			ticks = 2;
 		else if (ticks > maxt)
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 3db25df396cb..8eeedb2db924 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -205,6 +205,9 @@ struct bcma_host_ops {
 #define  BCMA_PKG_ID_BCM4709	0
 #define BCMA_CHIP_ID_BCM47094	53030
 #define BCMA_CHIP_ID_BCM53018	53018
+#define BCMA_CHIP_ID_BCM53573	53573
+#define  BCMA_PKG_ID_BCM53573	0
+#define  BCMA_PKG_ID_BCM47189	1
 
 /* Board types (on PCI usually equals to the subsystem dev id) */
 /* BCM4313 */
-- 
cgit v1.2.3


From 24b27fc4cdf9e10c5e79e5923b6b7c2c5c95096c Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 1 Sep 2016 22:18:34 -0700
Subject: bonding: Fix bonding crash

Following few steps will crash kernel -

  (a) Create bonding master
      > modprobe bonding miimon=50
  (b) Create macvlan bridge on eth2
      > ip link add link eth2 dev mvl0 address aa:0:0:0:0:01 \
	   type macvlan
  (c) Now try adding eth2 into the bond
      > echo +eth2 > /sys/class/net/bond0/bonding/slaves
      <crash>

Bonding does lots of things before checking if the device enslaved is
busy or not.

In this case when the notifier call-chain sends notifications, the
bond_netdev_event() assumes that the rx_handler /rx_handler_data is
registered while the bond_enslave() hasn't progressed far enough to
register rx_handler for the new slave.

This patch adds a rx_handler check that can be performed right at the
beginning of the enslave code to avoid getting into this situation.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  7 ++++---
 include/linux/netdevice.h       |  1 +
 net/core/dev.c                  | 16 ++++++++++++++++
 3 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 217e8da0628c..9599ed6f1213 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1341,9 +1341,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 			    slave_dev->name);
 	}
 
-	/* already enslaved */
-	if (slave_dev->flags & IFF_SLAVE) {
-		netdev_dbg(bond_dev, "Error: Device was already enslaved\n");
+	/* already in-use? */
+	if (netdev_is_rx_handler_busy(slave_dev)) {
+		netdev_err(bond_dev,
+			   "Error: Device is in use and cannot be enslaved\n");
 		return -EBUSY;
 	}
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3a788bf0affd..e8d79d4ebcfe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3267,6 +3267,7 @@ static inline void napi_free_frags(struct napi_struct *napi)
 	napi->skb = NULL;
 }
 
+bool netdev_is_rx_handler_busy(struct net_device *dev);
 int netdev_rx_handler_register(struct net_device *dev,
 			       rx_handler_func_t *rx_handler,
 			       void *rx_handler_data);
diff --git a/net/core/dev.c b/net/core/dev.c
index dd6ce598de89..ea6312057a71 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3974,6 +3974,22 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	return skb;
 }
 
+/**
+ *	netdev_is_rx_handler_busy - check if receive handler is registered
+ *	@dev: device to check
+ *
+ *	Check if a receive handler is already registered for a given device.
+ *	Return true if there one.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+bool netdev_is_rx_handler_busy(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	return dev && rtnl_dereference(dev->rx_handler);
+}
+EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
+
 /**
  *	netdev_rx_handler_register - register receive handler
  *	@dev: device to register a handler for
-- 
cgit v1.2.3


From 6e1ce3c3451291142a57c4f3f6f999a29fb5b3bc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 1 Sep 2016 14:43:53 -0700
Subject: af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'

Right now we use the 'readlock' both for protecting some of the af_unix
IO path and for making the bind be single-threaded.

The two are independent, but using the same lock makes for a nasty
deadlock due to ordering with regards to filesystem locking.  The bind
locking would want to nest outside the VSF pathname locking, but the IO
locking wants to nest inside some of those same locks.

We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix
splice-bind deadlock") which moved the readlock inside the vfs locks,
but that caused problems with overlayfs that will then call back into
filesystem routines that take the lock in the wrong order anyway.

Splitting the locks means that we can go back to having the bind lock be
the outermost lock, and we don't have any deadlocks with lock ordering.

Acked-by: Rainer Weikusat <rweikusat@cyberadapt.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    | 45 +++++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 9b4c418bebd8..fd60eccb59a6 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -52,7 +52,7 @@ struct unix_sock {
 	struct sock		sk;
 	struct unix_address     *addr;
 	struct path		path;
-	struct mutex		readlock;
+	struct mutex		iolock, bindlock;
 	struct sock		*peer;
 	struct list_head	link;
 	atomic_long_t		inflight;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 433ae1bbef97..8309687a56b0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -661,11 +661,11 @@ static int unix_set_peek_off(struct sock *sk, int val)
 {
 	struct unix_sock *u = unix_sk(sk);
 
-	if (mutex_lock_interruptible(&u->readlock))
+	if (mutex_lock_interruptible(&u->iolock))
 		return -EINTR;
 
 	sk->sk_peek_off = val;
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 
 	return 0;
 }
@@ -779,7 +779,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	spin_lock_init(&u->lock);
 	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
-	mutex_init(&u->readlock); /* single task reading lock */
+	mutex_init(&u->iolock); /* single task reading lock */
+	mutex_init(&u->bindlock); /* single task binding lock */
 	init_waitqueue_head(&u->peer_wait);
 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound(sk), sk);
@@ -848,7 +849,7 @@ static int unix_autobind(struct socket *sock)
 	int err;
 	unsigned int retries = 0;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		return err;
 
@@ -895,7 +896,7 @@ retry:
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
-out:	mutex_unlock(&u->readlock);
+out:	mutex_unlock(&u->bindlock);
 	return err;
 }
 
@@ -1009,7 +1010,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	addr_len = err;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		goto out;
 
@@ -1063,7 +1064,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->bindlock);
 out:
 	return err;
 }
@@ -1955,17 +1956,17 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 	if (false) {
 alloc_skb:
 		unix_state_unlock(other);
-		mutex_unlock(&unix_sk(other)->readlock);
+		mutex_unlock(&unix_sk(other)->iolock);
 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
 					      &err, 0);
 		if (!newskb)
 			goto err;
 	}
 
-	/* we must acquire readlock as we modify already present
+	/* we must acquire iolock as we modify already present
 	 * skbs in the sk_receive_queue and mess with skb->len
 	 */
-	err = mutex_lock_interruptible(&unix_sk(other)->readlock);
+	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
 	if (err) {
 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
 		goto err;
@@ -2032,7 +2033,7 @@ alloc_skb:
 	}
 
 	unix_state_unlock(other);
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 
 	other->sk_data_ready(other);
 	scm_destroy(&scm);
@@ -2041,7 +2042,7 @@ alloc_skb:
 err_state_unlock:
 	unix_state_unlock(other);
 err_unlock:
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 err:
 	kfree_skb(newskb);
 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
@@ -2109,7 +2110,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
-		mutex_lock(&u->readlock);
+		mutex_lock(&u->iolock);
 
 		skip = sk_peek_offset(sk, flags);
 		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
@@ -2117,14 +2118,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		if (skb)
 			break;
 
-		mutex_unlock(&u->readlock);
+		mutex_unlock(&u->iolock);
 
 		if (err != -EAGAIN)
 			break;
 	} while (timeo &&
 		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
 
-	if (!skb) { /* implies readlock unlocked */
+	if (!skb) { /* implies iolock unlocked */
 		unix_state_lock(sk);
 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
@@ -2189,7 +2190,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 
 out_free:
 	skb_free_datagram(sk, skb);
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 out:
 	return err;
 }
@@ -2284,7 +2285,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
 	/* Lock the socket to prevent queue disordering
 	 * while sleeps in memcpy_tomsg
 	 */
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	if (flags & MSG_PEEK)
 		skip = sk_peek_offset(sk, flags);
@@ -2326,7 +2327,7 @@ again:
 				break;
 			}
 
-			mutex_unlock(&u->readlock);
+			mutex_unlock(&u->iolock);
 
 			timeo = unix_stream_data_wait(sk, timeo, last,
 						      last_len);
@@ -2337,7 +2338,7 @@ again:
 				goto out;
 			}
 
-			mutex_lock(&u->readlock);
+			mutex_lock(&u->iolock);
 			goto redo;
 unlock:
 			unix_state_unlock(sk);
@@ -2440,7 +2441,7 @@ unlock:
 		}
 	} while (size);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	if (state->msg)
 		scm_recv(sock, state->msg, &scm, flags);
 	else
@@ -2481,9 +2482,9 @@ static ssize_t skb_unix_socket_splice(struct sock *sk,
 	int ret;
 	struct unix_sock *u = unix_sk(sk);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	ret = splice_to_pipe(pipe, spd);
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	return ret;
 }
-- 
cgit v1.2.3


From d7127b5e5fa0551be21b86640f1648b224e36d43 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Aug 2016 08:16:00 +0200
Subject: locking/barriers: Don't use sizeof(void) in lockless_dereference()

My previous commit:

  112dc0c8069e ("locking/barriers: Suppress sparse warnings in lockless_dereference()")

caused sparse to complain that (in radix-tree.h) we use sizeof(void)
since that rcu_dereference()s a void *.

Really, all we need is to have the expression *p in here somewhere
to make sure p is a pointer type, and sizeof(*p) was the thing that
came to my mind first to make sure that's done without really doing
anything at runtime.

Another thing I had considered was using typeof(*p), but obviously
we can't just declare a typeof(*p) variable either, since that may
end up being void. Declaring a variable as typeof(*p)* gets around
that, and still checks that typeof(*p) is valid, so do that. This
type construction can't be done for _________p1 because that will
actually be used and causes sparse address space warnings, so keep
a separate unused variable for it.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kbuild-all@01.org
Fixes: 112dc0c8069e ("locking/barriers: Suppress sparse warnings in lockless_dereference()")
Link: http://lkml.kernel.org/r/1472192160-4049-1-git-send-email-johannes@sipsolutions.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 436aa4e42221..668569844d37 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -527,13 +527,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * object's lifetime is managed by something other than RCU.  That
  * "something other" might be reference counting or simple immortality.
  *
- * The seemingly unused size_t variable is to validate @p is indeed a pointer
- * type by making sure it can be dereferenced.
+ * The seemingly unused variable ___typecheck_p validates that @p is
+ * indeed a pointer type by using a pointer to typeof(*p) as the type.
+ * Taking a pointer to typeof(*p) again is needed in case p is void *.
  */
 #define lockless_dereference(p) \
 ({ \
 	typeof(p) _________p1 = READ_ONCE(p); \
-	size_t __maybe_unused __size_of_ptr = sizeof(*(p)); \
+	typeof(*(p)) *___typecheck_p __maybe_unused; \
 	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
 	(_________p1); \
 })
-- 
cgit v1.2.3


From d4c4fed08f31f3746000c46cb1b20bed2959547a Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 15 Aug 2016 09:05:45 -0600
Subject: efi: Make for_each_efi_memory_desc_in_map() cope with running on Xen

While commit 55f1ea15216 ("efi: Fix for_each_efi_memory_desc_in_map()
for empty memmaps") made an attempt to deal with empty memory maps, it
didn't address the case where the map field never gets set, as is
apparently the case when running under Xen.

Reported-by: <lists@ssl-mail.com>
Tested-by: <lists@ssl-mail.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: <stable@vger.kernel.org> # v4.7+
Signed-off-by: Jan Beulich <jbeulich@suse.com>
[ Guard the loop with a NULL check instead of pointer underflow ]
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
---
 include/linux/efi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 7f5a58225385..23cd3ced8c1a 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -946,7 +946,7 @@ extern int efi_memattr_apply_permissions(struct mm_struct *mm,
 /* Iterate through an efi_memory_map */
 #define for_each_efi_memory_desc_in_map(m, md)				   \
 	for ((md) = (m)->map;						   \
-	     ((void *)(md) + (m)->desc_size) <= (m)->map_end;		   \
+	     (md) && ((void *)(md) + (m)->desc_size) <= (m)->map_end;	   \
 	     (md) = (void *)(md) + (m)->desc_size)
 
 /**
-- 
cgit v1.2.3


From dadb57abc37499f565b23933dbf49b435c3ba8af Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jhugo@codeaurora.org>
Date: Mon, 29 Aug 2016 14:38:51 -0600
Subject: efi/libstub: Allocate headspace in efi_get_memory_map()

efi_get_memory_map() allocates a buffer to store the memory map that it
retrieves.  This buffer may need to be reused by the client after
ExitBootServices() is called, at which point allocations are not longer
permitted.  To support this usecase, provide the allocated buffer size back
to the client, and allocate some additional headroom to account for any
reasonable growth in the map that is likely to happen between the call to
efi_get_memory_map() and the client reusing the buffer.

Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Leif Lindholm <leif.lindholm@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
---
 arch/x86/boot/compressed/eboot.c               | 20 ++++--
 drivers/firmware/efi/libstub/efi-stub-helper.c | 96 ++++++++++++++++++--------
 drivers/firmware/efi/libstub/fdt.c             | 17 +++--
 drivers/firmware/efi/libstub/random.c          | 12 +++-
 include/linux/efi.h                            | 15 ++--
 5 files changed, 111 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index ff574dad95cc..c5b7c7b4f0d7 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1008,7 +1008,7 @@ static efi_status_t exit_boot(struct boot_params *boot_params,
 			      void *handle, bool is64)
 {
 	struct efi_info *efi = &boot_params->efi_info;
-	unsigned long map_sz, key, desc_size;
+	unsigned long map_sz, key, desc_size, buff_size;
 	efi_memory_desc_t *mem_map;
 	struct setup_data *e820ext;
 	const char *signature;
@@ -1019,14 +1019,20 @@ static efi_status_t exit_boot(struct boot_params *boot_params,
 	bool called_exit = false;
 	u8 nr_entries;
 	int i;
-
-	nr_desc = 0;
-	e820ext = NULL;
-	e820ext_size = 0;
+	struct efi_boot_memmap map;
+
+	nr_desc =	0;
+	e820ext =	NULL;
+	e820ext_size =	0;
+	map.map =	&mem_map;
+	map.map_size =	&map_sz;
+	map.desc_size =	&desc_size;
+	map.desc_ver =	&desc_version;
+	map.key_ptr =	&key;
+	map.buff_size =	&buff_size;
 
 get_map:
-	status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size,
-				    &desc_version, &key);
+	status = efi_get_memory_map(sys_table, &map);
 
 	if (status != EFI_SUCCESS)
 		return status;
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 3bd127f95315..29368ac69221 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -41,6 +41,8 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 #define EFI_ALLOC_ALIGN		EFI_PAGE_SIZE
 #endif
 
+#define EFI_MMAP_NR_SLACK_SLOTS	8
+
 struct file_info {
 	efi_file_handle_t *handle;
 	u64 size;
@@ -63,49 +65,62 @@ void efi_printk(efi_system_table_t *sys_table_arg, char *str)
 	}
 }
 
+static inline bool mmap_has_headroom(unsigned long buff_size,
+				     unsigned long map_size,
+				     unsigned long desc_size)
+{
+	unsigned long slack = buff_size - map_size;
+
+	return slack / desc_size >= EFI_MMAP_NR_SLACK_SLOTS;
+}
+
 efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg,
-				efi_memory_desc_t **map,
-				unsigned long *map_size,
-				unsigned long *desc_size,
-				u32 *desc_ver,
-				unsigned long *key_ptr)
+				struct efi_boot_memmap *map)
 {
 	efi_memory_desc_t *m = NULL;
 	efi_status_t status;
 	unsigned long key;
 	u32 desc_version;
 
-	*map_size = sizeof(*m) * 32;
+	*map->desc_size =	sizeof(*m);
+	*map->map_size =	*map->desc_size * 32;
+	*map->buff_size =	*map->map_size;
 again:
-	/*
-	 * Add an additional efi_memory_desc_t because we're doing an
-	 * allocation which may be in a new descriptor region.
-	 */
-	*map_size += sizeof(*m);
 	status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
-				*map_size, (void **)&m);
+				*map->map_size, (void **)&m);
 	if (status != EFI_SUCCESS)
 		goto fail;
 
-	*desc_size = 0;
+	*map->desc_size = 0;
 	key = 0;
-	status = efi_call_early(get_memory_map, map_size, m,
-				&key, desc_size, &desc_version);
-	if (status == EFI_BUFFER_TOO_SMALL) {
+	status = efi_call_early(get_memory_map, map->map_size, m,
+				&key, map->desc_size, &desc_version);
+	if (status == EFI_BUFFER_TOO_SMALL ||
+	    !mmap_has_headroom(*map->buff_size, *map->map_size,
+			       *map->desc_size)) {
 		efi_call_early(free_pool, m);
+		/*
+		 * Make sure there is some entries of headroom so that the
+		 * buffer can be reused for a new map after allocations are
+		 * no longer permitted.  Its unlikely that the map will grow to
+		 * exceed this headroom once we are ready to trigger
+		 * ExitBootServices()
+		 */
+		*map->map_size += *map->desc_size * EFI_MMAP_NR_SLACK_SLOTS;
+		*map->buff_size = *map->map_size;
 		goto again;
 	}
 
 	if (status != EFI_SUCCESS)
 		efi_call_early(free_pool, m);
 
-	if (key_ptr && status == EFI_SUCCESS)
-		*key_ptr = key;
-	if (desc_ver && status == EFI_SUCCESS)
-		*desc_ver = desc_version;
+	if (map->key_ptr && status == EFI_SUCCESS)
+		*map->key_ptr = key;
+	if (map->desc_ver && status == EFI_SUCCESS)
+		*map->desc_ver = desc_version;
 
 fail:
-	*map = m;
+	*map->map = m;
 	return status;
 }
 
@@ -113,13 +128,20 @@ fail:
 unsigned long get_dram_base(efi_system_table_t *sys_table_arg)
 {
 	efi_status_t status;
-	unsigned long map_size;
+	unsigned long map_size, buff_size;
 	unsigned long membase  = EFI_ERROR;
 	struct efi_memory_map map;
 	efi_memory_desc_t *md;
+	struct efi_boot_memmap boot_map;
 
-	status = efi_get_memory_map(sys_table_arg, (efi_memory_desc_t **)&map.map,
-				    &map_size, &map.desc_size, NULL, NULL);
+	boot_map.map =		(efi_memory_desc_t **)&map.map;
+	boot_map.map_size =	&map_size;
+	boot_map.desc_size =	&map.desc_size;
+	boot_map.desc_ver =	NULL;
+	boot_map.key_ptr =	NULL;
+	boot_map.buff_size =	&buff_size;
+
+	status = efi_get_memory_map(sys_table_arg, &boot_map);
 	if (status != EFI_SUCCESS)
 		return membase;
 
@@ -144,15 +166,22 @@ efi_status_t efi_high_alloc(efi_system_table_t *sys_table_arg,
 			    unsigned long size, unsigned long align,
 			    unsigned long *addr, unsigned long max)
 {
-	unsigned long map_size, desc_size;
+	unsigned long map_size, desc_size, buff_size;
 	efi_memory_desc_t *map;
 	efi_status_t status;
 	unsigned long nr_pages;
 	u64 max_addr = 0;
 	int i;
+	struct efi_boot_memmap boot_map;
+
+	boot_map.map =		&map;
+	boot_map.map_size =	&map_size;
+	boot_map.desc_size =	&desc_size;
+	boot_map.desc_ver =	NULL;
+	boot_map.key_ptr =	NULL;
+	boot_map.buff_size =	&buff_size;
 
-	status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size,
-				    NULL, NULL);
+	status = efi_get_memory_map(sys_table_arg, &boot_map);
 	if (status != EFI_SUCCESS)
 		goto fail;
 
@@ -230,14 +259,21 @@ efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg,
 			   unsigned long size, unsigned long align,
 			   unsigned long *addr)
 {
-	unsigned long map_size, desc_size;
+	unsigned long map_size, desc_size, buff_size;
 	efi_memory_desc_t *map;
 	efi_status_t status;
 	unsigned long nr_pages;
 	int i;
+	struct efi_boot_memmap boot_map;
+
+	boot_map.map =		&map;
+	boot_map.map_size =	&map_size;
+	boot_map.desc_size =	&desc_size;
+	boot_map.desc_ver =	NULL;
+	boot_map.key_ptr =	NULL;
+	boot_map.buff_size =	&buff_size;
 
-	status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size,
-				    NULL, NULL);
+	status = efi_get_memory_map(sys_table_arg, &boot_map);
 	if (status != EFI_SUCCESS)
 		goto fail;
 
diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c
index e58abfa953cc..bec0fa8d8746 100644
--- a/drivers/firmware/efi/libstub/fdt.c
+++ b/drivers/firmware/efi/libstub/fdt.c
@@ -175,13 +175,21 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
 					    unsigned long fdt_addr,
 					    unsigned long fdt_size)
 {
-	unsigned long map_size, desc_size;
+	unsigned long map_size, desc_size, buff_size;
 	u32 desc_ver;
 	unsigned long mmap_key;
 	efi_memory_desc_t *memory_map, *runtime_map;
 	unsigned long new_fdt_size;
 	efi_status_t status;
 	int runtime_entry_count = 0;
+	struct efi_boot_memmap map;
+
+	map.map =	&runtime_map;
+	map.map_size =	&map_size;
+	map.desc_size =	&desc_size;
+	map.desc_ver =	&desc_ver;
+	map.key_ptr =	&mmap_key;
+	map.buff_size =	&buff_size;
 
 	/*
 	 * Get a copy of the current memory map that we will use to prepare
@@ -189,8 +197,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
 	 * subsequent allocations adding entries, since they could not affect
 	 * the number of EFI_MEMORY_RUNTIME regions.
 	 */
-	status = efi_get_memory_map(sys_table, &runtime_map, &map_size,
-				    &desc_size, &desc_ver, &mmap_key);
+	status = efi_get_memory_map(sys_table, &map);
 	if (status != EFI_SUCCESS) {
 		pr_efi_err(sys_table, "Unable to retrieve UEFI memory map.\n");
 		return status;
@@ -199,6 +206,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
 	pr_efi(sys_table,
 	       "Exiting boot services and installing virtual address map...\n");
 
+	map.map = &memory_map;
 	/*
 	 * Estimate size of new FDT, and allocate memory for it. We
 	 * will allocate a bigger buffer if this ends up being too
@@ -218,8 +226,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
 		 * we can get the memory map key  needed for
 		 * exit_boot_services().
 		 */
-		status = efi_get_memory_map(sys_table, &memory_map, &map_size,
-					    &desc_size, &desc_ver, &mmap_key);
+		status = efi_get_memory_map(sys_table, &map);
 		if (status != EFI_SUCCESS)
 			goto fail_free_new_fdt;
 
diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c
index 53f6d3fe6d86..0c9f58c5ba50 100644
--- a/drivers/firmware/efi/libstub/random.c
+++ b/drivers/firmware/efi/libstub/random.c
@@ -73,12 +73,20 @@ efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg,
 			      unsigned long random_seed)
 {
 	unsigned long map_size, desc_size, total_slots = 0, target_slot;
+	unsigned long buff_size;
 	efi_status_t status;
 	efi_memory_desc_t *memory_map;
 	int map_offset;
+	struct efi_boot_memmap map;
 
-	status = efi_get_memory_map(sys_table_arg, &memory_map, &map_size,
-				    &desc_size, NULL, NULL);
+	map.map =	&memory_map;
+	map.map_size =	&map_size;
+	map.desc_size =	&desc_size;
+	map.desc_ver =	NULL;
+	map.key_ptr =	NULL;
+	map.buff_size =	&buff_size;
+
+	status = efi_get_memory_map(sys_table_arg, &map);
 	if (status != EFI_SUCCESS)
 		return status;
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 23cd3ced8c1a..943fee524176 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -118,6 +118,15 @@ typedef struct {
 	u32 imagesize;
 } efi_capsule_header_t;
 
+struct efi_boot_memmap {
+	efi_memory_desc_t	**map;
+	unsigned long		*map_size;
+	unsigned long		*desc_size;
+	u32			*desc_ver;
+	unsigned long		*key_ptr;
+	unsigned long		*buff_size;
+};
+
 /*
  * EFI capsule flags
  */
@@ -1371,11 +1380,7 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg,
 			  efi_loaded_image_t *image, int *cmd_line_len);
 
 efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg,
-				efi_memory_desc_t **map,
-				unsigned long *map_size,
-				unsigned long *desc_size,
-				u32 *desc_ver,
-				unsigned long *key_ptr);
+				struct efi_boot_memmap *map);
 
 efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg,
 			   unsigned long size, unsigned long align,
-- 
cgit v1.2.3


From fc07716ba803483be91bc4b2344f9c84985e6f07 Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jhugo@codeaurora.org>
Date: Mon, 29 Aug 2016 14:38:52 -0600
Subject: efi/libstub: Introduce ExitBootServices helper

The spec allows ExitBootServices to fail with EFI_INVALID_PARAMETER if a
race condition has occurred where the EFI has updated the memory map after
the stub grabbed a reference to the map.  The spec defines a retry
proceedure with specific requirements to handle this scenario.

This scenario was previously observed on x86 - commit d3768d885c6c ("x86,
efi: retry ExitBootServices() on failure") but the current fix is not spec
compliant and the scenario is now observed on the Qualcomm Technologies
QDF2432 via the FDT stub which does not handle the error and thus causes
boot failures.  The user will notice the boot failure as the kernel is not
executed and the system may drop back to a UEFI shell, but will be
unresponsive to input and the system will require a power cycle to recover.

Add a helper to the stub library that correctly adheres to the spec in the
case of EFI_INVALID_PARAMETER from ExitBootServices and can be universally
used across all stub implementations.

Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Leif Lindholm <leif.lindholm@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
---
 drivers/firmware/efi/libstub/efi-stub-helper.c | 73 ++++++++++++++++++++++++++
 include/linux/efi.h                            | 10 ++++
 2 files changed, 83 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 29368ac69221..aded10662020 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -740,3 +740,76 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg,
 	*cmd_line_len = options_bytes;
 	return (char *)cmdline_addr;
 }
+
+/*
+ * Handle calling ExitBootServices according to the requirements set out by the
+ * spec.  Obtains the current memory map, and returns that info after calling
+ * ExitBootServices.  The client must specify a function to perform any
+ * processing of the memory map data prior to ExitBootServices.  A client
+ * specific structure may be passed to the function via priv.  The client
+ * function may be called multiple times.
+ */
+efi_status_t efi_exit_boot_services(efi_system_table_t *sys_table_arg,
+				    void *handle,
+				    struct efi_boot_memmap *map,
+				    void *priv,
+				    efi_exit_boot_map_processing priv_func)
+{
+	efi_status_t status;
+
+	status = efi_get_memory_map(sys_table_arg, map);
+
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = priv_func(sys_table_arg, map, priv);
+	if (status != EFI_SUCCESS)
+		goto free_map;
+
+	status = efi_call_early(exit_boot_services, handle, *map->key_ptr);
+
+	if (status == EFI_INVALID_PARAMETER) {
+		/*
+		 * The memory map changed between efi_get_memory_map() and
+		 * exit_boot_services().  Per the UEFI Spec v2.6, Section 6.4:
+		 * EFI_BOOT_SERVICES.ExitBootServices we need to get the
+		 * updated map, and try again.  The spec implies one retry
+		 * should be sufficent, which is confirmed against the EDK2
+		 * implementation.  Per the spec, we can only invoke
+		 * get_memory_map() and exit_boot_services() - we cannot alloc
+		 * so efi_get_memory_map() cannot be used, and we must reuse
+		 * the buffer.  For all practical purposes, the headroom in the
+		 * buffer should account for any changes in the map so the call
+		 * to get_memory_map() is expected to succeed here.
+		 */
+		*map->map_size = *map->buff_size;
+		status = efi_call_early(get_memory_map,
+					map->map_size,
+					*map->map,
+					map->key_ptr,
+					map->desc_size,
+					map->desc_ver);
+
+		/* exit_boot_services() was called, thus cannot free */
+		if (status != EFI_SUCCESS)
+			goto fail;
+
+		status = priv_func(sys_table_arg, map, priv);
+		/* exit_boot_services() was called, thus cannot free */
+		if (status != EFI_SUCCESS)
+			goto fail;
+
+		status = efi_call_early(exit_boot_services, handle, *map->key_ptr);
+	}
+
+	/* exit_boot_services() was called, thus cannot free */
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	return EFI_SUCCESS;
+
+free_map:
+	efi_call_early(free_pool, *map->map);
+fail:
+	return status;
+}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 943fee524176..0148a3046b48 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1462,4 +1462,14 @@ extern void efi_call_virt_check_flags(unsigned long flags, const char *call);
 	arch_efi_call_virt_teardown();					\
 })
 
+typedef efi_status_t (*efi_exit_boot_map_processing)(
+	efi_system_table_t *sys_table_arg,
+	struct efi_boot_memmap *map,
+	void *priv);
+
+efi_status_t efi_exit_boot_services(efi_system_table_t *sys_table,
+				    void *handle,
+				    struct efi_boot_memmap *map,
+				    void *priv,
+				    efi_exit_boot_map_processing priv_func);
 #endif /* _LINUX_EFI_H */
-- 
cgit v1.2.3


From 81409e9e28058811c9ea865345e1753f8f677e44 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 31 Aug 2016 16:04:21 -0700
Subject: usercopy: fold builtin_const check into inline function

Instead of having each caller of check_object_size() need to remember to
check for a const size parameter, move the check into check_object_size()
itself. This actually matches the original implementation in PaX, though
this commit cleans up the now-redundant builtin_const() calls in the
various architectures.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/ia64/include/asm/uaccess.h     | 12 ++++--------
 arch/powerpc/include/asm/uaccess.h  | 19 +++++++------------
 arch/sparc/include/asm/uaccess_32.h |  9 +++------
 arch/sparc/include/asm/uaccess_64.h |  7 +++----
 include/linux/thread_info.h         |  3 ++-
 5 files changed, 19 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
index 465c70982f40..0472927ebb9b 100644
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -241,8 +241,7 @@ extern unsigned long __must_check __copy_user (void __user *to, const void __use
 static inline unsigned long
 __copy_to_user (void __user *to, const void *from, unsigned long count)
 {
-	if (!__builtin_constant_p(count))
-		check_object_size(from, count, true);
+	check_object_size(from, count, true);
 
 	return __copy_user(to, (__force void __user *) from, count);
 }
@@ -250,8 +249,7 @@ __copy_to_user (void __user *to, const void *from, unsigned long count)
 static inline unsigned long
 __copy_from_user (void *to, const void __user *from, unsigned long count)
 {
-	if (!__builtin_constant_p(count))
-		check_object_size(to, count, false);
+	check_object_size(to, count, false);
 
 	return __copy_user((__force void __user *) to, from, count);
 }
@@ -265,8 +263,7 @@ __copy_from_user (void *to, const void __user *from, unsigned long count)
 	long __cu_len = (n);								\
 											\
 	if (__access_ok(__cu_to, __cu_len, get_fs())) {					\
-		if (!__builtin_constant_p(n))						\
-			check_object_size(__cu_from, __cu_len, true);			\
+		check_object_size(__cu_from, __cu_len, true);			\
 		__cu_len = __copy_user(__cu_to, (__force void __user *)  __cu_from, __cu_len);	\
 	}										\
 	__cu_len;									\
@@ -280,8 +277,7 @@ __copy_from_user (void *to, const void __user *from, unsigned long count)
 											\
 	__chk_user_ptr(__cu_from);							\
 	if (__access_ok(__cu_from, __cu_len, get_fs())) {				\
-		if (!__builtin_constant_p(n))						\
-			check_object_size(__cu_to, __cu_len, false);			\
+		check_object_size(__cu_to, __cu_len, false);			\
 		__cu_len = __copy_user((__force void __user *) __cu_to, __cu_from, __cu_len);	\
 	}										\
 	__cu_len;									\
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index c1dc6c14deb8..f1e382498bbb 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -311,14 +311,12 @@ static inline unsigned long copy_from_user(void *to,
 	unsigned long over;
 
 	if (access_ok(VERIFY_READ, from, n)) {
-		if (!__builtin_constant_p(n))
-			check_object_size(to, n, false);
+		check_object_size(to, n, false);
 		return __copy_tofrom_user((__force void __user *)to, from, n);
 	}
 	if ((unsigned long)from < TASK_SIZE) {
 		over = (unsigned long)from + n - TASK_SIZE;
-		if (!__builtin_constant_p(n - over))
-			check_object_size(to, n - over, false);
+		check_object_size(to, n - over, false);
 		return __copy_tofrom_user((__force void __user *)to, from,
 				n - over) + over;
 	}
@@ -331,14 +329,12 @@ static inline unsigned long copy_to_user(void __user *to,
 	unsigned long over;
 
 	if (access_ok(VERIFY_WRITE, to, n)) {
-		if (!__builtin_constant_p(n))
-			check_object_size(from, n, true);
+		check_object_size(from, n, true);
 		return __copy_tofrom_user(to, (__force void __user *)from, n);
 	}
 	if ((unsigned long)to < TASK_SIZE) {
 		over = (unsigned long)to + n - TASK_SIZE;
-		if (!__builtin_constant_p(n))
-			check_object_size(from, n - over, true);
+		check_object_size(from, n - over, true);
 		return __copy_tofrom_user(to, (__force void __user *)from,
 				n - over) + over;
 	}
@@ -383,8 +379,7 @@ static inline unsigned long __copy_from_user_inatomic(void *to,
 			return 0;
 	}
 
-	if (!__builtin_constant_p(n))
-		check_object_size(to, n, false);
+	check_object_size(to, n, false);
 
 	return __copy_tofrom_user((__force void __user *)to, from, n);
 }
@@ -412,8 +407,8 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to,
 		if (ret == 0)
 			return 0;
 	}
-	if (!__builtin_constant_p(n))
-		check_object_size(from, n, true);
+
+	check_object_size(from, n, true);
 
 	return __copy_tofrom_user(to, (__force const void __user *)from, n);
 }
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index 341a5a133f48..e722c510bb1b 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -249,8 +249,7 @@ unsigned long __copy_user(void __user *to, const void __user *from, unsigned lon
 static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	if (n && __access_ok((unsigned long) to, n)) {
-		if (!__builtin_constant_p(n))
-			check_object_size(from, n, true);
+		check_object_size(from, n, true);
 		return __copy_user(to, (__force void __user *) from, n);
 	} else
 		return n;
@@ -258,16 +257,14 @@ static inline unsigned long copy_to_user(void __user *to, const void *from, unsi
 
 static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (!__builtin_constant_p(n))
-		check_object_size(from, n, true);
+	check_object_size(from, n, true);
 	return __copy_user(to, (__force void __user *) from, n);
 }
 
 static inline unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	if (n && __access_ok((unsigned long) from, n)) {
-		if (!__builtin_constant_p(n))
-			check_object_size(to, n, false);
+		check_object_size(to, n, false);
 		return __copy_user((__force void __user *) to, from, n);
 	} else
 		return n;
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 8bda94fab8e8..37a315d0ddd4 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -212,8 +212,7 @@ copy_from_user(void *to, const void __user *from, unsigned long size)
 {
 	unsigned long ret;
 
-	if (!__builtin_constant_p(size))
-		check_object_size(to, size, false);
+	check_object_size(to, size, false);
 
 	ret = ___copy_from_user(to, from, size);
 	if (unlikely(ret))
@@ -233,8 +232,8 @@ copy_to_user(void __user *to, const void *from, unsigned long size)
 {
 	unsigned long ret;
 
-	if (!__builtin_constant_p(size))
-		check_object_size(from, size, true);
+	check_object_size(from, size, true);
+
 	ret = ___copy_to_user(to, from, size);
 	if (unlikely(ret))
 		ret = copy_to_user_fixup(to, from, size);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index cbd8990e2e77..10c9e601398b 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -121,7 +121,8 @@ extern void __check_object_size(const void *ptr, unsigned long n,
 static inline void check_object_size(const void *ptr, unsigned long n,
 				     bool to_user)
 {
-	__check_object_size(ptr, n, to_user);
+	if (!__builtin_constant_p(n))
+		__check_object_size(ptr, n, to_user);
 }
 #else
 static inline void check_object_size(const void *ptr, unsigned long n,
-- 
cgit v1.2.3


From 5a56a0b3a45dd0cc5b2f7bec6afd053a474ed9f5 Mon Sep 17 00:00:00 2001
From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Date: Mon, 5 Sep 2016 10:20:20 +1200
Subject: net: Don't delete routes in different VRFs

When deleting an IP address from an interface, there is a clean-up of
routes which refer to this local address. However, there was no check to
see that the VRF matched. This meant that deletion wasn't confined to
the VRF it should have been.

To solve this, a new field has been added to fib_info to hold a table
id. When removing fib entries corresponding to a local ip address, this
table id is also used in the comparison.

The table id is populated when the fib_info is created. This was already
done in some places, but not in ip_rt_ioctl(). This has now been fixed.

Fixes: 021dd3b8a142 ("net: Add routes to the table associated with the device")
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 3 ++-
 net/ipv4/fib_frontend.c  | 3 ++-
 net/ipv4/fib_semantics.c | 8 ++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4079fc18ffe4..7d4a72e75f33 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -111,6 +111,7 @@ struct fib_info {
 	unsigned char		fib_scope;
 	unsigned char		fib_type;
 	__be32			fib_prefsrc;
+	u32			fib_tb_id;
 	u32			fib_priority;
 	u32			*fib_metrics;
 #define fib_mtu fib_metrics[RTAX_MTU-1]
@@ -319,7 +320,7 @@ void fib_flush_external(struct net *net);
 /* Exported by fib_semantics.c */
 int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
-int fib_sync_down_addr(struct net *net, __be32 local);
+int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
 extern u32 fib_multipath_secret __read_mostly;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ef2ebeb89d0f..1b25daf8c7f1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -509,6 +509,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 		if (!dev)
 			return -ENODEV;
 		cfg->fc_oif = dev->ifindex;
+		cfg->fc_table = l3mdev_fib_table(dev);
 		if (colon) {
 			struct in_ifaddr *ifa;
 			struct in_device *in_dev = __in_dev_get_rtnl(dev);
@@ -1027,7 +1028,7 @@ no_promotions:
 			 * First of all, we scan fib_info list searching
 			 * for stray nexthop entries, then ignite fib_flush.
 			 */
-			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+			if (fib_sync_down_addr(dev, ifa->ifa_local))
 				fib_flush(dev_net(dev));
 		}
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 539fa264e67d..e9f56225e53f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi->fib_priority = cfg->fc_priority;
 	fi->fib_prefsrc = cfg->fc_prefsrc;
 	fi->fib_type = cfg->fc_type;
+	fi->fib_tb_id = cfg->fc_table;
 
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
@@ -1337,18 +1338,21 @@ nla_put_failure:
  *   referring to it.
  * - device went down -> we must shutdown all nexthops going via it.
  */
-int fib_sync_down_addr(struct net *net, __be32 local)
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
 {
 	int ret = 0;
 	unsigned int hash = fib_laddr_hashfn(local);
 	struct hlist_head *head = &fib_info_laddrhash[hash];
+	struct net *net = dev_net(dev);
+	int tb_id = l3mdev_fib_table(dev);
 	struct fib_info *fi;
 
 	if (!fib_info_laddrhash || local == 0)
 		return 0;
 
 	hlist_for_each_entry(fi, head, fib_lhash) {
-		if (!net_eq(fi->fib_net, net))
+		if (!net_eq(fi->fib_net, net) ||
+		    fi->fib_tb_id != tb_id)
 			continue;
 		if (fi->fib_prefsrc == local) {
 			fi->fib_flags |= RTNH_F_DEAD;
-- 
cgit v1.2.3


From fff72429c2e83bdbe32dc7f1ad6398dfe50750c6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 14:34:21 +0100
Subject: rxrpc: Improve the call tracking tracepoint

Improve the call tracking tracepoint by showing more differentiation
between some of the put and get events, including:

  (1) Getting and putting refs for the socket call user ID tree.

  (2) Getting and putting refs for queueing and failing to queue the call
      processor work item.

Note that these aren't necessarily used in this patch, but will be taken
advantage of in future patches.

An enum is added for the event subtype numbers rather than coding them
directly as decimal numbers and a table of 3-letter strings is provided
rather than a sequence of ?: operators.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 11 +++-------
 net/rxrpc/af_rxrpc.c         |  2 +-
 net/rxrpc/ar-internal.h      | 22 ++++++++++++++++++--
 net/rxrpc/call_accept.c      | 10 ++++-----
 net/rxrpc/call_event.c       |  2 +-
 net/rxrpc/call_object.c      | 48 ++++++++++++++++++++++++++++----------------
 net/rxrpc/input.c            |  6 +++---
 net/rxrpc/recvmsg.c          | 23 +++++++++++----------
 net/rxrpc/sendmsg.c          |  4 ++--
 net/rxrpc/skbuff.c           |  2 +-
 10 files changed, 79 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index cbe574ea674b..30164896f1f6 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -17,7 +17,8 @@
 #include <linux/tracepoint.h>
 
 TRACE_EVENT(rxrpc_call,
-	    TP_PROTO(struct rxrpc_call *call, int op, int usage, int nskb,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op,
+		     int usage, int nskb,
 		     const void *where, const void *aux),
 
 	    TP_ARGS(call, op, usage, nskb, where, aux),
@@ -42,13 +43,7 @@ TRACE_EVENT(rxrpc_call,
 
 	    TP_printk("c=%p %s u=%d s=%d p=%pSR a=%p",
 		      __entry->call,
-		      (__entry->op == 0 ? "NWc" :
-		       __entry->op == 1 ? "NWs" :
-		       __entry->op == 2 ? "SEE" :
-		       __entry->op == 3 ? "GET" :
-		       __entry->op == 4 ? "Gsb" :
-		       __entry->op == 5 ? "PUT" :
-		       "Psb"),
+		      rxrpc_call_traces[__entry->op],
 		      __entry->usage,
 		      __entry->nskb,
 		      __entry->where,
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index b66a9e6f8d04..8356cd003d51 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -296,7 +296,7 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
 	_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
 	rxrpc_remove_user_ID(rxrpc_sk(sock->sk), call);
 	rxrpc_purge_queue(&call->knlrecv_queue);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 }
 EXPORT_SYMBOL(rxrpc_kernel_end_call);
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ad702f9f8d1f..913255a53564 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -508,6 +508,24 @@ struct rxrpc_call {
 	unsigned long		ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
 };
 
+enum rxrpc_call_trace {
+	rxrpc_call_new_client,
+	rxrpc_call_new_service,
+	rxrpc_call_queued,
+	rxrpc_call_queued_ref,
+	rxrpc_call_seen,
+	rxrpc_call_got,
+	rxrpc_call_got_skb,
+	rxrpc_call_got_userid,
+	rxrpc_call_put,
+	rxrpc_call_put_skb,
+	rxrpc_call_put_userid,
+	rxrpc_call_put_noqueue,
+	rxrpc_call__nr_trace
+};
+
+extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
+
 #include <trace/events/rxrpc.h>
 
 /*
@@ -555,8 +573,8 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
 void rxrpc_release_call(struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
 void rxrpc_see_call(struct rxrpc_call *);
-void rxrpc_get_call(struct rxrpc_call *);
-void rxrpc_put_call(struct rxrpc_call *);
+void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
+void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *);
 void __exit rxrpc_destroy_all_calls(void);
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 68a439e30df1..487ae7aa86db 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -115,7 +115,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
 
 	write_lock(&rx->call_lock);
 	if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
-		rxrpc_get_call(call);
+		rxrpc_get_call(call, rxrpc_call_got);
 
 		spin_lock(&call->conn->state_lock);
 		if (sp->hdr.securityIndex > 0 &&
@@ -155,7 +155,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
 	_debug("done");
 	read_unlock_bh(&local->services_lock);
 	rxrpc_free_skb(notification);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = 0");
 	return 0;
 
@@ -166,11 +166,11 @@ invalid_service:
 	read_lock_bh(&call->state_lock);
 	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
 	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
-		rxrpc_get_call(call);
+		rxrpc_get_call(call, rxrpc_call_got);
 		rxrpc_queue_call(call);
 	}
 	read_unlock_bh(&call->state_lock);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	ret = -ECONNREFUSED;
 error:
 	rxrpc_free_skb(notification);
@@ -341,6 +341,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	}
 
 	/* formalise the acceptance */
+	rxrpc_get_call(call, rxrpc_call_got_userid);
 	call->notify_rx = notify_rx;
 	call->user_call_ID = user_call_ID;
 	rb_link_node(&call->sock_node, parent, pp);
@@ -351,7 +352,6 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 		BUG();
 	rxrpc_queue_call(call);
 
-	rxrpc_get_call(call);
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
 	_leave(" = %p{%d}", call, call->debug_id);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 4754c7fb6242..fee8b6ddb334 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -1246,7 +1246,7 @@ send_message_2:
 kill_ACKs:
 	del_timer_sync(&call->ack_timer);
 	if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
-		rxrpc_put_call(call);
+		rxrpc_put_call(call, rxrpc_call_put);
 	clear_bit(RXRPC_CALL_EV_ACK, &call->events);
 
 maybe_reschedule:
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 65691742199b..3166b5222435 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -55,6 +55,21 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 	[RXRPC_CALL_NETWORK_ERROR]		= "NetError",
 };
 
+const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
+	[rxrpc_call_new_client]		= "NWc",
+	[rxrpc_call_new_service]	= "NWs",
+	[rxrpc_call_queued]		= "QUE",
+	[rxrpc_call_queued_ref]		= "QUR",
+	[rxrpc_call_seen]		= "SEE",
+	[rxrpc_call_got]		= "GOT",
+	[rxrpc_call_got_skb]		= "Gsk",
+	[rxrpc_call_got_userid]		= "Gus",
+	[rxrpc_call_put]		= "PUT",
+	[rxrpc_call_put_skb]		= "Psk",
+	[rxrpc_call_put_userid]		= "Pus",
+	[rxrpc_call_put_noqueue]	= "PNQ",
+};
+
 struct kmem_cache *rxrpc_call_jar;
 LIST_HEAD(rxrpc_calls);
 DEFINE_RWLOCK(rxrpc_call_lock);
@@ -96,7 +111,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx,
 	return NULL;
 
 found_extant_call:
-	rxrpc_get_call(call);
+	rxrpc_get_call(call, rxrpc_call_got);
 	read_unlock(&rx->call_lock);
 	_leave(" = %p [%d]", call, atomic_read(&call->usage));
 	return call;
@@ -252,8 +267,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 			goto found_user_ID_now_present;
 	}
 
-	rxrpc_get_call(call);
-
+	rxrpc_get_call(call, rxrpc_call_got_userid);
 	rb_link_node(&call->sock_node, parent, pp);
 	rb_insert_color(&call->sock_node, &rx->calls);
 	write_unlock(&rx->call_lock);
@@ -275,7 +289,7 @@ error:
 	write_lock(&rx->call_lock);
 	rb_erase(&call->sock_node, &rx->calls);
 	write_unlock(&rx->call_lock);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put_userid);
 
 	write_lock_bh(&rxrpc_call_lock);
 	list_del_init(&call->link);
@@ -283,7 +297,7 @@ error:
 
 	set_bit(RXRPC_CALL_RELEASED, &call->flags);
 	call->state = RXRPC_CALL_DEAD;
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 
@@ -296,7 +310,7 @@ found_user_ID_now_present:
 	write_unlock(&rx->call_lock);
 	set_bit(RXRPC_CALL_RELEASED, &call->flags);
 	call->state = RXRPC_CALL_DEAD;
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = -EEXIST [%p]", call);
 	return ERR_PTR(-EEXIST);
 }
@@ -322,8 +336,8 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	if (!candidate)
 		return ERR_PTR(-EBUSY);
 
-	trace_rxrpc_call(candidate, 1, atomic_read(&candidate->usage),
-			 0, here, NULL);
+	trace_rxrpc_call(candidate, rxrpc_call_new_service,
+			 atomic_read(&candidate->usage), 0, here, NULL);
 
 	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
 	candidate->socket	= rx;
@@ -358,7 +372,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 			read_unlock(&call->state_lock);
 			goto aborted_call;
 		default:
-			rxrpc_get_call(call);
+			rxrpc_get_call(call, rxrpc_call_got);
 			read_unlock(&call->state_lock);
 			goto extant_call;
 		}
@@ -447,20 +461,20 @@ void rxrpc_see_call(struct rxrpc_call *call)
 		int n = atomic_read(&call->usage);
 		int m = atomic_read(&call->skb_count);
 
-		trace_rxrpc_call(call, 2, n, m, here, 0);
+		trace_rxrpc_call(call, rxrpc_call_seen, n, m, here, NULL);
 	}
 }
 
 /*
  * Note the addition of a ref on a call.
  */
-void rxrpc_get_call(struct rxrpc_call *call)
+void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	int n = atomic_inc_return(&call->usage);
 	int m = atomic_read(&call->skb_count);
 
-	trace_rxrpc_call(call, 3, n, m, here, 0);
+	trace_rxrpc_call(call, op, n, m, here, NULL);
 }
 
 /*
@@ -472,7 +486,7 @@ void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 	int n = atomic_inc_return(&call->usage);
 	int m = atomic_inc_return(&call->skb_count);
 
-	trace_rxrpc_call(call, 4, n, m, here, skb);
+	trace_rxrpc_call(call, rxrpc_call_got_skb, n, m, here, skb);
 }
 
 /*
@@ -575,7 +589,7 @@ static void rxrpc_dead_call_expired(unsigned long _call)
 	write_lock_bh(&call->state_lock);
 	call->state = RXRPC_CALL_DEAD;
 	write_unlock_bh(&call->state_lock);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 }
 
 /*
@@ -632,7 +646,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 /*
  * release a call
  */
-void rxrpc_put_call(struct rxrpc_call *call)
+void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	int n, m;
@@ -641,7 +655,7 @@ void rxrpc_put_call(struct rxrpc_call *call)
 
 	n = atomic_dec_return(&call->usage);
 	m = atomic_read(&call->skb_count);
-	trace_rxrpc_call(call, 5, n, m, here, NULL);
+	trace_rxrpc_call(call, op, n, m, here, NULL);
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
@@ -661,7 +675,7 @@ void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 
 	n = atomic_dec_return(&call->usage);
 	m = atomic_dec_return(&call->skb_count);
-	trace_rxrpc_call(call, 6, n, m, here, skb);
+	trace_rxrpc_call(call, rxrpc_call_put_skb, n, m, here, skb);
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 72f016cfaaf5..f7239a6f9181 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -537,7 +537,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
 	}
 
 	read_unlock(&call->state_lock);
-	rxrpc_get_call(call);
+	rxrpc_get_call(call, rxrpc_call_got);
 
 	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
 	    sp->hdr.flags & RXRPC_JUMBO_PACKET)
@@ -545,12 +545,12 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
 	else
 		rxrpc_fast_process_packet(call, skb);
 
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	goto done;
 
 resend_final_ack:
 	_debug("final ack again");
-	rxrpc_get_call(call);
+	rxrpc_get_call(call, rxrpc_call_got);
 	set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 	rxrpc_queue_call(call);
 	goto free_unlock;
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 0ab7b334bab1..97f8ee76c67c 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -79,7 +79,8 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
 				release_sock(&rx->sk);
 				if (continue_call)
-					rxrpc_put_call(continue_call);
+					rxrpc_put_call(continue_call,
+						       rxrpc_call_put);
 				return -ENODATA;
 			}
 		}
@@ -137,13 +138,13 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			if (call != continue_call ||
 			    skb->mark != RXRPC_SKB_MARK_DATA) {
 				release_sock(&rx->sk);
-				rxrpc_put_call(continue_call);
+				rxrpc_put_call(continue_call, rxrpc_call_put);
 				_leave(" = %d [noncont]", copied);
 				return copied;
 			}
 		}
 
-		rxrpc_get_call(call);
+		rxrpc_get_call(call, rxrpc_call_got);
 
 		/* copy the peer address and timestamp */
 		if (!continue_call) {
@@ -233,7 +234,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		if (!continue_call)
 			continue_call = sp->call;
 		else
-			rxrpc_put_call(call);
+			rxrpc_put_call(call, rxrpc_call_put);
 		call = NULL;
 
 		if (flags & MSG_PEEK) {
@@ -255,9 +256,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 out:
 	release_sock(&rx->sk);
 	if (call)
-		rxrpc_put_call(call);
+		rxrpc_put_call(call, rxrpc_call_put);
 	if (continue_call)
-		rxrpc_put_call(continue_call);
+		rxrpc_put_call(continue_call, rxrpc_call_put);
 	_leave(" = %d [data]", copied);
 	return copied;
 
@@ -341,18 +342,18 @@ terminal_message:
 	}
 
 	release_sock(&rx->sk);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	if (continue_call)
-		rxrpc_put_call(continue_call);
+		rxrpc_put_call(continue_call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ret;
 
 copy_error:
 	_debug("copy error");
 	release_sock(&rx->sk);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	if (continue_call)
-		rxrpc_put_call(continue_call);
+		rxrpc_put_call(continue_call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ret;
 
@@ -361,7 +362,7 @@ wait_interrupted:
 wait_error:
 	finish_wait(sk_sleep(&rx->sk), &wait);
 	if (continue_call)
-		rxrpc_put_call(continue_call);
+		rxrpc_put_call(continue_call, rxrpc_call_put);
 	if (copied)
 		copied = ret;
 	_leave(" = %d [waitfail %d]", copied, ret);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 7376794a0308..803078bea507 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -534,7 +534,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		call = rxrpc_accept_call(rx, user_call_ID, NULL);
 		if (IS_ERR(call))
 			return PTR_ERR(call);
-		rxrpc_put_call(call);
+		rxrpc_put_call(call, rxrpc_call_put);
 		return 0;
 	}
 
@@ -573,7 +573,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		ret = rxrpc_send_data(rx, call, msg, len);
 	}
 
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index a546a2ba6341..c0613ab6d2d5 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -35,7 +35,7 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call)
 
 		/* get an extra ref on the call for the final-ACK generator to
 		 * release */
-		rxrpc_get_call(call);
+		rxrpc_get_call(call, rxrpc_call_got);
 		set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 		if (try_to_del_timer_sync(&call->ack_timer) >= 0)
 			rxrpc_queue_call(call);
-- 
cgit v1.2.3


From 5a42976d4fe5d7fddce133de995c742c87b1b7e3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 6 Sep 2016 22:19:51 +0100
Subject: rxrpc: Add tracepoint for working out where aborts happen

Add a tracepoint for working out where local aborts happen.  Each
tracepoint call is labelled with a 3-letter code so that they can be
distinguished - and the DATA sequence number is added too where available.

rxrpc_kernel_abort_call() also takes a 3-letter code so that AFS can
indicate the circumstances when it aborts a call.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c               |  17 ++++---
 include/net/af_rxrpc.h       |   3 +-
 include/trace/events/rxrpc.h |  29 ++++++++++++
 net/rxrpc/ar-internal.h      |  14 ++++--
 net/rxrpc/call_event.c       |   7 +--
 net/rxrpc/call_object.c      |   2 +-
 net/rxrpc/conn_event.c       |   6 +++
 net/rxrpc/input.c            |   7 +--
 net/rxrpc/insecure.c         |  19 ++++----
 net/rxrpc/rxkad.c            | 108 +++++++++++++++++++------------------------
 net/rxrpc/sendmsg.c          |  18 ++++----
 11 files changed, 132 insertions(+), 98 deletions(-)

(limited to 'include')

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 37608be52abd..53750dece80e 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -377,7 +377,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	return wait_mode->wait(call);
 
 error_do_abort:
-	rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT);
+	rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD");
 error_kill_call:
 	afs_end_call(call);
 	_leave(" = %d", ret);
@@ -425,12 +425,12 @@ static void afs_deliver_to_call(struct afs_call *call)
 		case -ENOTCONN:
 			abort_code = RX_CALL_DEAD;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code);
+						abort_code, -ret, "KNC");
 			goto do_abort;
 		case -ENOTSUPP:
 			abort_code = RX_INVALID_OPERATION;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code);
+						abort_code, -ret, "KIV");
 			goto do_abort;
 		case -ENODATA:
 		case -EBADMSG:
@@ -440,7 +440,7 @@ static void afs_deliver_to_call(struct afs_call *call)
 			if (call->state != AFS_CALL_AWAIT_REPLY)
 				abort_code = RXGEN_SS_UNMARSHAL;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code);
+						abort_code, EBADMSG, "KUM");
 			goto do_abort;
 		}
 	}
@@ -463,6 +463,7 @@ do_abort:
  */
 static int afs_wait_for_call_to_complete(struct afs_call *call)
 {
+	const char *abort_why;
 	int ret;
 
 	DECLARE_WAITQUEUE(myself, current);
@@ -481,9 +482,11 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 			continue;
 		}
 
+		abort_why = "KWC";
 		ret = call->error;
 		if (call->state == AFS_CALL_COMPLETE)
 			break;
+		abort_why = "KWI";
 		ret = -EINTR;
 		if (signal_pending(current))
 			break;
@@ -497,7 +500,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 	if (call->state < AFS_CALL_COMPLETE) {
 		_debug("call incomplete");
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_CALL_DEAD);
+					RX_CALL_DEAD, -ret, abort_why);
 	}
 
 	_debug("call complete");
@@ -695,7 +698,7 @@ void afs_send_empty_reply(struct afs_call *call)
 	case -ENOMEM:
 		_debug("oom");
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_USER_ABORT);
+					RX_USER_ABORT, ENOMEM, "KOO");
 	default:
 		afs_end_call(call);
 		_leave(" [error]");
@@ -734,7 +737,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	if (n == -ENOMEM) {
 		_debug("oom");
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_USER_ABORT);
+					RX_USER_ABORT, ENOMEM, "KOO");
 	}
 	afs_end_call(call);
 	_leave(" [error]");
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index b4b6a3664dda..08ed8729126c 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -35,7 +35,8 @@ int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *,
 			   struct msghdr *, size_t);
 int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
 			   void *, size_t, size_t *, bool, u32 *);
-void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32);
+void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
+			     u32, int, const char *);
 void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
 struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long,
 					    rxrpc_notify_rx_t);
diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 30164896f1f6..85ee035774ae 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -84,6 +84,35 @@ TRACE_EVENT(rxrpc_skb,
 		      __entry->where)
 	    );
 
+TRACE_EVENT(rxrpc_abort,
+	    TP_PROTO(const char *why, u32 cid, u32 call_id, rxrpc_seq_t seq,
+		     int abort_code, int error),
+
+	    TP_ARGS(why, cid, call_id, seq, abort_code, error),
+
+	    TP_STRUCT__entry(
+		    __array(char,			why, 4		)
+		    __field(u32,			cid		)
+		    __field(u32,			call_id		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(int,			abort_code	)
+		    __field(int,			error		)
+			     ),
+
+	    TP_fast_assign(
+		    memcpy(__entry->why, why, 4);
+		    __entry->cid = cid;
+		    __entry->call_id = call_id;
+		    __entry->abort_code = abort_code;
+		    __entry->error = error;
+		    __entry->seq = seq;
+			   ),
+
+	    TP_printk("%08x:%08x s=%u a=%d e=%d %s",
+		      __entry->cid, __entry->call_id, __entry->seq,
+		      __entry->abort_code, __entry->error, __entry->why)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 0353399792b6..dbfb9ed17483 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -155,7 +155,8 @@ struct rxrpc_security {
 			     void *);
 
 	/* verify the security on a received packet */
-	int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *);
+	int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
+			     rxrpc_seq_t, u16);
 
 	/* issue a challenge */
 	int (*issue_challenge)(struct rxrpc_connection *);
@@ -637,9 +638,12 @@ static inline bool rxrpc_call_completed(struct rxrpc_call *call)
 /*
  * Record that a call is locally aborted.
  */
-static inline bool __rxrpc_abort_call(struct rxrpc_call *call,
+static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
+				      rxrpc_seq_t seq,
 				      u32 abort_code, int error)
 {
+	trace_rxrpc_abort(why, call->cid, call->call_id, seq,
+			  abort_code, error);
 	if (__rxrpc_set_call_completion(call,
 					RXRPC_CALL_LOCALLY_ABORTED,
 					abort_code, error)) {
@@ -649,13 +653,13 @@ static inline bool __rxrpc_abort_call(struct rxrpc_call *call,
 	return false;
 }
 
-static inline bool rxrpc_abort_call(struct rxrpc_call *call,
-				    u32 abort_code, int error)
+static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
+				    rxrpc_seq_t seq, u32 abort_code, int error)
 {
 	bool ret;
 
 	write_lock_bh(&call->state_lock);
-	ret = __rxrpc_abort_call(call, abort_code, error);
+	ret = __rxrpc_abort_call(why, call, seq, abort_code, error);
 	write_unlock_bh(&call->state_lock);
 	return ret;
 }
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 8365d3366114..af88ad7d2cf9 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -598,7 +598,8 @@ process_further:
 
 		/* secured packets must be verified and possibly decrypted */
 		if (call->conn->security->verify_packet(call, skb,
-							_abort_code) < 0)
+							sp->hdr.seq,
+							sp->hdr.cksum) < 0)
 			goto protocol_error;
 
 		rxrpc_insert_oos_packet(call, skb);
@@ -982,7 +983,7 @@ skip_msg_init:
 	}
 
 	if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
-		rxrpc_abort_call(call, RX_CALL_TIMEOUT, ETIME);
+		rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
 
 		_debug("post timeout");
 		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
@@ -1005,7 +1006,7 @@ skip_msg_init:
 		case -EKEYEXPIRED:
 		case -EKEYREJECTED:
 		case -EPROTO:
-			rxrpc_abort_call(call, abort_code, -ret);
+			rxrpc_abort_call("PRO", call, 0, abort_code, -ret);
 			goto kill_ACKs;
 		}
 	}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index be5733d55794..9efd9b0b0bdf 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -563,7 +563,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 
 		if (call->state < RXRPC_CALL_COMPLETE) {
 			_debug("+++ ABORTING STATE %d +++\n", call->state);
-			__rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET);
+			__rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
 			clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 		}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 9db90f4f768d..8c7938ba6a84 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -158,6 +158,11 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			lockdep_is_held(&conn->channel_lock));
 		if (call) {
 			rxrpc_see_call(call);
+			if (compl == RXRPC_CALL_LOCALLY_ABORTED)
+				trace_rxrpc_abort("CON", call->cid,
+						  call->call_id, 0,
+						  abort_code, error);
+
 			write_lock_bh(&call->state_lock);
 			if (rxrpc_set_call_completion(call, compl, abort_code,
 						      error)) {
@@ -167,6 +172,7 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			write_unlock_bh(&call->state_lock);
 			if (queue)
 				rxrpc_queue_call(call);
+
 		}
 	}
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 79f3f585cdc3..8e624109750a 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -429,7 +429,7 @@ protocol_error:
 	_debug("protocol error");
 	write_lock_bh(&call->state_lock);
 protocol_error_locked:
-	if (__rxrpc_abort_call(call, RX_PROTOCOL_ERROR, EPROTO))
+	if (__rxrpc_abort_call("FPR", call, 0, RX_PROTOCOL_ERROR, EPROTO))
 		rxrpc_queue_call(call);
 free_packet_unlock:
 	write_unlock_bh(&call->state_lock);
@@ -495,9 +495,10 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
 protocol_error:
 	_debug("protocol error");
 	rxrpc_free_skb(part);
-	rxrpc_free_skb(jumbo);
-	if (rxrpc_abort_call(call, RX_PROTOCOL_ERROR, EPROTO))
+	if (rxrpc_abort_call("PJP", call, sp->hdr.seq,
+			     RX_PROTOCOL_ERROR, EPROTO))
 		rxrpc_queue_call(call);
+	rxrpc_free_skb(jumbo);
 	_leave("");
 }
 
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index c21ad213b337..a4aba0246731 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -23,31 +23,32 @@ static int none_prime_packet_security(struct rxrpc_connection *conn)
 }
 
 static int none_secure_packet(struct rxrpc_call *call,
-			       struct sk_buff *skb,
-			       size_t data_size,
-			       void *sechdr)
+			      struct sk_buff *skb,
+			      size_t data_size,
+			      void *sechdr)
 {
 	return 0;
 }
 
 static int none_verify_packet(struct rxrpc_call *call,
-			       struct sk_buff *skb,
-			       u32 *_abort_code)
+			      struct sk_buff *skb,
+			      rxrpc_seq_t seq,
+			      u16 expected_cksum)
 {
 	return 0;
 }
 
 static int none_respond_to_challenge(struct rxrpc_connection *conn,
-				      struct sk_buff *skb,
-				      u32 *_abort_code)
+				     struct sk_buff *skb,
+				     u32 *_abort_code)
 {
 	*_abort_code = RX_PROTOCOL_ERROR;
 	return -EPROTO;
 }
 
 static int none_verify_response(struct rxrpc_connection *conn,
-				 struct sk_buff *skb,
-				 u32 *_abort_code)
+				struct sk_buff *skb,
+				u32 *_abort_code)
 {
 	*_abort_code = RX_PROTOCOL_ERROR;
 	return -EPROTO;
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 89f475febfd7..3777432df10b 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -316,12 +316,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
 /*
  * decrypt partial encryption on a packet (level 1 security)
  */
-static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
-				    struct sk_buff *skb,
-				    u32 *_abort_code)
+static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
+				 rxrpc_seq_t seq)
 {
 	struct rxkad_level1_hdr sechdr;
-	struct rxrpc_skb_priv *sp;
 	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist sg[16];
@@ -332,7 +330,10 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 
 	_enter("");
 
-	sp = rxrpc_skb(skb);
+	if (skb->len < 8) {
+		rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO);
+		goto protocol_error;
+	}
 
 	/* we want to decrypt the skbuff in-place */
 	nsg = skb_cow_data(skb, 0, &trailer);
@@ -351,9 +352,11 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 	crypto_skcipher_decrypt(req);
 	skcipher_request_zero(req);
 
-	/* remove the decrypted packet length */
-	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
-		goto datalen_error;
+	/* Extract the decrypted packet length */
+	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) {
+		rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO);
+		goto protocol_error;
+	}
 	if (!skb_pull(skb, sizeof(sechdr)))
 		BUG();
 
@@ -361,24 +364,24 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 	data_size = buf & 0xffff;
 
 	check = buf >> 16;
-	check ^= sp->hdr.seq ^ sp->hdr.callNumber;
+	check ^= seq ^ call->call_id;
 	check &= 0xffff;
 	if (check != 0) {
-		*_abort_code = RXKADSEALEDINCON;
+		rxrpc_abort_call("V1C", call, seq, RXKADSEALEDINCON, EPROTO);
 		goto protocol_error;
 	}
 
 	/* shorten the packet to remove the padding */
-	if (data_size > skb->len)
-		goto datalen_error;
-	else if (data_size < skb->len)
+	if (data_size > skb->len) {
+		rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO);
+		goto protocol_error;
+	}
+	if (data_size < skb->len)
 		skb->len = data_size;
 
 	_leave(" = 0 [dlen=%x]", data_size);
 	return 0;
 
-datalen_error:
-	*_abort_code = RXKADDATALEN;
 protocol_error:
 	_leave(" = -EPROTO");
 	return -EPROTO;
@@ -391,13 +394,11 @@ nomem:
 /*
  * wholly decrypt a packet (level 2 security)
  */
-static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
-				       struct sk_buff *skb,
-				       u32 *_abort_code)
+static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
+				 rxrpc_seq_t seq)
 {
 	const struct rxrpc_key_token *token;
 	struct rxkad_level2_hdr sechdr;
-	struct rxrpc_skb_priv *sp;
 	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist _sg[4], *sg;
@@ -408,7 +409,10 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 
 	_enter(",{%d}", skb->len);
 
-	sp = rxrpc_skb(skb);
+	if (skb->len < 8) {
+		rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO);
+		goto protocol_error;
+	}
 
 	/* we want to decrypt the skbuff in-place */
 	nsg = skb_cow_data(skb, 0, &trailer);
@@ -437,9 +441,11 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 	if (sg != _sg)
 		kfree(sg);
 
-	/* remove the decrypted packet length */
-	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
-		goto datalen_error;
+	/* Extract the decrypted packet length */
+	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) {
+		rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO);
+		goto protocol_error;
+	}
 	if (!skb_pull(skb, sizeof(sechdr)))
 		BUG();
 
@@ -447,24 +453,23 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 	data_size = buf & 0xffff;
 
 	check = buf >> 16;
-	check ^= sp->hdr.seq ^ sp->hdr.callNumber;
+	check ^= seq ^ call->call_id;
 	check &= 0xffff;
 	if (check != 0) {
-		*_abort_code = RXKADSEALEDINCON;
+		rxrpc_abort_call("V2C", call, seq, RXKADSEALEDINCON, EPROTO);
 		goto protocol_error;
 	}
 
-	/* shorten the packet to remove the padding */
-	if (data_size > skb->len)
-		goto datalen_error;
-	else if (data_size < skb->len)
+	if (data_size > skb->len) {
+		rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO);
+		goto protocol_error;
+	}
+	if (data_size < skb->len)
 		skb->len = data_size;
 
 	_leave(" = 0 [dlen=%x]", data_size);
 	return 0;
 
-datalen_error:
-	*_abort_code = RXKADDATALEN;
 protocol_error:
 	_leave(" = -EPROTO");
 	return -EPROTO;
@@ -475,40 +480,30 @@ nomem:
 }
 
 /*
- * verify the security on a received packet
+ * Verify the security on a received packet or subpacket (if part of a
+ * jumbo packet).
  */
-static int rxkad_verify_packet(struct rxrpc_call *call,
-			       struct sk_buff *skb,
-			       u32 *_abort_code)
+static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			       rxrpc_seq_t seq, u16 expected_cksum)
 {
 	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
-	struct rxrpc_skb_priv *sp;
 	struct rxrpc_crypt iv;
 	struct scatterlist sg;
 	u16 cksum;
 	u32 x, y;
-	int ret;
-
-	sp = rxrpc_skb(skb);
 
 	_enter("{%d{%x}},{#%u}",
-	       call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq);
+	       call->debug_id, key_serial(call->conn->params.key), seq);
 
 	if (!call->conn->cipher)
 		return 0;
 
-	if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) {
-		*_abort_code = RXKADINCONSISTENCY;
-		_leave(" = -EPROTO [not rxkad]");
-		return -EPROTO;
-	}
-
 	/* continue encrypting from where we left off */
 	memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
 
 	/* validate the security checksum */
 	x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
-	x |= sp->hdr.seq & 0x3fffffff;
+	x |= seq & 0x3fffffff;
 	call->crypto_buf[0] = htonl(call->call_id);
 	call->crypto_buf[1] = htonl(x);
 
@@ -524,29 +519,22 @@ static int rxkad_verify_packet(struct rxrpc_call *call,
 	if (cksum == 0)
 		cksum = 1; /* zero checksums are not permitted */
 
-	if (sp->hdr.cksum != cksum) {
-		*_abort_code = RXKADSEALEDINCON;
+	if (cksum != expected_cksum) {
+		rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO);
 		_leave(" = -EPROTO [csum failed]");
 		return -EPROTO;
 	}
 
 	switch (call->conn->params.security_level) {
 	case RXRPC_SECURITY_PLAIN:
-		ret = 0;
-		break;
+		return 0;
 	case RXRPC_SECURITY_AUTH:
-		ret = rxkad_verify_packet_auth(call, skb, _abort_code);
-		break;
+		return rxkad_verify_packet_1(call, skb, seq);
 	case RXRPC_SECURITY_ENCRYPT:
-		ret = rxkad_verify_packet_encrypt(call, skb, _abort_code);
-		break;
+		return rxkad_verify_packet_2(call, skb, seq);
 	default:
-		ret = -ENOANO;
-		break;
+		return -ENOANO;
 	}
-
-	_leave(" = %d", ret);
-	return ret;
 }
 
 /*
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 2439aff131c7..9a4af992fcdf 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -454,14 +454,15 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
 /*
  * abort a call, sending an ABORT packet to the peer
  */
-static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
+static void rxrpc_send_abort(struct rxrpc_call *call, const char *why,
+			     u32 abort_code, int error)
 {
 	if (call->state >= RXRPC_CALL_COMPLETE)
 		return;
 
 	write_lock_bh(&call->state_lock);
 
-	if (__rxrpc_abort_call(call, abort_code, ECONNABORTED)) {
+	if (__rxrpc_abort_call(why, call, 0, abort_code, error)) {
 		del_timer_sync(&call->resend_timer);
 		del_timer_sync(&call->ack_timer);
 		clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
@@ -556,7 +557,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		/* it's too late for this call */
 		ret = -ESHUTDOWN;
 	} else if (cmd == RXRPC_CMD_SEND_ABORT) {
-		rxrpc_send_abort(call, abort_code);
+		rxrpc_send_abort(call, "CMD", abort_code, ECONNABORTED);
 		ret = 0;
 	} else if (cmd != RXRPC_CMD_SEND_DATA) {
 		ret = -EINVAL;
@@ -626,20 +627,19 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data);
  * @sock: The socket the call is on
  * @call: The call to be aborted
  * @abort_code: The abort code to stick into the ABORT packet
+ * @error: Local error value
+ * @why: 3-char string indicating why.
  *
  * Allow a kernel service to abort a call, if it's still in an abortable state.
  */
 void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
-			     u32 abort_code)
+			     u32 abort_code, int error, const char *why)
 {
-	_enter("{%d},%d", call->debug_id, abort_code);
+	_enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why);
 
 	lock_sock(sock->sk);
 
-	_debug("CALL %d USR %lx ST %d on CONN %p",
-	       call->debug_id, call->user_call_ID, call->state, call->conn);
-
-	rxrpc_send_abort(call, abort_code);
+	rxrpc_send_abort(call, why, abort_code, error);
 
 	release_sock(sock->sk);
 	_leave("");
-- 
cgit v1.2.3


From a85d6b8242dc78ef3f4542a0f979aebcbe77fc4e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Sep 2016 09:39:32 -0700
Subject: usercopy: force check_object_size() inline

Just for good measure, make sure that check_object_size() is always
inlined too, as already done for copy_*_user() and __copy_*_user().

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 10c9e601398b..2b5b10eed74f 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -118,8 +118,8 @@ static inline int arch_within_stack_frames(const void * const stack,
 extern void __check_object_size(const void *ptr, unsigned long n,
 					bool to_user);
 
-static inline void check_object_size(const void *ptr, unsigned long n,
-				     bool to_user)
+static __always_inline void check_object_size(const void *ptr, unsigned long n,
+					      bool to_user)
 {
 	if (!__builtin_constant_p(n))
 		__check_object_size(ptr, n, to_user);
-- 
cgit v1.2.3


From c965db44462919f613973aa618271f6c3f5a1e64 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <Tomer.Tayar@qlogic.com>
Date: Wed, 7 Sep 2016 16:36:24 +0300
Subject: qed: Add support for debug data collection

This patch adds the support for dumping and formatting the HW/FW debug data.

Signed-off-by: Tomer Tayar <Tomer.Tayar@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile       |    2 +-
 drivers/net/ethernet/qlogic/qed/qed.h          |   20 +
 drivers/net/ethernet/qlogic/qed/qed_debug.c    | 6898 ++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_debug.h    |   54 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h      | 1056 +++-
 drivers/net/ethernet/qlogic/qed/qed_main.c     |    4 +
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |  894 +++
 include/linux/qed/common_hsi.h                 |    3 +
 8 files changed, 8919 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_debug.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_debug.h

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index d1f157e439cf..86a5b4f5f870 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -2,5 +2,5 @@ obj-$(CONFIG_QED) := qed.o
 
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
-	 qed_selftest.o qed_dcbx.o
+	 qed_selftest.o qed_dcbx.o qed_debug.o
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 2d67469eb8f6..0929582fc82b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -23,6 +23,7 @@
 #include <linux/zlib.h>
 #include <linux/hashtable.h>
 #include <linux/qed/qed_if.h>
+#include "qed_debug.h"
 #include "qed_hsi.h"
 
 extern const struct qed_common_ops qed_common_ops_pass;
@@ -395,6 +396,8 @@ struct qed_hwfn {
 	/* Buffer for unzipping firmware data */
 	void				*unzip_buf;
 
+	struct dbg_tools_data		dbg_info;
+
 	struct qed_simd_fp_handler	simd_proto_handler[64];
 
 #ifdef CONFIG_QED_SRIOV
@@ -430,6 +433,19 @@ struct qed_int_params {
 	u8			fp_msix_cnt;
 };
 
+struct qed_dbg_feature {
+	struct dentry *dentry;
+	u8 *dump_buf;
+	u32 buf_size;
+	u32 dumped_dwords;
+};
+
+struct qed_dbg_params {
+	struct qed_dbg_feature features[DBG_FEATURE_NUM];
+	u8 engine_for_debug;
+	bool print_data;
+};
+
 struct qed_dev {
 	u32	dp_module;
 	u8	dp_level;
@@ -444,6 +460,8 @@ struct qed_dev {
 				 CHIP_REV_IS_A0(dev))
 #define QED_IS_BB_B0(dev)       (QED_IS_BB(dev) && \
 				 CHIP_REV_IS_B0(dev))
+#define QED_IS_AH(dev)  ((dev)->type == QED_DEV_TYPE_AH)
+#define QED_IS_K2(dev)  QED_IS_AH(dev)
 
 #define QED_GET_TYPE(dev)       (QED_IS_BB_A0(dev) ? CHIP_BB_A0 : \
 				 QED_IS_BB_B0(dev) ? CHIP_BB_B0 : CHIP_K2)
@@ -544,6 +562,8 @@ struct qed_dev {
 	} protocol_ops;
 	void				*ops_cookie;
 
+	struct qed_dbg_params		dbg_params;
+
 	const struct firmware		*firmware;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
new file mode 100644
index 000000000000..88e7d5bef909
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -0,0 +1,6898 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+
+/* Chip IDs enum */
+enum chip_ids {
+	CHIP_RESERVED,
+	CHIP_BB_B0,
+	CHIP_K2,
+	MAX_CHIP_IDS
+};
+
+/* Memory groups enum */
+enum mem_groups {
+	MEM_GROUP_PXP_MEM,
+	MEM_GROUP_DMAE_MEM,
+	MEM_GROUP_CM_MEM,
+	MEM_GROUP_QM_MEM,
+	MEM_GROUP_TM_MEM,
+	MEM_GROUP_BRB_RAM,
+	MEM_GROUP_BRB_MEM,
+	MEM_GROUP_PRS_MEM,
+	MEM_GROUP_SDM_MEM,
+	MEM_GROUP_PBUF,
+	MEM_GROUP_IOR,
+	MEM_GROUP_RAM,
+	MEM_GROUP_BTB_RAM,
+	MEM_GROUP_RDIF_CTX,
+	MEM_GROUP_TDIF_CTX,
+	MEM_GROUP_CONN_CFC_MEM,
+	MEM_GROUP_TASK_CFC_MEM,
+	MEM_GROUP_CAU_PI,
+	MEM_GROUP_CAU_MEM,
+	MEM_GROUP_PXP_ILT,
+	MEM_GROUP_MULD_MEM,
+	MEM_GROUP_BTB_MEM,
+	MEM_GROUP_IGU_MEM,
+	MEM_GROUP_IGU_MSIX,
+	MEM_GROUP_CAU_SB,
+	MEM_GROUP_BMB_RAM,
+	MEM_GROUP_BMB_MEM,
+	MEM_GROUPS_NUM
+};
+
+/* Memory groups names */
+static const char * const s_mem_group_names[] = {
+	"PXP_MEM",
+	"DMAE_MEM",
+	"CM_MEM",
+	"QM_MEM",
+	"TM_MEM",
+	"BRB_RAM",
+	"BRB_MEM",
+	"PRS_MEM",
+	"SDM_MEM",
+	"PBUF",
+	"IOR",
+	"RAM",
+	"BTB_RAM",
+	"RDIF_CTX",
+	"TDIF_CTX",
+	"CONN_CFC_MEM",
+	"TASK_CFC_MEM",
+	"CAU_PI",
+	"CAU_MEM",
+	"PXP_ILT",
+	"MULD_MEM",
+	"BTB_MEM",
+	"IGU_MEM",
+	"IGU_MSIX",
+	"CAU_SB",
+	"BMB_RAM",
+	"BMB_MEM",
+};
+
+/* Idle check conditions */
+static u32 cond4(const u32 *r, const u32 *imm)
+{
+	return ((r[0] & imm[0]) != imm[1]) && ((r[1] & imm[2]) != imm[3]);
+}
+
+static u32 cond6(const u32 *r, const u32 *imm)
+{
+	return ((r[0] >> imm[0]) & imm[1]) != imm[2];
+}
+
+static u32 cond5(const u32 *r, const u32 *imm)
+{
+	return (r[0] & imm[0]) != imm[1];
+}
+
+static u32 cond8(const u32 *r, const u32 *imm)
+{
+	return ((r[0] & imm[0]) >> imm[1]) !=
+	    (((r[0] & imm[2]) >> imm[3]) | ((r[1] & imm[4]) << imm[5]));
+}
+
+static u32 cond9(const u32 *r, const u32 *imm)
+{
+	return ((r[0] & imm[0]) >> imm[1]) != (r[0] & imm[2]);
+}
+
+static u32 cond1(const u32 *r, const u32 *imm)
+{
+	return (r[0] & ~imm[0]) != imm[1];
+}
+
+static u32 cond0(const u32 *r, const u32 *imm)
+{
+	return r[0] != imm[0];
+}
+
+static u32 cond10(const u32 *r, const u32 *imm)
+{
+	return r[0] != r[1] && r[2] == imm[0];
+}
+
+static u32 cond11(const u32 *r, const u32 *imm)
+{
+	return r[0] != r[1] && r[2] > imm[0];
+}
+
+static u32 cond3(const u32 *r, const u32 *imm)
+{
+	return r[0] != r[1];
+}
+
+static u32 cond12(const u32 *r, const u32 *imm)
+{
+	return r[0] & imm[0];
+}
+
+static u32 cond7(const u32 *r, const u32 *imm)
+{
+	return r[0] < (r[1] - imm[0]);
+}
+
+static u32 cond2(const u32 *r, const u32 *imm)
+{
+	return r[0] > imm[0];
+}
+
+/* Array of Idle Check conditions */
+static u32(*cond_arr[]) (const u32 *r, const u32 *imm) = {
+	cond0,
+	cond1,
+	cond2,
+	cond3,
+	cond4,
+	cond5,
+	cond6,
+	cond7,
+	cond8,
+	cond9,
+	cond10,
+	cond11,
+	cond12,
+};
+
+/******************************* Data Types **********************************/
+
+enum platform_ids {
+	PLATFORM_ASIC,
+	PLATFORM_RESERVED,
+	PLATFORM_RESERVED2,
+	PLATFORM_RESERVED3,
+	MAX_PLATFORM_IDS
+};
+
+struct dbg_array {
+	const u32 *ptr;
+	u32 size_in_dwords;
+};
+
+/* Chip constant definitions */
+struct chip_defs {
+	const char *name;
+	struct {
+		u8 num_ports;
+		u8 num_pfs;
+	} per_platform[MAX_PLATFORM_IDS];
+};
+
+/* Platform constant definitions */
+struct platform_defs {
+	const char *name;
+	u32 delay_factor;
+};
+
+/* Storm constant definitions */
+struct storm_defs {
+	char letter;
+	enum block_id block_id;
+	enum dbg_bus_clients dbg_client_id[MAX_CHIP_IDS];
+	bool has_vfc;
+	u32 sem_fast_mem_addr;
+	u32 sem_frame_mode_addr;
+	u32 sem_slow_enable_addr;
+	u32 sem_slow_mode_addr;
+	u32 sem_slow_mode1_conf_addr;
+	u32 sem_sync_dbg_empty_addr;
+	u32 sem_slow_dbg_empty_addr;
+	u32 cm_ctx_wr_addr;
+	u32 cm_conn_ag_ctx_lid_size; /* In quad-regs */
+	u32 cm_conn_ag_ctx_rd_addr;
+	u32 cm_conn_st_ctx_lid_size; /* In quad-regs */
+	u32 cm_conn_st_ctx_rd_addr;
+	u32 cm_task_ag_ctx_lid_size; /* In quad-regs */
+	u32 cm_task_ag_ctx_rd_addr;
+	u32 cm_task_st_ctx_lid_size; /* In quad-regs */
+	u32 cm_task_st_ctx_rd_addr;
+};
+
+/* Block constant definitions */
+struct block_defs {
+	const char *name;
+	bool has_dbg_bus[MAX_CHIP_IDS];
+	bool associated_to_storm;
+	u32 storm_id; /* Valid only if associated_to_storm is true */
+	enum dbg_bus_clients dbg_client_id[MAX_CHIP_IDS];
+	u32 dbg_select_addr;
+	u32 dbg_cycle_enable_addr;
+	u32 dbg_shift_addr;
+	u32 dbg_force_valid_addr;
+	u32 dbg_force_frame_addr;
+	bool has_reset_bit;
+	bool unreset; /* If true, the block is taken out of reset before dump */
+	enum dbg_reset_regs reset_reg;
+	u8 reset_bit_offset; /* Bit offset in reset register */
+};
+
+/* Reset register definitions */
+struct reset_reg_defs {
+	u32 addr;
+	u32 unreset_val;
+	bool exists[MAX_CHIP_IDS];
+};
+
+struct grc_param_defs {
+	u32 default_val[MAX_CHIP_IDS];
+	u32 min;
+	u32 max;
+	bool is_preset;
+	u32 exclude_all_preset_val;
+	u32 crash_preset_val;
+};
+
+struct rss_mem_defs {
+	const char *mem_name;
+	const char *type_name;
+	u32 addr; /* In 128b units */
+	u32 num_entries[MAX_CHIP_IDS];
+	u32 entry_width[MAX_CHIP_IDS]; /* In bits */
+};
+
+struct vfc_ram_defs {
+	const char *mem_name;
+	const char *type_name;
+	u32 base_row;
+	u32 num_rows;
+};
+
+struct big_ram_defs {
+	const char *instance_name;
+	enum mem_groups mem_group_id;
+	enum mem_groups ram_mem_group_id;
+	enum dbg_grc_params grc_param;
+	u32 addr_reg_addr;
+	u32 data_reg_addr;
+	u32 num_of_blocks[MAX_CHIP_IDS];
+};
+
+struct phy_defs {
+	const char *phy_name;
+	u32 base_addr;
+	u32 tbus_addr_lo_addr;
+	u32 tbus_addr_hi_addr;
+	u32 tbus_data_lo_addr;
+	u32 tbus_data_hi_addr;
+};
+
+/******************************** Constants **********************************/
+
+#define MAX_LCIDS			320
+#define MAX_LTIDS			320
+#define NUM_IOR_SETS			2
+#define IORS_PER_SET			176
+#define IOR_SET_OFFSET(set_id)		((set_id) * 256)
+#define BYTES_IN_DWORD			sizeof(u32)
+
+/* In the macros below, size and offset are specified in bits */
+#define CEIL_DWORDS(size)		DIV_ROUND_UP(size, 32)
+#define FIELD_BIT_OFFSET(type, field)	type ## _ ## field ## _ ## OFFSET
+#define FIELD_BIT_SIZE(type, field)	type ## _ ## field ## _ ## SIZE
+#define FIELD_DWORD_OFFSET(type, field) \
+	 (int)(FIELD_BIT_OFFSET(type, field) / 32)
+#define FIELD_DWORD_SHIFT(type, field)	(FIELD_BIT_OFFSET(type, field) % 32)
+#define FIELD_BIT_MASK(type, field) \
+	(((1 << FIELD_BIT_SIZE(type, field)) - 1) << \
+	 FIELD_DWORD_SHIFT(type, field))
+#define SET_VAR_FIELD(var, type, field, val) \
+	do { \
+		var[FIELD_DWORD_OFFSET(type, field)] &=	\
+		(~FIELD_BIT_MASK(type, field));	\
+		var[FIELD_DWORD_OFFSET(type, field)] |= \
+		(val) << FIELD_DWORD_SHIFT(type, field); \
+	} while (0)
+#define ARR_REG_WR(dev, ptt, addr, arr, arr_size) \
+	do { \
+		for (i = 0; i < (arr_size); i++) \
+			qed_wr(dev, ptt, addr,	(arr)[i]); \
+	} while (0)
+#define ARR_REG_RD(dev, ptt, addr, arr, arr_size) \
+	do { \
+		for (i = 0; i < (arr_size); i++) \
+			(arr)[i] = qed_rd(dev, ptt, addr); \
+	} while (0)
+
+#define DWORDS_TO_BYTES(dwords)		((dwords) * BYTES_IN_DWORD)
+#define BYTES_TO_DWORDS(bytes)		((bytes) / BYTES_IN_DWORD)
+#define RAM_LINES_TO_DWORDS(lines)	((lines) * 2)
+#define RAM_LINES_TO_BYTES(lines) \
+	DWORDS_TO_BYTES(RAM_LINES_TO_DWORDS(lines))
+#define REG_DUMP_LEN_SHIFT		24
+#define MEM_DUMP_ENTRY_SIZE_DWORDS \
+	BYTES_TO_DWORDS(sizeof(struct dbg_dump_mem))
+#define IDLE_CHK_RULE_SIZE_DWORDS \
+	BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_rule))
+#define IDLE_CHK_RESULT_HDR_DWORDS \
+	BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_result_hdr))
+#define IDLE_CHK_RESULT_REG_HDR_DWORDS \
+	BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_result_reg_hdr))
+#define IDLE_CHK_MAX_ENTRIES_SIZE	32
+
+/* The sizes and offsets below are specified in bits */
+#define VFC_CAM_CMD_STRUCT_SIZE		64
+#define VFC_CAM_CMD_ROW_OFFSET		48
+#define VFC_CAM_CMD_ROW_SIZE		9
+#define VFC_CAM_ADDR_STRUCT_SIZE	16
+#define VFC_CAM_ADDR_OP_OFFSET		0
+#define VFC_CAM_ADDR_OP_SIZE		4
+#define VFC_CAM_RESP_STRUCT_SIZE	256
+#define VFC_RAM_ADDR_STRUCT_SIZE	16
+#define VFC_RAM_ADDR_OP_OFFSET		0
+#define VFC_RAM_ADDR_OP_SIZE		2
+#define VFC_RAM_ADDR_ROW_OFFSET		2
+#define VFC_RAM_ADDR_ROW_SIZE		10
+#define VFC_RAM_RESP_STRUCT_SIZE	256
+#define VFC_CAM_CMD_DWORDS		CEIL_DWORDS(VFC_CAM_CMD_STRUCT_SIZE)
+#define VFC_CAM_ADDR_DWORDS		CEIL_DWORDS(VFC_CAM_ADDR_STRUCT_SIZE)
+#define VFC_CAM_RESP_DWORDS		CEIL_DWORDS(VFC_CAM_RESP_STRUCT_SIZE)
+#define VFC_RAM_CMD_DWORDS		VFC_CAM_CMD_DWORDS
+#define VFC_RAM_ADDR_DWORDS		CEIL_DWORDS(VFC_RAM_ADDR_STRUCT_SIZE)
+#define VFC_RAM_RESP_DWORDS		CEIL_DWORDS(VFC_RAM_RESP_STRUCT_SIZE)
+#define NUM_VFC_RAM_TYPES		4
+#define VFC_CAM_NUM_ROWS		512
+#define VFC_OPCODE_CAM_RD		14
+#define VFC_OPCODE_RAM_RD		0
+#define NUM_RSS_MEM_TYPES		5
+#define NUM_BIG_RAM_TYPES		3
+#define BIG_RAM_BLOCK_SIZE_BYTES	128
+#define BIG_RAM_BLOCK_SIZE_DWORDS \
+	BYTES_TO_DWORDS(BIG_RAM_BLOCK_SIZE_BYTES)
+#define NUM_PHY_TBUS_ADDRESSES		2048
+#define PHY_DUMP_SIZE_DWORDS		(NUM_PHY_TBUS_ADDRESSES / 2)
+#define RESET_REG_UNRESET_OFFSET	4
+#define STALL_DELAY_MS			500
+#define STATIC_DEBUG_LINE_DWORDS	9
+#define NUM_DBG_BUS_LINES		256
+#define NUM_COMMON_GLOBAL_PARAMS	8
+#define FW_IMG_MAIN			1
+#define REG_FIFO_DEPTH_ELEMENTS		32
+#define REG_FIFO_ELEMENT_DWORDS		2
+#define REG_FIFO_DEPTH_DWORDS \
+	(REG_FIFO_ELEMENT_DWORDS * REG_FIFO_DEPTH_ELEMENTS)
+#define IGU_FIFO_DEPTH_ELEMENTS		64
+#define IGU_FIFO_ELEMENT_DWORDS		4
+#define IGU_FIFO_DEPTH_DWORDS \
+	(IGU_FIFO_ELEMENT_DWORDS * IGU_FIFO_DEPTH_ELEMENTS)
+#define PROTECTION_OVERRIDE_DEPTH_ELEMENTS	20
+#define PROTECTION_OVERRIDE_ELEMENT_DWORDS	2
+#define PROTECTION_OVERRIDE_DEPTH_DWORDS \
+	(PROTECTION_OVERRIDE_DEPTH_ELEMENTS * \
+	 PROTECTION_OVERRIDE_ELEMENT_DWORDS)
+#define MCP_SPAD_TRACE_OFFSIZE_ADDR \
+	(MCP_REG_SCRATCH + \
+	 offsetof(struct static_init, sections[SPAD_SECTION_TRACE]))
+#define MCP_TRACE_META_IMAGE_SIGNATURE  0x669955aa
+#define EMPTY_FW_VERSION_STR		"???_???_???_???"
+#define EMPTY_FW_IMAGE_STR		"???????????????"
+
+/***************************** Constant Arrays *******************************/
+
+/* Debug arrays */
+static struct dbg_array s_dbg_arrays[MAX_BIN_DBG_BUFFER_TYPE] = { {0} };
+
+/* Chip constant definitions array */
+static struct chip_defs s_chip_defs[MAX_CHIP_IDS] = {
+	{ "reserved", { {0, 0}, {0, 0}, {0, 0}, {0, 0} } },
+	{ "bb_b0",
+	  { {MAX_NUM_PORTS_BB, MAX_NUM_PFS_BB}, {0, 0}, {0, 0}, {0, 0} } },
+	{ "k2", { {MAX_NUM_PORTS_K2, MAX_NUM_PFS_K2}, {0, 0}, {0, 0}, {0, 0} } }
+};
+
+/* Storm constant definitions array */
+static struct storm_defs s_storm_defs[] = {
+	/* Tstorm */
+	{'T', BLOCK_TSEM,
+	 {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT,
+	  DBG_BUS_CLIENT_RBCT}, true,
+	 TSEM_REG_FAST_MEMORY,
+	 TSEM_REG_DBG_FRAME_MODE, TSEM_REG_SLOW_DBG_ACTIVE,
+	 TSEM_REG_SLOW_DBG_MODE, TSEM_REG_DBG_MODE1_CFG,
+	 TSEM_REG_SYNC_DBG_EMPTY, TSEM_REG_SLOW_DBG_EMPTY,
+	 TCM_REG_CTX_RBC_ACCS,
+	 4, TCM_REG_AGG_CON_CTX,
+	 16, TCM_REG_SM_CON_CTX,
+	 2, TCM_REG_AGG_TASK_CTX,
+	 4, TCM_REG_SM_TASK_CTX},
+	/* Mstorm */
+	{'M', BLOCK_MSEM,
+	 {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT,
+	  DBG_BUS_CLIENT_RBCM}, false,
+	 MSEM_REG_FAST_MEMORY,
+	 MSEM_REG_DBG_FRAME_MODE, MSEM_REG_SLOW_DBG_ACTIVE,
+	 MSEM_REG_SLOW_DBG_MODE, MSEM_REG_DBG_MODE1_CFG,
+	 MSEM_REG_SYNC_DBG_EMPTY, MSEM_REG_SLOW_DBG_EMPTY,
+	 MCM_REG_CTX_RBC_ACCS,
+	 1, MCM_REG_AGG_CON_CTX,
+	 10, MCM_REG_SM_CON_CTX,
+	 2, MCM_REG_AGG_TASK_CTX,
+	 7, MCM_REG_SM_TASK_CTX},
+	/* Ustorm */
+	{'U', BLOCK_USEM,
+	 {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU,
+	  DBG_BUS_CLIENT_RBCU}, false,
+	 USEM_REG_FAST_MEMORY,
+	 USEM_REG_DBG_FRAME_MODE, USEM_REG_SLOW_DBG_ACTIVE,
+	 USEM_REG_SLOW_DBG_MODE, USEM_REG_DBG_MODE1_CFG,
+	 USEM_REG_SYNC_DBG_EMPTY, USEM_REG_SLOW_DBG_EMPTY,
+	 UCM_REG_CTX_RBC_ACCS,
+	 2, UCM_REG_AGG_CON_CTX,
+	 13, UCM_REG_SM_CON_CTX,
+	 3, UCM_REG_AGG_TASK_CTX,
+	 3, UCM_REG_SM_TASK_CTX},
+	/* Xstorm */
+	{'X', BLOCK_XSEM,
+	 {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX,
+	  DBG_BUS_CLIENT_RBCX}, false,
+	 XSEM_REG_FAST_MEMORY,
+	 XSEM_REG_DBG_FRAME_MODE, XSEM_REG_SLOW_DBG_ACTIVE,
+	 XSEM_REG_SLOW_DBG_MODE, XSEM_REG_DBG_MODE1_CFG,
+	 XSEM_REG_SYNC_DBG_EMPTY, XSEM_REG_SLOW_DBG_EMPTY,
+	 XCM_REG_CTX_RBC_ACCS,
+	 9, XCM_REG_AGG_CON_CTX,
+	 15, XCM_REG_SM_CON_CTX,
+	 0, 0,
+	 0, 0},
+	/* Ystorm */
+	{'Y', BLOCK_YSEM,
+	 {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX,
+	  DBG_BUS_CLIENT_RBCY}, false,
+	 YSEM_REG_FAST_MEMORY,
+	 YSEM_REG_DBG_FRAME_MODE, YSEM_REG_SLOW_DBG_ACTIVE,
+	 YSEM_REG_SLOW_DBG_MODE, YSEM_REG_DBG_MODE1_CFG,
+	 YSEM_REG_SYNC_DBG_EMPTY, TSEM_REG_SLOW_DBG_EMPTY,
+	 YCM_REG_CTX_RBC_ACCS,
+	 2, YCM_REG_AGG_CON_CTX,
+	 3, YCM_REG_SM_CON_CTX,
+	 2, YCM_REG_AGG_TASK_CTX,
+	 12, YCM_REG_SM_TASK_CTX},
+	/* Pstorm */
+	{'P', BLOCK_PSEM,
+	 {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS,
+	  DBG_BUS_CLIENT_RBCS}, true,
+	 PSEM_REG_FAST_MEMORY,
+	 PSEM_REG_DBG_FRAME_MODE, PSEM_REG_SLOW_DBG_ACTIVE,
+	 PSEM_REG_SLOW_DBG_MODE, PSEM_REG_DBG_MODE1_CFG,
+	 PSEM_REG_SYNC_DBG_EMPTY, PSEM_REG_SLOW_DBG_EMPTY,
+	 PCM_REG_CTX_RBC_ACCS,
+	 0, 0,
+	 10, PCM_REG_SM_CON_CTX,
+	 0, 0,
+	 0, 0}
+};
+
+/* Block definitions array */
+static struct block_defs block_grc_defs = {
+	"grc", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN},
+	GRC_REG_DBG_SELECT, GRC_REG_DBG_DWORD_ENABLE,
+	GRC_REG_DBG_SHIFT, GRC_REG_DBG_FORCE_VALID,
+	GRC_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_UA, 1
+};
+
+static struct block_defs block_miscs_defs = {
+	"miscs", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_misc_defs = {
+	"misc", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_dbu_defs = {
+	"dbu", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_pglue_b_defs = {
+	"pglue_b", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH},
+	PGLUE_B_REG_DBG_SELECT, PGLUE_B_REG_DBG_DWORD_ENABLE,
+	PGLUE_B_REG_DBG_SHIFT, PGLUE_B_REG_DBG_FORCE_VALID,
+	PGLUE_B_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 1
+};
+
+static struct block_defs block_cnig_defs = {
+	"cnig", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW},
+	CNIG_REG_DBG_SELECT_K2, CNIG_REG_DBG_DWORD_ENABLE_K2,
+	CNIG_REG_DBG_SHIFT_K2, CNIG_REG_DBG_FORCE_VALID_K2,
+	CNIG_REG_DBG_FORCE_FRAME_K2,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 0
+};
+
+static struct block_defs block_cpmu_defs = {
+	"cpmu", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 8
+};
+
+static struct block_defs block_ncsi_defs = {
+	"ncsi", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
+	NCSI_REG_DBG_SELECT, NCSI_REG_DBG_DWORD_ENABLE,
+	NCSI_REG_DBG_SHIFT, NCSI_REG_DBG_FORCE_VALID,
+	NCSI_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 5
+};
+
+static struct block_defs block_opte_defs = {
+	"opte", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 4
+};
+
+static struct block_defs block_bmb_defs = {
+	"bmb", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCB},
+	BMB_REG_DBG_SELECT, BMB_REG_DBG_DWORD_ENABLE,
+	BMB_REG_DBG_SHIFT, BMB_REG_DBG_FORCE_VALID,
+	BMB_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_UA, 7
+};
+
+static struct block_defs block_pcie_defs = {
+	"pcie", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
+	PCIE_REG_DBG_COMMON_SELECT, PCIE_REG_DBG_COMMON_DWORD_ENABLE,
+	PCIE_REG_DBG_COMMON_SHIFT, PCIE_REG_DBG_COMMON_FORCE_VALID,
+	PCIE_REG_DBG_COMMON_FORCE_FRAME,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_mcp_defs = {
+	"mcp", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_mcp2_defs = {
+	"mcp2", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
+	MCP2_REG_DBG_SELECT, MCP2_REG_DBG_DWORD_ENABLE,
+	MCP2_REG_DBG_SHIFT, MCP2_REG_DBG_FORCE_VALID,
+	MCP2_REG_DBG_FORCE_FRAME,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_pswhst_defs = {
+	"pswhst", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWHST_REG_DBG_SELECT, PSWHST_REG_DBG_DWORD_ENABLE,
+	PSWHST_REG_DBG_SHIFT, PSWHST_REG_DBG_FORCE_VALID,
+	PSWHST_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 0
+};
+
+static struct block_defs block_pswhst2_defs = {
+	"pswhst2", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWHST2_REG_DBG_SELECT, PSWHST2_REG_DBG_DWORD_ENABLE,
+	PSWHST2_REG_DBG_SHIFT, PSWHST2_REG_DBG_FORCE_VALID,
+	PSWHST2_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 0
+};
+
+static struct block_defs block_pswrd_defs = {
+	"pswrd", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWRD_REG_DBG_SELECT, PSWRD_REG_DBG_DWORD_ENABLE,
+	PSWRD_REG_DBG_SHIFT, PSWRD_REG_DBG_FORCE_VALID,
+	PSWRD_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 2
+};
+
+static struct block_defs block_pswrd2_defs = {
+	"pswrd2", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWRD2_REG_DBG_SELECT, PSWRD2_REG_DBG_DWORD_ENABLE,
+	PSWRD2_REG_DBG_SHIFT, PSWRD2_REG_DBG_FORCE_VALID,
+	PSWRD2_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 2
+};
+
+static struct block_defs block_pswwr_defs = {
+	"pswwr", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWWR_REG_DBG_SELECT, PSWWR_REG_DBG_DWORD_ENABLE,
+	PSWWR_REG_DBG_SHIFT, PSWWR_REG_DBG_FORCE_VALID,
+	PSWWR_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 3
+};
+
+static struct block_defs block_pswwr2_defs = {
+	"pswwr2", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 3
+};
+
+static struct block_defs block_pswrq_defs = {
+	"pswrq", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWRQ_REG_DBG_SELECT, PSWRQ_REG_DBG_DWORD_ENABLE,
+	PSWRQ_REG_DBG_SHIFT, PSWRQ_REG_DBG_FORCE_VALID,
+	PSWRQ_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 1
+};
+
+static struct block_defs block_pswrq2_defs = {
+	"pswrq2", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWRQ2_REG_DBG_SELECT, PSWRQ2_REG_DBG_DWORD_ENABLE,
+	PSWRQ2_REG_DBG_SHIFT, PSWRQ2_REG_DBG_FORCE_VALID,
+	PSWRQ2_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 1
+};
+
+static struct block_defs block_pglcs_defs = {
+	"pglcs", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
+	PGLCS_REG_DBG_SELECT, PGLCS_REG_DBG_DWORD_ENABLE,
+	PGLCS_REG_DBG_SHIFT, PGLCS_REG_DBG_FORCE_VALID,
+	PGLCS_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 2
+};
+
+static struct block_defs block_ptu_defs = {
+	"ptu", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PTU_REG_DBG_SELECT, PTU_REG_DBG_DWORD_ENABLE,
+	PTU_REG_DBG_SHIFT, PTU_REG_DBG_FORCE_VALID,
+	PTU_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 20
+};
+
+static struct block_defs block_dmae_defs = {
+	"dmae", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	DMAE_REG_DBG_SELECT, DMAE_REG_DBG_DWORD_ENABLE,
+	DMAE_REG_DBG_SHIFT, DMAE_REG_DBG_FORCE_VALID,
+	DMAE_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 28
+};
+
+static struct block_defs block_tcm_defs = {
+	"tcm", {true, true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	TCM_REG_DBG_SELECT, TCM_REG_DBG_DWORD_ENABLE,
+	TCM_REG_DBG_SHIFT, TCM_REG_DBG_FORCE_VALID,
+	TCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 5
+};
+
+static struct block_defs block_mcm_defs = {
+	"mcm", {true, true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	MCM_REG_DBG_SELECT, MCM_REG_DBG_DWORD_ENABLE,
+	MCM_REG_DBG_SHIFT, MCM_REG_DBG_FORCE_VALID,
+	MCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 3
+};
+
+static struct block_defs block_ucm_defs = {
+	"ucm", {true, true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	UCM_REG_DBG_SELECT, UCM_REG_DBG_DWORD_ENABLE,
+	UCM_REG_DBG_SHIFT, UCM_REG_DBG_FORCE_VALID,
+	UCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 8
+};
+
+static struct block_defs block_xcm_defs = {
+	"xcm", {true, true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	XCM_REG_DBG_SELECT, XCM_REG_DBG_DWORD_ENABLE,
+	XCM_REG_DBG_SHIFT, XCM_REG_DBG_FORCE_VALID,
+	XCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 19
+};
+
+static struct block_defs block_ycm_defs = {
+	"ycm", {true, true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
+	YCM_REG_DBG_SELECT, YCM_REG_DBG_DWORD_ENABLE,
+	YCM_REG_DBG_SHIFT, YCM_REG_DBG_FORCE_VALID,
+	YCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 5
+};
+
+static struct block_defs block_pcm_defs = {
+	"pcm", {true, true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	PCM_REG_DBG_SELECT, PCM_REG_DBG_DWORD_ENABLE,
+	PCM_REG_DBG_SHIFT, PCM_REG_DBG_FORCE_VALID,
+	PCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 4
+};
+
+static struct block_defs block_qm_defs = {
+	"qm", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCQ},
+	QM_REG_DBG_SELECT, QM_REG_DBG_DWORD_ENABLE,
+	QM_REG_DBG_SHIFT, QM_REG_DBG_FORCE_VALID,
+	QM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 16
+};
+
+static struct block_defs block_tm_defs = {
+	"tm", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	TM_REG_DBG_SELECT, TM_REG_DBG_DWORD_ENABLE,
+	TM_REG_DBG_SHIFT, TM_REG_DBG_FORCE_VALID,
+	TM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 17
+};
+
+static struct block_defs block_dorq_defs = {
+	"dorq", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
+	DORQ_REG_DBG_SELECT, DORQ_REG_DBG_DWORD_ENABLE,
+	DORQ_REG_DBG_SHIFT, DORQ_REG_DBG_FORCE_VALID,
+	DORQ_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 18
+};
+
+static struct block_defs block_brb_defs = {
+	"brb", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR},
+	BRB_REG_DBG_SELECT, BRB_REG_DBG_DWORD_ENABLE,
+	BRB_REG_DBG_SHIFT, BRB_REG_DBG_FORCE_VALID,
+	BRB_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 0
+};
+
+static struct block_defs block_src_defs = {
+	"src", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	SRC_REG_DBG_SELECT, SRC_REG_DBG_DWORD_ENABLE,
+	SRC_REG_DBG_SHIFT, SRC_REG_DBG_FORCE_VALID,
+	SRC_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 2
+};
+
+static struct block_defs block_prs_defs = {
+	"prs", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR},
+	PRS_REG_DBG_SELECT, PRS_REG_DBG_DWORD_ENABLE,
+	PRS_REG_DBG_SHIFT, PRS_REG_DBG_FORCE_VALID,
+	PRS_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 1
+};
+
+static struct block_defs block_tsdm_defs = {
+	"tsdm", {true, true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	TSDM_REG_DBG_SELECT, TSDM_REG_DBG_DWORD_ENABLE,
+	TSDM_REG_DBG_SHIFT, TSDM_REG_DBG_FORCE_VALID,
+	TSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 3
+};
+
+static struct block_defs block_msdm_defs = {
+	"msdm", {true, true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	MSDM_REG_DBG_SELECT, MSDM_REG_DBG_DWORD_ENABLE,
+	MSDM_REG_DBG_SHIFT, MSDM_REG_DBG_FORCE_VALID,
+	MSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 6
+};
+
+static struct block_defs block_usdm_defs = {
+	"usdm", {true, true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	USDM_REG_DBG_SELECT, USDM_REG_DBG_DWORD_ENABLE,
+	USDM_REG_DBG_SHIFT, USDM_REG_DBG_FORCE_VALID,
+	USDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 7
+};
+
+static struct block_defs block_xsdm_defs = {
+	"xsdm", {true, true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	XSDM_REG_DBG_SELECT, XSDM_REG_DBG_DWORD_ENABLE,
+	XSDM_REG_DBG_SHIFT, XSDM_REG_DBG_FORCE_VALID,
+	XSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 20
+};
+
+static struct block_defs block_ysdm_defs = {
+	"ysdm", {true, true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
+	YSDM_REG_DBG_SELECT, YSDM_REG_DBG_DWORD_ENABLE,
+	YSDM_REG_DBG_SHIFT, YSDM_REG_DBG_FORCE_VALID,
+	YSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 8
+};
+
+static struct block_defs block_psdm_defs = {
+	"psdm", {true, true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	PSDM_REG_DBG_SELECT, PSDM_REG_DBG_DWORD_ENABLE,
+	PSDM_REG_DBG_SHIFT, PSDM_REG_DBG_FORCE_VALID,
+	PSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 7
+};
+
+static struct block_defs block_tsem_defs = {
+	"tsem", {true, true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	TSEM_REG_DBG_SELECT, TSEM_REG_DBG_DWORD_ENABLE,
+	TSEM_REG_DBG_SHIFT, TSEM_REG_DBG_FORCE_VALID,
+	TSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 4
+};
+
+static struct block_defs block_msem_defs = {
+	"msem", {true, true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	MSEM_REG_DBG_SELECT, MSEM_REG_DBG_DWORD_ENABLE,
+	MSEM_REG_DBG_SHIFT, MSEM_REG_DBG_FORCE_VALID,
+	MSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 9
+};
+
+static struct block_defs block_usem_defs = {
+	"usem", {true, true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	USEM_REG_DBG_SELECT, USEM_REG_DBG_DWORD_ENABLE,
+	USEM_REG_DBG_SHIFT, USEM_REG_DBG_FORCE_VALID,
+	USEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 9
+};
+
+static struct block_defs block_xsem_defs = {
+	"xsem", {true, true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	XSEM_REG_DBG_SELECT, XSEM_REG_DBG_DWORD_ENABLE,
+	XSEM_REG_DBG_SHIFT, XSEM_REG_DBG_FORCE_VALID,
+	XSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 21
+};
+
+static struct block_defs block_ysem_defs = {
+	"ysem", {true, true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
+	YSEM_REG_DBG_SELECT, YSEM_REG_DBG_DWORD_ENABLE,
+	YSEM_REG_DBG_SHIFT, YSEM_REG_DBG_FORCE_VALID,
+	YSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 11
+};
+
+static struct block_defs block_psem_defs = {
+	"psem", {true, true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	PSEM_REG_DBG_SELECT, PSEM_REG_DBG_DWORD_ENABLE,
+	PSEM_REG_DBG_SHIFT, PSEM_REG_DBG_FORCE_VALID,
+	PSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 10
+};
+
+static struct block_defs block_rss_defs = {
+	"rss", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	RSS_REG_DBG_SELECT, RSS_REG_DBG_DWORD_ENABLE,
+	RSS_REG_DBG_SHIFT, RSS_REG_DBG_FORCE_VALID,
+	RSS_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 18
+};
+
+static struct block_defs block_tmld_defs = {
+	"tmld", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	TMLD_REG_DBG_SELECT, TMLD_REG_DBG_DWORD_ENABLE,
+	TMLD_REG_DBG_SHIFT, TMLD_REG_DBG_FORCE_VALID,
+	TMLD_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 13
+};
+
+static struct block_defs block_muld_defs = {
+	"muld", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	MULD_REG_DBG_SELECT, MULD_REG_DBG_DWORD_ENABLE,
+	MULD_REG_DBG_SHIFT, MULD_REG_DBG_FORCE_VALID,
+	MULD_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 14
+};
+
+static struct block_defs block_yuld_defs = {
+	"yuld", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	YULD_REG_DBG_SELECT, YULD_REG_DBG_DWORD_ENABLE,
+	YULD_REG_DBG_SHIFT, YULD_REG_DBG_FORCE_VALID,
+	YULD_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 15
+};
+
+static struct block_defs block_xyld_defs = {
+	"xyld", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	XYLD_REG_DBG_SELECT, XYLD_REG_DBG_DWORD_ENABLE,
+	XYLD_REG_DBG_SHIFT, XYLD_REG_DBG_FORCE_VALID,
+	XYLD_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 12
+};
+
+static struct block_defs block_prm_defs = {
+	"prm", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	PRM_REG_DBG_SELECT, PRM_REG_DBG_DWORD_ENABLE,
+	PRM_REG_DBG_SHIFT, PRM_REG_DBG_FORCE_VALID,
+	PRM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 21
+};
+
+static struct block_defs block_pbf_pb1_defs = {
+	"pbf_pb1", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
+	PBF_PB1_REG_DBG_SELECT, PBF_PB1_REG_DBG_DWORD_ENABLE,
+	PBF_PB1_REG_DBG_SHIFT, PBF_PB1_REG_DBG_FORCE_VALID,
+	PBF_PB1_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1,
+	11
+};
+
+static struct block_defs block_pbf_pb2_defs = {
+	"pbf_pb2", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
+	PBF_PB2_REG_DBG_SELECT, PBF_PB2_REG_DBG_DWORD_ENABLE,
+	PBF_PB2_REG_DBG_SHIFT, PBF_PB2_REG_DBG_FORCE_VALID,
+	PBF_PB2_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1,
+	12
+};
+
+static struct block_defs block_rpb_defs = {
+	"rpb", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	RPB_REG_DBG_SELECT, RPB_REG_DBG_DWORD_ENABLE,
+	RPB_REG_DBG_SHIFT, RPB_REG_DBG_FORCE_VALID,
+	RPB_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 13
+};
+
+static struct block_defs block_btb_defs = {
+	"btb", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCV},
+	BTB_REG_DBG_SELECT, BTB_REG_DBG_DWORD_ENABLE,
+	BTB_REG_DBG_SHIFT, BTB_REG_DBG_FORCE_VALID,
+	BTB_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 10
+};
+
+static struct block_defs block_pbf_defs = {
+	"pbf", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
+	PBF_REG_DBG_SELECT, PBF_REG_DBG_DWORD_ENABLE,
+	PBF_REG_DBG_SHIFT, PBF_REG_DBG_FORCE_VALID,
+	PBF_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 15
+};
+
+static struct block_defs block_rdif_defs = {
+	"rdif", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	RDIF_REG_DBG_SELECT, RDIF_REG_DBG_DWORD_ENABLE,
+	RDIF_REG_DBG_SHIFT, RDIF_REG_DBG_FORCE_VALID,
+	RDIF_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 16
+};
+
+static struct block_defs block_tdif_defs = {
+	"tdif", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	TDIF_REG_DBG_SELECT, TDIF_REG_DBG_DWORD_ENABLE,
+	TDIF_REG_DBG_SHIFT, TDIF_REG_DBG_FORCE_VALID,
+	TDIF_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 17
+};
+
+static struct block_defs block_cdu_defs = {
+	"cdu", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	CDU_REG_DBG_SELECT, CDU_REG_DBG_DWORD_ENABLE,
+	CDU_REG_DBG_SHIFT, CDU_REG_DBG_FORCE_VALID,
+	CDU_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 23
+};
+
+static struct block_defs block_ccfc_defs = {
+	"ccfc", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	CCFC_REG_DBG_SELECT, CCFC_REG_DBG_DWORD_ENABLE,
+	CCFC_REG_DBG_SHIFT, CCFC_REG_DBG_FORCE_VALID,
+	CCFC_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 24
+};
+
+static struct block_defs block_tcfc_defs = {
+	"tcfc", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	TCFC_REG_DBG_SELECT, TCFC_REG_DBG_DWORD_ENABLE,
+	TCFC_REG_DBG_SHIFT, TCFC_REG_DBG_FORCE_VALID,
+	TCFC_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 25
+};
+
+static struct block_defs block_igu_defs = {
+	"igu", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	IGU_REG_DBG_SELECT, IGU_REG_DBG_DWORD_ENABLE,
+	IGU_REG_DBG_SHIFT, IGU_REG_DBG_FORCE_VALID,
+	IGU_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 27
+};
+
+static struct block_defs block_cau_defs = {
+	"cau", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	CAU_REG_DBG_SELECT, CAU_REG_DBG_DWORD_ENABLE,
+	CAU_REG_DBG_SHIFT, CAU_REG_DBG_FORCE_VALID,
+	CAU_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 19
+};
+
+static struct block_defs block_umac_defs = {
+	"umac", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ},
+	UMAC_REG_DBG_SELECT, UMAC_REG_DBG_DWORD_ENABLE,
+	UMAC_REG_DBG_SHIFT, UMAC_REG_DBG_FORCE_VALID,
+	UMAC_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 6
+};
+
+static struct block_defs block_xmac_defs = {
+	"xmac", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_dbg_defs = {
+	"dbg", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 3
+};
+
+static struct block_defs block_nig_defs = {
+	"nig", {true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN},
+	NIG_REG_DBG_SELECT, NIG_REG_DBG_DWORD_ENABLE,
+	NIG_REG_DBG_SHIFT, NIG_REG_DBG_FORCE_VALID,
+	NIG_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 0
+};
+
+static struct block_defs block_wol_defs = {
+	"wol", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ},
+	WOL_REG_DBG_SELECT, WOL_REG_DBG_DWORD_ENABLE,
+	WOL_REG_DBG_SHIFT, WOL_REG_DBG_FORCE_VALID,
+	WOL_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 7
+};
+
+static struct block_defs block_bmbn_defs = {
+	"bmbn", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCB},
+	BMBN_REG_DBG_SELECT, BMBN_REG_DBG_DWORD_ENABLE,
+	BMBN_REG_DBG_SHIFT, BMBN_REG_DBG_FORCE_VALID,
+	BMBN_REG_DBG_FORCE_FRAME,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_ipc_defs = {
+	"ipc", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_UA, 8
+};
+
+static struct block_defs block_nwm_defs = {
+	"nwm", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW},
+	NWM_REG_DBG_SELECT, NWM_REG_DBG_DWORD_ENABLE,
+	NWM_REG_DBG_SHIFT, NWM_REG_DBG_FORCE_VALID,
+	NWM_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_HV_2, 0
+};
+
+static struct block_defs block_nws_defs = {
+	"nws", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 12
+};
+
+static struct block_defs block_ms_defs = {
+	"ms", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 13
+};
+
+static struct block_defs block_phy_pcie_defs = {
+	"phy_pcie", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
+	PCIE_REG_DBG_COMMON_SELECT, PCIE_REG_DBG_COMMON_DWORD_ENABLE,
+	PCIE_REG_DBG_COMMON_SHIFT, PCIE_REG_DBG_COMMON_FORCE_VALID,
+	PCIE_REG_DBG_COMMON_FORCE_FRAME,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_led_defs = {
+	"led", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, true, DBG_RESET_REG_MISCS_PL_HV, 14
+};
+
+static struct block_defs block_misc_aeu_defs = {
+	"misc_aeu", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_bar0_map_defs = {
+	"bar0_map", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs *s_block_defs[MAX_BLOCK_ID] = {
+	&block_grc_defs,
+	&block_miscs_defs,
+	&block_misc_defs,
+	&block_dbu_defs,
+	&block_pglue_b_defs,
+	&block_cnig_defs,
+	&block_cpmu_defs,
+	&block_ncsi_defs,
+	&block_opte_defs,
+	&block_bmb_defs,
+	&block_pcie_defs,
+	&block_mcp_defs,
+	&block_mcp2_defs,
+	&block_pswhst_defs,
+	&block_pswhst2_defs,
+	&block_pswrd_defs,
+	&block_pswrd2_defs,
+	&block_pswwr_defs,
+	&block_pswwr2_defs,
+	&block_pswrq_defs,
+	&block_pswrq2_defs,
+	&block_pglcs_defs,
+	&block_dmae_defs,
+	&block_ptu_defs,
+	&block_tcm_defs,
+	&block_mcm_defs,
+	&block_ucm_defs,
+	&block_xcm_defs,
+	&block_ycm_defs,
+	&block_pcm_defs,
+	&block_qm_defs,
+	&block_tm_defs,
+	&block_dorq_defs,
+	&block_brb_defs,
+	&block_src_defs,
+	&block_prs_defs,
+	&block_tsdm_defs,
+	&block_msdm_defs,
+	&block_usdm_defs,
+	&block_xsdm_defs,
+	&block_ysdm_defs,
+	&block_psdm_defs,
+	&block_tsem_defs,
+	&block_msem_defs,
+	&block_usem_defs,
+	&block_xsem_defs,
+	&block_ysem_defs,
+	&block_psem_defs,
+	&block_rss_defs,
+	&block_tmld_defs,
+	&block_muld_defs,
+	&block_yuld_defs,
+	&block_xyld_defs,
+	&block_prm_defs,
+	&block_pbf_pb1_defs,
+	&block_pbf_pb2_defs,
+	&block_rpb_defs,
+	&block_btb_defs,
+	&block_pbf_defs,
+	&block_rdif_defs,
+	&block_tdif_defs,
+	&block_cdu_defs,
+	&block_ccfc_defs,
+	&block_tcfc_defs,
+	&block_igu_defs,
+	&block_cau_defs,
+	&block_umac_defs,
+	&block_xmac_defs,
+	&block_dbg_defs,
+	&block_nig_defs,
+	&block_wol_defs,
+	&block_bmbn_defs,
+	&block_ipc_defs,
+	&block_nwm_defs,
+	&block_nws_defs,
+	&block_ms_defs,
+	&block_phy_pcie_defs,
+	&block_led_defs,
+	&block_misc_aeu_defs,
+	&block_bar0_map_defs,
+};
+
+static struct platform_defs s_platform_defs[] = {
+	{"asic", 1},
+	{"reserved", 0},
+	{"reserved2", 0},
+	{"reserved3", 0}
+};
+
+static struct grc_param_defs s_grc_param_defs[] = {
+	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_TSTORM */
+	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_MSTORM */
+	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_USTORM */
+	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_XSTORM */
+	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_YSTORM */
+	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_PSTORM */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_REGS */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_RAM */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_PBUF */
+	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_IOR */
+	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_VFC */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CM_CTX */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_ILT */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_RSS */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CAU */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_QM */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_MCP */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_RESERVED */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CFC */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_IGU */
+	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BRB */
+	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BTB */
+	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BMB */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_NIG */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_MULD */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_PRS */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_DMAE */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_TM */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_SDM */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_DIF */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_STATIC */
+	{{0, 0, 0}, 0, 1, false, 0, 0},	/* DBG_GRC_PARAM_UNSTALL */
+	{{MAX_LCIDS, MAX_LCIDS, MAX_LCIDS}, 1, MAX_LCIDS, false, MAX_LCIDS,
+	 MAX_LCIDS},			/* DBG_GRC_PARAM_NUM_LCIDS */
+	{{MAX_LTIDS, MAX_LTIDS, MAX_LTIDS}, 1, MAX_LTIDS, false, MAX_LTIDS,
+	 MAX_LTIDS},			/* DBG_GRC_PARAM_NUM_LTIDS */
+	{{0, 0, 0}, 0, 1, true, 0, 0},	/* DBG_GRC_PARAM_EXCLUDE_ALL */
+	{{0, 0, 0}, 0, 1, true, 0, 0},	/* DBG_GRC_PARAM_CRASH */
+	{{0, 0, 0}, 0, 1, false, 1, 0},	/* DBG_GRC_PARAM_PARITY_SAFE */
+	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CM */
+	{{1, 1, 1}, 0, 1, false, 0, 1}	/* DBG_GRC_PARAM_DUMP_PHY */
+};
+
+static struct rss_mem_defs s_rss_mem_defs[] = {
+	{ "rss_mem_cid", "rss_cid", 0,
+	  {256, 256, 320},
+	  {32, 32, 32} },
+	{ "rss_mem_key_msb", "rss_key", 1024,
+	  {128, 128, 208},
+	  {256, 256, 256} },
+	{ "rss_mem_key_lsb", "rss_key", 2048,
+	  {128, 128, 208},
+	  {64, 64, 64} },
+	{ "rss_mem_info", "rss_info", 3072,
+	  {128, 128, 208},
+	  {16, 16, 16} },
+	{ "rss_mem_ind", "rss_ind", 4096,
+	  {(128 * 128), (128 * 128), (128 * 208)},
+	  {16, 16, 16} }
+};
+
+static struct vfc_ram_defs s_vfc_ram_defs[] = {
+	{"vfc_ram_tt1", "vfc_ram", 0, 512},
+	{"vfc_ram_mtt2", "vfc_ram", 512, 128},
+	{"vfc_ram_stt2", "vfc_ram", 640, 32},
+	{"vfc_ram_ro_vect", "vfc_ram", 672, 32}
+};
+
+static struct big_ram_defs s_big_ram_defs[] = {
+	{ "BRB", MEM_GROUP_BRB_MEM, MEM_GROUP_BRB_RAM, DBG_GRC_PARAM_DUMP_BRB,
+	  BRB_REG_BIG_RAM_ADDRESS, BRB_REG_BIG_RAM_DATA,
+	  {4800, 4800, 5632} },
+	{ "BTB", MEM_GROUP_BTB_MEM, MEM_GROUP_BTB_RAM, DBG_GRC_PARAM_DUMP_BTB,
+	  BTB_REG_BIG_RAM_ADDRESS, BTB_REG_BIG_RAM_DATA,
+	  {2880, 2880, 3680} },
+	{ "BMB", MEM_GROUP_BMB_MEM, MEM_GROUP_BMB_RAM, DBG_GRC_PARAM_DUMP_BMB,
+	  BMB_REG_BIG_RAM_ADDRESS, BMB_REG_BIG_RAM_DATA,
+	  {1152, 1152, 1152} }
+};
+
+static struct reset_reg_defs s_reset_regs_defs[] = {
+	{ MISCS_REG_RESET_PL_UA, 0x0,
+	  {true, true, true} },		/* DBG_RESET_REG_MISCS_PL_UA */
+	{ MISCS_REG_RESET_PL_HV, 0x0,
+	  {true, true, true} },		/* DBG_RESET_REG_MISCS_PL_HV */
+	{ MISCS_REG_RESET_PL_HV_2, 0x0,
+	  {false, false, true} },	/* DBG_RESET_REG_MISCS_PL_HV_2 */
+	{ MISC_REG_RESET_PL_UA, 0x0,
+	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_UA */
+	{ MISC_REG_RESET_PL_HV, 0x0,
+	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_HV */
+	{ MISC_REG_RESET_PL_PDA_VMAIN_1, 0x4404040,
+	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VMAIN_1 */
+	{ MISC_REG_RESET_PL_PDA_VMAIN_2, 0x7c00007,
+	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VMAIN_2 */
+	{ MISC_REG_RESET_PL_PDA_VAUX, 0x2,
+	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VAUX */
+};
+
+static struct phy_defs s_phy_defs[] = {
+	{"nw_phy", NWS_REG_NWS_CMU, PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_7_0,
+	 PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_15_8,
+	 PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_7_0,
+	 PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_11_8},
+	{"sgmii_phy", MS_REG_MS_CMU, PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X132,
+	 PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X133,
+	 PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X130,
+	 PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X131},
+	{"pcie_phy0", PHY_PCIE_REG_PHY0, PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131},
+	{"pcie_phy1", PHY_PCIE_REG_PHY1, PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131},
+};
+
+/**************************** Private Functions ******************************/
+
+/* Reads and returns a single dword from the specified unaligned buffer */
+static u32 qed_read_unaligned_dword(u8 *buf)
+{
+	u32 dword;
+
+	memcpy((u8 *)&dword, buf, sizeof(dword));
+	return dword;
+}
+
+/* Initializes debug data for the specified device */
+static enum dbg_status qed_dbg_dev_init(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	if (dev_data->initialized)
+		return DBG_STATUS_OK;
+
+	if (QED_IS_K2(p_hwfn->cdev)) {
+		dev_data->chip_id = CHIP_K2;
+		dev_data->mode_enable[MODE_K2] = 1;
+	} else if (QED_IS_BB_B0(p_hwfn->cdev)) {
+		dev_data->chip_id = CHIP_BB_B0;
+		dev_data->mode_enable[MODE_BB_B0] = 1;
+	} else {
+		return DBG_STATUS_UNKNOWN_CHIP;
+	}
+
+	dev_data->platform_id = PLATFORM_ASIC;
+	dev_data->mode_enable[MODE_ASIC] = 1;
+	dev_data->initialized = true;
+	return DBG_STATUS_OK;
+}
+
+/* Reads the FW info structure for the specified Storm from the chip,
+ * and writes it to the specified fw_info pointer.
+ */
+static void qed_read_fw_info(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt,
+			     u8 storm_id, struct fw_info *fw_info)
+{
+	/* Read first the address that points to fw_info location.
+	 * The address is located in the last line of the Storm RAM.
+	 */
+	u32 addr = s_storm_defs[storm_id].sem_fast_mem_addr +
+		   SEM_FAST_REG_INT_RAM +
+		   DWORDS_TO_BYTES(SEM_FAST_REG_INT_RAM_SIZE) -
+		   sizeof(struct fw_info_location);
+	struct fw_info_location fw_info_location;
+	u32 *dest = (u32 *)&fw_info_location;
+	u32 i;
+
+	memset(&fw_info_location, 0, sizeof(fw_info_location));
+	memset(fw_info, 0, sizeof(*fw_info));
+	for (i = 0; i < BYTES_TO_DWORDS(sizeof(fw_info_location));
+	     i++, addr += BYTES_IN_DWORD)
+		dest[i] = qed_rd(p_hwfn, p_ptt, addr);
+	if (fw_info_location.size > 0 && fw_info_location.size <=
+	    sizeof(*fw_info)) {
+		/* Read FW version info from Storm RAM */
+		addr = fw_info_location.grc_addr;
+		dest = (u32 *)fw_info;
+		for (i = 0; i < BYTES_TO_DWORDS(fw_info_location.size);
+		     i++, addr += BYTES_IN_DWORD)
+			dest[i] = qed_rd(p_hwfn, p_ptt, addr);
+	}
+}
+
+/* Dumps the specified string to the specified buffer. Returns the dumped size
+ * in bytes (actual length + 1 for the null character termination).
+ */
+static u32 qed_dump_str(char *dump_buf, bool dump, const char *str)
+{
+	if (dump)
+		strcpy(dump_buf, str);
+	return (u32)strlen(str) + 1;
+}
+
+/* Dumps zeros to align the specified buffer to dwords. Returns the dumped size
+ * in bytes.
+ */
+static u32 qed_dump_align(char *dump_buf, bool dump, u32 byte_offset)
+{
+	u8 offset_in_dword = (u8)(byte_offset & 0x3), align_size;
+
+	align_size = offset_in_dword ? BYTES_IN_DWORD - offset_in_dword : 0;
+
+	if (dump && align_size)
+		memset(dump_buf, 0, align_size);
+	return align_size;
+}
+
+/* Writes the specified string param to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_str_param(u32 *dump_buf,
+			      bool dump,
+			      const char *param_name, const char *param_val)
+{
+	char *char_buf = (char *)dump_buf;
+	u32 offset = 0;
+
+	/* Dump param name */
+	offset += qed_dump_str(char_buf + offset, dump, param_name);
+
+	/* Indicate a string param value */
+	if (dump)
+		*(char_buf + offset) = 1;
+	offset++;
+
+	/* Dump param value */
+	offset += qed_dump_str(char_buf + offset, dump, param_val);
+
+	/* Align buffer to next dword */
+	offset += qed_dump_align(char_buf + offset, dump, offset);
+	return BYTES_TO_DWORDS(offset);
+}
+
+/* Writes the specified numeric param to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_num_param(u32 *dump_buf,
+			      bool dump, const char *param_name, u32 param_val)
+{
+	char *char_buf = (char *)dump_buf;
+	u32 offset = 0;
+
+	/* Dump param name */
+	offset += qed_dump_str(char_buf + offset, dump, param_name);
+
+	/* Indicate a numeric param value */
+	if (dump)
+		*(char_buf + offset) = 0;
+	offset++;
+
+	/* Align buffer to next dword */
+	offset += qed_dump_align(char_buf + offset, dump, offset);
+
+	/* Dump param value (and change offset from bytes to dwords) */
+	offset = BYTES_TO_DWORDS(offset);
+	if (dump)
+		*(dump_buf + offset) = param_val;
+	offset++;
+	return offset;
+}
+
+/* Reads the FW version and writes it as a param to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_fw_ver_param(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	char fw_ver_str[16] = EMPTY_FW_VERSION_STR;
+	char fw_img_str[16] = EMPTY_FW_IMAGE_STR;
+	struct fw_info fw_info = { {0}, {0} };
+	int printed_chars;
+	u32 offset = 0;
+
+	if (dump) {
+		/* Read FW image/version from PRAM in a non-reset SEMI */
+		bool found = false;
+		u8 storm_id;
+
+		for (storm_id = 0; storm_id < MAX_DBG_STORMS && !found;
+		     storm_id++) {
+			/* Read FW version/image  */
+			if (!dev_data->block_in_reset
+			    [s_storm_defs[storm_id].block_id]) {
+				/* read FW info for the current Storm */
+				qed_read_fw_info(p_hwfn,
+						 p_ptt, storm_id, &fw_info);
+
+				/* Create FW version/image strings */
+				printed_chars =
+				    snprintf(fw_ver_str,
+					     sizeof(fw_ver_str),
+					     "%d_%d_%d_%d",
+					     fw_info.ver.num.major,
+					     fw_info.ver.num.minor,
+					     fw_info.ver.num.rev,
+					     fw_info.ver.num.eng);
+				if (printed_chars < 0 || printed_chars >=
+				    sizeof(fw_ver_str))
+					DP_NOTICE(p_hwfn,
+						  "Unexpected debug error: invalid FW version string\n");
+				switch (fw_info.ver.image_id) {
+				case FW_IMG_MAIN:
+					strcpy(fw_img_str, "main");
+					break;
+				default:
+					strcpy(fw_img_str, "unknown");
+					break;
+				}
+
+				found = true;
+			}
+		}
+	}
+
+	/* Dump FW version, image and timestamp */
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "fw-version", fw_ver_str);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "fw-image", fw_img_str);
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump,
+				     "fw-timestamp", fw_info.ver.timestamp);
+	return offset;
+}
+
+/* Reads the MFW version and writes it as a param to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_mfw_ver_param(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u32 *dump_buf, bool dump)
+{
+	char mfw_ver_str[16] = EMPTY_FW_VERSION_STR;
+
+	if (dump) {
+		u32 global_section_offsize, global_section_addr, mfw_ver;
+		u32 public_data_addr, global_section_offsize_addr;
+		int printed_chars;
+
+		/* Find MCP public data GRC address.
+		 * Needs to be ORed with MCP_REG_SCRATCH due to a HW bug.
+		 */
+		public_data_addr = qed_rd(p_hwfn, p_ptt,
+					  MISC_REG_SHARED_MEM_ADDR) |
+					  MCP_REG_SCRATCH;
+
+		/* Find MCP public global section offset */
+		global_section_offsize_addr = public_data_addr +
+					      offsetof(struct mcp_public_data,
+						       sections) +
+					      sizeof(offsize_t) * PUBLIC_GLOBAL;
+		global_section_offsize = qed_rd(p_hwfn, p_ptt,
+						global_section_offsize_addr);
+		global_section_addr = MCP_REG_SCRATCH +
+				      (global_section_offsize &
+				       OFFSIZE_OFFSET_MASK) * 4;
+
+		/* Read MFW version from MCP public global section */
+		mfw_ver = qed_rd(p_hwfn, p_ptt,
+				 global_section_addr +
+				 offsetof(struct public_global, mfw_ver));
+
+		/* Dump MFW version param */
+		printed_chars = snprintf(mfw_ver_str, sizeof(mfw_ver_str),
+					 "%d_%d_%d_%d",
+					 (u8) (mfw_ver >> 24),
+					 (u8) (mfw_ver >> 16),
+					 (u8) (mfw_ver >> 8),
+					 (u8) mfw_ver);
+		if (printed_chars < 0 || printed_chars >= sizeof(mfw_ver_str))
+			DP_NOTICE(p_hwfn,
+				  "Unexpected debug error: invalid MFW version string\n");
+	}
+
+	return qed_dump_str_param(dump_buf, dump, "mfw-version", mfw_ver_str);
+}
+
+/* Writes a section header to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_section_hdr(u32 *dump_buf,
+				bool dump, const char *name, u32 num_params)
+{
+	return qed_dump_num_param(dump_buf, dump, name, num_params);
+}
+
+/* Writes the common global params to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_common_global_params(struct qed_hwfn *p_hwfn,
+					 struct qed_ptt *p_ptt,
+					 u32 *dump_buf,
+					 bool dump,
+					 u8 num_specific_global_params)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 offset = 0;
+
+	/* Find platform string and dump global params section header */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump,
+				       "global_params",
+				       NUM_COMMON_GLOBAL_PARAMS +
+				       num_specific_global_params);
+
+	/* Store params */
+	offset += qed_dump_fw_ver_param(p_hwfn, p_ptt, dump_buf + offset, dump);
+	offset += qed_dump_mfw_ver_param(p_hwfn,
+					 p_ptt, dump_buf + offset, dump);
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "tools-version", TOOLS_VERSION);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump,
+				     "chip",
+				     s_chip_defs[dev_data->chip_id].name);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump,
+				     "platform",
+				     s_platform_defs[dev_data->platform_id].
+				     name);
+	offset +=
+	    qed_dump_num_param(dump_buf + offset, dump, "pci-func",
+			       p_hwfn->abs_pf_id);
+	return offset;
+}
+
+/* Writes the last section to the specified buffer at the given offset.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_last_section(u32 *dump_buf, u32 offset, bool dump)
+{
+	u32 start_offset = offset, crc = ~0;
+
+	/* Dump CRC section header */
+	offset += qed_dump_section_hdr(dump_buf + offset, dump, "last", 0);
+
+	/* Calculate CRC32 and add it to the dword following the "last" section.
+	 */
+	if (dump)
+		*(dump_buf + offset) = ~crc32(crc, (u8 *)dump_buf,
+					      DWORDS_TO_BYTES(offset));
+	offset++;
+	return offset - start_offset;
+}
+
+/* Update blocks reset state  */
+static void qed_update_blocks_reset_state(struct qed_hwfn *p_hwfn,
+					  struct qed_ptt *p_ptt)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 reg_val[MAX_DBG_RESET_REGS] = { 0 };
+	u32 i;
+
+	/* Read reset registers */
+	for (i = 0; i < MAX_DBG_RESET_REGS; i++)
+		if (s_reset_regs_defs[i].exists[dev_data->chip_id])
+			reg_val[i] = qed_rd(p_hwfn,
+					    p_ptt, s_reset_regs_defs[i].addr);
+
+	/* Check if blocks are in reset */
+	for (i = 0; i < MAX_BLOCK_ID; i++)
+		dev_data->block_in_reset[i] =
+		    s_block_defs[i]->has_reset_bit &&
+		    !(reg_val[s_block_defs[i]->reset_reg] &
+		      BIT(s_block_defs[i]->reset_bit_offset));
+}
+
+/* Enable / disable the Debug block */
+static void qed_bus_enable_dbg_block(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, bool enable)
+{
+	qed_wr(p_hwfn, p_ptt, DBG_REG_DBG_BLOCK_ON, enable ? 1 : 0);
+}
+
+/* Resets the Debug block */
+static void qed_bus_reset_dbg_block(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt)
+{
+	u32 dbg_reset_reg_addr, old_reset_reg_val, new_reset_reg_val;
+
+	dbg_reset_reg_addr =
+		s_reset_regs_defs[s_block_defs[BLOCK_DBG]->reset_reg].addr;
+	old_reset_reg_val = qed_rd(p_hwfn, p_ptt, dbg_reset_reg_addr);
+	new_reset_reg_val = old_reset_reg_val &
+			    ~BIT(s_block_defs[BLOCK_DBG]->reset_bit_offset);
+
+	qed_wr(p_hwfn, p_ptt, dbg_reset_reg_addr, new_reset_reg_val);
+	qed_wr(p_hwfn, p_ptt, dbg_reset_reg_addr, old_reset_reg_val);
+}
+
+static void qed_bus_set_framing_mode(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     enum dbg_bus_frame_modes mode)
+{
+	qed_wr(p_hwfn, p_ptt, DBG_REG_FRAMING_MODE, (u8)mode);
+}
+
+/* Enable / disable Debug Bus clients according to the specified mask.
+ * (1 = enable, 0 = disable)
+ */
+static void qed_bus_enable_clients(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt, u32 client_mask)
+{
+	qed_wr(p_hwfn, p_ptt, DBG_REG_CLIENT_ENABLE, client_mask);
+}
+
+static bool qed_is_mode_match(struct qed_hwfn *p_hwfn, u16 *modes_buf_offset)
+{
+	const u32 *ptr = s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr;
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 tree_val = ((u8 *)ptr)[(*modes_buf_offset)++];
+	bool arg1, arg2;
+
+	switch (tree_val) {
+	case INIT_MODE_OP_NOT:
+		return !qed_is_mode_match(p_hwfn, modes_buf_offset);
+	case INIT_MODE_OP_OR:
+	case INIT_MODE_OP_AND:
+		arg1 = qed_is_mode_match(p_hwfn, modes_buf_offset);
+		arg2 = qed_is_mode_match(p_hwfn, modes_buf_offset);
+		return (tree_val == INIT_MODE_OP_OR) ? (arg1 ||
+							arg2) : (arg1 && arg2);
+	default:
+		return dev_data->mode_enable[tree_val - MAX_INIT_MODE_OPS] > 0;
+	}
+}
+
+/* Returns the value of the specified GRC param */
+static u32 qed_grc_get_param(struct qed_hwfn *p_hwfn,
+			     enum dbg_grc_params grc_param)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	return dev_data->grc.param_val[grc_param];
+}
+
+/* Clear all GRC params */
+static void qed_dbg_grc_clear_params(struct qed_hwfn *p_hwfn)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 i;
+
+	for (i = 0; i < MAX_DBG_GRC_PARAMS; i++)
+		dev_data->grc.param_set_by_user[i] = 0;
+}
+
+/* Assign default GRC param values */
+static void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 i;
+
+	for (i = 0; i < MAX_DBG_GRC_PARAMS; i++)
+		if (!dev_data->grc.param_set_by_user[i])
+			dev_data->grc.param_val[i] =
+			    s_grc_param_defs[i].default_val[dev_data->chip_id];
+}
+
+/* Returns true if the specified entity (indicated by GRC param) should be
+ * included in the dump, false otherwise.
+ */
+static bool qed_grc_is_included(struct qed_hwfn *p_hwfn,
+				enum dbg_grc_params grc_param)
+{
+	return qed_grc_get_param(p_hwfn, grc_param) > 0;
+}
+
+/* Returns true of the specified Storm should be included in the dump, false
+ * otherwise.
+ */
+static bool qed_grc_is_storm_included(struct qed_hwfn *p_hwfn,
+				      enum dbg_storms storm)
+{
+	return qed_grc_get_param(p_hwfn, (enum dbg_grc_params)storm) > 0;
+}
+
+/* Returns true if the specified memory should be included in the dump, false
+ * otherwise.
+ */
+static bool qed_grc_is_mem_included(struct qed_hwfn *p_hwfn,
+				    enum block_id block_id, u8 mem_group_id)
+{
+	u8 i;
+
+	/* Check Storm match */
+	if (s_block_defs[block_id]->associated_to_storm &&
+	    !qed_grc_is_storm_included(p_hwfn,
+			(enum dbg_storms)s_block_defs[block_id]->storm_id))
+		return false;
+
+	for (i = 0; i < NUM_BIG_RAM_TYPES; i++)
+		if (mem_group_id == s_big_ram_defs[i].mem_group_id ||
+		    mem_group_id == s_big_ram_defs[i].ram_mem_group_id)
+			return qed_grc_is_included(p_hwfn,
+						   s_big_ram_defs[i].grc_param);
+	if (mem_group_id == MEM_GROUP_PXP_ILT || mem_group_id ==
+	    MEM_GROUP_PXP_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PXP);
+	if (mem_group_id == MEM_GROUP_RAM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_RAM);
+	if (mem_group_id == MEM_GROUP_PBUF)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PBUF);
+	if (mem_group_id == MEM_GROUP_CAU_MEM ||
+	    mem_group_id == MEM_GROUP_CAU_SB ||
+	    mem_group_id == MEM_GROUP_CAU_PI)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CAU);
+	if (mem_group_id == MEM_GROUP_QM_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_QM);
+	if (mem_group_id == MEM_GROUP_CONN_CFC_MEM ||
+	    mem_group_id == MEM_GROUP_TASK_CFC_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CFC);
+	if (mem_group_id == MEM_GROUP_IGU_MEM || mem_group_id ==
+	    MEM_GROUP_IGU_MSIX)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IGU);
+	if (mem_group_id == MEM_GROUP_MULD_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_MULD);
+	if (mem_group_id == MEM_GROUP_PRS_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PRS);
+	if (mem_group_id == MEM_GROUP_DMAE_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_DMAE);
+	if (mem_group_id == MEM_GROUP_TM_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_TM);
+	if (mem_group_id == MEM_GROUP_SDM_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_SDM);
+	if (mem_group_id == MEM_GROUP_TDIF_CTX || mem_group_id ==
+	    MEM_GROUP_RDIF_CTX)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_DIF);
+	if (mem_group_id == MEM_GROUP_CM_MEM)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CM);
+	if (mem_group_id == MEM_GROUP_IOR)
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IOR);
+
+	return true;
+}
+
+/* Stalls all Storms */
+static void qed_grc_stall_storms(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt, bool stall)
+{
+	u8 reg_val = stall ? 1 : 0;
+	u8 storm_id;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		if (qed_grc_is_storm_included(p_hwfn,
+					      (enum dbg_storms)storm_id)) {
+			u32 reg_addr =
+			    s_storm_defs[storm_id].sem_fast_mem_addr +
+			    SEM_FAST_REG_STALL_0;
+
+			qed_wr(p_hwfn, p_ptt, reg_addr, reg_val);
+		}
+	}
+
+	msleep(STALL_DELAY_MS);
+}
+
+/* Takes all blocks out of reset */
+static void qed_grc_unreset_blocks(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 reg_val[MAX_DBG_RESET_REGS] = { 0 };
+	u32 i;
+
+	/* Fill reset regs values */
+	for (i = 0; i < MAX_BLOCK_ID; i++)
+		if (s_block_defs[i]->has_reset_bit && s_block_defs[i]->unreset)
+			reg_val[s_block_defs[i]->reset_reg] |=
+			    BIT(s_block_defs[i]->reset_bit_offset);
+
+	/* Write reset registers */
+	for (i = 0; i < MAX_DBG_RESET_REGS; i++) {
+		if (s_reset_regs_defs[i].exists[dev_data->chip_id]) {
+			reg_val[i] |= s_reset_regs_defs[i].unreset_val;
+			if (reg_val[i])
+				qed_wr(p_hwfn,
+				       p_ptt,
+				       s_reset_regs_defs[i].addr +
+				       RESET_REG_UNRESET_OFFSET, reg_val[i]);
+		}
+	}
+}
+
+/* Returns the attention name offsets of the specified block */
+static const struct dbg_attn_block_type_data *
+qed_get_block_attn_data(enum block_id block_id, enum dbg_attn_type attn_type)
+{
+	const struct dbg_attn_block *base_attn_block_arr =
+		(const struct dbg_attn_block *)
+		s_dbg_arrays[BIN_BUF_DBG_ATTN_BLOCKS].ptr;
+
+	return &base_attn_block_arr[block_id].per_type_data[attn_type];
+}
+
+/* Returns the attention registers of the specified block */
+static const struct dbg_attn_reg *
+qed_get_block_attn_regs(enum block_id block_id, enum dbg_attn_type attn_type,
+			u8 *num_attn_regs)
+{
+	const struct dbg_attn_block_type_data *block_type_data =
+		qed_get_block_attn_data(block_id, attn_type);
+
+	*num_attn_regs = block_type_data->num_regs;
+	return &((const struct dbg_attn_reg *)
+		 s_dbg_arrays[BIN_BUF_DBG_ATTN_REGS].ptr)[block_type_data->
+							  regs_offset];
+}
+
+/* For each block, clear the status of all parities */
+static void qed_grc_clear_all_prty(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 reg_idx, num_attn_regs;
+	u32 block_id;
+
+	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+		const struct dbg_attn_reg *attn_reg_arr;
+
+		if (dev_data->block_in_reset[block_id])
+			continue;
+
+		attn_reg_arr = qed_get_block_attn_regs((enum block_id)block_id,
+						       ATTN_TYPE_PARITY,
+						       &num_attn_regs);
+		for (reg_idx = 0; reg_idx < num_attn_regs; reg_idx++) {
+			const struct dbg_attn_reg *reg_data =
+				&attn_reg_arr[reg_idx];
+
+			/* Check mode */
+			bool eval_mode = GET_FIELD(reg_data->mode.data,
+						   DBG_MODE_HDR_EVAL_MODE) > 0;
+			u16 modes_buf_offset =
+				GET_FIELD(reg_data->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+
+			if (!eval_mode ||
+			    qed_is_mode_match(p_hwfn, &modes_buf_offset))
+				/* Mode match - read parity status read-clear
+				 * register.
+				 */
+				qed_rd(p_hwfn, p_ptt,
+				       DWORDS_TO_BYTES(reg_data->
+						       sts_clr_address));
+		}
+	}
+}
+
+/* Dumps GRC registers section header. Returns the dumped size in dwords.
+ * The following parameters are dumped:
+ * - 'count' = num_dumped_entries
+ * - 'split' = split_type
+ * - 'id'i = split_id (dumped only if split_id >= 0)
+ * - 'param_name' = param_val (user param, dumped only if param_name != NULL and
+ *	param_val != NULL)
+ */
+static u32 qed_grc_dump_regs_hdr(u32 *dump_buf,
+				 bool dump,
+				 u32 num_reg_entries,
+				 const char *split_type,
+				 int split_id,
+				 const char *param_name, const char *param_val)
+{
+	u8 num_params = 2 + (split_id >= 0 ? 1 : 0) + (param_name ? 1 : 0);
+	u32 offset = 0;
+
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "grc_regs", num_params);
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "count", num_reg_entries);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "split", split_type);
+	if (split_id >= 0)
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump, "id", split_id);
+	if (param_name && param_val)
+		offset += qed_dump_str_param(dump_buf + offset,
+					     dump, param_name, param_val);
+	return offset;
+}
+
+/* Dumps GRC register/memory. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_reg_entry(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt, u32 *dump_buf,
+				  bool dump, u32 addr, u32 len)
+{
+	u32 offset = 0, i;
+
+	if (dump) {
+		*(dump_buf + offset++) = addr | (len << REG_DUMP_LEN_SHIFT);
+		for (i = 0; i < len; i++, addr++, offset++)
+			*(dump_buf + offset) = qed_rd(p_hwfn,
+						      p_ptt,
+						      DWORDS_TO_BYTES(addr));
+	} else {
+		offset += len + 1;
+	}
+
+	return offset;
+}
+
+/* Dumps GRC registers entries. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_regs_entries(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct dbg_array input_regs_arr,
+				     u32 *dump_buf,
+				     bool dump,
+				     bool block_enable[MAX_BLOCK_ID],
+				     u32 *num_dumped_reg_entries)
+{
+	u32 i, offset = 0, input_offset = 0;
+	bool mode_match = true;
+
+	*num_dumped_reg_entries = 0;
+	while (input_offset < input_regs_arr.size_in_dwords) {
+		const struct dbg_dump_cond_hdr *cond_hdr =
+		    (const struct dbg_dump_cond_hdr *)
+		    &input_regs_arr.ptr[input_offset++];
+		bool eval_mode = GET_FIELD(cond_hdr->mode.data,
+					   DBG_MODE_HDR_EVAL_MODE) > 0;
+
+		/* Check mode/block */
+		if (eval_mode) {
+			u16 modes_buf_offset =
+				GET_FIELD(cond_hdr->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+			mode_match = qed_is_mode_match(p_hwfn,
+						       &modes_buf_offset);
+		}
+
+		if (mode_match && block_enable[cond_hdr->block_id]) {
+			for (i = 0; i < cond_hdr->data_size;
+			     i++, input_offset++) {
+				const struct dbg_dump_reg *reg =
+				    (const struct dbg_dump_reg *)
+				    &input_regs_arr.ptr[input_offset];
+
+				offset +=
+					qed_grc_dump_reg_entry(p_hwfn, p_ptt,
+						    dump_buf + offset, dump,
+						    GET_FIELD(reg->data,
+							DBG_DUMP_REG_ADDRESS),
+						    GET_FIELD(reg->data,
+							DBG_DUMP_REG_LENGTH));
+				(*num_dumped_reg_entries)++;
+			}
+		} else {
+			input_offset += cond_hdr->data_size;
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC registers entries. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct dbg_array input_regs_arr,
+				   u32 *dump_buf,
+				   bool dump,
+				   bool block_enable[MAX_BLOCK_ID],
+				   const char *split_type_name,
+				   u32 split_id,
+				   const char *param_name,
+				   const char *param_val)
+{
+	u32 num_dumped_reg_entries, offset;
+
+	/* Calculate register dump header size (and skip it for now) */
+	offset = qed_grc_dump_regs_hdr(dump_buf,
+				       false,
+				       0,
+				       split_type_name,
+				       split_id, param_name, param_val);
+
+	/* Dump registers */
+	offset += qed_grc_dump_regs_entries(p_hwfn,
+					    p_ptt,
+					    input_regs_arr,
+					    dump_buf + offset,
+					    dump,
+					    block_enable,
+					    &num_dumped_reg_entries);
+
+	/* Write register dump header */
+	if (dump && num_dumped_reg_entries > 0)
+		qed_grc_dump_regs_hdr(dump_buf,
+				      dump,
+				      num_dumped_reg_entries,
+				      split_type_name,
+				      split_id, param_name, param_val);
+
+	return num_dumped_reg_entries > 0 ? offset : 0;
+}
+
+/* Dumps registers according to the input registers array.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u32 *dump_buf,
+				  bool dump,
+				  bool block_enable[MAX_BLOCK_ID],
+				  const char *param_name, const char *param_val)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 offset = 0, input_offset = 0;
+	u8 port_id, pf_id;
+
+	if (dump)
+		DP_VERBOSE(p_hwfn, QED_MSG_DEBUG, "Dumping registers...\n");
+	while (input_offset <
+	       s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].size_in_dwords) {
+		const struct dbg_dump_split_hdr *split_hdr =
+			(const struct dbg_dump_split_hdr *)
+			&s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset++];
+		u8 split_type_id = GET_FIELD(split_hdr->hdr,
+					     DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID);
+		u32 split_data_size = GET_FIELD(split_hdr->hdr,
+						DBG_DUMP_SPLIT_HDR_DATA_SIZE);
+		struct dbg_array curr_input_regs_arr = {
+			&s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset],
+			split_data_size};
+
+		switch (split_type_id) {
+		case SPLIT_TYPE_NONE:
+		case SPLIT_TYPE_VF:
+			offset += qed_grc_dump_split_data(p_hwfn,
+							  p_ptt,
+							  curr_input_regs_arr,
+							  dump_buf + offset,
+							  dump,
+							  block_enable,
+							  "eng",
+							  (u32)(-1),
+							  param_name,
+							  param_val);
+			break;
+		case SPLIT_TYPE_PORT:
+			for (port_id = 0;
+			     port_id <
+			     s_chip_defs[dev_data->chip_id].
+			     per_platform[dev_data->platform_id].num_ports;
+			     port_id++) {
+				if (dump)
+					qed_port_pretend(p_hwfn, p_ptt,
+							 port_id);
+				offset +=
+				    qed_grc_dump_split_data(p_hwfn, p_ptt,
+							    curr_input_regs_arr,
+							    dump_buf + offset,
+							    dump, block_enable,
+							    "port", port_id,
+							    param_name,
+							    param_val);
+			}
+			break;
+		case SPLIT_TYPE_PF:
+		case SPLIT_TYPE_PORT_PF:
+			for (pf_id = 0;
+			     pf_id <
+			     s_chip_defs[dev_data->chip_id].
+			     per_platform[dev_data->platform_id].num_pfs;
+			     pf_id++) {
+				if (dump)
+					qed_fid_pretend(p_hwfn, p_ptt, pf_id);
+				offset += qed_grc_dump_split_data(p_hwfn,
+							p_ptt,
+							curr_input_regs_arr,
+							dump_buf + offset,
+							dump, block_enable,
+							"pf", pf_id, param_name,
+							param_val);
+			}
+			break;
+		default:
+			break;
+		}
+
+		input_offset += split_data_size;
+	}
+
+	/* Pretend to original PF */
+	if (dump)
+		qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
+	return offset;
+}
+
+/* Dump reset registers. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_reset_regs(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 i, offset = 0, num_regs = 0;
+
+	/* Calculate header size */
+	offset += qed_grc_dump_regs_hdr(dump_buf,
+					false, 0, "eng", -1, NULL, NULL);
+
+	/* Write reset registers */
+	for (i = 0; i < MAX_DBG_RESET_REGS; i++) {
+		if (s_reset_regs_defs[i].exists[dev_data->chip_id]) {
+			offset += qed_grc_dump_reg_entry(p_hwfn,
+							 p_ptt,
+							 dump_buf + offset,
+							 dump,
+							 BYTES_TO_DWORDS
+							 (s_reset_regs_defs
+							  [i].addr), 1);
+			num_regs++;
+		}
+	}
+
+	/* Write header */
+	if (dump)
+		qed_grc_dump_regs_hdr(dump_buf,
+				      true, num_regs, "eng", -1, NULL, NULL);
+	return offset;
+}
+
+/* Dump registers that are modified during GRC Dump and therefore must be dumped
+ * first. Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 offset = 0, num_reg_entries = 0, block_id;
+	u8 storm_id, reg_idx, num_attn_regs;
+
+	/* Calculate header size */
+	offset += qed_grc_dump_regs_hdr(dump_buf,
+					false, 0, "eng", -1, NULL, NULL);
+
+	/* Write parity registers */
+	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+		const struct dbg_attn_reg *attn_reg_arr;
+
+		if (dev_data->block_in_reset[block_id] && dump)
+			continue;
+
+		attn_reg_arr = qed_get_block_attn_regs((enum block_id)block_id,
+						       ATTN_TYPE_PARITY,
+						       &num_attn_regs);
+		for (reg_idx = 0; reg_idx < num_attn_regs; reg_idx++) {
+			const struct dbg_attn_reg *reg_data =
+				&attn_reg_arr[reg_idx];
+			u16 modes_buf_offset;
+			bool eval_mode;
+
+			/* Check mode */
+			eval_mode = GET_FIELD(reg_data->mode.data,
+					      DBG_MODE_HDR_EVAL_MODE) > 0;
+			modes_buf_offset =
+				GET_FIELD(reg_data->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+			if (!eval_mode ||
+			    qed_is_mode_match(p_hwfn, &modes_buf_offset)) {
+				/* Mode match - read and dump registers */
+				offset += qed_grc_dump_reg_entry(p_hwfn,
+							p_ptt,
+							dump_buf + offset,
+							dump,
+							reg_data->mask_address,
+							1);
+				offset += qed_grc_dump_reg_entry(p_hwfn,
+						p_ptt,
+						dump_buf + offset,
+						dump,
+						GET_FIELD(reg_data->data,
+						    DBG_ATTN_REG_STS_ADDRESS),
+						1);
+				num_reg_entries += 2;
+			}
+		}
+	}
+
+	/* Write storm stall status registers */
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		if (dev_data->block_in_reset[s_storm_defs[storm_id].block_id] &&
+		    dump)
+			continue;
+
+		offset += qed_grc_dump_reg_entry(p_hwfn,
+					p_ptt,
+					dump_buf + offset,
+					dump,
+					BYTES_TO_DWORDS(s_storm_defs[storm_id].
+							sem_fast_mem_addr +
+							SEM_FAST_REG_STALLED),
+					1);
+		num_reg_entries++;
+	}
+
+	/* Write header */
+	if (dump)
+		qed_grc_dump_regs_hdr(dump_buf,
+				      true,
+				      num_reg_entries, "eng", -1, NULL, NULL);
+	return offset;
+}
+
+/* Dumps a GRC memory header (section and params).
+ * The following parameters are dumped:
+ * name - name is dumped only if it's not NULL.
+ * addr - byte_addr is dumped only if name is NULL.
+ * len - dword_len is always dumped.
+ * width - bit_width is dumped if it's not zero.
+ * packed - packed=1 is dumped if it's not false.
+ * mem_group - mem_group is always dumped.
+ * is_storm - true only if the memory is related to a Storm.
+ * storm_letter - storm letter (valid only if is_storm is true).
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_mem_hdr(struct qed_hwfn *p_hwfn,
+				u32 *dump_buf,
+				bool dump,
+				const char *name,
+				u32 byte_addr,
+				u32 dword_len,
+				u32 bit_width,
+				bool packed,
+				const char *mem_group,
+				bool is_storm, char storm_letter)
+{
+	u8 num_params = 3;
+	u32 offset = 0;
+	char buf[64];
+
+	if (!dword_len)
+		DP_NOTICE(p_hwfn,
+			  "Unexpected GRC Dump error: dumped memory size must be non-zero\n");
+	if (bit_width)
+		num_params++;
+	if (packed)
+		num_params++;
+
+	/* Dump section header */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "grc_mem", num_params);
+	if (name) {
+		/* Dump name */
+		if (is_storm) {
+			strcpy(buf, "?STORM_");
+			buf[0] = storm_letter;
+			strcpy(buf + strlen(buf), name);
+		} else {
+			strcpy(buf, name);
+		}
+
+		offset += qed_dump_str_param(dump_buf + offset,
+					     dump, "name", buf);
+		if (dump)
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_DEBUG,
+				   "Dumping %d registers from %s...\n",
+				   dword_len, buf);
+	} else {
+		/* Dump address */
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump, "addr", byte_addr);
+		if (dump && dword_len > 64)
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_DEBUG,
+				   "Dumping %d registers from address 0x%x...\n",
+				   dword_len, byte_addr);
+	}
+
+	/* Dump len */
+	offset += qed_dump_num_param(dump_buf + offset, dump, "len", dword_len);
+
+	/* Dump bit width */
+	if (bit_width)
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump, "width", bit_width);
+
+	/* Dump packed */
+	if (packed)
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump, "packed", 1);
+
+	/* Dump reg type */
+	if (is_storm) {
+		strcpy(buf, "?STORM_");
+		buf[0] = storm_letter;
+		strcpy(buf + strlen(buf), mem_group);
+	} else {
+		strcpy(buf, mem_group);
+	}
+
+	offset += qed_dump_str_param(dump_buf + offset, dump, "type", buf);
+	return offset;
+}
+
+/* Dumps a single GRC memory. If name is NULL, the memory is stored by address.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_mem(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    u32 *dump_buf,
+			    bool dump,
+			    const char *name,
+			    u32 byte_addr,
+			    u32 dword_len,
+			    u32 bit_width,
+			    bool packed,
+			    const char *mem_group,
+			    bool is_storm, char storm_letter)
+{
+	u32 offset = 0;
+
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       name,
+				       byte_addr,
+				       dword_len,
+				       bit_width,
+				       packed,
+				       mem_group, is_storm, storm_letter);
+	if (dump) {
+		u32 i;
+
+		for (i = 0; i < dword_len;
+		     i++, byte_addr += BYTES_IN_DWORD, offset++)
+			*(dump_buf + offset) = qed_rd(p_hwfn, p_ptt, byte_addr);
+	} else {
+		offset += dword_len;
+	}
+
+	return offset;
+}
+
+/* Dumps GRC memories entries. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_mem_entries(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct dbg_array input_mems_arr,
+				    u32 *dump_buf, bool dump)
+{
+	u32 i, offset = 0, input_offset = 0;
+	bool mode_match = true;
+
+	while (input_offset < input_mems_arr.size_in_dwords) {
+		const struct dbg_dump_cond_hdr *cond_hdr;
+		u32 num_entries;
+		bool eval_mode;
+
+		cond_hdr = (const struct dbg_dump_cond_hdr *)
+			   &input_mems_arr.ptr[input_offset++];
+		eval_mode = GET_FIELD(cond_hdr->mode.data,
+				      DBG_MODE_HDR_EVAL_MODE) > 0;
+
+		/* Check required mode */
+		if (eval_mode) {
+			u16 modes_buf_offset =
+				GET_FIELD(cond_hdr->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+
+			mode_match = qed_is_mode_match(p_hwfn,
+						       &modes_buf_offset);
+		}
+
+		if (!mode_match) {
+			input_offset += cond_hdr->data_size;
+			continue;
+		}
+
+		num_entries = cond_hdr->data_size / MEM_DUMP_ENTRY_SIZE_DWORDS;
+		for (i = 0; i < num_entries;
+		     i++, input_offset += MEM_DUMP_ENTRY_SIZE_DWORDS) {
+			const struct dbg_dump_mem *mem =
+				(const struct dbg_dump_mem *)
+				&input_mems_arr.ptr[input_offset];
+			u8 mem_group_id;
+
+			mem_group_id = GET_FIELD(mem->dword0,
+						 DBG_DUMP_MEM_MEM_GROUP_ID);
+			if (mem_group_id >= MEM_GROUPS_NUM) {
+				DP_NOTICE(p_hwfn, "Invalid mem_group_id\n");
+				return 0;
+			}
+
+			if (qed_grc_is_mem_included(p_hwfn,
+					(enum block_id)cond_hdr->block_id,
+					mem_group_id)) {
+				u32 mem_byte_addr =
+					DWORDS_TO_BYTES(GET_FIELD(mem->dword0,
+							DBG_DUMP_MEM_ADDRESS));
+				u32 mem_len = GET_FIELD(mem->dword1,
+							DBG_DUMP_MEM_LENGTH);
+				char storm_letter = 'a';
+				bool is_storm = false;
+
+				/* Update memory length for CCFC/TCFC memories
+				 * according to number of LCIDs/LTIDs.
+				 */
+				if (mem_group_id == MEM_GROUP_CONN_CFC_MEM)
+					mem_len = qed_grc_get_param(p_hwfn,
+							DBG_GRC_PARAM_NUM_LCIDS)
+							* (mem_len / MAX_LCIDS);
+				else if (mem_group_id == MEM_GROUP_TASK_CFC_MEM)
+					mem_len = qed_grc_get_param(p_hwfn,
+							DBG_GRC_PARAM_NUM_LTIDS)
+							* (mem_len / MAX_LTIDS);
+
+				/* If memory is associated with Storm, update
+				 * Storm details.
+				 */
+				if (s_block_defs[cond_hdr->block_id]->
+							associated_to_storm) {
+					is_storm = true;
+					storm_letter =
+						s_storm_defs[s_block_defs[
+						cond_hdr->block_id]->
+						storm_id].letter;
+				}
+
+				/* Dump memory */
+				offset += qed_grc_dump_mem(p_hwfn, p_ptt,
+						dump_buf + offset, dump, NULL,
+						mem_byte_addr, mem_len, 0,
+						false,
+						s_mem_group_names[mem_group_id],
+						is_storm, storm_letter);
+				}
+			}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC memories according to the input array dump_mem.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_memories(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf, bool dump)
+{
+	u32 offset = 0, input_offset = 0;
+
+	while (input_offset <
+	       s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].size_in_dwords) {
+		const struct dbg_dump_split_hdr *split_hdr =
+			(const struct dbg_dump_split_hdr *)
+			&s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset++];
+		u8 split_type_id = GET_FIELD(split_hdr->hdr,
+					     DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID);
+		u32 split_data_size = GET_FIELD(split_hdr->hdr,
+						DBG_DUMP_SPLIT_HDR_DATA_SIZE);
+		struct dbg_array curr_input_mems_arr = {
+			&s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset],
+			split_data_size};
+
+		switch (split_type_id) {
+		case SPLIT_TYPE_NONE:
+			offset += qed_grc_dump_mem_entries(p_hwfn,
+							   p_ptt,
+							   curr_input_mems_arr,
+							   dump_buf + offset,
+							   dump);
+			break;
+		default:
+			DP_NOTICE(p_hwfn,
+				  "Dumping split memories is currently not supported\n");
+			break;
+		}
+
+		input_offset += split_data_size;
+	}
+
+	return offset;
+}
+
+/* Dumps GRC context data for the specified Storm.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_ctx_data(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf,
+				 bool dump,
+				 const char *name,
+				 u32 num_lids,
+				 u32 lid_size,
+				 u32 rd_reg_addr,
+				 u8 storm_id)
+{
+	u32 i, lid, total_size;
+	u32 offset = 0;
+
+	if (!lid_size)
+		return 0;
+	lid_size *= BYTES_IN_DWORD;
+	total_size = num_lids * lid_size;
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       name,
+				       0,
+				       total_size,
+				       lid_size * 32,
+				       false,
+				       name,
+				       true, s_storm_defs[storm_id].letter);
+
+	/* Dump context data */
+	if (dump) {
+		for (lid = 0; lid < num_lids; lid++) {
+			for (i = 0; i < lid_size; i++, offset++) {
+				qed_wr(p_hwfn,
+				       p_ptt,
+				       s_storm_defs[storm_id].cm_ctx_wr_addr,
+				       BIT(9) | lid);
+				*(dump_buf + offset) = qed_rd(p_hwfn,
+							      p_ptt,
+							      rd_reg_addr);
+			}
+		}
+	} else {
+		offset += total_size;
+	}
+
+	return offset;
+}
+
+/* Dumps GRC contexts. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_ctx(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	u32 offset = 0;
+	u8 storm_id;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		if (!qed_grc_is_storm_included(p_hwfn,
+					       (enum dbg_storms)storm_id))
+			continue;
+
+		/* Dump Conn AG context size */
+		offset +=
+			qed_grc_dump_ctx_data(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      "CONN_AG_CTX",
+					      qed_grc_get_param(p_hwfn,
+						    DBG_GRC_PARAM_NUM_LCIDS),
+					      s_storm_defs[storm_id].
+						    cm_conn_ag_ctx_lid_size,
+					      s_storm_defs[storm_id].
+						    cm_conn_ag_ctx_rd_addr,
+					      storm_id);
+
+		/* Dump Conn ST context size */
+		offset +=
+			qed_grc_dump_ctx_data(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      "CONN_ST_CTX",
+					      qed_grc_get_param(p_hwfn,
+						    DBG_GRC_PARAM_NUM_LCIDS),
+					      s_storm_defs[storm_id].
+						    cm_conn_st_ctx_lid_size,
+					      s_storm_defs[storm_id].
+						    cm_conn_st_ctx_rd_addr,
+					      storm_id);
+
+		/* Dump Task AG context size */
+		offset +=
+			qed_grc_dump_ctx_data(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      "TASK_AG_CTX",
+					      qed_grc_get_param(p_hwfn,
+						    DBG_GRC_PARAM_NUM_LTIDS),
+					      s_storm_defs[storm_id].
+						    cm_task_ag_ctx_lid_size,
+					      s_storm_defs[storm_id].
+						    cm_task_ag_ctx_rd_addr,
+					      storm_id);
+
+		/* Dump Task ST context size */
+		offset +=
+			qed_grc_dump_ctx_data(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      "TASK_ST_CTX",
+					      qed_grc_get_param(p_hwfn,
+						    DBG_GRC_PARAM_NUM_LTIDS),
+					      s_storm_defs[storm_id].
+						    cm_task_st_ctx_lid_size,
+					      s_storm_defs[storm_id].
+						    cm_task_st_ctx_rd_addr,
+					      storm_id);
+	}
+
+	return offset;
+}
+
+/* Dumps GRC IORs data. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_iors(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	char buf[10] = "IOR_SET_?";
+	u8 storm_id, set_id;
+	u32 offset = 0;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		if (qed_grc_is_storm_included(p_hwfn,
+					      (enum dbg_storms)storm_id)) {
+			for (set_id = 0; set_id < NUM_IOR_SETS; set_id++) {
+				u32 addr =
+				    s_storm_defs[storm_id].sem_fast_mem_addr +
+				    SEM_FAST_REG_STORM_REG_FILE +
+				    DWORDS_TO_BYTES(IOR_SET_OFFSET(set_id));
+
+				buf[strlen(buf) - 1] = '0' + set_id;
+				offset += qed_grc_dump_mem(p_hwfn,
+							   p_ptt,
+							   dump_buf + offset,
+							   dump,
+							   buf,
+							   addr,
+							   IORS_PER_SET,
+							   32,
+							   false,
+							   "ior",
+							   true,
+							   s_storm_defs
+							   [storm_id].letter);
+			}
+		}
+	}
+
+	return offset;
+}
+
+/* Dump VFC CAM. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_vfc_cam(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				u32 *dump_buf, bool dump, u8 storm_id)
+{
+	u32 total_size = VFC_CAM_NUM_ROWS * VFC_CAM_RESP_DWORDS;
+	u32 cam_addr[VFC_CAM_ADDR_DWORDS] = { 0 };
+	u32 cam_cmd[VFC_CAM_CMD_DWORDS] = { 0 };
+	u32 offset = 0;
+	u32 row, i;
+
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       "vfc_cam",
+				       0,
+				       total_size,
+				       256,
+				       false,
+				       "vfc_cam",
+				       true, s_storm_defs[storm_id].letter);
+	if (dump) {
+		/* Prepare CAM address */
+		SET_VAR_FIELD(cam_addr, VFC_CAM_ADDR, OP, VFC_OPCODE_CAM_RD);
+		for (row = 0; row < VFC_CAM_NUM_ROWS;
+		     row++, offset += VFC_CAM_RESP_DWORDS) {
+			/* Write VFC CAM command */
+			SET_VAR_FIELD(cam_cmd, VFC_CAM_CMD, ROW, row);
+			ARR_REG_WR(p_hwfn,
+				   p_ptt,
+				   s_storm_defs[storm_id].sem_fast_mem_addr +
+				   SEM_FAST_REG_VFC_DATA_WR,
+				   cam_cmd, VFC_CAM_CMD_DWORDS);
+
+			/* Write VFC CAM address */
+			ARR_REG_WR(p_hwfn,
+				   p_ptt,
+				   s_storm_defs[storm_id].sem_fast_mem_addr +
+				   SEM_FAST_REG_VFC_ADDR,
+				   cam_addr, VFC_CAM_ADDR_DWORDS);
+
+			/* Read VFC CAM read response */
+			ARR_REG_RD(p_hwfn,
+				   p_ptt,
+				   s_storm_defs[storm_id].sem_fast_mem_addr +
+				   SEM_FAST_REG_VFC_DATA_RD,
+				   dump_buf + offset, VFC_CAM_RESP_DWORDS);
+		}
+	} else {
+		offset += total_size;
+	}
+
+	return offset;
+}
+
+/* Dump VFC RAM. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_vfc_ram(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				u32 *dump_buf,
+				bool dump,
+				u8 storm_id, struct vfc_ram_defs *ram_defs)
+{
+	u32 total_size = ram_defs->num_rows * VFC_RAM_RESP_DWORDS;
+	u32 ram_addr[VFC_RAM_ADDR_DWORDS] = { 0 };
+	u32 ram_cmd[VFC_RAM_CMD_DWORDS] = { 0 };
+	u32 offset = 0;
+	u32 row, i;
+
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       ram_defs->mem_name,
+				       0,
+				       total_size,
+				       256,
+				       false,
+				       ram_defs->type_name,
+				       true, s_storm_defs[storm_id].letter);
+
+	/* Prepare RAM address */
+	SET_VAR_FIELD(ram_addr, VFC_RAM_ADDR, OP, VFC_OPCODE_RAM_RD);
+
+	if (!dump)
+		return offset + total_size;
+
+	for (row = ram_defs->base_row;
+	     row < ram_defs->base_row + ram_defs->num_rows;
+	     row++, offset += VFC_RAM_RESP_DWORDS) {
+		/* Write VFC RAM command */
+		ARR_REG_WR(p_hwfn,
+			   p_ptt,
+			   s_storm_defs[storm_id].sem_fast_mem_addr +
+			   SEM_FAST_REG_VFC_DATA_WR,
+			   ram_cmd, VFC_RAM_CMD_DWORDS);
+
+		/* Write VFC RAM address */
+		SET_VAR_FIELD(ram_addr, VFC_RAM_ADDR, ROW, row);
+		ARR_REG_WR(p_hwfn,
+			   p_ptt,
+			   s_storm_defs[storm_id].sem_fast_mem_addr +
+			   SEM_FAST_REG_VFC_ADDR,
+			   ram_addr, VFC_RAM_ADDR_DWORDS);
+
+		/* Read VFC RAM read response */
+		ARR_REG_RD(p_hwfn,
+			   p_ptt,
+			   s_storm_defs[storm_id].sem_fast_mem_addr +
+			   SEM_FAST_REG_VFC_DATA_RD,
+			   dump_buf + offset, VFC_RAM_RESP_DWORDS);
+	}
+
+	return offset;
+}
+
+/* Dumps GRC VFC data. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_vfc(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 storm_id, i;
+	u32 offset = 0;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		if (qed_grc_is_storm_included(p_hwfn,
+					      (enum dbg_storms)storm_id) &&
+		    s_storm_defs[storm_id].has_vfc &&
+		    (storm_id != DBG_PSTORM_ID ||
+		     dev_data->platform_id == PLATFORM_ASIC)) {
+			/* Read CAM */
+			offset += qed_grc_dump_vfc_cam(p_hwfn,
+						       p_ptt,
+						       dump_buf + offset,
+						       dump, storm_id);
+
+			/* Read RAM */
+			for (i = 0; i < NUM_VFC_RAM_TYPES; i++)
+				offset += qed_grc_dump_vfc_ram(p_hwfn,
+							       p_ptt,
+							       dump_buf +
+							       offset,
+							       dump,
+							       storm_id,
+							       &s_vfc_ram_defs
+							       [i]);
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC RSS data. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_rss(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 offset = 0;
+	u8 rss_mem_id;
+
+	for (rss_mem_id = 0; rss_mem_id < NUM_RSS_MEM_TYPES; rss_mem_id++) {
+		struct rss_mem_defs *rss_defs = &s_rss_mem_defs[rss_mem_id];
+		u32 num_entries = rss_defs->num_entries[dev_data->chip_id];
+		u32 entry_width = rss_defs->entry_width[dev_data->chip_id];
+		u32 total_size = (num_entries * entry_width) / 32;
+		bool packed = (entry_width == 16);
+		u32 addr = rss_defs->addr;
+		u32 i, j;
+
+		offset += qed_grc_dump_mem_hdr(p_hwfn,
+					       dump_buf + offset,
+					       dump,
+					       rss_defs->mem_name,
+					       addr,
+					       total_size,
+					       entry_width,
+					       packed,
+					       rss_defs->type_name, false, 0);
+
+		if (!dump) {
+			offset += total_size;
+			continue;
+		}
+
+		/* Dump RSS data */
+		for (i = 0; i < BYTES_TO_DWORDS(total_size); i++, addr++) {
+			qed_wr(p_hwfn, p_ptt, RSS_REG_RSS_RAM_ADDR, addr);
+			for (j = 0; j < BYTES_IN_DWORD; j++, offset++)
+				*(dump_buf + offset) =
+					qed_rd(p_hwfn, p_ptt,
+					       RSS_REG_RSS_RAM_DATA +
+					       DWORDS_TO_BYTES(j));
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC Big RAM. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_big_ram(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				u32 *dump_buf, bool dump, u8 big_ram_id)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	char mem_name[12] = "???_BIG_RAM";
+	char type_name[8] = "???_RAM";
+	u32 ram_size, total_blocks;
+	u32 offset = 0, i, j;
+
+	total_blocks =
+		s_big_ram_defs[big_ram_id].num_of_blocks[dev_data->chip_id];
+	ram_size = total_blocks * BIG_RAM_BLOCK_SIZE_DWORDS;
+
+	strncpy(type_name, s_big_ram_defs[big_ram_id].instance_name,
+		strlen(s_big_ram_defs[big_ram_id].instance_name));
+	strncpy(mem_name, s_big_ram_defs[big_ram_id].instance_name,
+		strlen(s_big_ram_defs[big_ram_id].instance_name));
+
+	/* Dump memory header */
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       mem_name,
+				       0,
+				       ram_size,
+				       BIG_RAM_BLOCK_SIZE_BYTES * 8,
+				       false, type_name, false, 0);
+
+	if (!dump)
+		return offset + ram_size;
+
+	/* Read and dump Big RAM data */
+	for (i = 0; i < total_blocks / 2; i++) {
+		qed_wr(p_hwfn, p_ptt, s_big_ram_defs[big_ram_id].addr_reg_addr,
+		       i);
+		for (j = 0; j < 2 * BIG_RAM_BLOCK_SIZE_DWORDS; j++, offset++)
+			*(dump_buf + offset) = qed_rd(p_hwfn, p_ptt,
+						s_big_ram_defs[big_ram_id].
+							data_reg_addr +
+						DWORDS_TO_BYTES(j));
+	}
+
+	return offset;
+}
+
+static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	bool block_enable[MAX_BLOCK_ID] = { 0 };
+	bool halted = false;
+	u32 offset = 0;
+
+	/* Halt MCP */
+	if (dump) {
+		halted = !qed_mcp_halt(p_hwfn, p_ptt);
+		if (!halted)
+			DP_NOTICE(p_hwfn, "MCP halt failed!\n");
+	}
+
+	/* Dump MCP scratchpad */
+	offset += qed_grc_dump_mem(p_hwfn,
+				   p_ptt,
+				   dump_buf + offset,
+				   dump,
+				   NULL,
+				   MCP_REG_SCRATCH,
+				   MCP_REG_SCRATCH_SIZE,
+				   0, false, "MCP", false, 0);
+
+	/* Dump MCP cpu_reg_file */
+	offset += qed_grc_dump_mem(p_hwfn,
+				   p_ptt,
+				   dump_buf + offset,
+				   dump,
+				   NULL,
+				   MCP_REG_CPU_REG_FILE,
+				   MCP_REG_CPU_REG_FILE_SIZE,
+				   0, false, "MCP", false, 0);
+
+	/* Dump MCP registers */
+	block_enable[BLOCK_MCP] = true;
+	offset += qed_grc_dump_registers(p_hwfn,
+					 p_ptt,
+					 dump_buf + offset,
+					 dump, block_enable, "block", "MCP");
+
+	/* Dump required non-MCP registers */
+	offset += qed_grc_dump_regs_hdr(dump_buf + offset,
+					dump, 1, "eng", -1, "block", "MCP");
+	offset += qed_grc_dump_reg_entry(p_hwfn,
+					 p_ptt,
+					 dump_buf + offset,
+					 dump,
+					 BYTES_TO_DWORDS
+					 (MISC_REG_SHARED_MEM_ADDR), 1);
+
+	/* Release MCP */
+	if (halted && qed_mcp_resume(p_hwfn, p_ptt))
+		DP_NOTICE(p_hwfn, "Failed to resume MCP after halt!\n");
+	return offset;
+}
+
+/* Dumps the tbus indirect memory for all PHYs. */
+static u32 qed_grc_dump_phy(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	u32 offset = 0, tbus_lo_offset, tbus_hi_offset;
+	char mem_name[32];
+	u8 phy_id;
+
+	for (phy_id = 0; phy_id < ARRAY_SIZE(s_phy_defs); phy_id++) {
+		struct phy_defs *phy_defs = &s_phy_defs[phy_id];
+		int printed_chars;
+
+		printed_chars = snprintf(mem_name, sizeof(mem_name), "tbus_%s",
+					 phy_defs->phy_name);
+		if (printed_chars < 0 || printed_chars >= sizeof(mem_name))
+			DP_NOTICE(p_hwfn,
+				  "Unexpected debug error: invalid PHY memory name\n");
+		offset += qed_grc_dump_mem_hdr(p_hwfn,
+					       dump_buf + offset,
+					       dump,
+					       mem_name,
+					       0,
+					       PHY_DUMP_SIZE_DWORDS,
+					       16, true, mem_name, false, 0);
+		if (dump) {
+			u32 addr_lo_addr = phy_defs->base_addr +
+					   phy_defs->tbus_addr_lo_addr;
+			u32 addr_hi_addr = phy_defs->base_addr +
+					   phy_defs->tbus_addr_hi_addr;
+			u32 data_lo_addr = phy_defs->base_addr +
+					   phy_defs->tbus_data_lo_addr;
+			u32 data_hi_addr = phy_defs->base_addr +
+					   phy_defs->tbus_data_hi_addr;
+			u8 *bytes_buf = (u8 *)(dump_buf + offset);
+
+			for (tbus_hi_offset = 0;
+			     tbus_hi_offset < (NUM_PHY_TBUS_ADDRESSES >> 8);
+			     tbus_hi_offset++) {
+				qed_wr(p_hwfn,
+				       p_ptt, addr_hi_addr, tbus_hi_offset);
+				for (tbus_lo_offset = 0; tbus_lo_offset < 256;
+				     tbus_lo_offset++) {
+					qed_wr(p_hwfn,
+					       p_ptt,
+					       addr_lo_addr, tbus_lo_offset);
+					*(bytes_buf++) =
+						(u8)qed_rd(p_hwfn, p_ptt,
+							   data_lo_addr);
+					*(bytes_buf++) =
+						(u8)qed_rd(p_hwfn, p_ptt,
+							   data_hi_addr);
+				}
+			}
+		}
+
+		offset += PHY_DUMP_SIZE_DWORDS;
+	}
+
+	return offset;
+}
+
+static void qed_config_dbg_line(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				enum block_id block_id,
+				u8 line_id,
+				u8 cycle_en,
+				u8 right_shift, u8 force_valid, u8 force_frame)
+{
+	struct block_defs *p_block_defs = s_block_defs[block_id];
+
+	qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_select_addr, line_id);
+	qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_cycle_enable_addr, cycle_en);
+	qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_shift_addr, right_shift);
+	qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_force_valid_addr, force_valid);
+	qed_wr(p_hwfn, p_ptt, p_block_defs->dbg_force_frame_addr, force_frame);
+}
+
+/* Dumps Static Debug data. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_static_debug(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     u32 *dump_buf, bool dump)
+{
+	u32 block_dwords = NUM_DBG_BUS_LINES * STATIC_DEBUG_LINE_DWORDS;
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 offset = 0, block_id, line_id, addr, i;
+	struct block_defs *p_block_defs;
+
+	if (dump) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DEBUG, "Dumping static debug data...\n");
+
+		/* Disable all blocks debug output */
+		for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+			p_block_defs = s_block_defs[block_id];
+
+			if (p_block_defs->has_dbg_bus[dev_data->chip_id])
+				qed_wr(p_hwfn, p_ptt,
+				       p_block_defs->dbg_cycle_enable_addr, 0);
+		}
+
+		qed_bus_reset_dbg_block(p_hwfn, p_ptt);
+		qed_bus_set_framing_mode(p_hwfn,
+					 p_ptt, DBG_BUS_FRAME_MODE_8HW_0ST);
+		qed_wr(p_hwfn,
+		       p_ptt, DBG_REG_DEBUG_TARGET, DBG_BUS_TARGET_ID_INT_BUF);
+		qed_wr(p_hwfn, p_ptt, DBG_REG_FULL_MODE, 1);
+		qed_bus_enable_dbg_block(p_hwfn, p_ptt, true);
+	}
+
+	/* Dump all static debug lines for each relevant block */
+	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+		p_block_defs = s_block_defs[block_id];
+
+		if (!p_block_defs->has_dbg_bus[dev_data->chip_id])
+			continue;
+
+		/* Dump static section params */
+		offset += qed_grc_dump_mem_hdr(p_hwfn,
+					       dump_buf + offset,
+					       dump,
+					       p_block_defs->name, 0,
+					       block_dwords, 32, false,
+					       "STATIC", false, 0);
+
+		if (dump && !dev_data->block_in_reset[block_id]) {
+			u8 dbg_client_id =
+				p_block_defs->dbg_client_id[dev_data->chip_id];
+
+			/* Enable block's client */
+			qed_bus_enable_clients(p_hwfn, p_ptt,
+					       BIT(dbg_client_id));
+
+			for (line_id = 0; line_id < NUM_DBG_BUS_LINES;
+			     line_id++) {
+				/* Configure debug line ID */
+				qed_config_dbg_line(p_hwfn,
+						    p_ptt,
+						    (enum block_id)block_id,
+						    (u8)line_id,
+						    0xf, 0, 0, 0);
+
+				/* Read debug line info */
+				for (i = 0, addr = DBG_REG_CALENDAR_OUT_DATA;
+				     i < STATIC_DEBUG_LINE_DWORDS;
+				     i++, offset++, addr += BYTES_IN_DWORD)
+					dump_buf[offset] = qed_rd(p_hwfn, p_ptt,
+								  addr);
+			}
+
+			/* Disable block's client and debug output */
+			qed_bus_enable_clients(p_hwfn, p_ptt, 0);
+			qed_wr(p_hwfn, p_ptt,
+			       p_block_defs->dbg_cycle_enable_addr, 0);
+		} else {
+			/* All lines are invalid - dump zeros */
+			if (dump)
+				memset(dump_buf + offset, 0,
+				       DWORDS_TO_BYTES(block_dwords));
+			offset += block_dwords;
+		}
+	}
+
+	if (dump) {
+		qed_bus_enable_dbg_block(p_hwfn, p_ptt, false);
+		qed_bus_enable_clients(p_hwfn, p_ptt, 0);
+	}
+
+	return offset;
+}
+
+/* Performs GRC Dump to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    u32 *dump_buf,
+				    bool dump, u32 *num_dumped_dwords)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	bool parities_masked = false;
+	u8 i, port_mode = 0;
+	u32 offset = 0;
+
+	/* Check if emulation platform */
+	*num_dumped_dwords = 0;
+
+	/* Fill GRC parameters that were not set by the user with their default
+	 * value.
+	 */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	/* Find port mode */
+	if (dump) {
+		switch (qed_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE)) {
+		case 0:
+			port_mode = 1;
+			break;
+		case 1:
+			port_mode = 2;
+			break;
+		case 2:
+			port_mode = 4;
+			break;
+		}
+	}
+
+	/* Update reset state */
+	if (dump)
+		qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 4);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "grc-dump");
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump,
+				     "num-lcids",
+				     qed_grc_get_param(p_hwfn,
+						DBG_GRC_PARAM_NUM_LCIDS));
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump,
+				     "num-ltids",
+				     qed_grc_get_param(p_hwfn,
+						DBG_GRC_PARAM_NUM_LTIDS));
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "num-ports", port_mode);
+
+	/* Dump reset registers (dumped before taking blocks out of reset ) */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS))
+		offset += qed_grc_dump_reset_regs(p_hwfn,
+						  p_ptt,
+						  dump_buf + offset, dump);
+
+	/* Take all blocks out of reset (using reset registers) */
+	if (dump) {
+		qed_grc_unreset_blocks(p_hwfn, p_ptt);
+		qed_update_blocks_reset_state(p_hwfn, p_ptt);
+	}
+
+	/* Disable all parities using MFW command */
+	if (dump) {
+		parities_masked = !qed_mcp_mask_parities(p_hwfn, p_ptt, 1);
+		if (!parities_masked) {
+			if (qed_grc_get_param
+			    (p_hwfn, DBG_GRC_PARAM_PARITY_SAFE))
+				return DBG_STATUS_MCP_COULD_NOT_MASK_PRTY;
+			else
+				DP_NOTICE(p_hwfn,
+					  "Failed to mask parities using MFW\n");
+		}
+	}
+
+	/* Dump modified registers (dumped before modifying them) */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS))
+		offset += qed_grc_dump_modified_regs(p_hwfn,
+						     p_ptt,
+						     dump_buf + offset, dump);
+
+	/* Stall storms */
+	if (dump &&
+	    (qed_grc_is_included(p_hwfn,
+				 DBG_GRC_PARAM_DUMP_IOR) ||
+	     qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_VFC)))
+		qed_grc_stall_storms(p_hwfn, p_ptt, true);
+
+	/* Dump all regs  */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS)) {
+		/* Dump all blocks except MCP */
+		bool block_enable[MAX_BLOCK_ID];
+
+		for (i = 0; i < MAX_BLOCK_ID; i++)
+			block_enable[i] = true;
+		block_enable[BLOCK_MCP] = false;
+		offset += qed_grc_dump_registers(p_hwfn,
+						 p_ptt,
+						 dump_buf +
+						 offset,
+						 dump,
+						 block_enable, NULL, NULL);
+	}
+
+	/* Dump memories */
+	offset += qed_grc_dump_memories(p_hwfn, p_ptt, dump_buf + offset, dump);
+
+	/* Dump MCP */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_MCP))
+		offset += qed_grc_dump_mcp(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump context */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CM_CTX))
+		offset += qed_grc_dump_ctx(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump RSS memories */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_RSS))
+		offset += qed_grc_dump_rss(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump Big RAM */
+	for (i = 0; i < NUM_BIG_RAM_TYPES; i++)
+		if (qed_grc_is_included(p_hwfn, s_big_ram_defs[i].grc_param))
+			offset += qed_grc_dump_big_ram(p_hwfn,
+						       p_ptt,
+						       dump_buf + offset,
+						       dump, i);
+
+	/* Dump IORs */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IOR))
+		offset += qed_grc_dump_iors(p_hwfn,
+					    p_ptt, dump_buf + offset, dump);
+
+	/* Dump VFC */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_VFC))
+		offset += qed_grc_dump_vfc(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump PHY tbus */
+	if (qed_grc_is_included(p_hwfn,
+				DBG_GRC_PARAM_DUMP_PHY) && dev_data->chip_id ==
+	    CHIP_K2 && dev_data->platform_id == PLATFORM_ASIC)
+		offset += qed_grc_dump_phy(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump static debug data  */
+	if (qed_grc_is_included(p_hwfn,
+				DBG_GRC_PARAM_DUMP_STATIC) &&
+	    dev_data->bus.state == DBG_BUS_STATE_IDLE)
+		offset += qed_grc_dump_static_debug(p_hwfn,
+						    p_ptt,
+						    dump_buf + offset, dump);
+
+	/* Dump last section */
+	offset += qed_dump_last_section(dump_buf, offset, dump);
+	if (dump) {
+		/* Unstall storms */
+		if (qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_UNSTALL))
+			qed_grc_stall_storms(p_hwfn, p_ptt, false);
+
+		/* Clear parity status */
+		qed_grc_clear_all_prty(p_hwfn, p_ptt);
+
+		/* Enable all parities using MFW command */
+		if (parities_masked)
+			qed_mcp_mask_parities(p_hwfn, p_ptt, 0);
+	}
+
+	*num_dumped_dwords = offset;
+
+	return DBG_STATUS_OK;
+}
+
+/* Writes the specified failing Idle Check rule to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_idle_chk_dump_failure(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     u32 *
+				     dump_buf,
+				     bool dump,
+				     u16 rule_id,
+				     const struct dbg_idle_chk_rule *rule,
+				     u16 fail_entry_id, u32 *cond_reg_values)
+{
+	const union dbg_idle_chk_reg *regs = &((const union dbg_idle_chk_reg *)
+					       s_dbg_arrays
+					       [BIN_BUF_DBG_IDLE_CHK_REGS].
+					       ptr)[rule->reg_offset];
+	const struct dbg_idle_chk_cond_reg *cond_regs = &regs[0].cond_reg;
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	struct dbg_idle_chk_result_hdr *hdr =
+		(struct dbg_idle_chk_result_hdr *)dump_buf;
+	const struct dbg_idle_chk_info_reg *info_regs =
+		&regs[rule->num_cond_regs].info_reg;
+	u32 next_reg_offset = 0, i, offset = 0;
+	u8 reg_id;
+
+	/* Dump rule data */
+	if (dump) {
+		memset(hdr, 0, sizeof(*hdr));
+		hdr->rule_id = rule_id;
+		hdr->mem_entry_id = fail_entry_id;
+		hdr->severity = rule->severity;
+		hdr->num_dumped_cond_regs = rule->num_cond_regs;
+	}
+
+	offset += IDLE_CHK_RESULT_HDR_DWORDS;
+
+	/* Dump condition register values */
+	for (reg_id = 0; reg_id < rule->num_cond_regs; reg_id++) {
+		const struct dbg_idle_chk_cond_reg *reg = &cond_regs[reg_id];
+
+		/* Write register header */
+		if (dump) {
+			struct dbg_idle_chk_result_reg_hdr *reg_hdr =
+			    (struct dbg_idle_chk_result_reg_hdr *)(dump_buf
+								   + offset);
+			offset += IDLE_CHK_RESULT_REG_HDR_DWORDS;
+			memset(reg_hdr, 0,
+			       sizeof(struct dbg_idle_chk_result_reg_hdr));
+			reg_hdr->start_entry = reg->start_entry;
+			reg_hdr->size = reg->entry_size;
+			SET_FIELD(reg_hdr->data,
+				  DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM,
+				  reg->num_entries > 1 || reg->start_entry > 0
+				  ? 1 : 0);
+			SET_FIELD(reg_hdr->data,
+				  DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID, reg_id);
+
+			/* Write register values */
+			for (i = 0; i < reg_hdr->size;
+			     i++, next_reg_offset++, offset++)
+				dump_buf[offset] =
+				    cond_reg_values[next_reg_offset];
+		} else {
+			offset += IDLE_CHK_RESULT_REG_HDR_DWORDS +
+			    reg->entry_size;
+		}
+	}
+
+	/* Dump info register values */
+	for (reg_id = 0; reg_id < rule->num_info_regs; reg_id++) {
+		const struct dbg_idle_chk_info_reg *reg = &info_regs[reg_id];
+		u32 block_id;
+
+		if (!dump) {
+			offset += IDLE_CHK_RESULT_REG_HDR_DWORDS + reg->size;
+			continue;
+		}
+
+		/* Check if register's block is in reset */
+		block_id = GET_FIELD(reg->data, DBG_IDLE_CHK_INFO_REG_BLOCK_ID);
+		if (block_id >= MAX_BLOCK_ID) {
+			DP_NOTICE(p_hwfn, "Invalid block_id\n");
+			return 0;
+		}
+
+		if (!dev_data->block_in_reset[block_id]) {
+			bool eval_mode = GET_FIELD(reg->mode.data,
+						   DBG_MODE_HDR_EVAL_MODE) > 0;
+			bool mode_match = true;
+
+			/* Check mode */
+			if (eval_mode) {
+				u16 modes_buf_offset =
+					GET_FIELD(reg->mode.data,
+						DBG_MODE_HDR_MODES_BUF_OFFSET);
+				mode_match =
+					qed_is_mode_match(p_hwfn,
+							  &modes_buf_offset);
+			}
+
+			if (mode_match) {
+				u32 grc_addr =
+					DWORDS_TO_BYTES(GET_FIELD(reg->data,
+						DBG_IDLE_CHK_INFO_REG_ADDRESS));
+
+				/* Write register header */
+				struct dbg_idle_chk_result_reg_hdr *reg_hdr =
+					(struct dbg_idle_chk_result_reg_hdr *)
+					(dump_buf + offset);
+
+				offset += IDLE_CHK_RESULT_REG_HDR_DWORDS;
+				hdr->num_dumped_info_regs++;
+				memset(reg_hdr, 0, sizeof(*reg_hdr));
+				reg_hdr->size = reg->size;
+				SET_FIELD(reg_hdr->data,
+					DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID,
+					rule->num_cond_regs + reg_id);
+
+				/* Write register values */
+				for (i = 0; i < reg->size;
+				     i++, offset++, grc_addr += 4)
+					dump_buf[offset] =
+						qed_rd(p_hwfn, p_ptt, grc_addr);
+				}
+			}
+	}
+
+	return offset;
+}
+
+/* Dumps idle check rule entries. Returns the dumped size in dwords. */
+static u32
+qed_idle_chk_dump_rule_entries(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			       u32 *dump_buf, bool dump,
+			       const struct dbg_idle_chk_rule *input_rules,
+			       u32 num_input_rules, u32 *num_failing_rules)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 cond_reg_values[IDLE_CHK_MAX_ENTRIES_SIZE];
+	u32 i, j, offset = 0;
+	u16 entry_id;
+	u8 reg_id;
+
+	*num_failing_rules = 0;
+	for (i = 0; i < num_input_rules; i++) {
+		const struct dbg_idle_chk_cond_reg *cond_regs;
+		const struct dbg_idle_chk_rule *rule;
+		const union dbg_idle_chk_reg *regs;
+		u16 num_reg_entries = 1;
+		bool check_rule = true;
+		const u32 *imm_values;
+
+		rule = &input_rules[i];
+		regs = &((const union dbg_idle_chk_reg *)
+			 s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_REGS].ptr)
+			[rule->reg_offset];
+		cond_regs = &regs[0].cond_reg;
+		imm_values = &s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_IMMS].ptr
+			     [rule->imm_offset];
+
+		/* Check if all condition register blocks are out of reset, and
+		 * find maximal number of entries (all condition registers that
+		 * are memories must have the same size, which is > 1).
+		 */
+		for (reg_id = 0; reg_id < rule->num_cond_regs && check_rule;
+		     reg_id++) {
+			u32 block_id = GET_FIELD(cond_regs[reg_id].data,
+						DBG_IDLE_CHK_COND_REG_BLOCK_ID);
+
+			if (block_id >= MAX_BLOCK_ID) {
+				DP_NOTICE(p_hwfn, "Invalid block_id\n");
+				return 0;
+			}
+
+			check_rule = !dev_data->block_in_reset[block_id];
+			if (cond_regs[reg_id].num_entries > num_reg_entries)
+				num_reg_entries = cond_regs[reg_id].num_entries;
+		}
+
+		if (!check_rule && dump)
+			continue;
+
+		/* Go over all register entries (number of entries is the same
+		 * for all condition registers).
+		 */
+		for (entry_id = 0; entry_id < num_reg_entries; entry_id++) {
+			/* Read current entry of all condition registers */
+			if (dump) {
+				u32 next_reg_offset = 0;
+
+				for (reg_id = 0;
+				     reg_id < rule->num_cond_regs;
+				     reg_id++) {
+					const struct dbg_idle_chk_cond_reg
+						*reg = &cond_regs[reg_id];
+
+					/* Find GRC address (if it's a memory,
+					 * the address of the specific entry is
+					 * calculated).
+					 */
+					u32 grc_addr =
+					   DWORDS_TO_BYTES(
+						GET_FIELD(reg->data,
+						    DBG_IDLE_CHK_COND_REG_ADDRESS));
+
+					if (reg->num_entries > 1 ||
+					    reg->start_entry > 0) {
+						u32 padded_entry_size =
+							reg->entry_size > 1 ?
+							roundup_pow_of_two
+							(reg->entry_size) : 1;
+
+						grc_addr +=
+							DWORDS_TO_BYTES(
+								(reg->start_entry +
+								entry_id)
+								* padded_entry_size);
+					}
+
+					/* Read registers */
+					if (next_reg_offset + reg->entry_size >=
+					    IDLE_CHK_MAX_ENTRIES_SIZE) {
+						DP_NOTICE(p_hwfn,
+							  "idle check registers entry is too large\n");
+						return 0;
+					}
+
+					for (j = 0; j < reg->entry_size;
+					     j++, next_reg_offset++,
+					     grc_addr += 4)
+					     cond_reg_values[next_reg_offset] =
+						qed_rd(p_hwfn, p_ptt, grc_addr);
+				}
+			}
+
+			/* Call rule's condition function - a return value of
+			 * true indicates failure.
+			 */
+			if ((*cond_arr[rule->cond_id])(cond_reg_values,
+						       imm_values) || !dump) {
+				offset +=
+					qed_idle_chk_dump_failure(p_hwfn,
+							p_ptt,
+							dump_buf + offset,
+							dump,
+							rule->rule_id,
+							rule,
+							entry_id,
+							cond_reg_values);
+				(*num_failing_rules)++;
+				break;
+			}
+		}
+	}
+
+	return offset;
+}
+
+/* Performs Idle Check Dump to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_idle_chk_dump(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	u32 offset = 0, input_offset = 0, num_failing_rules = 0;
+	u32 num_failing_rules_offset;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "idle-chk");
+
+	/* Dump idle check section header with a single parameter */
+	offset += qed_dump_section_hdr(dump_buf + offset, dump, "idle_chk", 1);
+	num_failing_rules_offset = offset;
+	offset += qed_dump_num_param(dump_buf + offset, dump, "num_rules", 0);
+	while (input_offset <
+	       s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].size_in_dwords) {
+		const struct dbg_idle_chk_cond_hdr *cond_hdr =
+			(const struct dbg_idle_chk_cond_hdr *)
+			&s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].ptr
+			[input_offset++];
+		bool eval_mode = GET_FIELD(cond_hdr->mode.data,
+					   DBG_MODE_HDR_EVAL_MODE) > 0;
+		bool mode_match = true;
+
+		/* Check mode */
+		if (eval_mode) {
+			u16 modes_buf_offset =
+				GET_FIELD(cond_hdr->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+
+			mode_match = qed_is_mode_match(p_hwfn,
+						       &modes_buf_offset);
+		}
+
+		if (mode_match) {
+			u32 curr_failing_rules;
+
+			offset +=
+			    qed_idle_chk_dump_rule_entries(p_hwfn,
+				p_ptt,
+				dump_buf + offset,
+				dump,
+				(const struct dbg_idle_chk_rule *)
+				&s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].
+				ptr[input_offset],
+				cond_hdr->data_size / IDLE_CHK_RULE_SIZE_DWORDS,
+				&curr_failing_rules);
+			num_failing_rules += curr_failing_rules;
+		}
+
+		input_offset += cond_hdr->data_size;
+	}
+
+	/* Overwrite num_rules parameter */
+	if (dump)
+		qed_dump_num_param(dump_buf + num_failing_rules_offset,
+				   dump, "num_rules", num_failing_rules);
+
+	return offset;
+}
+
+/* Finds the meta data image in NVRAM. */
+static enum dbg_status qed_find_nvram_image(struct qed_hwfn *p_hwfn,
+					    struct qed_ptt *p_ptt,
+					    u32 image_type,
+					    u32 *nvram_offset_bytes,
+					    u32 *nvram_size_bytes)
+{
+	u32 ret_mcp_resp, ret_mcp_param, ret_txn_size;
+	struct mcp_file_att file_att;
+
+	/* Call NVRAM get file command */
+	if (qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_NVM_GET_FILE_ATT,
+			       image_type, &ret_mcp_resp, &ret_mcp_param,
+			       &ret_txn_size, (u32 *)&file_att) != 0)
+		return DBG_STATUS_NVRAM_GET_IMAGE_FAILED;
+
+	/* Check response */
+	if ((ret_mcp_resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_NVM_OK)
+		return DBG_STATUS_NVRAM_GET_IMAGE_FAILED;
+
+	/* Update return values */
+	*nvram_offset_bytes = file_att.nvm_start_addr;
+	*nvram_size_bytes = file_att.len;
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_DEBUG,
+		   "find_nvram_image: found NVRAM image of type %d in NVRAM offset %d bytes with size %d bytes\n",
+		   image_type, *nvram_offset_bytes, *nvram_size_bytes);
+
+	/* Check alignment */
+	if (*nvram_size_bytes & 0x3)
+		return DBG_STATUS_NON_ALIGNED_NVRAM_IMAGE;
+	return DBG_STATUS_OK;
+}
+
+static enum dbg_status qed_nvram_read(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 nvram_offset_bytes,
+				      u32 nvram_size_bytes, u32 *ret_buf)
+{
+	u32 ret_mcp_resp, ret_mcp_param, ret_read_size;
+	u32 bytes_to_copy, read_offset = 0;
+	s32 bytes_left = nvram_size_bytes;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_DEBUG,
+		   "nvram_read: reading image of size %d bytes from NVRAM\n",
+		   nvram_size_bytes);
+	do {
+		bytes_to_copy =
+		    (bytes_left >
+		     MCP_DRV_NVM_BUF_LEN) ? MCP_DRV_NVM_BUF_LEN : bytes_left;
+
+		/* Call NVRAM read command */
+		if (qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt,
+				       DRV_MSG_CODE_NVM_READ_NVRAM,
+				       (nvram_offset_bytes +
+					read_offset) |
+				       (bytes_to_copy <<
+					DRV_MB_PARAM_NVM_LEN_SHIFT),
+				       &ret_mcp_resp, &ret_mcp_param,
+				       &ret_read_size,
+				       (u32 *)((u8 *)ret_buf +
+					       read_offset)) != 0)
+			return DBG_STATUS_NVRAM_READ_FAILED;
+
+		/* Check response */
+		if ((ret_mcp_resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_NVM_OK)
+			return DBG_STATUS_NVRAM_READ_FAILED;
+
+		/* Update read offset */
+		read_offset += ret_read_size;
+		bytes_left -= ret_read_size;
+	} while (bytes_left > 0);
+
+	return DBG_STATUS_OK;
+}
+
+/* Get info on the MCP Trace data in the scratchpad:
+ * - trace_data_grc_addr - the GRC address of the trace data
+ * - trace_data_size_bytes - the size in bytes of the MCP Trace data (without
+ *	the header)
+ */
+static enum dbg_status qed_mcp_trace_get_data_info(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *trace_data_grc_addr,
+						   u32 *trace_data_size_bytes)
+{
+	/* Read MCP trace section offsize structure from MCP scratchpad */
+	u32 spad_trace_offsize = qed_rd(p_hwfn,
+					p_ptt,
+					MCP_SPAD_TRACE_OFFSIZE_ADDR);
+	u32 signature;
+
+	/* Extract MCP trace section GRC address from offsize structure (within
+	 * scratchpad).
+	 */
+	*trace_data_grc_addr =
+		MCP_REG_SCRATCH + SECTION_OFFSET(spad_trace_offsize);
+
+	/* Read signature from MCP trace section */
+	signature = qed_rd(p_hwfn, p_ptt,
+			   *trace_data_grc_addr +
+			   offsetof(struct mcp_trace, signature));
+	if (signature != MFW_TRACE_SIGNATURE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	/* Read trace size from MCP trace section */
+	*trace_data_size_bytes = qed_rd(p_hwfn,
+					p_ptt,
+					*trace_data_grc_addr +
+					offsetof(struct mcp_trace, size));
+	return DBG_STATUS_OK;
+}
+
+/* Reads MCP trace meta data image from NVRAM.
+ * - running_bundle_id (OUT) - the running bundle ID (invalid when loaded from
+ *	file)
+ * - trace_meta_offset_bytes (OUT) - the NVRAM offset in bytes in which the MCP
+ *	Trace meta data starts (invalid when loaded from file)
+ * - trace_meta_size_bytes (OUT) - the size in bytes of the MCP Trace meta data
+ */
+static enum dbg_status qed_mcp_trace_get_meta_info(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 trace_data_size_bytes,
+						   u32 *running_bundle_id,
+						   u32 *trace_meta_offset_bytes,
+						   u32 *trace_meta_size_bytes)
+{
+	/* Read MCP trace section offsize structure from MCP scratchpad */
+	u32 spad_trace_offsize = qed_rd(p_hwfn,
+					p_ptt,
+					MCP_SPAD_TRACE_OFFSIZE_ADDR);
+
+	/* Find running bundle ID */
+	u32 running_mfw_addr =
+		MCP_REG_SCRATCH + SECTION_OFFSET(spad_trace_offsize) +
+		QED_SECTION_SIZE(spad_trace_offsize) + trace_data_size_bytes;
+	enum dbg_status status;
+	u32 nvram_image_type;
+
+	*running_bundle_id = qed_rd(p_hwfn, p_ptt, running_mfw_addr);
+	if (*running_bundle_id > 1)
+		return DBG_STATUS_INVALID_NVRAM_BUNDLE;
+
+	/* Find image in NVRAM */
+	nvram_image_type =
+	    (*running_bundle_id ==
+	     DIR_ID_1) ? NVM_TYPE_MFW_TRACE1 : NVM_TYPE_MFW_TRACE2;
+	status = qed_find_nvram_image(p_hwfn,
+				      p_ptt,
+				      nvram_image_type,
+				      trace_meta_offset_bytes,
+				      trace_meta_size_bytes);
+
+	return status;
+}
+
+/* Reads the MCP Trace data from the specified GRC address into the specified
+ * buffer.
+ */
+static void qed_mcp_trace_read_data(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    u32 grc_addr, u32 size_in_dwords, u32 *buf)
+{
+	u32 i;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_DEBUG,
+		   "mcp_trace_read_data: reading trace data of size %d dwords from GRC address 0x%x\n",
+		   size_in_dwords, grc_addr);
+	for (i = 0; i < size_in_dwords; i++, grc_addr += BYTES_IN_DWORD)
+		buf[i] = qed_rd(p_hwfn, p_ptt, grc_addr);
+}
+
+/* Reads the MCP Trace meta data (from NVRAM or buffer) into the specified
+ * buffer.
+ */
+static enum dbg_status qed_mcp_trace_read_meta(struct qed_hwfn *p_hwfn,
+					       struct qed_ptt *p_ptt,
+					       u32 nvram_offset_in_bytes,
+					       u32 size_in_bytes, u32 *buf)
+{
+	u8 *byte_buf = (u8 *)buf;
+	u8 modules_num, i;
+	u32 signature;
+
+	/* Read meta data from NVRAM */
+	enum dbg_status status = qed_nvram_read(p_hwfn,
+						p_ptt,
+						nvram_offset_in_bytes,
+						size_in_bytes,
+						buf);
+
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	/* Extract and check first signature */
+	signature = qed_read_unaligned_dword(byte_buf);
+	byte_buf += sizeof(u32);
+	if (signature != MCP_TRACE_META_IMAGE_SIGNATURE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	/* Extract number of modules */
+	modules_num = *(byte_buf++);
+
+	/* Skip all modules */
+	for (i = 0; i < modules_num; i++) {
+		u8 module_len = *(byte_buf++);
+
+		byte_buf += module_len;
+	}
+
+	/* Extract and check second signature */
+	signature = qed_read_unaligned_dword(byte_buf);
+	byte_buf += sizeof(u32);
+	if (signature != MCP_TRACE_META_IMAGE_SIGNATURE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+	return DBG_STATUS_OK;
+}
+
+/* Dump MCP Trace */
+enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   u32 *dump_buf,
+				   bool dump, u32 *num_dumped_dwords)
+{
+	u32 trace_data_grc_addr, trace_data_size_bytes, trace_data_size_dwords;
+	u32 trace_meta_size_dwords, running_bundle_id, offset = 0;
+	u32 trace_meta_offset_bytes, trace_meta_size_bytes;
+	enum dbg_status status;
+	int halted = 0;
+
+	*num_dumped_dwords = 0;
+
+	/* Get trace data info */
+	status = qed_mcp_trace_get_data_info(p_hwfn,
+					     p_ptt,
+					     &trace_data_grc_addr,
+					     &trace_data_size_bytes);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "mcp-trace");
+
+	/* Halt MCP while reading from scratchpad so the read data will be
+	 * consistent if halt fails, MCP trace is taken anyway, with a small
+	 * risk that it may be corrupt.
+	 */
+	if (dump) {
+		halted = !qed_mcp_halt(p_hwfn, p_ptt);
+		if (!halted)
+			DP_NOTICE(p_hwfn, "MCP halt failed!\n");
+	}
+
+	/* Find trace data size */
+	trace_data_size_dwords =
+		DIV_ROUND_UP(trace_data_size_bytes + sizeof(struct mcp_trace),
+			     BYTES_IN_DWORD);
+
+	/* Dump trace data section header and param */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "mcp_trace_data", 1);
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "size", trace_data_size_dwords);
+
+	/* Read trace data from scratchpad into dump buffer */
+	if (dump)
+		qed_mcp_trace_read_data(p_hwfn,
+					p_ptt,
+					trace_data_grc_addr,
+					trace_data_size_dwords,
+					dump_buf + offset);
+	offset += trace_data_size_dwords;
+
+	/* Resume MCP (only if halt succeeded) */
+	if (halted && qed_mcp_resume(p_hwfn, p_ptt) != 0)
+		DP_NOTICE(p_hwfn, "Failed to resume MCP after halt!\n");
+
+	/* Dump trace meta section header */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "mcp_trace_meta", 1);
+
+	/* Read trace meta info */
+	status = qed_mcp_trace_get_meta_info(p_hwfn,
+					     p_ptt,
+					     trace_data_size_bytes,
+					     &running_bundle_id,
+					     &trace_meta_offset_bytes,
+					     &trace_meta_size_bytes);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	/* Dump trace meta size param (trace_meta_size_bytes is always
+	 * dword-aligned).
+	 */
+	trace_meta_size_dwords = BYTES_TO_DWORDS(trace_meta_size_bytes);
+	offset += qed_dump_num_param(dump_buf + offset,	dump, "size",
+				     trace_meta_size_dwords);
+
+	/* Read trace meta image into dump buffer */
+	if (dump) {
+		status = qed_mcp_trace_read_meta(p_hwfn,
+						p_ptt,
+						trace_meta_offset_bytes,
+						trace_meta_size_bytes,
+						dump_buf + offset);
+		if (status != DBG_STATUS_OK)
+			return status;
+	}
+
+	offset += trace_meta_size_dwords;
+
+	*num_dumped_dwords = offset;
+
+	return DBG_STATUS_OK;
+}
+
+/* Dump GRC FIFO */
+enum dbg_status qed_reg_fifo_dump(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u32 *dump_buf,
+				  bool dump, u32 *num_dumped_dwords)
+{
+	u32 offset = 0, dwords_read, size_param_offset;
+	bool fifo_has_data;
+
+	*num_dumped_dwords = 0;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "reg-fifo");
+
+	/* Dump fifo data section header and param. The size param is 0 for now,
+	 * and is overwritten after reading the FIFO.
+	 */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "reg_fifo_data", 1);
+	size_param_offset = offset;
+	offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0);
+
+	if (!dump) {
+		/* FIFO max size is REG_FIFO_DEPTH_DWORDS. There is no way to
+		 * test how much data is available, except for reading it.
+		 */
+		offset += REG_FIFO_DEPTH_DWORDS;
+		*num_dumped_dwords = offset;
+		return DBG_STATUS_OK;
+	}
+
+	fifo_has_data = qed_rd(p_hwfn, p_ptt,
+			       GRC_REG_TRACE_FIFO_VALID_DATA) > 0;
+
+	/* Pull available data from fifo. Use DMAE since this is widebus memory
+	 * and must be accessed atomically. Test for dwords_read not passing
+	 * buffer size since more entries could be added to the buffer as we are
+	 * emptying it.
+	 */
+	for (dwords_read = 0;
+	     fifo_has_data && dwords_read < REG_FIFO_DEPTH_DWORDS;
+	     dwords_read += REG_FIFO_ELEMENT_DWORDS, offset +=
+	     REG_FIFO_ELEMENT_DWORDS) {
+		if (qed_dmae_grc2host(p_hwfn, p_ptt, GRC_REG_TRACE_FIFO,
+				      (u64)(uintptr_t)(&dump_buf[offset]),
+				      REG_FIFO_ELEMENT_DWORDS, 0))
+			return DBG_STATUS_DMAE_FAILED;
+		fifo_has_data = qed_rd(p_hwfn, p_ptt,
+				       GRC_REG_TRACE_FIFO_VALID_DATA) > 0;
+	}
+
+	qed_dump_num_param(dump_buf + size_param_offset, dump, "size",
+			   dwords_read);
+
+	*num_dumped_dwords = offset;
+	return DBG_STATUS_OK;
+}
+
+/* Dump IGU FIFO */
+enum dbg_status qed_igu_fifo_dump(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u32 *dump_buf,
+				  bool dump, u32 *num_dumped_dwords)
+{
+	u32 offset = 0, dwords_read, size_param_offset;
+	bool fifo_has_data;
+
+	*num_dumped_dwords = 0;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "igu-fifo");
+
+	/* Dump fifo data section header and param. The size param is 0 for now,
+	 * and is overwritten after reading the FIFO.
+	 */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "igu_fifo_data", 1);
+	size_param_offset = offset;
+	offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0);
+
+	if (!dump) {
+		/* FIFO max size is IGU_FIFO_DEPTH_DWORDS. There is no way to
+		 * test how much data is available, except for reading it.
+		 */
+		offset += IGU_FIFO_DEPTH_DWORDS;
+		*num_dumped_dwords = offset;
+		return DBG_STATUS_OK;
+	}
+
+	fifo_has_data = qed_rd(p_hwfn, p_ptt,
+			       IGU_REG_ERROR_HANDLING_DATA_VALID) > 0;
+
+	/* Pull available data from fifo. Use DMAE since this is widebus memory
+	 * and must be accessed atomically. Test for dwords_read not passing
+	 * buffer size since more entries could be added to the buffer as we are
+	 * emptying it.
+	 */
+	for (dwords_read = 0;
+	     fifo_has_data && dwords_read < IGU_FIFO_DEPTH_DWORDS;
+	     dwords_read += IGU_FIFO_ELEMENT_DWORDS, offset +=
+	     IGU_FIFO_ELEMENT_DWORDS) {
+		if (qed_dmae_grc2host(p_hwfn, p_ptt,
+				      IGU_REG_ERROR_HANDLING_MEMORY,
+				      (u64)(uintptr_t)(&dump_buf[offset]),
+				      IGU_FIFO_ELEMENT_DWORDS, 0))
+			return DBG_STATUS_DMAE_FAILED;
+		fifo_has_data =	qed_rd(p_hwfn, p_ptt,
+				       IGU_REG_ERROR_HANDLING_DATA_VALID) > 0;
+	}
+
+	qed_dump_num_param(dump_buf + size_param_offset, dump, "size",
+			   dwords_read);
+
+	*num_dumped_dwords = offset;
+	return DBG_STATUS_OK;
+}
+
+/* Protection Override dump */
+enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn,
+					     struct qed_ptt *p_ptt,
+					     u32 *dump_buf,
+					     bool dump, u32 *num_dumped_dwords)
+{
+	u32 offset = 0, size_param_offset, override_window_dwords;
+
+	*num_dumped_dwords = 0;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "protection-override");
+
+	/* Dump data section header and param. The size param is 0 for now, and
+	 * is overwritten after reading the data.
+	 */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "protection_override_data", 1);
+	size_param_offset = offset;
+	offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0);
+
+	if (!dump) {
+		offset += PROTECTION_OVERRIDE_DEPTH_DWORDS;
+		*num_dumped_dwords = offset;
+		return DBG_STATUS_OK;
+	}
+
+	/* Add override window info to buffer */
+	override_window_dwords =
+		qed_rd(p_hwfn, p_ptt,
+		       GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) *
+		       PROTECTION_OVERRIDE_ELEMENT_DWORDS;
+	if (qed_dmae_grc2host(p_hwfn, p_ptt,
+			      GRC_REG_PROTECTION_OVERRIDE_WINDOW,
+			      (u64)(uintptr_t)(dump_buf + offset),
+			      override_window_dwords, 0))
+		return DBG_STATUS_DMAE_FAILED;
+	offset += override_window_dwords;
+	qed_dump_num_param(dump_buf + size_param_offset, dump, "size",
+			   override_window_dwords);
+
+	*num_dumped_dwords = offset;
+	return DBG_STATUS_OK;
+}
+
+/* Performs FW Asserts Dump to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	char storm_letter_str[2] = "?";
+	struct fw_info fw_info;
+	u32 offset = 0, i;
+	u8 storm_id;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "fw-asserts");
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		u32 fw_asserts_section_addr, next_list_idx_addr, next_list_idx,
+			last_list_idx, element_addr;
+
+		if (dev_data->block_in_reset[s_storm_defs[storm_id].block_id])
+			continue;
+
+		/* Read FW info for the current Storm */
+		qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
+
+		/* Dump FW Asserts section header and params */
+		storm_letter_str[0] = s_storm_defs[storm_id].letter;
+		offset += qed_dump_section_hdr(dump_buf + offset, dump,
+					       "fw_asserts", 2);
+		offset += qed_dump_str_param(dump_buf + offset, dump, "storm",
+					     storm_letter_str);
+		offset += qed_dump_num_param(dump_buf + offset, dump, "size",
+					     fw_info.fw_asserts_section.
+					     list_element_dword_size);
+
+		if (!dump) {
+			offset += fw_info.fw_asserts_section.
+				  list_element_dword_size;
+			continue;
+		}
+
+		/* Read and dump FW Asserts data */
+		fw_asserts_section_addr =
+			s_storm_defs[storm_id].sem_fast_mem_addr +
+			SEM_FAST_REG_INT_RAM +
+			RAM_LINES_TO_BYTES(fw_info.fw_asserts_section.
+					   section_ram_line_offset);
+		next_list_idx_addr =
+			fw_asserts_section_addr +
+			DWORDS_TO_BYTES(fw_info.fw_asserts_section.
+					list_next_index_dword_offset);
+		next_list_idx = qed_rd(p_hwfn, p_ptt, next_list_idx_addr);
+		last_list_idx = (next_list_idx > 0
+				 ? next_list_idx
+				 : fw_info.fw_asserts_section.list_num_elements)
+				- 1;
+		element_addr =
+			fw_asserts_section_addr +
+			DWORDS_TO_BYTES(fw_info.fw_asserts_section.
+					list_dword_offset) +
+			last_list_idx *
+			DWORDS_TO_BYTES(fw_info.fw_asserts_section.
+					list_element_dword_size);
+		for (i = 0;
+		     i < fw_info.fw_asserts_section.list_element_dword_size;
+		     i++, offset++, element_addr += BYTES_IN_DWORD)
+			dump_buf[offset] = qed_rd(p_hwfn, p_ptt, element_addr);
+	}
+
+	/* Dump last section */
+	offset += qed_dump_section_hdr(dump_buf + offset, dump, "last", 0);
+	return offset;
+}
+
+/***************************** Public Functions *******************************/
+
+enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr)
+{
+	/* Convert binary data to debug arrays */
+	u32 num_of_buffers = *(u32 *)bin_ptr;
+	struct bin_buffer_hdr *buf_array;
+	u8 buf_id;
+
+	buf_array = (struct bin_buffer_hdr *)((u32 *)bin_ptr + 1);
+
+	for (buf_id = 0; buf_id < num_of_buffers; buf_id++) {
+		s_dbg_arrays[buf_id].ptr =
+		    (u32 *)(bin_ptr + buf_array[buf_id].offset);
+		s_dbg_arrays[buf_id].size_in_dwords =
+		    BYTES_TO_DWORDS(buf_array[buf_id].length);
+	}
+
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_dbg_grc_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt,
+					      u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (!s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_ATTN_BLOCKS].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_ATTN_REGS].ptr)
+		return DBG_STATUS_DBG_ARRAY_NOT_SET;
+	return qed_grc_dump(p_hwfn, p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_grc_dump(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf,
+				 u32 buf_size_in_dwords,
+				 u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	status = qed_dbg_grc_get_dump_buf_size(p_hwfn, p_ptt,
+					       &needed_buf_size_in_dwords);
+
+	*num_dumped_dwords = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* GRC Dump */
+	status = qed_grc_dump(p_hwfn, p_ptt, dump_buf, true, num_dumped_dwords);
+
+	/* Clear all GRC params */
+	qed_dbg_grc_clear_params(p_hwfn);
+	return status;
+}
+
+enum dbg_status qed_dbg_idle_chk_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	*buf_size = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (!s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_REGS].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_IMMS].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].ptr)
+		return DBG_STATUS_DBG_ARRAY_NOT_SET;
+	if (!dev_data->idle_chk.buf_size_set) {
+		dev_data->idle_chk.buf_size = qed_idle_chk_dump(p_hwfn,
+								p_ptt,
+								NULL, false);
+		dev_data->idle_chk.buf_size_set = true;
+	}
+
+	*buf_size = dev_data->idle_chk.buf_size;
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_dbg_idle_chk_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	status = qed_dbg_idle_chk_get_dump_buf_size(p_hwfn, p_ptt,
+						    &needed_buf_size_in_dwords);
+
+	*num_dumped_dwords = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	/* Idle Check Dump */
+	*num_dumped_dwords = qed_idle_chk_dump(p_hwfn, p_ptt, dump_buf, true);
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_dbg_mcp_trace_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						    struct qed_ptt *p_ptt,
+						    u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	return qed_mcp_trace_dump(p_hwfn, p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_mcp_trace_dump(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       u32 *dump_buf,
+				       u32 buf_size_in_dwords,
+				       u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	status = qed_dbg_mcp_trace_get_dump_buf_size(p_hwfn, p_ptt,
+						&needed_buf_size_in_dwords);
+
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	/* Perform dump */
+	return qed_mcp_trace_dump(p_hwfn,
+				  p_ptt, dump_buf, true, num_dumped_dwords);
+}
+
+enum dbg_status qed_dbg_reg_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	return qed_reg_fifo_dump(p_hwfn, p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_reg_fifo_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	status = qed_dbg_reg_fifo_get_dump_buf_size(p_hwfn, p_ptt,
+						    &needed_buf_size_in_dwords);
+
+	*num_dumped_dwords = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+	return qed_reg_fifo_dump(p_hwfn,
+				 p_ptt, dump_buf, true, num_dumped_dwords);
+}
+
+enum dbg_status qed_dbg_igu_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	return qed_igu_fifo_dump(p_hwfn, p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_igu_fifo_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	status = qed_dbg_igu_fifo_get_dump_buf_size(p_hwfn, p_ptt,
+						    &needed_buf_size_in_dwords);
+
+	*num_dumped_dwords = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+	return qed_igu_fifo_dump(p_hwfn,
+				 p_ptt, dump_buf, true, num_dumped_dwords);
+}
+
+enum dbg_status
+qed_dbg_protection_override_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt,
+					      u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	return qed_protection_override_dump(p_hwfn,
+					    p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_protection_override_dump(struct qed_hwfn *p_hwfn,
+						 struct qed_ptt *p_ptt,
+						 u32 *dump_buf,
+						 u32 buf_size_in_dwords,
+						 u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	status = qed_dbg_protection_override_get_dump_buf_size(p_hwfn, p_ptt,
+						&needed_buf_size_in_dwords);
+
+	*num_dumped_dwords = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+	return qed_protection_override_dump(p_hwfn,
+					    p_ptt,
+					    dump_buf, true, num_dumped_dwords);
+}
+
+enum dbg_status qed_dbg_fw_asserts_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						     struct qed_ptt *p_ptt,
+						     u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+	*buf_size = qed_fw_asserts_dump(p_hwfn, p_ptt, NULL, false);
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_dbg_fw_asserts_dump(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					u32 *dump_buf,
+					u32 buf_size_in_dwords,
+					u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	status = qed_dbg_fw_asserts_get_dump_buf_size(p_hwfn, p_ptt,
+						&needed_buf_size_in_dwords);
+
+	*num_dumped_dwords = 0;
+	if (status != DBG_STATUS_OK)
+		return status;
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	*num_dumped_dwords = qed_fw_asserts_dump(p_hwfn, p_ptt, dump_buf, true);
+	return DBG_STATUS_OK;
+}
+
+/******************************* Data Types **********************************/
+
+struct mcp_trace_format {
+	u32 data;
+#define MCP_TRACE_FORMAT_MODULE_MASK	0x0000ffff
+#define MCP_TRACE_FORMAT_MODULE_SHIFT	0
+#define MCP_TRACE_FORMAT_LEVEL_MASK	0x00030000
+#define MCP_TRACE_FORMAT_LEVEL_SHIFT	16
+#define MCP_TRACE_FORMAT_P1_SIZE_MASK	0x000c0000
+#define MCP_TRACE_FORMAT_P1_SIZE_SHIFT	18
+#define MCP_TRACE_FORMAT_P2_SIZE_MASK	0x00300000
+#define MCP_TRACE_FORMAT_P2_SIZE_SHIFT	20
+#define MCP_TRACE_FORMAT_P3_SIZE_MASK	0x00c00000
+#define MCP_TRACE_FORMAT_P3_SIZE_SHIFT	22
+#define MCP_TRACE_FORMAT_LEN_MASK	0xff000000
+#define MCP_TRACE_FORMAT_LEN_SHIFT	24
+	char *format_str;
+};
+
+struct mcp_trace_meta {
+	u32 modules_num;
+	char **modules;
+	u32 formats_num;
+	struct mcp_trace_format *formats;
+};
+
+/* Reg fifo element */
+struct reg_fifo_element {
+	u64 data;
+#define REG_FIFO_ELEMENT_ADDRESS_SHIFT		0
+#define REG_FIFO_ELEMENT_ADDRESS_MASK		0x7fffff
+#define REG_FIFO_ELEMENT_ACCESS_SHIFT		23
+#define REG_FIFO_ELEMENT_ACCESS_MASK		0x1
+#define REG_FIFO_ELEMENT_PF_SHIFT		24
+#define REG_FIFO_ELEMENT_PF_MASK		0xf
+#define REG_FIFO_ELEMENT_VF_SHIFT		28
+#define REG_FIFO_ELEMENT_VF_MASK		0xff
+#define REG_FIFO_ELEMENT_PORT_SHIFT		36
+#define REG_FIFO_ELEMENT_PORT_MASK		0x3
+#define REG_FIFO_ELEMENT_PRIVILEGE_SHIFT	38
+#define REG_FIFO_ELEMENT_PRIVILEGE_MASK		0x3
+#define REG_FIFO_ELEMENT_PROTECTION_SHIFT	40
+#define REG_FIFO_ELEMENT_PROTECTION_MASK	0x7
+#define REG_FIFO_ELEMENT_MASTER_SHIFT		43
+#define REG_FIFO_ELEMENT_MASTER_MASK		0xf
+#define REG_FIFO_ELEMENT_ERROR_SHIFT		47
+#define REG_FIFO_ELEMENT_ERROR_MASK		0x1f
+};
+
+/* IGU fifo element */
+struct igu_fifo_element {
+	u32 dword0;
+#define IGU_FIFO_ELEMENT_DWORD0_FID_SHIFT		0
+#define IGU_FIFO_ELEMENT_DWORD0_FID_MASK		0xff
+#define IGU_FIFO_ELEMENT_DWORD0_IS_PF_SHIFT		8
+#define IGU_FIFO_ELEMENT_DWORD0_IS_PF_MASK		0x1
+#define IGU_FIFO_ELEMENT_DWORD0_SOURCE_SHIFT		9
+#define IGU_FIFO_ELEMENT_DWORD0_SOURCE_MASK		0xf
+#define IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE_SHIFT		13
+#define IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE_MASK		0xf
+#define IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR_SHIFT		17
+#define IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR_MASK		0x7fff
+	u32 dword1;
+	u32 dword2;
+#define IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD_SHIFT	0
+#define IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD_MASK		0x1
+#define IGU_FIFO_ELEMENT_DWORD12_WR_DATA_SHIFT		1
+#define IGU_FIFO_ELEMENT_DWORD12_WR_DATA_MASK		0xffffffff
+	u32 reserved;
+};
+
+struct igu_fifo_wr_data {
+	u32 data;
+#define IGU_FIFO_WR_DATA_PROD_CONS_SHIFT		0
+#define IGU_FIFO_WR_DATA_PROD_CONS_MASK			0xffffff
+#define IGU_FIFO_WR_DATA_UPDATE_FLAG_SHIFT		24
+#define IGU_FIFO_WR_DATA_UPDATE_FLAG_MASK		0x1
+#define IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB_SHIFT	25
+#define IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB_MASK		0x3
+#define IGU_FIFO_WR_DATA_SEGMENT_SHIFT			27
+#define IGU_FIFO_WR_DATA_SEGMENT_MASK			0x1
+#define IGU_FIFO_WR_DATA_TIMER_MASK_SHIFT		28
+#define IGU_FIFO_WR_DATA_TIMER_MASK_MASK		0x1
+#define IGU_FIFO_WR_DATA_CMD_TYPE_SHIFT			31
+#define IGU_FIFO_WR_DATA_CMD_TYPE_MASK			0x1
+};
+
+struct igu_fifo_cleanup_wr_data {
+	u32 data;
+#define IGU_FIFO_CLEANUP_WR_DATA_RESERVED_SHIFT		0
+#define IGU_FIFO_CLEANUP_WR_DATA_RESERVED_MASK		0x7ffffff
+#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL_SHIFT	27
+#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL_MASK	0x1
+#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE_SHIFT	28
+#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE_MASK	0x7
+#define IGU_FIFO_CLEANUP_WR_DATA_CMD_TYPE_SHIFT		31
+#define IGU_FIFO_CLEANUP_WR_DATA_CMD_TYPE_MASK		0x1
+};
+
+/* Protection override element */
+struct protection_override_element {
+	u64 data;
+#define PROTECTION_OVERRIDE_ELEMENT_ADDRESS_SHIFT		0
+#define PROTECTION_OVERRIDE_ELEMENT_ADDRESS_MASK		0x7fffff
+#define PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE_SHIFT		23
+#define PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE_MASK		0xffffff
+#define PROTECTION_OVERRIDE_ELEMENT_READ_SHIFT			47
+#define PROTECTION_OVERRIDE_ELEMENT_READ_MASK			0x1
+#define PROTECTION_OVERRIDE_ELEMENT_WRITE_SHIFT			48
+#define PROTECTION_OVERRIDE_ELEMENT_WRITE_MASK			0x1
+#define PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION_SHIFT	49
+#define PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION_MASK	0x7
+#define PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION_SHIFT	52
+#define PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION_MASK	0x7
+};
+
+enum igu_fifo_sources {
+	IGU_SRC_PXP0,
+	IGU_SRC_PXP1,
+	IGU_SRC_PXP2,
+	IGU_SRC_PXP3,
+	IGU_SRC_PXP4,
+	IGU_SRC_PXP5,
+	IGU_SRC_PXP6,
+	IGU_SRC_PXP7,
+	IGU_SRC_CAU,
+	IGU_SRC_ATTN,
+	IGU_SRC_GRC
+};
+
+enum igu_fifo_addr_types {
+	IGU_ADDR_TYPE_MSIX_MEM,
+	IGU_ADDR_TYPE_WRITE_PBA,
+	IGU_ADDR_TYPE_WRITE_INT_ACK,
+	IGU_ADDR_TYPE_WRITE_ATTN_BITS,
+	IGU_ADDR_TYPE_READ_INT,
+	IGU_ADDR_TYPE_WRITE_PROD_UPDATE,
+	IGU_ADDR_TYPE_RESERVED
+};
+
+struct igu_fifo_addr_data {
+	u16 start_addr;
+	u16 end_addr;
+	char *desc;
+	char *vf_desc;
+	enum igu_fifo_addr_types type;
+};
+
+/******************************** Constants **********************************/
+
+#define MAX_MSG_LEN				1024
+#define MCP_TRACE_MAX_MODULE_LEN		8
+#define MCP_TRACE_FORMAT_MAX_PARAMS		3
+#define MCP_TRACE_FORMAT_PARAM_WIDTH \
+	(MCP_TRACE_FORMAT_P2_SIZE_SHIFT - MCP_TRACE_FORMAT_P1_SIZE_SHIFT)
+#define REG_FIFO_ELEMENT_ADDR_FACTOR		4
+#define REG_FIFO_ELEMENT_IS_PF_VF_VAL		127
+#define PROTECTION_OVERRIDE_ELEMENT_ADDR_FACTOR	4
+
+/********************************* Macros ************************************/
+
+#define BYTES_TO_DWORDS(bytes)			((bytes) / BYTES_IN_DWORD)
+
+/***************************** Constant Arrays *******************************/
+
+/* Status string array */
+static const char * const s_status_str[] = {
+	"Operation completed successfully",
+	"Debug application version wasn't set",
+	"Unsupported debug application version",
+	"The debug block wasn't reset since the last recording",
+	"Invalid arguments",
+	"The debug output was already set",
+	"Invalid PCI buffer size",
+	"PCI buffer allocation failed",
+	"A PCI buffer wasn't allocated",
+	"Too many inputs were enabled. Enabled less inputs, or set 'unifyInputs' to true",
+	"GRC/Timestamp input overlap in cycle dword 0",
+	"Cannot record Storm data since the entire recording cycle is used by HW",
+	"The Storm was already enabled",
+	"The specified Storm wasn't enabled",
+	"The block was already enabled",
+	"The specified block wasn't enabled",
+	"No input was enabled for recording",
+	"Filters and triggers are not allowed when recording in 64b units",
+	"The filter was already enabled",
+	"The trigger was already enabled",
+	"The trigger wasn't enabled",
+	"A constraint can be added only after a filter was enabled or a trigger state was added",
+	"Cannot add more than 3 trigger states",
+	"Cannot add more than 4 constraints per filter or trigger state",
+	"The recording wasn't started",
+	"A trigger was configured, but it didn't trigger",
+	"No data was recorded",
+	"Dump buffer is too small",
+	"Dumped data is not aligned to chunks",
+	"Unknown chip",
+	"Failed allocating virtual memory",
+	"The input block is in reset",
+	"Invalid MCP trace signature found in NVRAM",
+	"Invalid bundle ID found in NVRAM",
+	"Failed getting NVRAM image",
+	"NVRAM image is not dword-aligned",
+	"Failed reading from NVRAM",
+	"Idle check parsing failed",
+	"MCP Trace data is corrupt",
+	"Dump doesn't contain meta data - it must be provided in an image file",
+	"Failed to halt MCP",
+	"Failed to resume MCP after halt",
+	"DMAE transaction failed",
+	"Failed to empty SEMI sync FIFO",
+	"IGU FIFO data is corrupt",
+	"MCP failed to mask parities",
+	"FW Asserts parsing failed",
+	"GRC FIFO data is corrupt",
+	"Protection Override data is corrupt",
+	"Debug arrays were not set (when using binary files, dbg_set_bin_ptr must be called)",
+	"When a block is filtered, no other blocks can be recorded unless inputs are unified (due to a HW bug)"
+};
+
+/* Idle check severity names array */
+static const char * const s_idle_chk_severity_str[] = {
+	"Error",
+	"Error if no traffic",
+	"Warning"
+};
+
+/* MCP Trace level names array */
+static const char * const s_mcp_trace_level_str[] = {
+	"ERROR",
+	"TRACE",
+	"DEBUG"
+};
+
+/* Parsing strings */
+static const char * const s_access_strs[] = {
+	"read",
+	"write"
+};
+
+static const char * const s_privilege_strs[] = {
+	"VF",
+	"PDA",
+	"HV",
+	"UA"
+};
+
+static const char * const s_protection_strs[] = {
+	"(default)",
+	"(default)",
+	"(default)",
+	"(default)",
+	"override VF",
+	"override PDA",
+	"override HV",
+	"override UA"
+};
+
+static const char * const s_master_strs[] = {
+	"???",
+	"pxp",
+	"mcp",
+	"msdm",
+	"psdm",
+	"ysdm",
+	"usdm",
+	"tsdm",
+	"xsdm",
+	"dbu",
+	"dmae",
+	"???",
+	"???",
+	"???",
+	"???",
+	"???"
+};
+
+static const char * const s_reg_fifo_error_strs[] = {
+	"grc timeout",
+	"address doesn't belong to any block",
+	"reserved address in block or write to read-only address",
+	"privilege/protection mismatch",
+	"path isolation error"
+};
+
+static const char * const s_igu_fifo_source_strs[] = {
+	"TSTORM",
+	"MSTORM",
+	"USTORM",
+	"XSTORM",
+	"YSTORM",
+	"PSTORM",
+	"PCIE",
+	"NIG_QM_PBF",
+	"CAU",
+	"ATTN",
+	"GRC",
+};
+
+static const char * const s_igu_fifo_error_strs[] = {
+	"no error",
+	"length error",
+	"function disabled",
+	"VF sent command to attnetion address",
+	"host sent prod update command",
+	"read of during interrupt register while in MIMD mode",
+	"access to PXP BAR reserved address",
+	"producer update command to attention index",
+	"unknown error",
+	"SB index not valid",
+	"SB relative index and FID not found",
+	"FID not match",
+	"command with error flag asserted (PCI error or CAU discard)",
+	"VF sent cleanup and RF cleanup is disabled",
+	"cleanup command on type bigger than 4"
+};
+
+/* IGU FIFO address data */
+static const struct igu_fifo_addr_data s_igu_fifo_addr_data[] = {
+	{0x0, 0x101, "MSI-X Memory", NULL, IGU_ADDR_TYPE_MSIX_MEM},
+	{0x102, 0x1ff, "reserved", NULL, IGU_ADDR_TYPE_RESERVED},
+	{0x200, 0x200, "Write PBA[0:63]", NULL, IGU_ADDR_TYPE_WRITE_PBA},
+	{0x201, 0x201, "Write PBA[64:127]", "reserved",
+	 IGU_ADDR_TYPE_WRITE_PBA},
+	{0x202, 0x202, "Write PBA[128]", "reserved", IGU_ADDR_TYPE_WRITE_PBA},
+	{0x203, 0x3ff, "reserved", NULL, IGU_ADDR_TYPE_RESERVED},
+	{0x400, 0x5ef, "Write interrupt acknowledgment", NULL,
+	 IGU_ADDR_TYPE_WRITE_INT_ACK},
+	{0x5f0, 0x5f0, "Attention bits update", NULL,
+	 IGU_ADDR_TYPE_WRITE_ATTN_BITS},
+	{0x5f1, 0x5f1, "Attention bits set", NULL,
+	 IGU_ADDR_TYPE_WRITE_ATTN_BITS},
+	{0x5f2, 0x5f2, "Attention bits clear", NULL,
+	 IGU_ADDR_TYPE_WRITE_ATTN_BITS},
+	{0x5f3, 0x5f3, "Read interrupt 0:63 with mask", NULL,
+	 IGU_ADDR_TYPE_READ_INT},
+	{0x5f4, 0x5f4, "Read interrupt 0:31 with mask", NULL,
+	 IGU_ADDR_TYPE_READ_INT},
+	{0x5f5, 0x5f5, "Read interrupt 32:63 with mask", NULL,
+	 IGU_ADDR_TYPE_READ_INT},
+	{0x5f6, 0x5f6, "Read interrupt 0:63 without mask", NULL,
+	 IGU_ADDR_TYPE_READ_INT},
+	{0x5f7, 0x5ff, "reserved", NULL, IGU_ADDR_TYPE_RESERVED},
+	{0x600, 0x7ff, "Producer update", NULL, IGU_ADDR_TYPE_WRITE_PROD_UPDATE}
+};
+
+/******************************** Variables **********************************/
+
+/* MCP Trace meta data - used in case the dump doesn't contain the meta data
+ * (e.g. due to no NVRAM access).
+ */
+static struct dbg_array s_mcp_trace_meta = { NULL, 0 };
+
+/* Temporary buffer, used for print size calculations */
+static char s_temp_buf[MAX_MSG_LEN];
+
+/***************************** Public Functions *******************************/
+
+enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr)
+{
+	/* Convert binary data to debug arrays */
+	u32 num_of_buffers = *(u32 *)bin_ptr;
+	struct bin_buffer_hdr *buf_array;
+	u8 buf_id;
+
+	buf_array = (struct bin_buffer_hdr *)((u32 *)bin_ptr + 1);
+
+	for (buf_id = 0; buf_id < num_of_buffers; buf_id++) {
+		s_dbg_arrays[buf_id].ptr =
+		    (u32 *)(bin_ptr + buf_array[buf_id].offset);
+		s_dbg_arrays[buf_id].size_in_dwords =
+		    BYTES_TO_DWORDS(buf_array[buf_id].length);
+	}
+
+	return DBG_STATUS_OK;
+}
+
+static u32 qed_cyclic_add(u32 a, u32 b, u32 size)
+{
+	return (a + b) % size;
+}
+
+static u32 qed_cyclic_sub(u32 a, u32 b, u32 size)
+{
+	return (size + a - b) % size;
+}
+
+/* Reads the specified number of bytes from the specified cyclic buffer (up to 4
+ * bytes) and returns them as a dword value. the specified buffer offset is
+ * updated.
+ */
+static u32 qed_read_from_cyclic_buf(void *buf,
+				    u32 *offset,
+				    u32 buf_size, u8 num_bytes_to_read)
+{
+	u8 *bytes_buf = (u8 *)buf;
+	u8 *val_ptr;
+	u32 val = 0;
+	u8 i;
+
+	val_ptr = (u8 *)&val;
+
+	for (i = 0; i < num_bytes_to_read; i++) {
+		val_ptr[i] = bytes_buf[*offset];
+		*offset = qed_cyclic_add(*offset, 1, buf_size);
+	}
+
+	return val;
+}
+
+/* Reads and returns the next byte from the specified buffer.
+ * The specified buffer offset is updated.
+ */
+static u8 qed_read_byte_from_buf(void *buf, u32 *offset)
+{
+	return ((u8 *)buf)[(*offset)++];
+}
+
+/* Reads and returns the next dword from the specified buffer.
+ * The specified buffer offset is updated.
+ */
+static u32 qed_read_dword_from_buf(void *buf, u32 *offset)
+{
+	u32 dword_val = *(u32 *)&((u8 *)buf)[*offset];
+
+	*offset += 4;
+	return dword_val;
+}
+
+/* Reads the next string from the specified buffer, and copies it to the
+ * specified pointer. The specified buffer offset is updated.
+ */
+static void qed_read_str_from_buf(void *buf, u32 *offset, u32 size, char *dest)
+{
+	const char *source_str = &((const char *)buf)[*offset];
+
+	strncpy(dest, source_str, size);
+	dest[size - 1] = '\0';
+	*offset += size;
+}
+
+/* Returns a pointer to the specified offset (in bytes) of the specified buffer.
+ * If the specified buffer in NULL, a temporary buffer pointer is returned.
+ */
+static char *qed_get_buf_ptr(void *buf, u32 offset)
+{
+	return buf ? (char *)buf + offset : s_temp_buf;
+}
+
+/* Reads a param from the specified buffer. Returns the number of dwords read.
+ * If the returned str_param is NULL, the param is numeric and its value is
+ * returned in num_param.
+ * Otheriwise, the param is a string and its pointer is returned in str_param.
+ */
+static u32 qed_read_param(u32 *dump_buf,
+			  const char **param_name,
+			  const char **param_str_val, u32 *param_num_val)
+{
+	char *char_buf = (char *)dump_buf;
+	u32 offset = 0; /* In bytes */
+
+	/* Extract param name */
+	*param_name = char_buf;
+	offset += strlen(*param_name) + 1;
+
+	/* Check param type */
+	if (*(char_buf + offset++)) {
+		/* String param */
+		*param_str_val = char_buf + offset;
+		offset += strlen(*param_str_val) + 1;
+		if (offset & 0x3)
+			offset += (4 - (offset & 0x3));
+	} else {
+		/* Numeric param */
+		*param_str_val = NULL;
+		if (offset & 0x3)
+			offset += (4 - (offset & 0x3));
+		*param_num_val = *(u32 *)(char_buf + offset);
+		offset += 4;
+	}
+
+	return offset / 4;
+}
+
+/* Reads a section header from the specified buffer.
+ * Returns the number of dwords read.
+ */
+static u32 qed_read_section_hdr(u32 *dump_buf,
+				const char **section_name,
+				u32 *num_section_params)
+{
+	const char *param_str_val;
+
+	return qed_read_param(dump_buf,
+			      section_name, &param_str_val, num_section_params);
+}
+
+/* Reads section params from the specified buffer and prints them to the results
+ * buffer. Returns the number of dwords read.
+ */
+static u32 qed_print_section_params(u32 *dump_buf,
+				    u32 num_section_params,
+				    char *results_buf, u32 *num_chars_printed)
+{
+	u32 i, dump_offset = 0, results_offset = 0;
+
+	for (i = 0; i < num_section_params; i++) {
+		const char *param_name;
+		const char *param_str_val;
+		u32 param_num_val = 0;
+
+		dump_offset += qed_read_param(dump_buf + dump_offset,
+					      &param_name,
+					      &param_str_val, &param_num_val);
+		if (param_str_val)
+			/* String param */
+			results_offset +=
+				sprintf(qed_get_buf_ptr(results_buf,
+							results_offset),
+					"%s: %s\n", param_name, param_str_val);
+		else if (strcmp(param_name, "fw-timestamp"))
+			/* Numeric param */
+			results_offset +=
+				sprintf(qed_get_buf_ptr(results_buf,
+							results_offset),
+					"%s: %d\n", param_name, param_num_val);
+	}
+
+	results_offset +=
+	    sprintf(qed_get_buf_ptr(results_buf, results_offset), "\n");
+	*num_chars_printed = results_offset;
+	return dump_offset;
+}
+
+const char *qed_dbg_get_status_str(enum dbg_status status)
+{
+	return (status <
+		MAX_DBG_STATUS) ? s_status_str[status] : "Invalid debug status";
+}
+
+/* Parses the idle check rules and returns the number of characters printed.
+ * In case of parsing error, returns 0.
+ */
+static u32 qed_parse_idle_chk_dump_rules(struct qed_hwfn *p_hwfn,
+					 u32 *dump_buf,
+					 u32 *dump_buf_end,
+					 u32 num_rules,
+					 bool print_fw_idle_chk,
+					 char *results_buf,
+					 u32 *num_errors, u32 *num_warnings)
+{
+	u32 rule_idx, results_offset = 0; /* Offset in results_buf in bytes */
+	u16 i, j;
+
+	*num_errors = 0;
+	*num_warnings = 0;
+
+	/* Go over dumped results */
+	for (rule_idx = 0; rule_idx < num_rules && dump_buf < dump_buf_end;
+	     rule_idx++) {
+		const struct dbg_idle_chk_rule_parsing_data *rule_parsing_data;
+		struct dbg_idle_chk_result_hdr *hdr;
+		const char *parsing_str;
+		u32 parsing_str_offset;
+		const char *lsi_msg;
+		u8 curr_reg_id = 0;
+		bool has_fw_msg;
+
+		hdr = (struct dbg_idle_chk_result_hdr *)dump_buf;
+		rule_parsing_data =
+			(const struct dbg_idle_chk_rule_parsing_data *)
+			&s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_PARSING_DATA].
+			ptr[hdr->rule_id];
+		parsing_str_offset =
+			GET_FIELD(rule_parsing_data->data,
+				  DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET);
+		has_fw_msg =
+			GET_FIELD(rule_parsing_data->data,
+				DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG) > 0;
+		parsing_str = &((const char *)
+				s_dbg_arrays[BIN_BUF_DBG_PARSING_STRINGS].ptr)
+				[parsing_str_offset];
+		lsi_msg = parsing_str;
+
+		if (hdr->severity >= MAX_DBG_IDLE_CHK_SEVERITY_TYPES)
+			return 0;
+
+		/* Skip rule header */
+		dump_buf += (sizeof(struct dbg_idle_chk_result_hdr) / 4);
+
+		/* Update errors/warnings count */
+		if (hdr->severity == IDLE_CHK_SEVERITY_ERROR ||
+		    hdr->severity == IDLE_CHK_SEVERITY_ERROR_NO_TRAFFIC)
+			(*num_errors)++;
+		else
+			(*num_warnings)++;
+
+		/* Print rule severity */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset), "%s: ",
+			    s_idle_chk_severity_str[hdr->severity]);
+
+		/* Print rule message */
+		if (has_fw_msg)
+			parsing_str += strlen(parsing_str) + 1;
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset), "%s.",
+			    has_fw_msg &&
+			    print_fw_idle_chk ? parsing_str : lsi_msg);
+		parsing_str += strlen(parsing_str) + 1;
+
+		/* Print register values */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset), " Registers:");
+		for (i = 0;
+		     i < hdr->num_dumped_cond_regs + hdr->num_dumped_info_regs;
+		     i++) {
+			struct dbg_idle_chk_result_reg_hdr *reg_hdr
+			    = (struct dbg_idle_chk_result_reg_hdr *)
+			    dump_buf;
+			bool is_mem =
+				GET_FIELD(reg_hdr->data,
+					  DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM);
+			u8 reg_id =
+				GET_FIELD(reg_hdr->data,
+					  DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID);
+
+			/* Skip reg header */
+			dump_buf +=
+			    (sizeof(struct dbg_idle_chk_result_reg_hdr) / 4);
+
+			/* Skip register names until the required reg_id is
+			 * reached.
+			 */
+			for (; reg_id > curr_reg_id;
+			     curr_reg_id++,
+			     parsing_str += strlen(parsing_str) + 1);
+
+			results_offset +=
+			    sprintf(qed_get_buf_ptr(results_buf,
+						    results_offset), " %s",
+				    parsing_str);
+			if (i < hdr->num_dumped_cond_regs && is_mem)
+				results_offset +=
+				    sprintf(qed_get_buf_ptr(results_buf,
+							    results_offset),
+					    "[%d]", hdr->mem_entry_id +
+					    reg_hdr->start_entry);
+			results_offset +=
+			    sprintf(qed_get_buf_ptr(results_buf,
+						    results_offset), "=");
+			for (j = 0; j < reg_hdr->size; j++, dump_buf++) {
+				results_offset +=
+				    sprintf(qed_get_buf_ptr(results_buf,
+							    results_offset),
+					    "0x%x", *dump_buf);
+				if (j < reg_hdr->size - 1)
+					results_offset +=
+					    sprintf(qed_get_buf_ptr
+						    (results_buf,
+						     results_offset), ",");
+			}
+		}
+
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf, results_offset), "\n");
+	}
+
+	/* Check if end of dump buffer was exceeded */
+	if (dump_buf > dump_buf_end)
+		return 0;
+	return results_offset;
+}
+
+/* Parses an idle check dump buffer.
+ * If result_buf is not NULL, the idle check results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_idle_chk_dump(struct qed_hwfn *p_hwfn,
+					       u32 *dump_buf,
+					       u32 num_dumped_dwords,
+					       char *results_buf,
+					       u32 *parsed_results_bytes,
+					       u32 *num_errors,
+					       u32 *num_warnings)
+{
+	const char *section_name, *param_name, *param_str_val;
+	u32 *dump_buf_end = dump_buf + num_dumped_dwords;
+	u32 num_section_params = 0, num_rules;
+	u32 results_offset = 0;	/* Offset in results_buf in bytes */
+
+	*parsed_results_bytes = 0;
+	*num_errors = 0;
+	*num_warnings = 0;
+	if (!s_dbg_arrays[BIN_BUF_DBG_PARSING_STRINGS].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_PARSING_DATA].ptr)
+		return DBG_STATUS_DBG_ARRAY_NOT_SET;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read idle_chk section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "idle_chk") || num_section_params != 1)
+		return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &num_rules);
+	if (strcmp(param_name, "num_rules") != 0)
+		return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+
+	if (num_rules) {
+		u32 rules_print_size;
+
+		/* Print FW output */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "FW_IDLE_CHECK:\n");
+		rules_print_size =
+			qed_parse_idle_chk_dump_rules(p_hwfn, dump_buf,
+						      dump_buf_end, num_rules,
+						      true,
+						      results_buf ?
+						      results_buf +
+						      results_offset : NULL,
+						      num_errors, num_warnings);
+		results_offset += rules_print_size;
+		if (rules_print_size == 0)
+			return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+
+		/* Print LSI output */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "\nLSI_IDLE_CHECK:\n");
+		rules_print_size =
+			qed_parse_idle_chk_dump_rules(p_hwfn, dump_buf,
+						      dump_buf_end, num_rules,
+						      false,
+						      results_buf ?
+						      results_buf +
+						      results_offset : NULL,
+						      num_errors, num_warnings);
+		results_offset += rules_print_size;
+		if (rules_print_size == 0)
+			return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+	}
+
+	/* Print errors/warnings count */
+	if (*num_errors) {
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "\nIdle Check failed!!! (with %d errors and %d warnings)\n",
+			    *num_errors, *num_warnings);
+	} else if (*num_warnings) {
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "\nIdle Check completed successfuly (with %d warnings)\n",
+			    *num_warnings);
+	} else {
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "\nIdle Check completed successfuly\n");
+	}
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_get_idle_chk_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size)
+{
+	u32 num_errors, num_warnings;
+
+	return qed_parse_idle_chk_dump(p_hwfn,
+				       dump_buf,
+				       num_dumped_dwords,
+				       NULL,
+				       results_buf_size,
+				       &num_errors, &num_warnings);
+}
+
+enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf,
+					   u32 *num_errors, u32 *num_warnings)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_idle_chk_dump(p_hwfn,
+				       dump_buf,
+				       num_dumped_dwords,
+				       results_buf,
+				       &parsed_buf_size,
+				       num_errors, num_warnings);
+}
+
+/* Frees the specified MCP Trace meta data */
+static void qed_mcp_trace_free_meta(struct qed_hwfn *p_hwfn,
+				    struct mcp_trace_meta *meta)
+{
+	u32 i;
+
+	/* Release modules */
+	if (meta->modules) {
+		for (i = 0; i < meta->modules_num; i++)
+			kfree(meta->modules[i]);
+		kfree(meta->modules);
+	}
+
+	/* Release formats */
+	if (meta->formats) {
+		for (i = 0; i < meta->formats_num; i++)
+			kfree(meta->formats[i].format_str);
+		kfree(meta->formats);
+	}
+}
+
+/* Allocates and fills MCP Trace meta data based on the specified meta data
+ * dump buffer.
+ * Returns debug status code.
+ */
+static enum dbg_status qed_mcp_trace_alloc_meta(struct qed_hwfn *p_hwfn,
+						const u32 *meta_buf,
+						struct mcp_trace_meta *meta)
+{
+	u8 *meta_buf_bytes = (u8 *)meta_buf;
+	u32 offset = 0, signature, i;
+
+	memset(meta, 0, sizeof(*meta));
+
+	/* Read first signature */
+	signature = qed_read_dword_from_buf(meta_buf_bytes, &offset);
+	if (signature != MCP_TRACE_META_IMAGE_SIGNATURE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	/* Read number of modules and allocate memory for all the modules
+	 * pointers.
+	 */
+	meta->modules_num = qed_read_byte_from_buf(meta_buf_bytes, &offset);
+	meta->modules = kzalloc(meta->modules_num * sizeof(char *), GFP_KERNEL);
+	if (!meta->modules)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	/* Allocate and read all module strings */
+	for (i = 0; i < meta->modules_num; i++) {
+		u8 module_len = qed_read_byte_from_buf(meta_buf_bytes, &offset);
+
+		*(meta->modules + i) = kzalloc(module_len, GFP_KERNEL);
+		if (!(*(meta->modules + i))) {
+			/* Update number of modules to be released */
+			meta->modules_num = i ? i - 1 : 0;
+			return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+		}
+
+		qed_read_str_from_buf(meta_buf_bytes, &offset, module_len,
+				      *(meta->modules + i));
+		if (module_len > MCP_TRACE_MAX_MODULE_LEN)
+			(*(meta->modules + i))[MCP_TRACE_MAX_MODULE_LEN] = '\0';
+	}
+
+	/* Read second signature */
+	signature = qed_read_dword_from_buf(meta_buf_bytes, &offset);
+	if (signature != MCP_TRACE_META_IMAGE_SIGNATURE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	/* Read number of formats and allocate memory for all formats */
+	meta->formats_num = qed_read_dword_from_buf(meta_buf_bytes, &offset);
+	meta->formats = kzalloc(meta->formats_num *
+				sizeof(struct mcp_trace_format),
+				GFP_KERNEL);
+	if (!meta->formats)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	/* Allocate and read all strings */
+	for (i = 0; i < meta->formats_num; i++) {
+		struct mcp_trace_format *format_ptr = &meta->formats[i];
+		u8 format_len;
+
+		format_ptr->data = qed_read_dword_from_buf(meta_buf_bytes,
+							   &offset);
+		format_len =
+		    (format_ptr->data &
+		     MCP_TRACE_FORMAT_LEN_MASK) >> MCP_TRACE_FORMAT_LEN_SHIFT;
+		format_ptr->format_str = kzalloc(format_len, GFP_KERNEL);
+		if (!format_ptr->format_str) {
+			/* Update number of modules to be released */
+			meta->formats_num = i ? i - 1 : 0;
+			return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+		}
+
+		qed_read_str_from_buf(meta_buf_bytes,
+				      &offset,
+				      format_len, format_ptr->format_str);
+	}
+
+	return DBG_STATUS_OK;
+}
+
+/* Parses an MCP Trace dump buffer.
+ * If result_buf is not NULL, the MCP Trace results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_mcp_trace_dump(struct qed_hwfn *p_hwfn,
+						u32 *dump_buf,
+						u32 num_dumped_dwords,
+						char *results_buf,
+						u32 *parsed_results_bytes)
+{
+	u32 results_offset = 0, param_mask, param_shift, param_num_val;
+	u32 num_section_params, offset, end_offset, bytes_left;
+	const char *section_name, *param_name, *param_str_val;
+	u32 trace_data_dwords, trace_meta_dwords;
+	struct mcp_trace_meta meta;
+	struct mcp_trace *trace;
+	enum dbg_status status;
+	const u32 *meta_buf;
+	u8 *trace_buf;
+
+	*parsed_results_bytes = 0;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read trace_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "mcp_trace_data") || num_section_params != 1)
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+	trace_data_dwords = param_num_val;
+
+	/* Prepare trace info */
+	trace = (struct mcp_trace *)dump_buf;
+	trace_buf = (u8 *)dump_buf + sizeof(struct mcp_trace);
+	offset = trace->trace_oldest;
+	end_offset = trace->trace_prod;
+	bytes_left = qed_cyclic_sub(end_offset, offset, trace->size);
+	dump_buf += trace_data_dwords;
+
+	/* Read meta_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "mcp_trace_meta"))
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size") != 0)
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+	trace_meta_dwords = param_num_val;
+
+	/* Choose meta data buffer */
+	if (!trace_meta_dwords) {
+		/* Dump doesn't include meta data */
+		if (!s_mcp_trace_meta.ptr)
+			return DBG_STATUS_MCP_TRACE_NO_META;
+		meta_buf = s_mcp_trace_meta.ptr;
+	} else {
+		/* Dump includes meta data */
+		meta_buf = dump_buf;
+	}
+
+	/* Allocate meta data memory */
+	status = qed_mcp_trace_alloc_meta(p_hwfn, meta_buf, &meta);
+	if (status != DBG_STATUS_OK)
+		goto free_mem;
+
+	/* Ignore the level and modules masks - just print everything that is
+	 * already in the buffer.
+	 */
+	while (bytes_left) {
+		struct mcp_trace_format *format_ptr;
+		u8 format_level, format_module;
+		u32 params[3] = { 0, 0, 0 };
+		u32 header, format_idx, i;
+
+		if (bytes_left < MFW_TRACE_ENTRY_SIZE) {
+			status = DBG_STATUS_MCP_TRACE_BAD_DATA;
+			goto free_mem;
+		}
+
+		header = qed_read_from_cyclic_buf(trace_buf,
+						  &offset,
+						  trace->size,
+						  MFW_TRACE_ENTRY_SIZE);
+		bytes_left -= MFW_TRACE_ENTRY_SIZE;
+		format_idx = header & MFW_TRACE_EVENTID_MASK;
+
+		/* Skip message if its  index doesn't exist in the meta data */
+		if (format_idx > meta.formats_num) {
+			u8 format_size =
+			    (u8)((header &
+				  MFW_TRACE_PRM_SIZE_MASK) >>
+				 MFW_TRACE_PRM_SIZE_SHIFT);
+
+			if (bytes_left < format_size) {
+				status = DBG_STATUS_MCP_TRACE_BAD_DATA;
+				goto free_mem;
+			}
+
+			offset = qed_cyclic_add(offset,
+						format_size, trace->size);
+			bytes_left -= format_size;
+			continue;
+		}
+
+		format_ptr = &meta.formats[format_idx];
+		for (i = 0,
+		     param_mask = MCP_TRACE_FORMAT_P1_SIZE_MASK, param_shift =
+		     MCP_TRACE_FORMAT_P1_SIZE_SHIFT;
+		     i < MCP_TRACE_FORMAT_MAX_PARAMS;
+		     i++, param_mask <<= MCP_TRACE_FORMAT_PARAM_WIDTH,
+		     param_shift += MCP_TRACE_FORMAT_PARAM_WIDTH) {
+			/* Extract param size (0..3) */
+			u8 param_size =
+			    (u8)((format_ptr->data &
+				  param_mask) >> param_shift);
+
+			/* If the param size is zero, there are no other
+			 * parameters.
+			 */
+			if (!param_size)
+				break;
+
+			/* Size is encoded using 2 bits, where 3 is used to
+			 * encode 4.
+			 */
+			if (param_size == 3)
+				param_size = 4;
+			if (bytes_left < param_size) {
+				status = DBG_STATUS_MCP_TRACE_BAD_DATA;
+				goto free_mem;
+			}
+
+			params[i] = qed_read_from_cyclic_buf(trace_buf,
+							     &offset,
+							     trace->size,
+							     param_size);
+			bytes_left -= param_size;
+		}
+
+		format_level =
+		    (u8)((format_ptr->data &
+			  MCP_TRACE_FORMAT_LEVEL_MASK) >>
+			  MCP_TRACE_FORMAT_LEVEL_SHIFT);
+		format_module =
+		    (u8)((format_ptr->data &
+			  MCP_TRACE_FORMAT_MODULE_MASK) >>
+			 MCP_TRACE_FORMAT_MODULE_SHIFT);
+		if (format_level >= ARRAY_SIZE(s_mcp_trace_level_str)) {
+			status = DBG_STATUS_MCP_TRACE_BAD_DATA;
+			goto free_mem;
+		}
+
+		/* Print current message to results buffer */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset), "%s %-8s: ",
+			    s_mcp_trace_level_str[format_level],
+			    meta.modules[format_module]);
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    format_ptr->format_str, params[0], params[1],
+			    params[2]);
+	}
+
+free_mem:
+	*parsed_results_bytes = results_offset + 1;
+	qed_mcp_trace_free_meta(p_hwfn, &meta);
+	return status;
+}
+
+enum dbg_status qed_get_mcp_trace_results_buf_size(struct qed_hwfn *p_hwfn,
+						   u32 *dump_buf,
+						   u32 num_dumped_dwords,
+						   u32 *results_buf_size)
+{
+	return qed_parse_mcp_trace_dump(p_hwfn,
+					dump_buf,
+					num_dumped_dwords,
+					NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn,
+					    u32 *dump_buf,
+					    u32 num_dumped_dwords,
+					    char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_mcp_trace_dump(p_hwfn,
+					dump_buf,
+					num_dumped_dwords,
+					results_buf, &parsed_buf_size);
+}
+
+/* Parses a Reg FIFO dump buffer.
+ * If result_buf is not NULL, the Reg FIFO results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_reg_fifo_dump(struct qed_hwfn *p_hwfn,
+					       u32 *dump_buf,
+					       u32 num_dumped_dwords,
+					       char *results_buf,
+					       u32 *parsed_results_bytes)
+{
+	u32 results_offset = 0, param_num_val, num_section_params, num_elements;
+	const char *section_name, *param_name, *param_str_val;
+	struct reg_fifo_element *elements;
+	u8 i, j, err_val, vf_val;
+	char vf_str[4];
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_REG_FIFO_BAD_DATA;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read reg_fifo_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "reg_fifo_data"))
+		return DBG_STATUS_REG_FIFO_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_REG_FIFO_BAD_DATA;
+	if (param_num_val % REG_FIFO_ELEMENT_DWORDS)
+		return DBG_STATUS_REG_FIFO_BAD_DATA;
+	num_elements = param_num_val / REG_FIFO_ELEMENT_DWORDS;
+	elements = (struct reg_fifo_element *)dump_buf;
+
+	/* Decode elements */
+	for (i = 0; i < num_elements; i++) {
+		bool err_printed = false;
+
+		/* Discover if element belongs to a VF or a PF */
+		vf_val = GET_FIELD(elements[i].data, REG_FIFO_ELEMENT_VF);
+		if (vf_val == REG_FIFO_ELEMENT_IS_PF_VF_VAL)
+			sprintf(vf_str, "%s", "N/A");
+		else
+			sprintf(vf_str, "%d", vf_val);
+
+		/* Add parsed element to parsed buffer */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "raw: 0x%016llx, address: 0x%07llx, access: %-5s, pf: %2lld, vf: %s, port: %lld, privilege: %-3s, protection: %-12s, master: %-4s, errors: ",
+			    elements[i].data,
+			    GET_FIELD(elements[i].data,
+				      REG_FIFO_ELEMENT_ADDRESS) *
+				      REG_FIFO_ELEMENT_ADDR_FACTOR,
+				      s_access_strs[GET_FIELD(elements[i].data,
+						    REG_FIFO_ELEMENT_ACCESS)],
+			    GET_FIELD(elements[i].data,
+				      REG_FIFO_ELEMENT_PF), vf_str,
+			    GET_FIELD(elements[i].data,
+				      REG_FIFO_ELEMENT_PORT),
+				      s_privilege_strs[GET_FIELD(elements[i].
+				      data,
+				      REG_FIFO_ELEMENT_PRIVILEGE)],
+			    s_protection_strs[GET_FIELD(elements[i].data,
+						REG_FIFO_ELEMENT_PROTECTION)],
+			    s_master_strs[GET_FIELD(elements[i].data,
+						REG_FIFO_ELEMENT_MASTER)]);
+
+		/* Print errors */
+		for (j = 0,
+		     err_val = GET_FIELD(elements[i].data,
+					 REG_FIFO_ELEMENT_ERROR);
+		     j < ARRAY_SIZE(s_reg_fifo_error_strs);
+		     j++, err_val >>= 1) {
+			if (!(err_val & 0x1))
+				continue;
+			if (err_printed)
+				results_offset +=
+					sprintf(qed_get_buf_ptr(results_buf,
+								results_offset),
+						", ");
+			results_offset +=
+				sprintf(qed_get_buf_ptr(results_buf,
+							results_offset), "%s",
+					s_reg_fifo_error_strs[j]);
+			err_printed = true;
+		}
+
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf, results_offset), "\n");
+	}
+
+	results_offset += sprintf(qed_get_buf_ptr(results_buf,
+						  results_offset),
+				  "fifo contained %d elements", num_elements);
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_get_reg_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size)
+{
+	return qed_parse_reg_fifo_dump(p_hwfn,
+				       dump_buf,
+				       num_dumped_dwords,
+				       NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_reg_fifo_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_reg_fifo_dump(p_hwfn,
+				       dump_buf,
+				       num_dumped_dwords,
+				       results_buf, &parsed_buf_size);
+}
+
+/* Parses an IGU FIFO dump buffer.
+ * If result_buf is not NULL, the IGU FIFO results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_igu_fifo_dump(struct qed_hwfn *p_hwfn,
+					       u32 *dump_buf,
+					       u32 num_dumped_dwords,
+					       char *results_buf,
+					       u32 *parsed_results_bytes)
+{
+	u32 results_offset = 0, param_num_val, num_section_params, num_elements;
+	const char *section_name, *param_name, *param_str_val;
+	struct igu_fifo_element *elements;
+	char parsed_addr_data[32];
+	char parsed_wr_data[256];
+	u8 i, j;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read igu_fifo_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "igu_fifo_data"))
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+	if (param_num_val % IGU_FIFO_ELEMENT_DWORDS)
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+	num_elements = param_num_val / IGU_FIFO_ELEMENT_DWORDS;
+	elements = (struct igu_fifo_element *)dump_buf;
+
+	/* Decode elements */
+	for (i = 0; i < num_elements; i++) {
+		/* dword12 (dword index 1 and 2) contains bits 32..95 of the
+		 * FIFO element.
+		 */
+		u64 dword12 =
+		    ((u64)elements[i].dword2 << 32) | elements[i].dword1;
+		bool is_wr_cmd = GET_FIELD(dword12,
+					   IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD);
+		bool is_pf = GET_FIELD(elements[i].dword0,
+				       IGU_FIFO_ELEMENT_DWORD0_IS_PF);
+		u16 cmd_addr = GET_FIELD(elements[i].dword0,
+					 IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR);
+		u8 source = GET_FIELD(elements[i].dword0,
+				      IGU_FIFO_ELEMENT_DWORD0_SOURCE);
+		u8 err_type = GET_FIELD(elements[i].dword0,
+					IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE);
+		const struct igu_fifo_addr_data *addr_data = NULL;
+
+		if (source >= ARRAY_SIZE(s_igu_fifo_source_strs))
+			return DBG_STATUS_IGU_FIFO_BAD_DATA;
+		if (err_type >= ARRAY_SIZE(s_igu_fifo_error_strs))
+			return DBG_STATUS_IGU_FIFO_BAD_DATA;
+
+		/* Find address data */
+		for (j = 0; j < ARRAY_SIZE(s_igu_fifo_addr_data) && !addr_data;
+		     j++)
+			if (cmd_addr >= s_igu_fifo_addr_data[j].start_addr &&
+			    cmd_addr <= s_igu_fifo_addr_data[j].end_addr)
+				addr_data = &s_igu_fifo_addr_data[j];
+		if (!addr_data)
+			return DBG_STATUS_IGU_FIFO_BAD_DATA;
+
+		/* Prepare parsed address data */
+		switch (addr_data->type) {
+		case IGU_ADDR_TYPE_MSIX_MEM:
+			sprintf(parsed_addr_data,
+				" vector_num=0x%x", cmd_addr / 2);
+			break;
+		case IGU_ADDR_TYPE_WRITE_INT_ACK:
+		case IGU_ADDR_TYPE_WRITE_PROD_UPDATE:
+			sprintf(parsed_addr_data,
+				" SB=0x%x", cmd_addr - addr_data->start_addr);
+			break;
+		default:
+			parsed_addr_data[0] = '\0';
+		}
+
+		/* Prepare parsed write data */
+		if (is_wr_cmd) {
+			u32 wr_data = GET_FIELD(dword12,
+					IGU_FIFO_ELEMENT_DWORD12_WR_DATA);
+			u32 prod_cons = GET_FIELD(wr_data,
+						  IGU_FIFO_WR_DATA_PROD_CONS);
+			u8 is_cleanup = GET_FIELD(wr_data,
+						  IGU_FIFO_WR_DATA_CMD_TYPE);
+
+			if (source == IGU_SRC_ATTN) {
+				sprintf(parsed_wr_data,
+					"prod: 0x%x, ", prod_cons);
+			} else {
+				if (is_cleanup) {
+					u8 cleanup_val = GET_FIELD(wr_data,
+								   IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL);
+					u8 cleanup_type = GET_FIELD(wr_data,
+								    IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE);
+
+					sprintf(parsed_wr_data,
+						"cmd_type: cleanup, cleanup_val: %s, cleanup_type: %d, ",
+						cleanup_val ? "set" : "clear",
+						cleanup_type);
+				} else {
+					u8 update_flag = GET_FIELD(wr_data,
+								   IGU_FIFO_WR_DATA_UPDATE_FLAG);
+					u8 en_dis_int_for_sb =
+					    GET_FIELD(wr_data,
+						      IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB);
+					u8 segment = GET_FIELD(wr_data,
+							       IGU_FIFO_WR_DATA_SEGMENT);
+					u8 timer_mask = GET_FIELD(wr_data,
+								  IGU_FIFO_WR_DATA_TIMER_MASK);
+
+					sprintf(parsed_wr_data,
+						"cmd_type: prod/cons update, prod/cons: 0x%x, update_flag: %s, en_dis_int_for_sb: %s, segment: %s, timer_mask=%d, ",
+						prod_cons,
+						update_flag ? "update" : "nop",
+						en_dis_int_for_sb
+						? (en_dis_int_for_sb ==
+						   1 ? "disable" : "nop") :
+						"enable",
+						segment ? "attn" : "regular",
+						timer_mask);
+				}
+			}
+		} else {
+			parsed_wr_data[0] = '\0';
+		}
+
+		/* Add parsed element to parsed buffer */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "raw: 0x%01x%08x%08x, %s: %d, source: %s, type: %s, cmd_addr: 0x%x (%s%s), %serror: %s\n",
+			    elements[i].dword2, elements[i].dword1,
+			    elements[i].dword0,
+			    is_pf ? "pf" : "vf",
+			    GET_FIELD(elements[i].dword0,
+				      IGU_FIFO_ELEMENT_DWORD0_FID),
+			    s_igu_fifo_source_strs[source],
+			    is_wr_cmd ? "wr" : "rd", cmd_addr,
+			    (!is_pf && addr_data->vf_desc)
+			    ? addr_data->vf_desc : addr_data->desc,
+			    parsed_addr_data, parsed_wr_data,
+			    s_igu_fifo_error_strs[err_type]);
+	}
+
+	results_offset += sprintf(qed_get_buf_ptr(results_buf,
+						  results_offset),
+				  "fifo contained %d elements", num_elements);
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_get_igu_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size)
+{
+	return qed_parse_igu_fifo_dump(p_hwfn,
+				       dump_buf,
+				       num_dumped_dwords,
+				       NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_igu_fifo_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_igu_fifo_dump(p_hwfn,
+				       dump_buf,
+				       num_dumped_dwords,
+				       results_buf, &parsed_buf_size);
+}
+
+static enum dbg_status
+qed_parse_protection_override_dump(struct qed_hwfn *p_hwfn,
+				   u32 *dump_buf,
+				   u32 num_dumped_dwords,
+				   char *results_buf,
+				   u32 *parsed_results_bytes)
+{
+	u32 results_offset = 0, param_num_val, num_section_params, num_elements;
+	const char *section_name, *param_name, *param_str_val;
+	struct protection_override_element *elements;
+	u8 i;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read protection_override_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "protection_override_data"))
+		return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA;
+	if (param_num_val % PROTECTION_OVERRIDE_ELEMENT_DWORDS != 0)
+		return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA;
+	num_elements = param_num_val / PROTECTION_OVERRIDE_ELEMENT_DWORDS;
+	elements = (struct protection_override_element *)dump_buf;
+
+	/* Decode elements */
+	for (i = 0; i < num_elements; i++) {
+		u32 address = GET_FIELD(elements[i].data,
+					PROTECTION_OVERRIDE_ELEMENT_ADDRESS) *
+					PROTECTION_OVERRIDE_ELEMENT_ADDR_FACTOR;
+
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "window %2d, address: 0x%07x, size: %7lld regs, read: %lld, write: %lld, read protection: %-12s, write protection: %-12s\n",
+			    i, address,
+			    GET_FIELD(elements[i].data,
+				      PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE),
+			    GET_FIELD(elements[i].data,
+				      PROTECTION_OVERRIDE_ELEMENT_READ),
+			    GET_FIELD(elements[i].data,
+				      PROTECTION_OVERRIDE_ELEMENT_WRITE),
+			    s_protection_strs[GET_FIELD(elements[i].data,
+				PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION)],
+			    s_protection_strs[GET_FIELD(elements[i].data,
+				PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION)]);
+	}
+
+	results_offset += sprintf(qed_get_buf_ptr(results_buf,
+						  results_offset),
+				  "protection override contained %d elements",
+				  num_elements);
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status
+qed_get_protection_override_results_buf_size(struct qed_hwfn *p_hwfn,
+					     u32 *dump_buf,
+					     u32 num_dumped_dwords,
+					     u32 *results_buf_size)
+{
+	return qed_parse_protection_override_dump(p_hwfn,
+						  dump_buf,
+						  num_dumped_dwords,
+						  NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_protection_override_results(struct qed_hwfn *p_hwfn,
+						      u32 *dump_buf,
+						      u32 num_dumped_dwords,
+						      char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_protection_override_dump(p_hwfn,
+						  dump_buf,
+						  num_dumped_dwords,
+						  results_buf,
+						  &parsed_buf_size);
+}
+
+/* Parses a FW Asserts dump buffer.
+ * If result_buf is not NULL, the FW Asserts results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_fw_asserts_dump(struct qed_hwfn *p_hwfn,
+						 u32 *dump_buf,
+						 u32 num_dumped_dwords,
+						 char *results_buf,
+						 u32 *parsed_results_bytes)
+{
+	u32 results_offset = 0, num_section_params, param_num_val, i;
+	const char *param_name, *param_str_val, *section_name;
+	bool last_section_found = false;
+
+	*parsed_results_bytes = 0;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_FW_ASSERTS_PARSE_FAILED;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+	while (!last_section_found) {
+		const char *storm_letter = NULL;
+		u32 storm_dump_size = 0;
+
+		dump_buf += qed_read_section_hdr(dump_buf,
+						 &section_name,
+						 &num_section_params);
+		if (!strcmp(section_name, "last")) {
+			last_section_found = true;
+			continue;
+		} else if (strcmp(section_name, "fw_asserts")) {
+			return DBG_STATUS_FW_ASSERTS_PARSE_FAILED;
+		}
+
+		/* Extract params */
+		for (i = 0; i < num_section_params; i++) {
+			dump_buf += qed_read_param(dump_buf,
+						   &param_name,
+						   &param_str_val,
+						   &param_num_val);
+			if (!strcmp(param_name, "storm"))
+				storm_letter = param_str_val;
+			else if (!strcmp(param_name, "size"))
+				storm_dump_size = param_num_val;
+			else
+				return DBG_STATUS_FW_ASSERTS_PARSE_FAILED;
+		}
+
+		if (!storm_letter || !storm_dump_size)
+			return DBG_STATUS_FW_ASSERTS_PARSE_FAILED;
+
+		/* Print data */
+		results_offset += sprintf(qed_get_buf_ptr(results_buf,
+							  results_offset),
+					  "\n%sSTORM_ASSERT: size=%d\n",
+					  storm_letter, storm_dump_size);
+		for (i = 0; i < storm_dump_size; i++, dump_buf++)
+			results_offset +=
+			    sprintf(qed_get_buf_ptr(results_buf,
+						    results_offset),
+				    "%08x\n", *dump_buf);
+	}
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_get_fw_asserts_results_buf_size(struct qed_hwfn *p_hwfn,
+						    u32 *dump_buf,
+						    u32 num_dumped_dwords,
+						    u32 *results_buf_size)
+{
+	return qed_parse_fw_asserts_dump(p_hwfn,
+					 dump_buf,
+					 num_dumped_dwords,
+					 NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_fw_asserts_results(struct qed_hwfn *p_hwfn,
+					     u32 *dump_buf,
+					     u32 num_dumped_dwords,
+					     char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_fw_asserts_dump(p_hwfn,
+					 dump_buf,
+					 num_dumped_dwords,
+					 results_buf, &parsed_buf_size);
+}
+
+/* Wrapper for unifying the idle_chk and mcp_trace api */
+enum dbg_status qed_print_idle_chk_results_wrapper(struct qed_hwfn *p_hwfn,
+						   u32 *dump_buf,
+						   u32 num_dumped_dwords,
+						   char *results_buf)
+{
+	u32 num_errors, num_warnnings;
+
+	return qed_print_idle_chk_results(p_hwfn, dump_buf, num_dumped_dwords,
+					  results_buf, &num_errors,
+					  &num_warnnings);
+}
+
+/* Feature meta data lookup table */
+static struct {
+	char *name;
+	enum dbg_status (*get_size)(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt, u32 *size);
+	enum dbg_status (*perform_dump)(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt, u32 *dump_buf,
+					u32 buf_size, u32 *dumped_dwords);
+	enum dbg_status (*print_results)(struct qed_hwfn *p_hwfn,
+					 u32 *dump_buf, u32 num_dumped_dwords,
+					 char *results_buf);
+	enum dbg_status (*results_buf_size)(struct qed_hwfn *p_hwfn,
+					    u32 *dump_buf,
+					    u32 num_dumped_dwords,
+					    u32 *results_buf_size);
+} qed_features_lookup[] = {
+	{
+	"grc", qed_dbg_grc_get_dump_buf_size,
+		    qed_dbg_grc_dump, NULL, NULL}, {
+	"idle_chk",
+		    qed_dbg_idle_chk_get_dump_buf_size,
+		    qed_dbg_idle_chk_dump,
+		    qed_print_idle_chk_results_wrapper,
+		    qed_get_idle_chk_results_buf_size}, {
+	"mcp_trace",
+		    qed_dbg_mcp_trace_get_dump_buf_size,
+		    qed_dbg_mcp_trace_dump, qed_print_mcp_trace_results,
+		    qed_get_mcp_trace_results_buf_size}, {
+	"reg_fifo",
+		    qed_dbg_reg_fifo_get_dump_buf_size,
+		    qed_dbg_reg_fifo_dump, qed_print_reg_fifo_results,
+		    qed_get_reg_fifo_results_buf_size}, {
+	"igu_fifo",
+		    qed_dbg_igu_fifo_get_dump_buf_size,
+		    qed_dbg_igu_fifo_dump, qed_print_igu_fifo_results,
+		    qed_get_igu_fifo_results_buf_size}, {
+	"protection_override",
+		    qed_dbg_protection_override_get_dump_buf_size,
+		    qed_dbg_protection_override_dump,
+		    qed_print_protection_override_results,
+		    qed_get_protection_override_results_buf_size}, {
+	"fw_asserts",
+		    qed_dbg_fw_asserts_get_dump_buf_size,
+		    qed_dbg_fw_asserts_dump,
+		    qed_print_fw_asserts_results,
+		    qed_get_fw_asserts_results_buf_size},};
+
+static void qed_dbg_print_feature(u8 *p_text_buf, u32 text_size)
+{
+	u32 i, precision = 80;
+
+	if (!p_text_buf)
+		return;
+
+	pr_notice("\n%.*s", precision, p_text_buf);
+	for (i = precision; i < text_size; i += precision)
+		pr_cont("%.*s", precision, p_text_buf + i);
+	pr_cont("\n");
+}
+
+#define QED_RESULTS_BUF_MIN_SIZE 16
+/* Generic function for decoding debug feature info */
+enum dbg_status format_feature(struct qed_hwfn *p_hwfn,
+			       enum qed_dbg_features feature_idx)
+{
+	struct qed_dbg_feature *feature =
+	    &p_hwfn->cdev->dbg_params.features[feature_idx];
+	u32 text_size_bytes, null_char_pos, i;
+	enum dbg_status rc;
+	char *text_buf;
+
+	/* Check if feature supports formatting capability */
+	if (!qed_features_lookup[feature_idx].results_buf_size)
+		return DBG_STATUS_OK;
+
+	/* Obtain size of formatted output */
+	rc = qed_features_lookup[feature_idx].
+		results_buf_size(p_hwfn, (u32 *)feature->dump_buf,
+				 feature->dumped_dwords, &text_size_bytes);
+	if (rc != DBG_STATUS_OK)
+		return rc;
+
+	/* Make sure that the allocated size is a multiple of dword (4 bytes) */
+	null_char_pos = text_size_bytes - 1;
+	text_size_bytes = (text_size_bytes + 3) & ~0x3;
+
+	if (text_size_bytes < QED_RESULTS_BUF_MIN_SIZE) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "formatted size of feature was too small %d. Aborting\n",
+			  text_size_bytes);
+		return DBG_STATUS_INVALID_ARGS;
+	}
+
+	/* Allocate temp text buf */
+	text_buf = vzalloc(text_size_bytes);
+	if (!text_buf)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	/* Decode feature opcodes to string on temp buf */
+	rc = qed_features_lookup[feature_idx].
+		print_results(p_hwfn, (u32 *)feature->dump_buf,
+			      feature->dumped_dwords, text_buf);
+	if (rc != DBG_STATUS_OK) {
+		vfree(text_buf);
+		return rc;
+	}
+
+	/* Replace the original null character with a '\n' character.
+	 * The bytes that were added as a result of the dword alignment are also
+	 * padded with '\n' characters.
+	 */
+	for (i = null_char_pos; i < text_size_bytes; i++)
+		text_buf[i] = '\n';
+
+	/* Dump printable feature to log */
+	if (p_hwfn->cdev->dbg_params.print_data)
+		qed_dbg_print_feature(text_buf, text_size_bytes);
+
+	/* Free the old dump_buf and point the dump_buf to the newly allocagted
+	 * and formatted text buffer.
+	 */
+	vfree(feature->dump_buf);
+	feature->dump_buf = text_buf;
+	feature->buf_size = text_size_bytes;
+	feature->dumped_dwords = text_size_bytes / 4;
+	return rc;
+}
+
+/* Generic function for performing the dump of a debug feature. */
+enum dbg_status qed_dbg_dump(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			     enum qed_dbg_features feature_idx)
+{
+	struct qed_dbg_feature *feature =
+	    &p_hwfn->cdev->dbg_params.features[feature_idx];
+	u32 buf_size_dwords;
+	enum dbg_status rc;
+
+	DP_NOTICE(p_hwfn->cdev, "Collecting a debug feature [\"%s\"]\n",
+		  qed_features_lookup[feature_idx].name);
+
+	/* Dump_buf was already allocated need to free (this can happen if dump
+	 * was called but file was never read).
+	 * We can't use the buffer as is since size may have changed.
+	 */
+	if (feature->dump_buf) {
+		vfree(feature->dump_buf);
+		feature->dump_buf = NULL;
+	}
+
+	/* Get buffer size from hsi, allocate accordingly, and perform the
+	 * dump.
+	 */
+	rc = qed_features_lookup[feature_idx].get_size(p_hwfn, p_ptt,
+						       &buf_size_dwords);
+	if (rc != DBG_STATUS_OK)
+		return rc;
+	feature->buf_size = buf_size_dwords * sizeof(u32);
+	feature->dump_buf = vmalloc(feature->buf_size);
+	if (!feature->dump_buf)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	rc = qed_features_lookup[feature_idx].
+		perform_dump(p_hwfn, p_ptt, (u32 *)feature->dump_buf,
+			     feature->buf_size / sizeof(u32),
+			     &feature->dumped_dwords);
+
+	/* If mcp is stuck we get DBG_STATUS_NVRAM_GET_IMAGE_FAILED error.
+	 * In this case the buffer holds valid binary data, but we wont able
+	 * to parse it (since parsing relies on data in NVRAM which is only
+	 * accessible when MFW is responsive). skip the formatting but return
+	 * success so that binary data is provided.
+	 */
+	if (rc == DBG_STATUS_NVRAM_GET_IMAGE_FAILED)
+		return DBG_STATUS_OK;
+
+	if (rc != DBG_STATUS_OK)
+		return rc;
+
+	/* Format output */
+	rc = format_feature(p_hwfn, feature_idx);
+	return rc;
+}
+
+int qed_dbg_grc(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_GRC, num_dumped_bytes);
+}
+
+int qed_dbg_grc_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_GRC);
+}
+
+int qed_dbg_idle_chk(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_IDLE_CHK,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_idle_chk_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_IDLE_CHK);
+}
+
+int qed_dbg_reg_fifo(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_REG_FIFO,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_reg_fifo_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_REG_FIFO);
+}
+
+int qed_dbg_igu_fifo(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_IGU_FIFO,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_igu_fifo_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_IGU_FIFO);
+}
+
+int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer,
+				u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_PROTECTION_OVERRIDE,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_protection_override_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_PROTECTION_OVERRIDE);
+}
+
+int qed_dbg_fw_asserts(struct qed_dev *cdev, void *buffer,
+		       u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_FW_ASSERTS,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_fw_asserts_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_FW_ASSERTS);
+}
+
+int qed_dbg_mcp_trace(struct qed_dev *cdev, void *buffer,
+		      u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_MCP_TRACE,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_mcp_trace_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_MCP_TRACE);
+}
+
+/* Defines the amount of bytes allocated for recording the length of debugfs
+ * feature buffer.
+ */
+#define REGDUMP_HEADER_SIZE			sizeof(u32)
+#define REGDUMP_HEADER_FEATURE_SHIFT		24
+#define REGDUMP_HEADER_ENGINE_SHIFT		31
+#define REGDUMP_HEADER_OMIT_ENGINE_SHIFT	30
+enum debug_print_features {
+	OLD_MODE = 0,
+	IDLE_CHK = 1,
+	GRC_DUMP = 2,
+	MCP_TRACE = 3,
+	REG_FIFO = 4,
+	PROTECTION_OVERRIDE = 5,
+	IGU_FIFO = 6,
+	PHY = 7,
+	FW_ASSERTS = 8,
+};
+
+static u32 qed_calc_regdump_header(enum debug_print_features feature,
+				   int engine, u32 feature_size, u8 omit_engine)
+{
+	/* Insert the engine, feature and mode inside the header and combine it
+	 * with feature size.
+	 */
+	return feature_size | (feature << REGDUMP_HEADER_FEATURE_SHIFT) |
+	       (omit_engine << REGDUMP_HEADER_OMIT_ENGINE_SHIFT) |
+	       (engine << REGDUMP_HEADER_ENGINE_SHIFT);
+}
+
+int qed_dbg_all_data(struct qed_dev *cdev, void *buffer)
+{
+	u8 cur_engine, omit_engine = 0, org_engine;
+	u32 offset = 0, feature_size;
+	int rc;
+
+	if (cdev->num_hwfns == 1)
+		omit_engine = 1;
+
+	org_engine = qed_get_debug_engine(cdev);
+	for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) {
+		/* Collect idle_chks and grcDump for each hw function */
+		DP_VERBOSE(cdev, QED_MSG_DEBUG,
+			   "obtaining idle_chk and grcdump for current engine\n");
+		qed_set_debug_engine(cdev, cur_engine);
+
+		/* First idle_chk */
+		rc = qed_dbg_idle_chk(cdev, (u8 *)buffer + offset +
+				      REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(IDLE_CHK, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_idle_chk failed. rc = %d\n", rc);
+		}
+
+		/* Second idle_chk */
+		rc = qed_dbg_idle_chk(cdev, (u8 *)buffer + offset +
+				      REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(IDLE_CHK, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_idle_chk failed. rc = %d\n", rc);
+		}
+
+		/* reg_fifo dump */
+		rc = qed_dbg_reg_fifo(cdev, (u8 *)buffer + offset +
+				      REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(REG_FIFO, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_reg_fifo failed. rc = %d\n", rc);
+		}
+
+		/* igu_fifo dump */
+		rc = qed_dbg_igu_fifo(cdev, (u8 *)buffer + offset +
+				      REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(IGU_FIFO, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_igu_fifo failed. rc = %d", rc);
+		}
+
+		/* protection_override dump */
+		rc = qed_dbg_protection_override(cdev, (u8 *)buffer + offset +
+						 REGDUMP_HEADER_SIZE,
+						 &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(PROTECTION_OVERRIDE,
+						    cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev,
+			       "qed_dbg_protection_override failed. rc = %d\n",
+			       rc);
+		}
+
+		/* fw_asserts dump */
+		rc = qed_dbg_fw_asserts(cdev, (u8 *)buffer + offset +
+					REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(FW_ASSERTS, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_fw_asserts failed. rc = %d\n",
+			       rc);
+		}
+
+		/* GRC dump - must be last because when mcp stuck it will
+		 * clutter idle_chk, reg_fifo, ...
+		 */
+		rc = qed_dbg_grc(cdev, (u8 *)buffer + offset +
+				 REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(GRC_DUMP, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_grc failed. rc = %d", rc);
+		}
+	}
+
+	/* mcp_trace */
+	rc = qed_dbg_mcp_trace(cdev, (u8 *)buffer + offset +
+			       REGDUMP_HEADER_SIZE, &feature_size);
+	if (!rc) {
+		*(u32 *)((u8 *)buffer + offset) =
+		    qed_calc_regdump_header(MCP_TRACE, cur_engine,
+					    feature_size, omit_engine);
+		offset += (feature_size + REGDUMP_HEADER_SIZE);
+	} else {
+		DP_ERR(cdev, "qed_dbg_mcp_trace failed. rc = %d\n", rc);
+	}
+
+	qed_set_debug_engine(cdev, org_engine);
+
+	return 0;
+}
+
+int qed_dbg_all_data_size(struct qed_dev *cdev)
+{
+	u8 cur_engine, org_engine;
+	u32 regs_len = 0;
+
+	org_engine = qed_get_debug_engine(cdev);
+	for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) {
+		/* Engine specific */
+		DP_VERBOSE(cdev, QED_MSG_DEBUG,
+			   "calculating idle_chk and grcdump register length for current engine\n");
+		qed_set_debug_engine(cdev, cur_engine);
+		regs_len += REGDUMP_HEADER_SIZE + qed_dbg_idle_chk_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_idle_chk_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_grc_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_reg_fifo_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_igu_fifo_size(cdev) +
+			    REGDUMP_HEADER_SIZE +
+			    qed_dbg_protection_override_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_fw_asserts_size(cdev);
+	}
+
+	/* Engine common */
+	regs_len += REGDUMP_HEADER_SIZE + qed_dbg_mcp_trace_size(cdev);
+	qed_set_debug_engine(cdev, org_engine);
+
+	return regs_len;
+}
+
+int qed_dbg_feature(struct qed_dev *cdev, void *buffer,
+		    enum qed_dbg_features feature, u32 *num_dumped_bytes)
+{
+	struct qed_hwfn *p_hwfn =
+		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
+	struct qed_dbg_feature *qed_feature =
+		&cdev->dbg_params.features[feature];
+	enum dbg_status dbg_rc;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	/* Acquire ptt */
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EINVAL;
+
+	/* Get dump */
+	dbg_rc = qed_dbg_dump(p_hwfn, p_ptt, feature);
+	if (dbg_rc != DBG_STATUS_OK) {
+		DP_VERBOSE(cdev, QED_MSG_DEBUG, "%s\n",
+			   qed_dbg_get_status_str(dbg_rc));
+		*num_dumped_bytes = 0;
+		rc = -EINVAL;
+		goto out;
+	}
+
+	DP_VERBOSE(cdev, QED_MSG_DEBUG,
+		   "copying debugfs feature to external buffer\n");
+	memcpy(buffer, qed_feature->dump_buf, qed_feature->buf_size);
+	*num_dumped_bytes = cdev->dbg_params.features[feature].dumped_dwords *
+			    4;
+
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
+
+int qed_dbg_feature_size(struct qed_dev *cdev, enum qed_dbg_features feature)
+{
+	struct qed_hwfn *p_hwfn =
+		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
+	struct qed_ptt *p_ptt = qed_ptt_acquire(p_hwfn);
+	struct qed_dbg_feature *qed_feature =
+		&cdev->dbg_params.features[feature];
+	u32 buf_size_dwords;
+	enum dbg_status rc;
+
+	if (!p_ptt)
+		return -EINVAL;
+
+	rc = qed_features_lookup[feature].get_size(p_hwfn, p_ptt,
+						   &buf_size_dwords);
+	if (rc != DBG_STATUS_OK)
+		buf_size_dwords = 0;
+
+	qed_ptt_release(p_hwfn, p_ptt);
+	qed_feature->buf_size = buf_size_dwords * sizeof(u32);
+	return qed_feature->buf_size;
+}
+
+u8 qed_get_debug_engine(struct qed_dev *cdev)
+{
+	return cdev->dbg_params.engine_for_debug;
+}
+
+void qed_set_debug_engine(struct qed_dev *cdev, int engine_number)
+{
+	DP_VERBOSE(cdev, QED_MSG_DEBUG, "set debug engine to %d\n",
+		   engine_number);
+	cdev->dbg_params.engine_for_debug = engine_number;
+}
+
+void qed_dbg_pf_init(struct qed_dev *cdev)
+{
+	const u8 *dbg_values;
+
+	/* Debug values are after init values.
+	 * The offset is the first dword of the file.
+	 */
+	dbg_values = cdev->firmware->data + *(u32 *)cdev->firmware->data;
+	qed_dbg_set_bin_ptr((u8 *)dbg_values);
+	qed_dbg_user_set_bin_ptr((u8 *)dbg_values);
+}
+
+void qed_dbg_pf_exit(struct qed_dev *cdev)
+{
+	struct qed_dbg_feature *feature = NULL;
+	enum qed_dbg_features feature_idx;
+
+	/* Debug features' buffers may be allocated if debug feature was used
+	 * but dump wasn't called.
+	 */
+	for (feature_idx = 0; feature_idx < DBG_FEATURE_NUM; feature_idx++) {
+		feature = &cdev->dbg_params.features[feature_idx];
+		if (feature->dump_buf) {
+			vfree(feature->dump_buf);
+			feature->dump_buf = NULL;
+		}
+	}
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.h b/drivers/net/ethernet/qlogic/qed/qed_debug.h
new file mode 100644
index 000000000000..f872d7324814
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.h
@@ -0,0 +1,54 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_DEBUGFS_H
+#define _QED_DEBUGFS_H
+
+enum qed_dbg_features {
+	DBG_FEATURE_GRC,
+	DBG_FEATURE_IDLE_CHK,
+	DBG_FEATURE_MCP_TRACE,
+	DBG_FEATURE_REG_FIFO,
+	DBG_FEATURE_IGU_FIFO,
+	DBG_FEATURE_PROTECTION_OVERRIDE,
+	DBG_FEATURE_FW_ASSERTS,
+	DBG_FEATURE_NUM
+};
+
+int qed_dbg_grc(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes);
+int qed_dbg_grc_size(struct qed_dev *cdev);
+int qed_dbg_idle_chk(struct qed_dev *cdev, void *buffer,
+		     u32 *num_dumped_bytes);
+int qed_dbg_idle_chk_size(struct qed_dev *cdev);
+int qed_dbg_reg_fifo(struct qed_dev *cdev, void *buffer,
+		     u32 *num_dumped_bytes);
+int qed_dbg_reg_fifo_size(struct qed_dev *cdev);
+int qed_dbg_igu_fifo(struct qed_dev *cdev, void *buffer,
+		     u32 *num_dumped_bytes);
+int qed_dbg_igu_fifo_size(struct qed_dev *cdev);
+int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer,
+				u32 *num_dumped_bytes);
+int qed_dbg_protection_override_size(struct qed_dev *cdev);
+int qed_dbg_fw_asserts(struct qed_dev *cdev, void *buffer,
+		       u32 *num_dumped_bytes);
+int qed_dbg_fw_asserts_size(struct qed_dev *cdev);
+int qed_dbg_mcp_trace(struct qed_dev *cdev, void *buffer,
+		      u32 *num_dumped_bytes);
+int qed_dbg_mcp_trace_size(struct qed_dev *cdev);
+int qed_dbg_all_data(struct qed_dev *cdev, void *buffer);
+int qed_dbg_all_data_size(struct qed_dev *cdev);
+u8 qed_get_debug_engine(struct qed_dev *cdev);
+void qed_set_debug_engine(struct qed_dev *cdev, int engine_number);
+int qed_dbg_feature(struct qed_dev *cdev, void *buffer,
+		    enum qed_dbg_features feature, u32 *num_dumped_bytes);
+int qed_dbg_feature_size(struct qed_dev *cdev, enum qed_dbg_features feature);
+
+void qed_dbg_pf_init(struct qed_dev *cdev);
+void qed_dbg_pf_exit(struct qed_dev *cdev);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 855691f18412..2777d5bb4380 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -1728,13 +1728,6 @@ enum bin_dbg_buffer_type {
 	MAX_BIN_DBG_BUFFER_TYPE
 };
 
-/* Chip IDs */
-enum chip_ids {
-	CHIP_RESERVED,
-	CHIP_BB_B0,
-	CHIP_RESERVED2,
-	MAX_CHIP_IDS
-};
 
 /* Attention bit mapping */
 struct dbg_attn_bit_mapping {
@@ -1813,6 +1806,371 @@ enum dbg_attn_type {
 	MAX_DBG_ATTN_TYPE
 };
 
+/* condition header for registers dump */
+struct dbg_dump_cond_hdr {
+	struct dbg_mode_hdr mode; /* Mode header */
+	u8 block_id; /* block ID */
+	u8 data_size; /* size in dwords of the data following this header */
+};
+
+/* memory data for registers dump */
+struct dbg_dump_mem {
+	__le32 dword0;
+#define DBG_DUMP_MEM_ADDRESS_MASK       0xFFFFFF
+#define DBG_DUMP_MEM_ADDRESS_SHIFT      0
+#define DBG_DUMP_MEM_MEM_GROUP_ID_MASK  0xFF
+#define DBG_DUMP_MEM_MEM_GROUP_ID_SHIFT 24
+	__le32 dword1;
+#define DBG_DUMP_MEM_LENGTH_MASK        0xFFFFFF
+#define DBG_DUMP_MEM_LENGTH_SHIFT       0
+#define DBG_DUMP_MEM_RESERVED_MASK      0xFF
+#define DBG_DUMP_MEM_RESERVED_SHIFT     24
+};
+
+/* register data for registers dump */
+struct dbg_dump_reg {
+	__le32 data;
+#define DBG_DUMP_REG_ADDRESS_MASK  0xFFFFFF /* register address (in dwords) */
+#define DBG_DUMP_REG_ADDRESS_SHIFT 0
+#define DBG_DUMP_REG_LENGTH_MASK   0xFF /* register size (in dwords) */
+#define DBG_DUMP_REG_LENGTH_SHIFT  24
+};
+
+/* split header for registers dump */
+struct dbg_dump_split_hdr {
+	__le32 hdr;
+#define DBG_DUMP_SPLIT_HDR_DATA_SIZE_MASK      0xFFFFFF
+#define DBG_DUMP_SPLIT_HDR_DATA_SIZE_SHIFT     0
+#define DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID_MASK  0xFF
+#define DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID_SHIFT 24
+};
+
+/* condition header for idle check */
+struct dbg_idle_chk_cond_hdr {
+	struct dbg_mode_hdr mode; /* Mode header */
+	__le16 data_size; /* size in dwords of the data following this header */
+};
+
+/* Idle Check condition register */
+struct dbg_idle_chk_cond_reg {
+	__le32 data;
+#define DBG_IDLE_CHK_COND_REG_ADDRESS_MASK   0xFFFFFF
+#define DBG_IDLE_CHK_COND_REG_ADDRESS_SHIFT  0
+#define DBG_IDLE_CHK_COND_REG_BLOCK_ID_MASK  0xFF
+#define DBG_IDLE_CHK_COND_REG_BLOCK_ID_SHIFT 24
+	__le16 num_entries; /* number of registers entries to check */
+	u8 entry_size; /* size of registers entry (in dwords) */
+	u8 start_entry; /* index of the first entry to check */
+};
+
+/* Idle Check info register */
+struct dbg_idle_chk_info_reg {
+	__le32 data;
+#define DBG_IDLE_CHK_INFO_REG_ADDRESS_MASK   0xFFFFFF
+#define DBG_IDLE_CHK_INFO_REG_ADDRESS_SHIFT  0
+#define DBG_IDLE_CHK_INFO_REG_BLOCK_ID_MASK  0xFF
+#define DBG_IDLE_CHK_INFO_REG_BLOCK_ID_SHIFT 24
+	__le16 size; /* register size in dwords */
+	struct dbg_mode_hdr mode; /* Mode header */
+};
+
+/* Idle Check register */
+union dbg_idle_chk_reg {
+	struct dbg_idle_chk_cond_reg cond_reg; /* condition register */
+	struct dbg_idle_chk_info_reg info_reg; /* info register */
+};
+
+/* Idle Check result header */
+struct dbg_idle_chk_result_hdr {
+	__le16 rule_id; /* Failing rule index */
+	__le16 mem_entry_id; /* Failing memory entry index */
+	u8 num_dumped_cond_regs; /* number of dumped condition registers */
+	u8 num_dumped_info_regs; /* number of dumped condition registers */
+	u8 severity; /* from dbg_idle_chk_severity_types enum */
+	u8 reserved;
+};
+
+/* Idle Check result register header */
+struct dbg_idle_chk_result_reg_hdr {
+	u8 data;
+#define DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM_MASK  0x1
+#define DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM_SHIFT 0
+#define DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID_MASK  0x7F
+#define DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID_SHIFT 1
+	u8 start_entry; /* index of the first checked entry */
+	__le16 size; /* register size in dwords */
+};
+
+/* Idle Check rule */
+struct dbg_idle_chk_rule {
+	__le16 rule_id; /* Idle Check rule ID */
+	u8 severity; /* value from dbg_idle_chk_severity_types enum */
+	u8 cond_id; /* Condition ID */
+	u8 num_cond_regs; /* number of condition registers */
+	u8 num_info_regs; /* number of info registers */
+	u8 num_imms; /* number of immediates in the condition */
+	u8 reserved1;
+	__le16 reg_offset; /* offset of this rules registers in the idle check
+			    * register array (in dbg_idle_chk_reg units).
+			    */
+	__le16 imm_offset; /* offset of this rules immediate values in the
+			    * immediate values array (in dwords).
+			    */
+};
+
+/* Idle Check rule parsing data */
+struct dbg_idle_chk_rule_parsing_data {
+	__le32 data;
+#define DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG_MASK  0x1
+#define DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG_SHIFT 0
+#define DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET_MASK  0x7FFFFFFF
+#define DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET_SHIFT 1
+};
+
+/* idle check severity types */
+enum dbg_idle_chk_severity_types {
+	/* idle check failure should cause an error */
+	IDLE_CHK_SEVERITY_ERROR,
+	/* idle check failure should cause an error only if theres no traffic */
+	IDLE_CHK_SEVERITY_ERROR_NO_TRAFFIC,
+	/* idle check failure should cause a warning */
+	IDLE_CHK_SEVERITY_WARNING,
+	MAX_DBG_IDLE_CHK_SEVERITY_TYPES
+};
+
+/* Debug Bus block data */
+struct dbg_bus_block_data {
+	u8 enabled; /* Indicates if the block is enabled for recording (0/1) */
+	u8 hw_id; /* HW ID associated with the block */
+	u8 line_num; /* Debug line number to select */
+	u8 right_shift; /* Number of units to  right the debug data (0-3) */
+	u8 cycle_en; /* 4-bit value: bit i set -> unit i is enabled. */
+	u8 force_valid; /* 4-bit value: bit i set -> unit i is forced valid. */
+	u8 force_frame; /* 4-bit value: bit i set -> unit i frame bit is forced.
+			 */
+	u8 reserved;
+};
+
+/* Debug Bus Clients */
+enum dbg_bus_clients {
+	DBG_BUS_CLIENT_RBCN,
+	DBG_BUS_CLIENT_RBCP,
+	DBG_BUS_CLIENT_RBCR,
+	DBG_BUS_CLIENT_RBCT,
+	DBG_BUS_CLIENT_RBCU,
+	DBG_BUS_CLIENT_RBCF,
+	DBG_BUS_CLIENT_RBCX,
+	DBG_BUS_CLIENT_RBCS,
+	DBG_BUS_CLIENT_RBCH,
+	DBG_BUS_CLIENT_RBCZ,
+	DBG_BUS_CLIENT_OTHER_ENGINE,
+	DBG_BUS_CLIENT_TIMESTAMP,
+	DBG_BUS_CLIENT_CPU,
+	DBG_BUS_CLIENT_RBCY,
+	DBG_BUS_CLIENT_RBCQ,
+	DBG_BUS_CLIENT_RBCM,
+	DBG_BUS_CLIENT_RBCB,
+	DBG_BUS_CLIENT_RBCW,
+	DBG_BUS_CLIENT_RBCV,
+	MAX_DBG_BUS_CLIENTS
+};
+
+/* Debug Bus memory address */
+struct dbg_bus_mem_addr {
+	__le32 lo;
+	__le32 hi;
+};
+
+/* Debug Bus PCI buffer data */
+struct dbg_bus_pci_buf_data {
+	struct dbg_bus_mem_addr phys_addr; /* PCI buffer physical address */
+	struct dbg_bus_mem_addr virt_addr; /* PCI buffer virtual address */
+	__le32 size; /* PCI buffer size in bytes */
+};
+
+/* Debug Bus Storm EID range filter params */
+struct dbg_bus_storm_eid_range_params {
+	u8 min; /* Minimal event ID to filter on */
+	u8 max; /* Maximal event ID to filter on */
+};
+
+/* Debug Bus Storm EID mask filter params */
+struct dbg_bus_storm_eid_mask_params {
+	u8 val; /* Event ID value */
+	u8 mask; /* Event ID mask. 1s in the mask = dont care bits. */
+};
+
+/* Debug Bus Storm EID filter params */
+union dbg_bus_storm_eid_params {
+	struct dbg_bus_storm_eid_range_params range;
+	struct dbg_bus_storm_eid_mask_params mask;
+};
+
+/* Debug Bus Storm data */
+struct dbg_bus_storm_data {
+	u8 fast_enabled;
+	u8 fast_mode;
+	u8 slow_enabled;
+	u8 slow_mode;
+	u8 hw_id;
+	u8 eid_filter_en;
+	u8 eid_range_not_mask;
+	u8 cid_filter_en;
+	union dbg_bus_storm_eid_params eid_filter_params;
+	__le16 reserved;
+	__le32 cid;
+};
+
+/* Debug Bus data */
+struct dbg_bus_data {
+	__le32 app_version; /* The tools version number of the application */
+	u8 state; /* The current debug bus state */
+	u8 hw_dwords; /* HW dwords per cycle */
+	u8 next_hw_id; /* Next HW ID to be associated with an input */
+	u8 num_enabled_blocks; /* Number of blocks enabled for recording */
+	u8 num_enabled_storms; /* Number of Storms enabled for recording */
+	u8 target; /* Output target */
+	u8 next_trigger_state; /* ID of next trigger state to be added */
+	u8 next_constraint_id; /* ID of next filter/trigger constraint to be
+				* added.
+				*/
+	u8 one_shot_en; /* Indicates if one-shot mode is enabled (0/1) */
+	u8 grc_input_en; /* Indicates if GRC recording is enabled (0/1) */
+	u8 timestamp_input_en; /* Indicates if timestamp recording is enabled
+				* (0/1).
+				*/
+	u8 filter_en; /* Indicates if the recording filter is enabled (0/1) */
+	u8 trigger_en; /* Indicates if the recording trigger is enabled (0/1) */
+	u8 adding_filter; /* If true, the next added constraint belong to the
+			   * filter. Otherwise, it belongs to the last added
+			   * trigger state. Valid only if either filter or
+			   * triggers are enabled.
+			   */
+	u8 filter_pre_trigger; /* Indicates if the recording filter should be
+				* applied before the trigger. Valid only if both
+				* filter and trigger are enabled (0/1).
+				*/
+	u8 filter_post_trigger; /* Indicates if the recording filter should be
+				 * applied after the trigger. Valid only if both
+				 * filter and trigger are enabled (0/1).
+				 */
+	u8 unify_inputs; /* If true, all inputs are associated with HW ID 0.
+			  * Otherwise, each input is assigned a different HW ID
+			  * (0/1).
+			  */
+	u8 rcv_from_other_engine; /* Indicates if the other engine sends it NW
+				   * recording to this engine (0/1).
+				   */
+	struct dbg_bus_pci_buf_data pci_buf; /* Debug Bus PCI buffer data. Valid
+					      * only when the target is
+					      * DBG_BUS_TARGET_ID_PCI.
+					      */
+	__le16 reserved;
+	struct dbg_bus_block_data blocks[80];/* Debug Bus data for each block */
+	struct dbg_bus_storm_data storms[6]; /* Debug Bus data for each block */
+};
+
+/* Debug bus frame modes */
+enum dbg_bus_frame_modes {
+	DBG_BUS_FRAME_MODE_0HW_4ST = 0, /* 0 HW dwords, 4 Storm dwords */
+	DBG_BUS_FRAME_MODE_4HW_0ST = 3, /* 4 HW dwords, 0 Storm dwords */
+	DBG_BUS_FRAME_MODE_8HW_0ST = 4, /* 8 HW dwords, 0 Storm dwords */
+	MAX_DBG_BUS_FRAME_MODES
+};
+
+/* Debug bus states */
+enum dbg_bus_states {
+	DBG_BUS_STATE_IDLE, /* debug bus idle state (not recording) */
+	DBG_BUS_STATE_READY, /* debug bus is ready for configuration and
+			      * recording.
+			      */
+	DBG_BUS_STATE_RECORDING, /* debug bus is currently recording */
+	DBG_BUS_STATE_STOPPED, /* debug bus recording has stopped */
+	MAX_DBG_BUS_STATES
+};
+
+/* Debug bus target IDs */
+enum dbg_bus_targets {
+	/* records debug bus to DBG block internal buffer */
+	DBG_BUS_TARGET_ID_INT_BUF,
+	/* records debug bus to the NW */
+	DBG_BUS_TARGET_ID_NIG,
+	/* records debug bus to a PCI buffer */
+	DBG_BUS_TARGET_ID_PCI,
+	MAX_DBG_BUS_TARGETS
+};
+
+/* GRC Dump data */
+struct dbg_grc_data {
+	__le32 param_val[40]; /* Value of each GRC parameter. Array size must
+			       * match the enum dbg_grc_params.
+			       */
+	u8 param_set_by_user[40]; /* Indicates for each GRC parameter if it was
+				   * set by the user (0/1). Array size must
+				   * match the enum dbg_grc_params.
+				   */
+};
+
+/* Debug GRC params */
+enum dbg_grc_params {
+	DBG_GRC_PARAM_DUMP_TSTORM, /* dump Tstorm memories (0/1) */
+	DBG_GRC_PARAM_DUMP_MSTORM, /* dump Mstorm memories (0/1) */
+	DBG_GRC_PARAM_DUMP_USTORM, /* dump Ustorm memories (0/1) */
+	DBG_GRC_PARAM_DUMP_XSTORM, /* dump Xstorm memories (0/1) */
+	DBG_GRC_PARAM_DUMP_YSTORM, /* dump Ystorm memories (0/1) */
+	DBG_GRC_PARAM_DUMP_PSTORM, /* dump Pstorm memories (0/1) */
+	DBG_GRC_PARAM_DUMP_REGS, /* dump non-memory registers (0/1) */
+	DBG_GRC_PARAM_DUMP_RAM, /* dump Storm internal RAMs (0/1) */
+	DBG_GRC_PARAM_DUMP_PBUF, /* dump Storm passive buffer (0/1) */
+	DBG_GRC_PARAM_DUMP_IOR, /* dump Storm IORs (0/1) */
+	DBG_GRC_PARAM_DUMP_VFC, /* dump VFC memories (0/1) */
+	DBG_GRC_PARAM_DUMP_CM_CTX, /* dump CM contexts (0/1) */
+	DBG_GRC_PARAM_DUMP_PXP, /* dump PXP memories (0/1) */
+	DBG_GRC_PARAM_DUMP_RSS, /* dump RSS memories (0/1) */
+	DBG_GRC_PARAM_DUMP_CAU, /* dump CAU memories (0/1) */
+	DBG_GRC_PARAM_DUMP_QM, /* dump QM memories (0/1) */
+	DBG_GRC_PARAM_DUMP_MCP, /* dump MCP memories (0/1) */
+	DBG_GRC_PARAM_RESERVED, /* reserved */
+	DBG_GRC_PARAM_DUMP_CFC, /* dump CFC memories (0/1) */
+	DBG_GRC_PARAM_DUMP_IGU, /* dump IGU memories (0/1) */
+	DBG_GRC_PARAM_DUMP_BRB, /* dump BRB memories (0/1) */
+	DBG_GRC_PARAM_DUMP_BTB, /* dump BTB memories (0/1) */
+	DBG_GRC_PARAM_DUMP_BMB, /* dump BMB memories (0/1) */
+	DBG_GRC_PARAM_DUMP_NIG, /* dump NIG memories (0/1) */
+	DBG_GRC_PARAM_DUMP_MULD, /* dump MULD memories (0/1) */
+	DBG_GRC_PARAM_DUMP_PRS, /* dump PRS memories (0/1) */
+	DBG_GRC_PARAM_DUMP_DMAE, /* dump PRS memories (0/1) */
+	DBG_GRC_PARAM_DUMP_TM, /* dump TM (timers) memories (0/1) */
+	DBG_GRC_PARAM_DUMP_SDM, /* dump SDM memories (0/1) */
+	DBG_GRC_PARAM_DUMP_DIF, /* dump DIF memories (0/1) */
+	DBG_GRC_PARAM_DUMP_STATIC, /* dump static debug data (0/1) */
+	DBG_GRC_PARAM_UNSTALL, /* un-stall Storms after dump (0/1) */
+	DBG_GRC_PARAM_NUM_LCIDS, /* number of LCIDs (0..320) */
+	DBG_GRC_PARAM_NUM_LTIDS, /* number of LTIDs (0..320) */
+	/* preset: exclude all memories from dump (1 only) */
+	DBG_GRC_PARAM_EXCLUDE_ALL,
+	/* preset: include memories for crash dump (1 only) */
+	DBG_GRC_PARAM_CRASH,
+	/* perform dump only if MFW is responding (0/1) */
+	DBG_GRC_PARAM_PARITY_SAFE,
+	DBG_GRC_PARAM_DUMP_CM, /* dump CM memories (0/1) */
+	DBG_GRC_PARAM_DUMP_PHY, /* dump PHY memories (0/1) */
+	MAX_DBG_GRC_PARAMS
+};
+
+/* Debug reset registers */
+enum dbg_reset_regs {
+	DBG_RESET_REG_MISCS_PL_UA,
+	DBG_RESET_REG_MISCS_PL_HV,
+	DBG_RESET_REG_MISCS_PL_HV_2,
+	DBG_RESET_REG_MISC_PL_UA,
+	DBG_RESET_REG_MISC_PL_HV,
+	DBG_RESET_REG_MISC_PL_PDA_VMAIN_1,
+	DBG_RESET_REG_MISC_PL_PDA_VMAIN_2,
+	DBG_RESET_REG_MISC_PL_PDA_VAUX,
+	MAX_DBG_RESET_REGS
+};
+
 /* Debug status codes */
 enum dbg_status {
 	DBG_STATUS_OK,
@@ -1869,6 +2227,41 @@ enum dbg_status {
 	MAX_DBG_STATUS
 };
 
+/* Debug Storms IDs */
+enum dbg_storms {
+	DBG_TSTORM_ID,
+	DBG_MSTORM_ID,
+	DBG_USTORM_ID,
+	DBG_XSTORM_ID,
+	DBG_YSTORM_ID,
+	DBG_PSTORM_ID,
+	MAX_DBG_STORMS
+};
+
+/* Idle Check data */
+struct idle_chk_data {
+	__le32 buf_size; /* Idle check buffer size in dwords */
+	u8 buf_size_set; /* Indicates if the idle check buffer size was set
+			  * (0/1).
+			  */
+	u8 reserved1;
+	__le16 reserved2;
+};
+
+/* Debug Tools data (per HW function) */
+struct dbg_tools_data {
+	struct dbg_grc_data grc; /* GRC Dump data */
+	struct dbg_bus_data bus; /* Debug Bus data */
+	struct idle_chk_data idle_chk; /* Idle Check data */
+	u8 mode_enable[40]; /* Indicates if a mode is enabled (0/1) */
+	u8 block_in_reset[80]; /* Indicates if a block is in reset state (0/1).
+				*/
+	u8 chip_id; /* Chip ID (from enum chip_ids) */
+	u8 platform_id; /* Platform ID (from enum platform_ids) */
+	u8 initialized; /* Indicates if the data was initialized */
+	u8 reserved;
+};
+
 /********************************/
 /* HSI Init Functions constants */
 /********************************/
@@ -1948,15 +2341,50 @@ struct init_qm_vport_params {
 /* Max size in dwords of a zipped array */
 #define MAX_ZIPPED_SIZE	8192
 
+struct fw_asserts_ram_section {
+	__le16 section_ram_line_offset;
+	__le16 section_ram_line_size;
+	u8 list_dword_offset;
+	u8 list_element_dword_size;
+	u8 list_num_elements;
+	u8 list_next_index_dword_offset;
+};
+
+struct fw_ver_num {
+	u8 major; /* Firmware major version number */
+	u8 minor; /* Firmware minor version number */
+	u8 rev; /* Firmware revision version number */
+	u8 eng; /* Firmware engineering version number (for bootleg versions) */
+};
+
+struct fw_ver_info {
+	__le16 tools_ver; /* Tools version number */
+	u8 image_id; /* FW image ID (e.g. main) */
+	u8 reserved1;
+	struct fw_ver_num num; /* FW version number */
+	__le32 timestamp; /* FW Timestamp in unix time  (sec. since 1970) */
+	__le32 reserved2;
+};
+
+struct fw_info {
+	struct fw_ver_info ver;
+	struct fw_asserts_ram_section fw_asserts_section;
+};
+
+struct fw_info_location {
+	__le32 grc_addr;
+	__le32 size;
+};
+
 enum init_modes {
 	MODE_RESERVED,
 	MODE_BB_B0,
-	MODE_RESERVED2,
+	MODE_K2,
 	MODE_ASIC,
+	MODE_RESERVED2,
 	MODE_RESERVED3,
 	MODE_RESERVED4,
 	MODE_RESERVED5,
-	MODE_RESERVED6,
 	MODE_SF,
 	MODE_MF_SD,
 	MODE_MF_SI,
@@ -1965,7 +2393,7 @@ enum init_modes {
 	MODE_PORTS_PER_ENG_4,
 	MODE_100G,
 	MODE_40G,
-	MODE_RESERVED7,
+	MODE_RESERVED6,
 	MAX_INIT_MODES
 };
 
@@ -2223,8 +2651,276 @@ struct iro {
 	__le16 size;
 };
 
+/***************************** Public Functions *******************************/
+/**
+ * @brief qed_dbg_set_bin_ptr - Sets a pointer to the binary data with debug
+ *	arrays.
+ *
+ * @param bin_ptr - a pointer to the binary data with debug arrays.
+ */
+enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr);
+/**
+ * @brief qed_dbg_grc_get_dump_buf_size - Returns the required buffer size for
+ *	GRC Dump.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for the GRC Dump
+ *	data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_grc_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt,
+					      u32 *buf_size);
+/**
+ * @brief qed_dbg_grc_dump - Dumps GRC data into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the collected GRC data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified dump buffer is too small
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_grc_dump(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf,
+				 u32 buf_size_in_dwords,
+				 u32 *num_dumped_dwords);
+/**
+ * @brief qed_dbg_idle_chk_get_dump_buf_size - Returns the required buffer size
+ *	for idle check results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for the idle check
+ *	data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_idle_chk_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size);
+/**
+ * @brief qed_dbg_idle_chk_dump - Performs idle check and writes the results
+ *	into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the idle check data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_idle_chk_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords);
+/**
+ * @brief qed_dbg_mcp_trace_get_dump_buf_size - Returns the required buffer size
+ *	for mcp trace results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for mcp trace data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the trace data in MCP scratchpad contain an invalid signature
+ *	- the bundle ID in NVRAM is invalid
+ *	- the trace meta data cannot be found (in NVRAM or image file)
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_mcp_trace_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						    struct qed_ptt *p_ptt,
+						    u32 *buf_size);
+/**
+ * @brief qed_dbg_mcp_trace_dump - Performs mcp trace and writes the results
+ *	into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the mcp trace data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ *	- the trace data in MCP scratchpad contain an invalid signature
+ *	- the bundle ID in NVRAM is invalid
+ *	- the trace meta data cannot be found (in NVRAM or image file)
+ *	- the trace meta data cannot be read (from NVRAM or image file)
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_mcp_trace_dump(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       u32 *dump_buf,
+				       u32 buf_size_in_dwords,
+				       u32 *num_dumped_dwords);
+/**
+ * @brief qed_dbg_reg_fifo_get_dump_buf_size - Returns the required buffer size
+ *	for grc trace fifo results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for reg fifo data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_reg_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size);
+/**
+ * @brief qed_dbg_reg_fifo_dump - Reads the reg fifo and writes the results into
+ *	the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the reg fifo data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ *	- DMAE transaction failed
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_reg_fifo_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords);
+/**
+ * @brief qed_dbg_igu_fifo_get_dump_buf_size - Returns the required buffer size
+ *	for the IGU fifo results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for the IGU fifo
+ *	data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_igu_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size);
+/**
+ * @brief qed_dbg_igu_fifo_dump - Reads the IGU fifo and writes the results into
+ *	the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the IGU fifo data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ *	- DMAE transaction failed
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_igu_fifo_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords);
+/**
+ * @brief qed_dbg_protection_override_get_dump_buf_size - Returns the required
+ *	buffer size for protection override window results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for protection
+ *	override data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status
+qed_dbg_protection_override_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt,
+					      u32 *buf_size);
+/**
+ * @brief qed_dbg_protection_override_dump - Reads protection override window
+ *	entries and writes the results into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the protection override data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ *	- DMAE transaction failed
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_protection_override_dump(struct qed_hwfn *p_hwfn,
+						 struct qed_ptt *p_ptt,
+						 u32 *dump_buf,
+						 u32 buf_size_in_dwords,
+						 u32 *num_dumped_dwords);
+/**
+ * @brief qed_dbg_fw_asserts_get_dump_buf_size - Returns the required buffer
+ *	size for FW Asserts results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for FW Asserts data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_fw_asserts_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						     struct qed_ptt *p_ptt,
+						     u32 *buf_size);
+/**
+ * @brief qed_dbg_fw_asserts_dump - Reads the FW Asserts and writes the results
+ *	into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the FW Asserts data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_fw_asserts_dump(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					u32 *dump_buf,
+					u32 buf_size_in_dwords,
+					u32 *num_dumped_dwords);
 /**
- * @brief qed_dbg_print_attn - Prints attention registers values in the specified results struct.
+ * @brief qed_dbg_print_attn - Prints attention registers values in the
+ *	specified results struct.
  *
  * @param p_hwfn
  * @param results - Pointer to the attention read results
@@ -2236,8 +2932,212 @@ struct iro {
 enum dbg_status qed_dbg_print_attn(struct qed_hwfn *p_hwfn,
 				   struct dbg_attn_block_result *results);
 
+/******************************** Constants **********************************/
+
 #define MAX_NAME_LEN	16
 
+/***************************** Public Functions *******************************/
+/**
+ * @brief qed_dbg_user_set_bin_ptr - Sets a pointer to the binary data with
+ *	debug arrays.
+ *
+ * @param bin_ptr - a pointer to the binary data with debug arrays.
+ */
+enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr);
+/**
+ * @brief qed_dbg_get_status_str - Returns a string for the specified status.
+ *
+ * @param status - a debug status code.
+ *
+ * @return a string for the specified status
+ */
+const char *qed_dbg_get_status_str(enum dbg_status status);
+/**
+ * @brief qed_get_idle_chk_results_buf_size - Returns the required buffer size
+ *	for idle check results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - idle check dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_idle_chk_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32  num_dumped_dwords,
+						  u32 *results_buf_size);
+/**
+ * @brief qed_print_idle_chk_results - Prints idle check results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - idle check dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the idle check results.
+ * @param num_errors - OUT: number of errors found in idle check.
+ * @param num_warnings - OUT: number of warnings found in idle check.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf,
+					   u32 *num_errors,
+					   u32 *num_warnings);
+/**
+ * @brief qed_get_mcp_trace_results_buf_size - Returns the required buffer size
+ *	for MCP Trace results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - MCP Trace dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_mcp_trace_results_buf_size(struct qed_hwfn *p_hwfn,
+						   u32 *dump_buf,
+						   u32 num_dumped_dwords,
+						   u32 *results_buf_size);
+/**
+ * @brief qed_print_mcp_trace_results - Prints MCP Trace results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - mcp trace dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the mcp trace results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn,
+					    u32 *dump_buf,
+					    u32 num_dumped_dwords,
+					    char *results_buf);
+/**
+ * @brief qed_get_reg_fifo_results_buf_size - Returns the required buffer size
+ *	for reg_fifo results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - reg fifo dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_reg_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size);
+/**
+ * @brief qed_print_reg_fifo_results - Prints reg fifo results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - reg fifo dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the reg fifo results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_reg_fifo_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf);
+/**
+ * @brief qed_get_igu_fifo_results_buf_size - Returns the required buffer size
+ *	for igu_fifo results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - IGU fifo dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_igu_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size);
+/**
+ * @brief qed_print_igu_fifo_results - Prints IGU fifo results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - IGU fifo dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the IGU fifo results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_igu_fifo_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf);
+/**
+ * @brief qed_get_protection_override_results_buf_size - Returns the required
+ *	buffer size for protection override results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - protection override dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status
+qed_get_protection_override_results_buf_size(struct qed_hwfn *p_hwfn,
+					     u32 *dump_buf,
+					     u32 num_dumped_dwords,
+					     u32 *results_buf_size);
+/**
+ * @brief qed_print_protection_override_results - Prints protection override
+ *	results.
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - protection override dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the reg fifo results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_protection_override_results(struct qed_hwfn *p_hwfn,
+						      u32 *dump_buf,
+						      u32 num_dumped_dwords,
+						      char *results_buf);
+/**
+ * @brief qed_get_fw_asserts_results_buf_size - Returns the required buffer size
+ *	for FW Asserts results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - FW Asserts dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_fw_asserts_results_buf_size(struct qed_hwfn *p_hwfn,
+						    u32 *dump_buf,
+						    u32 num_dumped_dwords,
+						    u32 *results_buf_size);
+/**
+ * @brief qed_print_fw_asserts_results - Prints FW Asserts results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - FW Asserts dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the FW Asserts results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_fw_asserts_results(struct qed_hwfn *p_hwfn,
+					     u32 *dump_buf,
+					     u32 num_dumped_dwords,
+					     char *results_buf);
 /* Win 2 */
 #define GTT_BAR0_MAP_REG_IGU_CMD	0x00f000UL
 
@@ -7039,6 +7939,35 @@ struct ystorm_iscsi_conn_ag_ctx {
 	__le32 reg2;
 	__le32 reg3;
 };
+
+#define MFW_TRACE_SIGNATURE     0x25071946
+
+/* The trace in the buffer */
+#define MFW_TRACE_EVENTID_MASK          0x00ffff
+#define MFW_TRACE_PRM_SIZE_MASK         0x0f0000
+#define MFW_TRACE_PRM_SIZE_SHIFT        16
+#define MFW_TRACE_ENTRY_SIZE            3
+
+struct mcp_trace {
+	u32 signature;		/* Help to identify that the trace is valid */
+	u32 size;		/* the size of the trace buffer in bytes */
+	u32 curr_level;		/* 2 - all will be written to the buffer
+				 * 1 - debug trace will not be written
+				 * 0 - just errors will be written to the buffer
+				 */
+	u32 modules_mask[2];	/* a bit per module, 1 means write it, 0 means
+				 * mask it.
+				 */
+
+	/* Warning: the following pointers are assumed to be 32bits as they are
+	 * used only in the MFW.
+	 */
+	u32 trace_prod; /* The next trace will be written to this offset */
+	u32 trace_oldest; /* The oldest valid trace starts at this offset
+			   * (usually very close after the current producer).
+			   */
+};
+
 #define VF_MAX_STATIC 192
 
 #define MCP_GLOB_PATH_MAX	2
@@ -7046,6 +7975,7 @@ struct ystorm_iscsi_conn_ag_ctx {
 #define MCP_GLOB_PORT_MAX	4
 #define MCP_GLOB_FUNC_MAX	16
 
+typedef u32 offsize_t;		/* In DWORDS !!! */
 /* Offset from the beginning of the MCP scratchpad */
 #define OFFSIZE_OFFSET_SHIFT	0
 #define OFFSIZE_OFFSET_MASK	0x0000ffff
@@ -7636,6 +8566,8 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_NIG_DRAIN			0x30000000
 #define DRV_MSG_CODE_VF_DISABLED_DONE		0xc0000000
 #define DRV_MSG_CODE_CFG_VF_MSIX		0xc0010000
+#define DRV_MSG_CODE_NVM_GET_FILE_ATT		0x00030000
+#define DRV_MSG_CODE_NVM_READ_NVRAM		0x00050000
 #define DRV_MSG_CODE_MCP_RESET			0x00090000
 #define DRV_MSG_CODE_SET_VERSION		0x000f0000
 #define DRV_MSG_CODE_MCP_HALT                   0x00100000
@@ -7657,6 +8589,9 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_UNLOAD_WOL_MCP		0x00000001
 #define DRV_MB_PARAM_DCBX_NOTIFY_MASK		0x000000FF
 #define DRV_MB_PARAM_DCBX_NOTIFY_SHIFT		3
+
+#define DRV_MB_PARAM_NVM_LEN_SHIFT		24
+
 #define DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_SHIFT	0
 #define DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_MASK	0x000000FF
 #define DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_SHIFT	8
@@ -7694,6 +8629,8 @@ struct public_drv_mb {
 #define FW_MSG_CODE_DRV_UNLOAD_FUNCTION		0x20130000
 #define FW_MSG_CODE_DRV_UNLOAD_DONE		0x21100000
 #define FW_MSG_CODE_DRV_CFG_VF_MSIX_DONE	0xb0010000
+
+#define FW_MSG_CODE_NVM_OK			0x00010000
 #define FW_MSG_CODE_OK				0x00160000
 
 #define FW_MSG_SEQ_NUMBER_MASK			0x0000ffff
@@ -7930,4 +8867,101 @@ struct nvm_cfg1 {
 	struct nvm_cfg1_port port[MCP_GLOB_PORT_MAX];
 	struct nvm_cfg1_func func[MCP_GLOB_FUNC_MAX];
 };
+
+enum spad_sections {
+	SPAD_SECTION_TRACE,
+	SPAD_SECTION_NVM_CFG,
+	SPAD_SECTION_PUBLIC,
+	SPAD_SECTION_PRIVATE,
+	SPAD_SECTION_MAX
+};
+
+#define MCP_TRACE_SIZE          2048	/* 2kb */
+
+/* This section is located at a fixed location in the beginning of the
+ * scratchpad, to ensure that the MCP trace is not run over during MFW upgrade.
+ * All the rest of data has a floating location which differs from version to
+ * version, and is pointed by the mcp_meta_data below.
+ * Moreover, the spad_layout section is part of the MFW firmware, and is loaded
+ * with it from nvram in order to clear this portion.
+ */
+struct static_init {
+	u32 num_sections;
+	offsize_t sections[SPAD_SECTION_MAX];
+#define SECTION(_sec_) (*((offsize_t *)(STRUCT_OFFSET(sections[_sec_]))))
+
+	struct mcp_trace trace;
+#define MCP_TRACE_P ((struct mcp_trace *)(STRUCT_OFFSET(trace)))
+	u8 trace_buffer[MCP_TRACE_SIZE];
+#define MCP_TRACE_BUF ((u8 *)(STRUCT_OFFSET(trace_buffer)))
+	/* running_mfw has the same definition as in nvm_map.h.
+	 * This bit indicate both the running dir, and the running bundle.
+	 * It is set once when the LIM is loaded.
+	 */
+	u32 running_mfw;
+#define RUNNING_MFW (*((u32 *)(STRUCT_OFFSET(running_mfw))))
+	u32 build_time;
+#define MFW_BUILD_TIME (*((u32 *)(STRUCT_OFFSET(build_time))))
+	u32 reset_type;
+#define RESET_TYPE (*((u32 *)(STRUCT_OFFSET(reset_type))))
+	u32 mfw_secure_mode;
+#define MFW_SECURE_MODE (*((u32 *)(STRUCT_OFFSET(mfw_secure_mode))))
+	u16 pme_status_pf_bitmap;
+#define PME_STATUS_PF_BITMAP (*((u16 *)(STRUCT_OFFSET(pme_status_pf_bitmap))))
+	u16 pme_enable_pf_bitmap;
+#define PME_ENABLE_PF_BITMAP (*((u16 *)(STRUCT_OFFSET(pme_enable_pf_bitmap))))
+	u32 mim_nvm_addr;
+	u32 mim_start_addr;
+	u32 ah_pcie_link_params;
+#define AH_PCIE_LINK_PARAMS_LINK_SPEED_MASK     (0x000000ff)
+#define AH_PCIE_LINK_PARAMS_LINK_SPEED_SHIFT    (0)
+#define AH_PCIE_LINK_PARAMS_LINK_WIDTH_MASK     (0x0000ff00)
+#define AH_PCIE_LINK_PARAMS_LINK_WIDTH_SHIFT    (8)
+#define AH_PCIE_LINK_PARAMS_ASPM_MODE_MASK      (0x00ff0000)
+#define AH_PCIE_LINK_PARAMS_ASPM_MODE_SHIFT     (16)
+#define AH_PCIE_LINK_PARAMS_ASPM_CAP_MASK       (0xff000000)
+#define AH_PCIE_LINK_PARAMS_ASPM_CAP_SHIFT      (24)
+#define AH_PCIE_LINK_PARAMS (*((u32 *)(STRUCT_OFFSET(ah_pcie_link_params))))
+
+	u32 rsrv_persist[5];	/* Persist reserved for MFW upgrades */
+};
+
+enum nvm_image_type {
+	NVM_TYPE_TIM1 = 0x01,
+	NVM_TYPE_TIM2 = 0x02,
+	NVM_TYPE_MIM1 = 0x03,
+	NVM_TYPE_MIM2 = 0x04,
+	NVM_TYPE_MBA = 0x05,
+	NVM_TYPE_MODULES_PN = 0x06,
+	NVM_TYPE_VPD = 0x07,
+	NVM_TYPE_MFW_TRACE1 = 0x08,
+	NVM_TYPE_MFW_TRACE2 = 0x09,
+	NVM_TYPE_NVM_CFG1 = 0x0a,
+	NVM_TYPE_L2B = 0x0b,
+	NVM_TYPE_DIR1 = 0x0c,
+	NVM_TYPE_EAGLE_FW1 = 0x0d,
+	NVM_TYPE_FALCON_FW1 = 0x0e,
+	NVM_TYPE_PCIE_FW1 = 0x0f,
+	NVM_TYPE_HW_SET = 0x10,
+	NVM_TYPE_LIM = 0x11,
+	NVM_TYPE_AVS_FW1 = 0x12,
+	NVM_TYPE_DIR2 = 0x13,
+	NVM_TYPE_CCM = 0x14,
+	NVM_TYPE_EAGLE_FW2 = 0x15,
+	NVM_TYPE_FALCON_FW2 = 0x16,
+	NVM_TYPE_PCIE_FW2 = 0x17,
+	NVM_TYPE_AVS_FW2 = 0x18,
+	NVM_TYPE_INIT_HW = 0x19,
+	NVM_TYPE_DEFAULT_CFG = 0x1a,
+	NVM_TYPE_MDUMP = 0x1b,
+	NVM_TYPE_META = 0x1c,
+	NVM_TYPE_ISCSI_CFG = 0x1d,
+	NVM_TYPE_FCOE_CFG = 0x1f,
+	NVM_TYPE_ETH_PHY_FW1 = 0x20,
+	NVM_TYPE_ETH_PHY_FW2 = 0x21,
+	NVM_TYPE_MAX,
+};
+
+#define DIR_ID_1    (0)
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index d22e3f8816ae..250efd1d270d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -588,6 +588,8 @@ static int qed_nic_stop(struct qed_dev *cdev)
 		}
 	}
 
+	qed_dbg_pf_exit(cdev);
+
 	return rc;
 }
 
@@ -846,6 +848,8 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 
 		/* First Dword used to diffrentiate between various sources */
 		data = cdev->firmware->data + sizeof(u32);
+
+		qed_dbg_pf_init(cdev);
 	}
 
 	memset(&tunn_info, 0, sizeof(tunn_info));
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index b44f09b18eb1..759cb04e02b0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -528,9 +528,903 @@
 #define QM_REG_WFQPFWEIGHT	0x2f4e80UL
 #define QM_REG_WFQVPWEIGHT	0x2fa000UL
 
+#define PGLCS_REG_DBG_SELECT \
+	0x001d14UL
+#define PGLCS_REG_DBG_DWORD_ENABLE \
+	0x001d18UL
+#define PGLCS_REG_DBG_SHIFT \
+	0x001d1cUL
+#define PGLCS_REG_DBG_FORCE_VALID \
+	0x001d20UL
+#define PGLCS_REG_DBG_FORCE_FRAME \
+	0x001d24UL
+#define MISC_REG_RESET_PL_PDA_VMAIN_1 \
+	0x008070UL
+#define MISC_REG_RESET_PL_PDA_VMAIN_2 \
+	0x008080UL
+#define MISC_REG_RESET_PL_PDA_VAUX \
+	0x008090UL
+#define MISCS_REG_RESET_PL_UA \
+	0x009050UL
+#define MISCS_REG_RESET_PL_HV \
+	0x009060UL
+#define MISCS_REG_RESET_PL_HV_2	\
+	0x009150UL
+#define DMAE_REG_DBG_SELECT \
+	0x00c510UL
+#define DMAE_REG_DBG_DWORD_ENABLE \
+	0x00c514UL
+#define DMAE_REG_DBG_SHIFT \
+	0x00c518UL
+#define DMAE_REG_DBG_FORCE_VALID \
+	0x00c51cUL
+#define DMAE_REG_DBG_FORCE_FRAME \
+	0x00c520UL
+#define NCSI_REG_DBG_SELECT \
+	0x040474UL
+#define NCSI_REG_DBG_DWORD_ENABLE \
+	0x040478UL
+#define NCSI_REG_DBG_SHIFT \
+	0x04047cUL
+#define NCSI_REG_DBG_FORCE_VALID \
+	0x040480UL
+#define NCSI_REG_DBG_FORCE_FRAME \
+	0x040484UL
+#define GRC_REG_DBG_SELECT \
+	0x0500a4UL
+#define GRC_REG_DBG_DWORD_ENABLE \
+	0x0500a8UL
+#define GRC_REG_DBG_SHIFT \
+	0x0500acUL
+#define GRC_REG_DBG_FORCE_VALID	\
+	0x0500b0UL
+#define GRC_REG_DBG_FORCE_FRAME	\
+	0x0500b4UL
+#define UMAC_REG_DBG_SELECT \
+	0x051094UL
+#define UMAC_REG_DBG_DWORD_ENABLE \
+	0x051098UL
+#define UMAC_REG_DBG_SHIFT \
+	0x05109cUL
+#define UMAC_REG_DBG_FORCE_VALID \
+	0x0510a0UL
+#define UMAC_REG_DBG_FORCE_FRAME \
+	0x0510a4UL
+#define MCP2_REG_DBG_SELECT \
+	0x052400UL
+#define MCP2_REG_DBG_DWORD_ENABLE \
+	0x052404UL
+#define MCP2_REG_DBG_SHIFT \
+	0x052408UL
+#define MCP2_REG_DBG_FORCE_VALID \
+	0x052440UL
+#define MCP2_REG_DBG_FORCE_FRAME \
+	0x052444UL
+#define PCIE_REG_DBG_SELECT \
+	0x0547e8UL
+#define PCIE_REG_DBG_DWORD_ENABLE \
+	0x0547ecUL
+#define PCIE_REG_DBG_SHIFT \
+	0x0547f0UL
+#define PCIE_REG_DBG_FORCE_VALID \
+	0x0547f4UL
+#define PCIE_REG_DBG_FORCE_FRAME \
+	0x0547f8UL
+#define DORQ_REG_DBG_SELECT \
+	0x100ad0UL
+#define DORQ_REG_DBG_DWORD_ENABLE \
+	0x100ad4UL
+#define DORQ_REG_DBG_SHIFT \
+	0x100ad8UL
+#define DORQ_REG_DBG_FORCE_VALID \
+	0x100adcUL
+#define DORQ_REG_DBG_FORCE_FRAME \
+	0x100ae0UL
+#define IGU_REG_DBG_SELECT \
+	0x181578UL
+#define IGU_REG_DBG_DWORD_ENABLE \
+	0x18157cUL
+#define IGU_REG_DBG_SHIFT \
+	0x181580UL
+#define IGU_REG_DBG_FORCE_VALID	\
+	0x181584UL
+#define IGU_REG_DBG_FORCE_FRAME	\
+	0x181588UL
+#define CAU_REG_DBG_SELECT \
+	0x1c0ea8UL
+#define CAU_REG_DBG_DWORD_ENABLE \
+	0x1c0eacUL
+#define CAU_REG_DBG_SHIFT \
+	0x1c0eb0UL
+#define CAU_REG_DBG_FORCE_VALID	\
+	0x1c0eb4UL
+#define CAU_REG_DBG_FORCE_FRAME	\
+	0x1c0eb8UL
+#define PRS_REG_DBG_SELECT \
+	0x1f0b6cUL
+#define PRS_REG_DBG_DWORD_ENABLE \
+	0x1f0b70UL
+#define PRS_REG_DBG_SHIFT \
+	0x1f0b74UL
+#define PRS_REG_DBG_FORCE_VALID	\
+	0x1f0ba0UL
+#define PRS_REG_DBG_FORCE_FRAME	\
+	0x1f0ba4UL
+#define CNIG_REG_DBG_SELECT_K2 \
+	0x218254UL
+#define CNIG_REG_DBG_DWORD_ENABLE_K2 \
+	0x218258UL
+#define CNIG_REG_DBG_SHIFT_K2 \
+	0x21825cUL
+#define CNIG_REG_DBG_FORCE_VALID_K2 \
+	0x218260UL
+#define CNIG_REG_DBG_FORCE_FRAME_K2 \
+	0x218264UL
+#define PRM_REG_DBG_SELECT \
+	0x2306a8UL
+#define PRM_REG_DBG_DWORD_ENABLE \
+	0x2306acUL
+#define PRM_REG_DBG_SHIFT \
+	0x2306b0UL
+#define PRM_REG_DBG_FORCE_VALID	\
+	0x2306b4UL
+#define PRM_REG_DBG_FORCE_FRAME	\
+	0x2306b8UL
+#define SRC_REG_DBG_SELECT \
+	0x238700UL
+#define SRC_REG_DBG_DWORD_ENABLE \
+	0x238704UL
+#define SRC_REG_DBG_SHIFT \
+	0x238708UL
+#define SRC_REG_DBG_FORCE_VALID	\
+	0x23870cUL
+#define SRC_REG_DBG_FORCE_FRAME	\
+	0x238710UL
+#define RSS_REG_DBG_SELECT \
+	0x238c4cUL
+#define RSS_REG_DBG_DWORD_ENABLE \
+	0x238c50UL
+#define RSS_REG_DBG_SHIFT \
+	0x238c54UL
+#define RSS_REG_DBG_FORCE_VALID	\
+	0x238c58UL
+#define RSS_REG_DBG_FORCE_FRAME	\
+	0x238c5cUL
+#define RPB_REG_DBG_SELECT \
+	0x23c728UL
+#define RPB_REG_DBG_DWORD_ENABLE \
+	0x23c72cUL
+#define RPB_REG_DBG_SHIFT \
+	0x23c730UL
+#define RPB_REG_DBG_FORCE_VALID	\
+	0x23c734UL
+#define RPB_REG_DBG_FORCE_FRAME	\
+	0x23c738UL
+#define PSWRQ2_REG_DBG_SELECT \
+	0x240100UL
+#define PSWRQ2_REG_DBG_DWORD_ENABLE \
+	0x240104UL
+#define PSWRQ2_REG_DBG_SHIFT \
+	0x240108UL
+#define PSWRQ2_REG_DBG_FORCE_VALID \
+	0x24010cUL
+#define PSWRQ2_REG_DBG_FORCE_FRAME \
+	0x240110UL
+#define PSWRQ_REG_DBG_SELECT \
+	0x280020UL
+#define PSWRQ_REG_DBG_DWORD_ENABLE \
+	0x280024UL
+#define PSWRQ_REG_DBG_SHIFT \
+	0x280028UL
+#define PSWRQ_REG_DBG_FORCE_VALID \
+	0x28002cUL
+#define PSWRQ_REG_DBG_FORCE_FRAME \
+	0x280030UL
+#define PSWWR_REG_DBG_SELECT \
+	0x29a084UL
+#define PSWWR_REG_DBG_DWORD_ENABLE \
+	0x29a088UL
+#define PSWWR_REG_DBG_SHIFT \
+	0x29a08cUL
+#define PSWWR_REG_DBG_FORCE_VALID \
+	0x29a090UL
+#define PSWWR_REG_DBG_FORCE_FRAME \
+	0x29a094UL
+#define PSWRD_REG_DBG_SELECT \
+	0x29c040UL
+#define PSWRD_REG_DBG_DWORD_ENABLE \
+	0x29c044UL
+#define PSWRD_REG_DBG_SHIFT \
+	0x29c048UL
+#define PSWRD_REG_DBG_FORCE_VALID \
+	0x29c04cUL
+#define PSWRD_REG_DBG_FORCE_FRAME \
+	0x29c050UL
+#define PSWRD2_REG_DBG_SELECT \
+	0x29d400UL
+#define PSWRD2_REG_DBG_DWORD_ENABLE \
+	0x29d404UL
+#define PSWRD2_REG_DBG_SHIFT \
+	0x29d408UL
+#define PSWRD2_REG_DBG_FORCE_VALID \
+	0x29d40cUL
+#define PSWRD2_REG_DBG_FORCE_FRAME \
+	0x29d410UL
+#define PSWHST2_REG_DBG_SELECT \
+	0x29e058UL
+#define PSWHST2_REG_DBG_DWORD_ENABLE \
+	0x29e05cUL
+#define PSWHST2_REG_DBG_SHIFT \
+	0x29e060UL
+#define PSWHST2_REG_DBG_FORCE_VALID \
+	0x29e064UL
+#define PSWHST2_REG_DBG_FORCE_FRAME \
+	0x29e068UL
+#define PSWHST_REG_DBG_SELECT \
+	0x2a0100UL
+#define PSWHST_REG_DBG_DWORD_ENABLE \
+	0x2a0104UL
+#define PSWHST_REG_DBG_SHIFT \
+	0x2a0108UL
+#define PSWHST_REG_DBG_FORCE_VALID \
+	0x2a010cUL
+#define PSWHST_REG_DBG_FORCE_FRAME \
+	0x2a0110UL
+#define PGLUE_B_REG_DBG_SELECT \
+	0x2a8400UL
+#define PGLUE_B_REG_DBG_DWORD_ENABLE \
+	0x2a8404UL
+#define PGLUE_B_REG_DBG_SHIFT \
+	0x2a8408UL
+#define PGLUE_B_REG_DBG_FORCE_VALID \
+	0x2a840cUL
+#define PGLUE_B_REG_DBG_FORCE_FRAME \
+	0x2a8410UL
+#define TM_REG_DBG_SELECT \
+	0x2c07a8UL
+#define TM_REG_DBG_DWORD_ENABLE	\
+	0x2c07acUL
+#define TM_REG_DBG_SHIFT \
+	0x2c07b0UL
+#define TM_REG_DBG_FORCE_VALID \
+	0x2c07b4UL
+#define TM_REG_DBG_FORCE_FRAME \
+	0x2c07b8UL
+#define TCFC_REG_DBG_SELECT \
+	0x2d0500UL
+#define TCFC_REG_DBG_DWORD_ENABLE \
+	0x2d0504UL
+#define TCFC_REG_DBG_SHIFT \
+	0x2d0508UL
+#define TCFC_REG_DBG_FORCE_VALID \
+	0x2d050cUL
+#define TCFC_REG_DBG_FORCE_FRAME \
+	0x2d0510UL
+#define CCFC_REG_DBG_SELECT \
+	0x2e0500UL
+#define CCFC_REG_DBG_DWORD_ENABLE \
+	0x2e0504UL
+#define CCFC_REG_DBG_SHIFT \
+	0x2e0508UL
+#define CCFC_REG_DBG_FORCE_VALID \
+	0x2e050cUL
+#define CCFC_REG_DBG_FORCE_FRAME \
+	0x2e0510UL
+#define QM_REG_DBG_SELECT \
+	0x2f2e74UL
+#define QM_REG_DBG_DWORD_ENABLE	\
+	0x2f2e78UL
+#define QM_REG_DBG_SHIFT \
+	0x2f2e7cUL
+#define QM_REG_DBG_FORCE_VALID \
+	0x2f2e80UL
+#define QM_REG_DBG_FORCE_FRAME \
+	0x2f2e84UL
+#define RDIF_REG_DBG_SELECT \
+	0x300500UL
+#define RDIF_REG_DBG_DWORD_ENABLE \
+	0x300504UL
+#define RDIF_REG_DBG_SHIFT \
+	0x300508UL
+#define RDIF_REG_DBG_FORCE_VALID \
+	0x30050cUL
+#define RDIF_REG_DBG_FORCE_FRAME \
+	0x300510UL
+#define TDIF_REG_DBG_SELECT \
+	0x310500UL
+#define TDIF_REG_DBG_DWORD_ENABLE \
+	0x310504UL
+#define TDIF_REG_DBG_SHIFT \
+	0x310508UL
+#define TDIF_REG_DBG_FORCE_VALID \
+	0x31050cUL
+#define TDIF_REG_DBG_FORCE_FRAME \
+	0x310510UL
+#define BRB_REG_DBG_SELECT \
+	0x340ed0UL
+#define BRB_REG_DBG_DWORD_ENABLE \
+	0x340ed4UL
+#define BRB_REG_DBG_SHIFT \
+	0x340ed8UL
+#define BRB_REG_DBG_FORCE_VALID	\
+	0x340edcUL
+#define BRB_REG_DBG_FORCE_FRAME	\
+	0x340ee0UL
+#define XYLD_REG_DBG_SELECT \
+	0x4c1600UL
+#define XYLD_REG_DBG_DWORD_ENABLE \
+	0x4c1604UL
+#define XYLD_REG_DBG_SHIFT \
+	0x4c1608UL
+#define XYLD_REG_DBG_FORCE_VALID \
+	0x4c160cUL
+#define XYLD_REG_DBG_FORCE_FRAME \
+	0x4c1610UL
+#define YULD_REG_DBG_SELECT \
+	0x4c9600UL
+#define YULD_REG_DBG_DWORD_ENABLE \
+	0x4c9604UL
+#define YULD_REG_DBG_SHIFT \
+	0x4c9608UL
+#define YULD_REG_DBG_FORCE_VALID \
+	0x4c960cUL
+#define YULD_REG_DBG_FORCE_FRAME \
+	0x4c9610UL
+#define TMLD_REG_DBG_SELECT \
+	0x4d1600UL
+#define TMLD_REG_DBG_DWORD_ENABLE \
+	0x4d1604UL
+#define TMLD_REG_DBG_SHIFT \
+	0x4d1608UL
+#define TMLD_REG_DBG_FORCE_VALID \
+	0x4d160cUL
+#define TMLD_REG_DBG_FORCE_FRAME \
+	0x4d1610UL
+#define MULD_REG_DBG_SELECT \
+	0x4e1600UL
+#define MULD_REG_DBG_DWORD_ENABLE \
+	0x4e1604UL
+#define MULD_REG_DBG_SHIFT \
+	0x4e1608UL
+#define MULD_REG_DBG_FORCE_VALID \
+	0x4e160cUL
+#define MULD_REG_DBG_FORCE_FRAME \
+	0x4e1610UL
+#define NIG_REG_DBG_SELECT \
+	0x502140UL
+#define NIG_REG_DBG_DWORD_ENABLE \
+	0x502144UL
+#define NIG_REG_DBG_SHIFT \
+	0x502148UL
+#define NIG_REG_DBG_FORCE_VALID	\
+	0x50214cUL
+#define NIG_REG_DBG_FORCE_FRAME	\
+	0x502150UL
+#define BMB_REG_DBG_SELECT \
+	0x540a7cUL
+#define BMB_REG_DBG_DWORD_ENABLE \
+	0x540a80UL
+#define BMB_REG_DBG_SHIFT \
+	0x540a84UL
+#define BMB_REG_DBG_FORCE_VALID	\
+	0x540a88UL
+#define BMB_REG_DBG_FORCE_FRAME	\
+	0x540a8cUL
+#define PTU_REG_DBG_SELECT \
+	0x560100UL
+#define PTU_REG_DBG_DWORD_ENABLE \
+	0x560104UL
+#define PTU_REG_DBG_SHIFT \
+	0x560108UL
+#define PTU_REG_DBG_FORCE_VALID	\
+	0x56010cUL
+#define PTU_REG_DBG_FORCE_FRAME	\
+	0x560110UL
+#define CDU_REG_DBG_SELECT \
+	0x580704UL
+#define CDU_REG_DBG_DWORD_ENABLE \
+	0x580708UL
+#define CDU_REG_DBG_SHIFT \
+	0x58070cUL
+#define CDU_REG_DBG_FORCE_VALID	\
+	0x580710UL
+#define CDU_REG_DBG_FORCE_FRAME	\
+	0x580714UL
+#define WOL_REG_DBG_SELECT \
+	0x600140UL
+#define WOL_REG_DBG_DWORD_ENABLE \
+	0x600144UL
+#define WOL_REG_DBG_SHIFT \
+	0x600148UL
+#define WOL_REG_DBG_FORCE_VALID	\
+	0x60014cUL
+#define WOL_REG_DBG_FORCE_FRAME	\
+	0x600150UL
+#define BMBN_REG_DBG_SELECT \
+	0x610140UL
+#define BMBN_REG_DBG_DWORD_ENABLE \
+	0x610144UL
+#define BMBN_REG_DBG_SHIFT \
+	0x610148UL
+#define BMBN_REG_DBG_FORCE_VALID \
+	0x61014cUL
+#define BMBN_REG_DBG_FORCE_FRAME \
+	0x610150UL
+#define NWM_REG_DBG_SELECT \
+	0x8000ecUL
+#define NWM_REG_DBG_DWORD_ENABLE \
+	0x8000f0UL
+#define NWM_REG_DBG_SHIFT \
+	0x8000f4UL
+#define NWM_REG_DBG_FORCE_VALID	\
+	0x8000f8UL
+#define NWM_REG_DBG_FORCE_FRAME	\
+	0x8000fcUL
+#define PBF_REG_DBG_SELECT \
+	0xd80060UL
+#define PBF_REG_DBG_DWORD_ENABLE \
+	0xd80064UL
+#define PBF_REG_DBG_SHIFT \
+	0xd80068UL
+#define PBF_REG_DBG_FORCE_VALID	\
+	0xd8006cUL
+#define PBF_REG_DBG_FORCE_FRAME	\
+	0xd80070UL
+#define PBF_PB1_REG_DBG_SELECT \
+	0xda0728UL
+#define PBF_PB1_REG_DBG_DWORD_ENABLE \
+	0xda072cUL
+#define PBF_PB1_REG_DBG_SHIFT \
+	0xda0730UL
+#define PBF_PB1_REG_DBG_FORCE_VALID \
+	0xda0734UL
+#define PBF_PB1_REG_DBG_FORCE_FRAME \
+	0xda0738UL
+#define PBF_PB2_REG_DBG_SELECT \
+	0xda4728UL
+#define PBF_PB2_REG_DBG_DWORD_ENABLE \
+	0xda472cUL
+#define PBF_PB2_REG_DBG_SHIFT \
+	0xda4730UL
+#define PBF_PB2_REG_DBG_FORCE_VALID \
+	0xda4734UL
+#define PBF_PB2_REG_DBG_FORCE_FRAME \
+	0xda4738UL
+#define BTB_REG_DBG_SELECT \
+	0xdb08c8UL
+#define BTB_REG_DBG_DWORD_ENABLE \
+	0xdb08ccUL
+#define BTB_REG_DBG_SHIFT \
+	0xdb08d0UL
+#define BTB_REG_DBG_FORCE_VALID	\
+	0xdb08d4UL
+#define BTB_REG_DBG_FORCE_FRAME	\
+	0xdb08d8UL
+#define XSDM_REG_DBG_SELECT \
+	0xf80e28UL
+#define XSDM_REG_DBG_DWORD_ENABLE \
+	0xf80e2cUL
+#define XSDM_REG_DBG_SHIFT \
+	0xf80e30UL
+#define XSDM_REG_DBG_FORCE_VALID \
+	0xf80e34UL
+#define XSDM_REG_DBG_FORCE_FRAME \
+	0xf80e38UL
+#define YSDM_REG_DBG_SELECT \
+	0xf90e28UL
+#define YSDM_REG_DBG_DWORD_ENABLE \
+	0xf90e2cUL
+#define YSDM_REG_DBG_SHIFT \
+	0xf90e30UL
+#define YSDM_REG_DBG_FORCE_VALID \
+	0xf90e34UL
+#define YSDM_REG_DBG_FORCE_FRAME \
+	0xf90e38UL
+#define PSDM_REG_DBG_SELECT \
+	0xfa0e28UL
+#define PSDM_REG_DBG_DWORD_ENABLE \
+	0xfa0e2cUL
+#define PSDM_REG_DBG_SHIFT \
+	0xfa0e30UL
+#define PSDM_REG_DBG_FORCE_VALID \
+	0xfa0e34UL
+#define PSDM_REG_DBG_FORCE_FRAME \
+	0xfa0e38UL
+#define TSDM_REG_DBG_SELECT \
+	0xfb0e28UL
+#define TSDM_REG_DBG_DWORD_ENABLE \
+	0xfb0e2cUL
+#define TSDM_REG_DBG_SHIFT \
+	0xfb0e30UL
+#define TSDM_REG_DBG_FORCE_VALID \
+	0xfb0e34UL
+#define TSDM_REG_DBG_FORCE_FRAME \
+	0xfb0e38UL
+#define MSDM_REG_DBG_SELECT \
+	0xfc0e28UL
+#define MSDM_REG_DBG_DWORD_ENABLE \
+	0xfc0e2cUL
+#define MSDM_REG_DBG_SHIFT \
+	0xfc0e30UL
+#define MSDM_REG_DBG_FORCE_VALID \
+	0xfc0e34UL
+#define MSDM_REG_DBG_FORCE_FRAME \
+	0xfc0e38UL
+#define USDM_REG_DBG_SELECT \
+	0xfd0e28UL
+#define USDM_REG_DBG_DWORD_ENABLE \
+	0xfd0e2cUL
+#define USDM_REG_DBG_SHIFT \
+	0xfd0e30UL
+#define USDM_REG_DBG_FORCE_VALID \
+	0xfd0e34UL
+#define USDM_REG_DBG_FORCE_FRAME \
+	0xfd0e38UL
+#define XCM_REG_DBG_SELECT \
+	0x1000040UL
+#define XCM_REG_DBG_DWORD_ENABLE \
+	0x1000044UL
+#define XCM_REG_DBG_SHIFT \
+	0x1000048UL
+#define XCM_REG_DBG_FORCE_VALID	\
+	0x100004cUL
+#define XCM_REG_DBG_FORCE_FRAME	\
+	0x1000050UL
+#define YCM_REG_DBG_SELECT \
+	0x1080040UL
+#define YCM_REG_DBG_DWORD_ENABLE \
+	0x1080044UL
+#define YCM_REG_DBG_SHIFT \
+	0x1080048UL
+#define YCM_REG_DBG_FORCE_VALID	\
+	0x108004cUL
+#define YCM_REG_DBG_FORCE_FRAME	\
+	0x1080050UL
+#define PCM_REG_DBG_SELECT \
+	0x1100040UL
+#define PCM_REG_DBG_DWORD_ENABLE \
+	0x1100044UL
+#define PCM_REG_DBG_SHIFT \
+	0x1100048UL
+#define PCM_REG_DBG_FORCE_VALID	\
+	0x110004cUL
+#define PCM_REG_DBG_FORCE_FRAME	\
+	0x1100050UL
+#define TCM_REG_DBG_SELECT \
+	0x1180040UL
+#define TCM_REG_DBG_DWORD_ENABLE \
+	0x1180044UL
+#define TCM_REG_DBG_SHIFT \
+	0x1180048UL
+#define TCM_REG_DBG_FORCE_VALID	\
+	0x118004cUL
+#define TCM_REG_DBG_FORCE_FRAME	\
+	0x1180050UL
+#define MCM_REG_DBG_SELECT \
+	0x1200040UL
+#define MCM_REG_DBG_DWORD_ENABLE \
+	0x1200044UL
+#define MCM_REG_DBG_SHIFT \
+	0x1200048UL
+#define MCM_REG_DBG_FORCE_VALID	\
+	0x120004cUL
+#define MCM_REG_DBG_FORCE_FRAME	\
+	0x1200050UL
+#define UCM_REG_DBG_SELECT \
+	0x1280050UL
+#define UCM_REG_DBG_DWORD_ENABLE \
+	0x1280054UL
+#define UCM_REG_DBG_SHIFT \
+	0x1280058UL
+#define UCM_REG_DBG_FORCE_VALID	\
+	0x128005cUL
+#define UCM_REG_DBG_FORCE_FRAME	\
+	0x1280060UL
+#define XSEM_REG_DBG_SELECT \
+	0x1401528UL
+#define XSEM_REG_DBG_DWORD_ENABLE \
+	0x140152cUL
+#define XSEM_REG_DBG_SHIFT \
+	0x1401530UL
+#define XSEM_REG_DBG_FORCE_VALID \
+	0x1401534UL
+#define XSEM_REG_DBG_FORCE_FRAME \
+	0x1401538UL
+#define YSEM_REG_DBG_SELECT \
+	0x1501528UL
+#define YSEM_REG_DBG_DWORD_ENABLE \
+	0x150152cUL
+#define YSEM_REG_DBG_SHIFT \
+	0x1501530UL
+#define YSEM_REG_DBG_FORCE_VALID \
+	0x1501534UL
+#define YSEM_REG_DBG_FORCE_FRAME \
+	0x1501538UL
+#define PSEM_REG_DBG_SELECT \
+	0x1601528UL
+#define PSEM_REG_DBG_DWORD_ENABLE \
+	0x160152cUL
+#define PSEM_REG_DBG_SHIFT \
+	0x1601530UL
+#define PSEM_REG_DBG_FORCE_VALID \
+	0x1601534UL
+#define PSEM_REG_DBG_FORCE_FRAME \
+	0x1601538UL
+#define TSEM_REG_DBG_SELECT \
+	0x1701528UL
+#define TSEM_REG_DBG_DWORD_ENABLE \
+	0x170152cUL
+#define TSEM_REG_DBG_SHIFT \
+	0x1701530UL
+#define TSEM_REG_DBG_FORCE_VALID \
+	0x1701534UL
+#define TSEM_REG_DBG_FORCE_FRAME \
+	0x1701538UL
+#define MSEM_REG_DBG_SELECT \
+	0x1801528UL
+#define MSEM_REG_DBG_DWORD_ENABLE \
+	0x180152cUL
+#define MSEM_REG_DBG_SHIFT \
+	0x1801530UL
+#define MSEM_REG_DBG_FORCE_VALID \
+	0x1801534UL
+#define MSEM_REG_DBG_FORCE_FRAME \
+	0x1801538UL
+#define USEM_REG_DBG_SELECT \
+	0x1901528UL
+#define USEM_REG_DBG_DWORD_ENABLE \
+	0x190152cUL
+#define USEM_REG_DBG_SHIFT \
+	0x1901530UL
+#define USEM_REG_DBG_FORCE_VALID \
+	0x1901534UL
+#define USEM_REG_DBG_FORCE_FRAME \
+	0x1901538UL
+#define PCIE_REG_DBG_COMMON_SELECT \
+	0x054398UL
+#define PCIE_REG_DBG_COMMON_DWORD_ENABLE \
+	0x05439cUL
+#define PCIE_REG_DBG_COMMON_SHIFT \
+	0x0543a0UL
+#define PCIE_REG_DBG_COMMON_FORCE_VALID	\
+	0x0543a4UL
+#define PCIE_REG_DBG_COMMON_FORCE_FRAME	\
+	0x0543a8UL
+#define MISC_REG_RESET_PL_UA \
+	0x008050UL
+#define MISC_REG_RESET_PL_HV \
+	0x008060UL
+#define XCM_REG_CTX_RBC_ACCS \
+	0x1001800UL
+#define XCM_REG_AGG_CON_CTX \
+	0x1001804UL
+#define XCM_REG_SM_CON_CTX \
+	0x1001808UL
+#define YCM_REG_CTX_RBC_ACCS \
+	0x1081800UL
+#define YCM_REG_AGG_CON_CTX \
+	0x1081804UL
+#define YCM_REG_AGG_TASK_CTX \
+	0x1081808UL
+#define YCM_REG_SM_CON_CTX \
+	0x108180cUL
+#define YCM_REG_SM_TASK_CTX \
+	0x1081810UL
+#define PCM_REG_CTX_RBC_ACCS \
+	0x1101440UL
+#define PCM_REG_SM_CON_CTX \
+	0x1101444UL
+#define TCM_REG_CTX_RBC_ACCS \
+	0x11814c0UL
+#define TCM_REG_AGG_CON_CTX \
+	0x11814c4UL
+#define TCM_REG_AGG_TASK_CTX \
+	0x11814c8UL
+#define TCM_REG_SM_CON_CTX \
+	0x11814ccUL
+#define TCM_REG_SM_TASK_CTX \
+	0x11814d0UL
+#define MCM_REG_CTX_RBC_ACCS \
+	0x1201800UL
+#define MCM_REG_AGG_CON_CTX \
+	0x1201804UL
+#define MCM_REG_AGG_TASK_CTX \
+	0x1201808UL
+#define MCM_REG_SM_CON_CTX \
+	0x120180cUL
+#define MCM_REG_SM_TASK_CTX \
+	0x1201810UL
+#define UCM_REG_CTX_RBC_ACCS \
+	0x1281700UL
+#define UCM_REG_AGG_CON_CTX \
+	0x1281704UL
+#define UCM_REG_AGG_TASK_CTX \
+	0x1281708UL
+#define UCM_REG_SM_CON_CTX \
+	0x128170cUL
+#define UCM_REG_SM_TASK_CTX \
+	0x1281710UL
+#define XSEM_REG_SLOW_DBG_EMPTY	\
+	0x1401140UL
+#define XSEM_REG_SYNC_DBG_EMPTY	\
+	0x1401160UL
+#define XSEM_REG_SLOW_DBG_ACTIVE \
+	0x1401400UL
+#define XSEM_REG_SLOW_DBG_MODE \
+	0x1401404UL
+#define XSEM_REG_DBG_FRAME_MODE	\
+	0x1401408UL
+#define XSEM_REG_DBG_MODE1_CFG \
+	0x1401420UL
+#define XSEM_REG_FAST_MEMORY \
+	0x1440000UL
+#define YSEM_REG_SYNC_DBG_EMPTY	\
+	0x1501160UL
+#define YSEM_REG_SLOW_DBG_ACTIVE \
+	0x1501400UL
+#define YSEM_REG_SLOW_DBG_MODE \
+	0x1501404UL
+#define YSEM_REG_DBG_FRAME_MODE	\
+	0x1501408UL
+#define YSEM_REG_DBG_MODE1_CFG \
+	0x1501420UL
+#define YSEM_REG_FAST_MEMORY \
+	0x1540000UL
+#define PSEM_REG_SLOW_DBG_EMPTY	\
+	0x1601140UL
+#define PSEM_REG_SYNC_DBG_EMPTY	\
+	0x1601160UL
+#define PSEM_REG_SLOW_DBG_ACTIVE \
+	0x1601400UL
+#define PSEM_REG_SLOW_DBG_MODE \
+	0x1601404UL
+#define PSEM_REG_DBG_FRAME_MODE	\
+	0x1601408UL
+#define PSEM_REG_DBG_MODE1_CFG \
+	0x1601420UL
+#define PSEM_REG_FAST_MEMORY \
+	0x1640000UL
+#define TSEM_REG_SLOW_DBG_EMPTY	\
+	0x1701140UL
+#define TSEM_REG_SYNC_DBG_EMPTY	\
+	0x1701160UL
+#define TSEM_REG_SLOW_DBG_ACTIVE \
+	0x1701400UL
+#define TSEM_REG_SLOW_DBG_MODE \
+	0x1701404UL
+#define TSEM_REG_DBG_FRAME_MODE	\
+	0x1701408UL
+#define TSEM_REG_DBG_MODE1_CFG \
+	0x1701420UL
+#define TSEM_REG_FAST_MEMORY \
+	0x1740000UL
+#define MSEM_REG_SLOW_DBG_EMPTY	\
+	0x1801140UL
+#define MSEM_REG_SYNC_DBG_EMPTY	\
+	0x1801160UL
+#define MSEM_REG_SLOW_DBG_ACTIVE \
+	0x1801400UL
+#define MSEM_REG_SLOW_DBG_MODE \
+	0x1801404UL
+#define MSEM_REG_DBG_FRAME_MODE	\
+	0x1801408UL
+#define MSEM_REG_DBG_MODE1_CFG \
+	0x1801420UL
+#define MSEM_REG_FAST_MEMORY \
+	0x1840000UL
+#define USEM_REG_SLOW_DBG_EMPTY	\
+	0x1901140UL
+#define USEM_REG_SYNC_DBG_EMPTY	\
+	0x1901160UL
+#define USEM_REG_SLOW_DBG_ACTIVE \
+	0x1901400UL
+#define USEM_REG_SLOW_DBG_MODE \
+	0x1901404UL
+#define USEM_REG_DBG_FRAME_MODE	\
+	0x1901408UL
+#define USEM_REG_DBG_MODE1_CFG \
+	0x1901420UL
+#define USEM_REG_FAST_MEMORY \
+	0x1940000UL
+#define SEM_FAST_REG_INT_RAM \
+	0x020000UL
+#define SEM_FAST_REG_INT_RAM_SIZE \
+	20480
+#define GRC_REG_TRACE_FIFO_VALID_DATA \
+	0x050064UL
+#define GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW \
+	0x05040cUL
+#define GRC_REG_PROTECTION_OVERRIDE_WINDOW \
+	0x050500UL
+#define IGU_REG_ERROR_HANDLING_MEMORY \
+	0x181520UL
 #define MCP_REG_CPU_MODE \
 	0xe05000UL
 #define MCP_REG_CPU_MODE_SOFT_HALT \
 		(0x1 << 10)
+#define BRB_REG_BIG_RAM_ADDRESS \
+	0x340800UL
+#define BRB_REG_BIG_RAM_DATA \
+	0x341500UL
+#define SEM_FAST_REG_STALL_0 \
+	0x000488UL
+#define SEM_FAST_REG_STALLED \
+	0x000494UL
+#define BTB_REG_BIG_RAM_ADDRESS \
+	0xdb0800UL
+#define BTB_REG_BIG_RAM_DATA \
+	0xdb0c00UL
+#define BMB_REG_BIG_RAM_ADDRESS \
+	0x540800UL
+#define BMB_REG_BIG_RAM_DATA \
+	0x540f00UL
+#define SEM_FAST_REG_STORM_REG_FILE \
+	0x008000UL
+#define RSS_REG_RSS_RAM_ADDR \
+	0x238c30UL
+#define MISCS_REG_BLOCK_256B_EN \
+	0x009074UL
+#define MCP_REG_SCRATCH_SIZE \
+	57344
+#define MCP_REG_CPU_REG_FILE \
+	0xe05200UL
+#define MCP_REG_CPU_REG_FILE_SIZE \
+	32
+#define DBG_REG_DEBUG_TARGET \
+	0x01005cUL
+#define DBG_REG_FULL_MODE \
+	0x010060UL
+#define DBG_REG_CALENDAR_OUT_DATA \
+	0x010480UL
+#define GRC_REG_TRACE_FIFO \
+	0x050068UL
+#define IGU_REG_ERROR_HANDLING_DATA_VALID \
+	0x181530UL
+#define DBG_REG_DBG_BLOCK_ON \
+	0x010454UL
+#define DBG_REG_FRAMING_MODE \
+	0x010058UL
+#define SEM_FAST_REG_VFC_DATA_WR \
+	0x000b40UL
+#define SEM_FAST_REG_VFC_ADDR \
+	0x000b44UL
+#define SEM_FAST_REG_VFC_DATA_RD \
+	0x000b48UL
+#define RSS_REG_RSS_RAM_DATA \
+	0x238c20UL
+#define MISC_REG_BLOCK_256B_EN \
+	0x008c14UL
+#define NWS_REG_NWS_CMU	\
+	0x720000UL
+#define PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_7_0 \
+	0x000680UL
+#define PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_15_8 \
+	0x000684UL
+#define PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_7_0 \
+	0x0006c0UL
+#define PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_11_8 \
+	0x0006c4UL
+#define MS_REG_MS_CMU \
+	0x6a4000UL
+#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X130 \
+	0x000208UL
+#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X132 \
+	0x000210UL
+#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X131 \
+	0x00020cUL
+#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X133 \
+	0x000214UL
+#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130 \
+	0x000208UL
+#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131 \
+	0x00020cUL
+#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132 \
+	0x000210UL
+#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133 \
+	0x000214UL
+#define PHY_PCIE_REG_PHY0 \
+	0x620000UL
+#define PHY_PCIE_REG_PHY1 \
+	0x624000UL
 
 #endif
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 70b30e4d3cc4..19027635df0d 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -143,6 +143,9 @@
 #define GTT_BYTE_SIZE_BITS	(GTT_DWORD_SIZE_BITS + 2)
 #define GTT_DWORD_SIZE		BIT(GTT_DWORD_SIZE_BITS)
 
+/* Tools Version */
+#define TOOLS_VERSION 10
+
 /*****************/
 /* CDU CONSTANTS */
 /*****************/
-- 
cgit v1.2.3


From e0971c832af4cd906ab931c9f6e9e1791a62fc98 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <Tomer.Tayar@qlogic.com>
Date: Wed, 7 Sep 2016 16:36:25 +0300
Subject: qed*: Add support for the ethtool get_regs operation

Signed-off-by: Tomer Tayar <Tomer.Tayar@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c      |  2 ++
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 24 ++++++++++++++++++++++++
 include/linux/qed/qed_if.h                      |  4 ++++
 3 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 250efd1d270d..b730a632c383 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1398,6 +1398,8 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.get_link = &qed_get_current_link,
 	.drain = &qed_drain,
 	.update_msglvl = &qed_init_dp,
+	.dbg_all_data = &qed_dbg_all_data,
+	.dbg_all_data_size = &qed_dbg_all_data_size,
 	.chain_alloc = &qed_chain_alloc,
 	.chain_free = &qed_chain_free,
 	.get_coalesce = &qed_get_coalesce,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 14d5328e6ac9..25a9b293ee8f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -695,6 +695,28 @@ static int qede_set_pauseparam(struct net_device *dev,
 	return 0;
 }
 
+static void qede_get_regs(struct net_device *ndev,
+			  struct ethtool_regs *regs, void *buffer)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	regs->version = 0;
+	memset(buffer, 0, regs->len);
+
+	if (edev->ops && edev->ops->common)
+		edev->ops->common->dbg_all_data(edev->cdev, buffer);
+}
+
+static int qede_get_regs_len(struct net_device *ndev)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	if (edev->ops && edev->ops->common)
+		return edev->ops->common->dbg_all_data_size(edev->cdev);
+	else
+		return -EINVAL;
+}
+
 static void qede_update_mtu(struct qede_dev *edev, union qede_reload_args *args)
 {
 	edev->ndev->mtu = args->mtu;
@@ -1395,6 +1417,8 @@ static const struct ethtool_ops qede_ethtool_ops = {
 	.get_link_ksettings = qede_get_link_ksettings,
 	.set_link_ksettings = qede_set_link_ksettings,
 	.get_drvinfo = qede_get_drvinfo,
+	.get_regs_len = qede_get_regs_len,
+	.get_regs = qede_get_regs,
 	.get_msglevel = qede_get_msglevel,
 	.set_msglevel = qede_set_msglevel,
 	.nway_reset = qede_nway_reset,
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index d8dc5c2243d5..e4546abcea08 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -455,6 +455,10 @@ struct qed_common_ops {
 	void		(*simd_handler_clean)(struct qed_dev *cdev,
 					      int index);
 
+	int (*dbg_all_data) (struct qed_dev *cdev, void *buffer);
+
+	int (*dbg_all_data_size) (struct qed_dev *cdev);
+
 /**
  * @brief can_link_change - can the instance change the link or not
  *
-- 
cgit v1.2.3


From 18f1387c7d7c6827b3ed6adf6ae20f65a58dc7b0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:11 +0100
Subject: rxrpc: Update protocol definitions slightly

Update the protocol definitions in include/rxrpc/packet.h slightly:

 (1) Get rid of RXRPC_PROCESS_MAXCALLS as it's redundant (same as
     RXRPC_MAXCALLS).

 (2) In struct rxrpc_jumbo_header, put _rsvd in a union with a field called
     cksum to match struct rxrpc_wire_header.

 (3) Provide RXRPC_JUMBO_SUBPKTLEN which is the total of the amount of data
     in a non-terminal subpacket plus the following secondary header for
     the next packet included in the jumbo packet.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/rxrpc/packet.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index 3c6128e1fdbe..b0ae5c1a6ce6 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -34,8 +34,6 @@ struct rxrpc_wire_header {
 #define RXRPC_CID_INC		(1 << RXRPC_CIDSHIFT)	/* connection ID increment */
 
 	__be32		callNumber;	/* call ID (0 for connection-level packets) */
-#define RXRPC_PROCESS_MAXCALLS	(1<<2)	/* maximum number of active calls per conn (power of 2) */
-
 	__be32		seq;		/* sequence number of pkt in call stream */
 	__be32		serial;		/* serial number of pkt sent to network */
 
@@ -93,10 +91,14 @@ struct rxrpc_wire_header {
 struct rxrpc_jumbo_header {
 	uint8_t		flags;		/* packet flags (as per rxrpc_header) */
 	uint8_t		pad;
-	__be16		_rsvd;		/* reserved (used by kerberos security as cksum) */
+	union {
+		__be16	_rsvd;		/* reserved */
+		__be16	cksum;		/* kerberos security checksum */
+	};
 };
 
 #define RXRPC_JUMBO_DATALEN	1412	/* non-terminal jumbo packet data length */
+#define RXRPC_JUMBO_SUBPKTLEN	(RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header))
 
 /*****************************************************************************/
 /*
-- 
cgit v1.2.3


From 2ab27215ea27475a0b279732ba8a934bfab57ef0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:12 +0100
Subject: rxrpc: Remove skb_count from struct rxrpc_call

Remove the sk_buff count from the rxrpc_call struct as it's less useful
once we stop queueing sk_buffs.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 10 +++-------
 net/rxrpc/ar-internal.h      |  1 -
 net/rxrpc/call_object.c      | 34 ++++++++++++----------------------
 3 files changed, 15 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 85ee035774ae..6b06cf050bc0 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -18,16 +18,14 @@
 
 TRACE_EVENT(rxrpc_call,
 	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op,
-		     int usage, int nskb,
-		     const void *where, const void *aux),
+		     int usage, const void *where, const void *aux),
 
-	    TP_ARGS(call, op, usage, nskb, where, aux),
+	    TP_ARGS(call, op, usage, where, aux),
 
 	    TP_STRUCT__entry(
 		    __field(struct rxrpc_call *,	call		)
 		    __field(int,			op		)
 		    __field(int,			usage		)
-		    __field(int,			nskb		)
 		    __field(const void *,		where		)
 		    __field(const void *,		aux		)
 			     ),
@@ -36,16 +34,14 @@ TRACE_EVENT(rxrpc_call,
 		    __entry->call = call;
 		    __entry->op = op;
 		    __entry->usage = usage;
-		    __entry->nskb = nskb;
 		    __entry->where = where;
 		    __entry->aux = aux;
 			   ),
 
-	    TP_printk("c=%p %s u=%d s=%d p=%pSR a=%p",
+	    TP_printk("c=%p %s u=%d sp=%pSR a=%p",
 		      __entry->call,
 		      rxrpc_call_traces[__entry->op],
 		      __entry->usage,
-		      __entry->nskb,
 		      __entry->where,
 		      __entry->aux)
 	    );
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index fd438dc93ee9..027791261768 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -467,7 +467,6 @@ struct rxrpc_call {
 	enum rxrpc_call_state	state;		/* current state of call */
 	enum rxrpc_call_completion completion;	/* Call completion condition */
 	atomic_t		usage;
-	atomic_t		skb_count;	/* Outstanding packets on this call */
 	atomic_t		sequence;	/* Tx data packet sequence counter */
 	u16			service_id;	/* service ID */
 	u8			security_ix;	/* Security type */
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 9efd9b0b0bdf..f843397e03b6 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -232,9 +232,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 		return call;
 	}
 
-	trace_rxrpc_call(call, rxrpc_call_new_client,
-			 atomic_read(&call->usage), 0,
-			 here, (const void *)user_call_ID);
+	trace_rxrpc_call(call, 0, atomic_read(&call->usage), here,
+			 (const void *)user_call_ID);
 
 	/* Publish the call, even though it is incompletely set up as yet */
 	call->user_call_ID = user_call_ID;
@@ -325,7 +324,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 		return ERR_PTR(-EBUSY);
 
 	trace_rxrpc_call(candidate, rxrpc_call_new_service,
-			 atomic_read(&candidate->usage), 0, here, NULL);
+			 atomic_read(&candidate->usage), here, NULL);
 
 	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
 	candidate->conn		= conn;
@@ -446,11 +445,10 @@ bool rxrpc_queue_call(struct rxrpc_call *call)
 {
 	const void *here = __builtin_return_address(0);
 	int n = __atomic_add_unless(&call->usage, 1, 0);
-	int m = atomic_read(&call->skb_count);
 	if (n == 0)
 		return false;
 	if (rxrpc_queue_work(&call->processor))
-		trace_rxrpc_call(call, rxrpc_call_queued, n + 1, m, here, NULL);
+		trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL);
 	else
 		rxrpc_put_call(call, rxrpc_call_put_noqueue);
 	return true;
@@ -463,10 +461,9 @@ bool __rxrpc_queue_call(struct rxrpc_call *call)
 {
 	const void *here = __builtin_return_address(0);
 	int n = atomic_read(&call->usage);
-	int m = atomic_read(&call->skb_count);
 	ASSERTCMP(n, >=, 1);
 	if (rxrpc_queue_work(&call->processor))
-		trace_rxrpc_call(call, rxrpc_call_queued_ref, n, m, here, NULL);
+		trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL);
 	else
 		rxrpc_put_call(call, rxrpc_call_put_noqueue);
 	return true;
@@ -480,9 +477,8 @@ void rxrpc_see_call(struct rxrpc_call *call)
 	const void *here = __builtin_return_address(0);
 	if (call) {
 		int n = atomic_read(&call->usage);
-		int m = atomic_read(&call->skb_count);
 
-		trace_rxrpc_call(call, rxrpc_call_seen, n, m, here, NULL);
+		trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL);
 	}
 }
 
@@ -493,9 +489,8 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	int n = atomic_inc_return(&call->usage);
-	int m = atomic_read(&call->skb_count);
 
-	trace_rxrpc_call(call, op, n, m, here, NULL);
+	trace_rxrpc_call(call, op, n, here, NULL);
 }
 
 /*
@@ -505,9 +500,8 @@ void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 {
 	const void *here = __builtin_return_address(0);
 	int n = atomic_inc_return(&call->usage);
-	int m = atomic_inc_return(&call->skb_count);
 
-	trace_rxrpc_call(call, rxrpc_call_got_skb, n, m, here, skb);
+	trace_rxrpc_call(call, rxrpc_call_got_skb, n, here, skb);
 }
 
 /*
@@ -642,17 +636,15 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
 	const void *here = __builtin_return_address(0);
-	int n, m;
+	int n;
 
 	ASSERT(call != NULL);
 
 	n = atomic_dec_return(&call->usage);
-	m = atomic_read(&call->skb_count);
-	trace_rxrpc_call(call, op, n, m, here, NULL);
+	trace_rxrpc_call(call, op, n, here, NULL);
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
-		WARN_ON(m != 0);
 		rxrpc_cleanup_call(call);
 	}
 }
@@ -663,15 +655,13 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 {
 	const void *here = __builtin_return_address(0);
-	int n, m;
+	int n;
 
 	n = atomic_dec_return(&call->usage);
-	m = atomic_dec_return(&call->skb_count);
-	trace_rxrpc_call(call, rxrpc_call_put_skb, n, m, here, skb);
+	trace_rxrpc_call(call, rxrpc_call_put_skb, n, here, skb);
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
-		WARN_ON(m != 0);
 		rxrpc_cleanup_call(call);
 	}
 }
-- 
cgit v1.2.3


From 49e19ec7d3499f79d2b3a45bb28418e89512fd7a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:12 +0100
Subject: rxrpc: Add tracepoints to record received packets and end of
 data_ready

Add two tracepoints:

 (1) Record the RxRPC protocol header of packets retrieved from the UDP
     socket by the data_ready handler.

 (2) Record the outcome of the data_ready handler.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 38 ++++++++++++++++++++++++++++++++++++++
 net/rxrpc/input.c            |  8 ++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 6b06cf050bc0..ea3b10ed91a8 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -80,6 +80,44 @@ TRACE_EVENT(rxrpc_skb,
 		      __entry->where)
 	    );
 
+TRACE_EVENT(rxrpc_rx_packet,
+	    TP_PROTO(struct rxrpc_skb_priv *sp),
+
+	    TP_ARGS(sp),
+
+	    TP_STRUCT__entry(
+		    __field_struct(struct rxrpc_host_header,	hdr		)
+			     ),
+
+	    TP_fast_assign(
+		    memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr));
+			   ),
+
+	    TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x",
+		      __entry->hdr.epoch, __entry->hdr.cid,
+		      __entry->hdr.callNumber, __entry->hdr.serviceId,
+		      __entry->hdr.serial, __entry->hdr.seq,
+		      __entry->hdr.type, __entry->hdr.flags)
+	    );
+
+TRACE_EVENT(rxrpc_rx_done,
+	    TP_PROTO(int result, int abort_code),
+
+	    TP_ARGS(result, abort_code),
+
+	    TP_STRUCT__entry(
+		    __field(int,			result		)
+		    __field(int,			abort_code	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->result = result;
+		    __entry->abort_code = abort_code;
+			   ),
+
+	    TP_printk("r=%d a=%d", __entry->result, __entry->abort_code)
+	    );
+
 TRACE_EVENT(rxrpc_abort,
 	    TP_PROTO(const char *why, u32 cid, u32 call_id, rxrpc_seq_t seq,
 		     int abort_code, int error),
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 8e624109750a..6c4b7df05e95 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -683,6 +683,7 @@ void rxrpc_data_ready(struct sock *sk)
 	/* dig out the RxRPC connection details */
 	if (rxrpc_extract_header(sp, skb) < 0)
 		goto bad_message;
+	trace_rxrpc_rx_packet(sp);
 
 	_net("Rx RxRPC %s ep=%x call=%x:%x",
 	     sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
@@ -767,6 +768,7 @@ discard_unlock:
 out_unlock:
 	rcu_read_unlock();
 out:
+	trace_rxrpc_rx_done(0, 0);
 	return;
 
 cant_route_call:
@@ -780,7 +782,7 @@ cant_route_call:
 			skb_queue_tail(&local->accept_queue, skb);
 			rxrpc_queue_work(&local->processor);
 			_leave(" [incoming]");
-			return;
+			goto out;
 		}
 		skb->priority = RX_INVALID_OPERATION;
 	} else {
@@ -789,7 +791,7 @@ cant_route_call:
 
 	if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
 		_debug("reject type %d",sp->hdr.type);
-		rxrpc_reject_packet(local, skb);
+		goto reject_packet;
 	} else {
 		rxrpc_free_skb(skb);
 	}
@@ -798,6 +800,8 @@ cant_route_call:
 
 bad_message:
 	skb->priority = RX_PROTOCOL_ERROR;
+reject_packet:
+	trace_rxrpc_rx_done(skb->mark, skb->priority);
 	rxrpc_reject_packet(local, skb);
 	_leave(" [badmsg]");
 }
-- 
cgit v1.2.3


From 00e907127e6f86d0f9b122d9b4347a8aa09a8b61 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:12 +0100
Subject: rxrpc: Preallocate peers, conns and calls for incoming service
 requests

Make it possible for the data_ready handler called from the UDP transport
socket to completely instantiate an rxrpc_call structure and make it
immediately live by preallocating all the memory it might need.  The idea
is to cut out the background thread usage as much as possible.

[Note that the preallocated structs are not actually used in this patch -
 that will be done in a future patch.]

If insufficient resources are available in the preallocation buffers, it
will be possible to discard the DATA packet in the data_ready handler or
schedule a BUSY packet without the need to schedule an attempt at
allocation in a background thread.

To this end:

 (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a
     maximum number each of the listen backlog size.  The backlog size is
     limited to a maxmimum of 32.  Only this many of each can be in the
     preallocation buffer.

 (2) For userspace sockets, the preallocation is charged initially by
     listen() and will be recharged by accepting or rejecting pending
     new incoming calls.

 (3) For kernel services {,re,dis}charging of the preallocation buffers is
     handled manually.  Two notifier callbacks have to be provided before
     kernel_listen() is invoked:

     (a) An indication that a new call has been instantiated.  This can be
     	 used to trigger background recharging.

     (b) An indication that a call is being discarded.  This is used when
     	 the socket is being released.

     A function, rxrpc_kernel_charge_accept() is called by the kernel
     service to preallocate a single call.  It should be passed the user ID
     to be used for that call and a callback to associate the rxrpc call
     with the kernel service's side of the ID.

 (4) Discard the preallocation when the socket is closed.

 (5) Temporarily bump the refcount on the call allocated in
     rxrpc_incoming_call() so that rxrpc_release_call() can ditch the
     preallocation ref on service calls unconditionally.  This will no
     longer be necessary once the preallocation is used.

Note that this does not yet control the number of active service calls on a
client - that will come in a later patch.

A future development would be to provide a setsockopt() call that allows a
userspace server to manually charge the preallocation buffer.  This would
allow user call IDs to be provided in advance and the awkward manual accept
stage to be bypassed.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c           |  71 ++++++++++++++-
 include/net/af_rxrpc.h   |  10 ++-
 net/rxrpc/af_rxrpc.c     |  16 +++-
 net/rxrpc/ar-internal.h  |  32 ++++++-
 net/rxrpc/call_accept.c  | 229 +++++++++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/call_object.c  |  12 ++-
 net/rxrpc/conn_object.c  |   2 +
 net/rxrpc/conn_service.c |  24 +++++
 net/rxrpc/input.c        |   2 +-
 net/rxrpc/proc.c         |   8 +-
 10 files changed, 391 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 53750dece80e..720ef05a24fe 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -18,6 +18,7 @@
 
 struct socket *afs_socket; /* my RxRPC socket */
 static struct workqueue_struct *afs_async_calls;
+static struct afs_call *afs_spare_incoming_call;
 static atomic_t afs_outstanding_calls;
 
 static void afs_free_call(struct afs_call *);
@@ -26,7 +27,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *);
 static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
 static int afs_dont_wait_for_call_to_complete(struct afs_call *);
 static void afs_process_async_call(struct work_struct *);
-static void afs_rx_new_call(struct sock *);
+static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
+static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
 static int afs_deliver_cm_op_id(struct afs_call *);
 
 /* synchronous call management */
@@ -54,8 +56,10 @@ static const struct afs_call_type afs_RXCMxxxx = {
 };
 
 static void afs_collect_incoming_call(struct work_struct *);
+static void afs_charge_preallocation(struct work_struct *);
 
 static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
 
 static int afs_wait_atomic_t(atomic_t *p)
 {
@@ -100,13 +104,15 @@ int afs_open_socket(void)
 	if (ret < 0)
 		goto error_2;
 
-	rxrpc_kernel_new_call_notification(socket, afs_rx_new_call);
+	rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
+					   afs_rx_discard_new_call);
 
 	ret = kernel_listen(socket, INT_MAX);
 	if (ret < 0)
 		goto error_2;
 
 	afs_socket = socket;
+	afs_charge_preallocation(NULL);
 	_leave(" = 0");
 	return 0;
 
@@ -126,6 +132,12 @@ void afs_close_socket(void)
 {
 	_enter("");
 
+	if (afs_spare_incoming_call) {
+		atomic_inc(&afs_outstanding_calls);
+		afs_free_call(afs_spare_incoming_call);
+		afs_spare_incoming_call = NULL;
+	}
+
 	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
 	wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
 			 TASK_UNINTERRUPTIBLE);
@@ -635,12 +647,65 @@ static void afs_collect_incoming_call(struct work_struct *work)
 		afs_free_call(call);
 }
 
+static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
+{
+	struct afs_call *call = (struct afs_call *)user_call_ID;
+
+	call->rxcall = rxcall;
+}
+
+/*
+ * Charge the incoming call preallocation.
+ */
+static void afs_charge_preallocation(struct work_struct *work)
+{
+	struct afs_call *call = afs_spare_incoming_call;
+
+	for (;;) {
+		if (!call) {
+			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+			if (!call)
+				break;
+
+			INIT_WORK(&call->async_work, afs_process_async_call);
+			call->wait_mode = &afs_async_incoming_call;
+			call->type = &afs_RXCMxxxx;
+			init_waitqueue_head(&call->waitq);
+			call->state = AFS_CALL_AWAIT_OP_ID;
+		}
+
+		if (rxrpc_kernel_charge_accept(afs_socket,
+					       afs_wake_up_async_call,
+					       afs_rx_attach,
+					       (unsigned long)call,
+					       GFP_KERNEL) < 0)
+			break;
+		call = NULL;
+	}
+	afs_spare_incoming_call = call;
+}
+
+/*
+ * Discard a preallocated call when a socket is shut down.
+ */
+static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
+				    unsigned long user_call_ID)
+{
+	struct afs_call *call = (struct afs_call *)user_call_ID;
+
+	atomic_inc(&afs_outstanding_calls);
+	call->rxcall = NULL;
+	afs_free_call(call);
+}
+
 /*
  * Notification of an incoming call.
  */
-static void afs_rx_new_call(struct sock *sk)
+static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
+			    unsigned long user_call_ID)
 {
 	queue_work(afs_wq, &afs_collect_incoming_call_work);
+	queue_work(afs_wq, &afs_charge_preallocation_work);
 }
 
 /*
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 08ed8729126c..9cf551be916b 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -21,10 +21,14 @@ struct rxrpc_call;
 
 typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *,
 				  unsigned long);
-typedef void (*rxrpc_notify_new_call_t)(struct sock *);
+typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *,
+					unsigned long);
+typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long);
+typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long);
 
 void rxrpc_kernel_new_call_notification(struct socket *,
-					rxrpc_notify_new_call_t);
+					rxrpc_notify_new_call_t,
+					rxrpc_discard_new_call_t);
 struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *,
 					   struct sockaddr_rxrpc *,
 					   struct key *,
@@ -43,5 +47,7 @@ struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long,
 int rxrpc_kernel_reject_call(struct socket *);
 void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
 			   struct sockaddr_rxrpc *);
+int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
+			       rxrpc_user_attach_call_t, unsigned long, gfp_t);
 
 #endif /* _NET_RXRPC_H */
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index f13cca1e973e..1e8cf3ded81f 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -193,7 +193,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
 	struct rxrpc_sock *rx = rxrpc_sk(sk);
-	unsigned int max;
+	unsigned int max, old;
 	int ret;
 
 	_enter("%p,%d", rx, backlog);
@@ -212,9 +212,13 @@ static int rxrpc_listen(struct socket *sock, int backlog)
 			backlog = max;
 		else if (backlog < 0 || backlog > max)
 			break;
+		old = sk->sk_max_ack_backlog;
 		sk->sk_max_ack_backlog = backlog;
-		rx->sk.sk_state = RXRPC_SERVER_LISTENING;
-		ret = 0;
+		ret = rxrpc_service_prealloc(rx, GFP_KERNEL);
+		if (ret == 0)
+			rx->sk.sk_state = RXRPC_SERVER_LISTENING;
+		else
+			sk->sk_max_ack_backlog = old;
 		break;
 	default:
 		ret = -EBUSY;
@@ -303,16 +307,19 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call);
  * rxrpc_kernel_new_call_notification - Get notifications of new calls
  * @sock: The socket to intercept received messages on
  * @notify_new_call: Function to be called when new calls appear
+ * @discard_new_call: Function to discard preallocated calls
  *
  * Allow a kernel service to be given notifications about new calls.
  */
 void rxrpc_kernel_new_call_notification(
 	struct socket *sock,
-	rxrpc_notify_new_call_t notify_new_call)
+	rxrpc_notify_new_call_t notify_new_call,
+	rxrpc_discard_new_call_t discard_new_call)
 {
 	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
 
 	rx->notify_new_call = notify_new_call;
+	rx->discard_new_call = discard_new_call;
 }
 EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
 
@@ -622,6 +629,7 @@ static int rxrpc_release_sock(struct sock *sk)
 	}
 
 	/* try to flush out this socket */
+	rxrpc_discard_prealloc(rx);
 	rxrpc_release_calls_on_socket(rx);
 	flush_workqueue(rxrpc_workqueue);
 	rxrpc_purge_queue(&sk->sk_receive_queue);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 027791261768..45e1c269f90e 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -63,6 +63,27 @@ enum {
 	RXRPC_CLOSE,			/* socket is being closed */
 };
 
+/*
+ * Service backlog preallocation.
+ *
+ * This contains circular buffers of preallocated peers, connections and calls
+ * for incoming service calls and their head and tail pointers.  This allows
+ * calls to be set up in the data_ready handler, thereby avoiding the need to
+ * shuffle packets around so much.
+ */
+struct rxrpc_backlog {
+	unsigned short		peer_backlog_head;
+	unsigned short		peer_backlog_tail;
+	unsigned short		conn_backlog_head;
+	unsigned short		conn_backlog_tail;
+	unsigned short		call_backlog_head;
+	unsigned short		call_backlog_tail;
+#define RXRPC_BACKLOG_MAX	32
+	struct rxrpc_peer	*peer_backlog[RXRPC_BACKLOG_MAX];
+	struct rxrpc_connection	*conn_backlog[RXRPC_BACKLOG_MAX];
+	struct rxrpc_call	*call_backlog[RXRPC_BACKLOG_MAX];
+};
+
 /*
  * RxRPC socket definition
  */
@@ -70,13 +91,15 @@ struct rxrpc_sock {
 	/* WARNING: sk has to be the first member */
 	struct sock		sk;
 	rxrpc_notify_new_call_t	notify_new_call; /* Func to notify of new call */
+	rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
 	struct rxrpc_local	*local;		/* local endpoint */
 	struct hlist_node	listen_link;	/* link in the local endpoint's listen list */
 	struct list_head	secureq;	/* calls awaiting connection security clearance */
 	struct list_head	acceptq;	/* calls awaiting acceptance */
+	struct rxrpc_backlog	*backlog;	/* Preallocation for services */
 	struct key		*key;		/* security for this socket */
 	struct key		*securities;	/* list of server security descriptors */
-	struct rb_root		calls;		/* outstanding calls on this socket */
+	struct rb_root		calls;		/* User ID -> call mapping */
 	unsigned long		flags;
 #define RXRPC_SOCK_CONNECTED		0	/* connect_srx is set */
 	rwlock_t		call_lock;	/* lock for calls */
@@ -290,6 +313,7 @@ enum rxrpc_conn_cache_state {
 enum rxrpc_conn_proto_state {
 	RXRPC_CONN_UNUSED,		/* Connection not yet attempted */
 	RXRPC_CONN_CLIENT,		/* Client connection */
+	RXRPC_CONN_SERVICE_PREALLOC,	/* Service connection preallocation */
 	RXRPC_CONN_SERVICE_UNSECURED,	/* Service unsecured connection */
 	RXRPC_CONN_SERVICE_CHALLENGING,	/* Service challenging for security */
 	RXRPC_CONN_SERVICE,		/* Service secured connection */
@@ -408,6 +432,7 @@ enum rxrpc_call_state {
 	RXRPC_CALL_CLIENT_AWAIT_REPLY,	/* - client awaiting reply */
 	RXRPC_CALL_CLIENT_RECV_REPLY,	/* - client receiving reply phase */
 	RXRPC_CALL_CLIENT_FINAL_ACK,	/* - client sending final ACK phase */
+	RXRPC_CALL_SERVER_PREALLOC,	/* - service preallocation */
 	RXRPC_CALL_SERVER_SECURING,	/* - server securing request connection */
 	RXRPC_CALL_SERVER_ACCEPTING,	/* - server accepting request */
 	RXRPC_CALL_SERVER_RECV_REQUEST,	/* - server receiving request */
@@ -534,6 +559,8 @@ extern struct workqueue_struct *rxrpc_workqueue;
 /*
  * call_accept.c
  */
+int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
+void rxrpc_discard_prealloc(struct rxrpc_sock *);
 void rxrpc_accept_incoming_calls(struct rxrpc_local *);
 struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
 				     rxrpc_notify_rx_t);
@@ -557,6 +584,7 @@ extern struct list_head rxrpc_calls;
 extern rwlock_t rxrpc_call_lock;
 
 struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
+struct rxrpc_call *rxrpc_alloc_call(gfp_t);
 struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 					 struct rxrpc_conn_parameters *,
 					 struct sockaddr_rxrpc *,
@@ -573,6 +601,7 @@ void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *);
+void rxrpc_cleanup_call(struct rxrpc_call *);
 void __exit rxrpc_destroy_all_calls(void);
 
 static inline bool rxrpc_is_service_call(const struct rxrpc_call *call)
@@ -757,6 +786,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
 struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *,
 						   struct sockaddr_rxrpc *,
 						   struct sk_buff *);
+struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t);
 void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
 
 /*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 4c71efcf82ed..cc7194e05a15 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -20,11 +20,209 @@
 #include <linux/in6.h>
 #include <linux/icmp.h>
 #include <linux/gfp.h>
+#include <linux/circ_buf.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <net/ip.h>
 #include "ar-internal.h"
 
+/*
+ * Preallocate a single service call, connection and peer and, if possible,
+ * give them a user ID and attach the user's side of the ID to them.
+ */
+static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
+				      struct rxrpc_backlog *b,
+				      rxrpc_notify_rx_t notify_rx,
+				      rxrpc_user_attach_call_t user_attach_call,
+				      unsigned long user_call_ID, gfp_t gfp)
+{
+	const void *here = __builtin_return_address(0);
+	struct rxrpc_call *call;
+	int max, tmp;
+	unsigned int size = RXRPC_BACKLOG_MAX;
+	unsigned int head, tail, call_head, call_tail;
+
+	max = rx->sk.sk_max_ack_backlog;
+	tmp = rx->sk.sk_ack_backlog;
+	if (tmp >= max) {
+		_leave(" = -ENOBUFS [full %u]", max);
+		return -ENOBUFS;
+	}
+	max -= tmp;
+
+	/* We don't need more conns and peers than we have calls, but on the
+	 * other hand, we shouldn't ever use more peers than conns or conns
+	 * than calls.
+	 */
+	call_head = b->call_backlog_head;
+	call_tail = READ_ONCE(b->call_backlog_tail);
+	tmp = CIRC_CNT(call_head, call_tail, size);
+	if (tmp >= max) {
+		_leave(" = -ENOBUFS [enough %u]", tmp);
+		return -ENOBUFS;
+	}
+	max = tmp + 1;
+
+	head = b->peer_backlog_head;
+	tail = READ_ONCE(b->peer_backlog_tail);
+	if (CIRC_CNT(head, tail, size) < max) {
+		struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp);
+		if (!peer)
+			return -ENOMEM;
+		b->peer_backlog[head] = peer;
+		smp_store_release(&b->peer_backlog_head,
+				  (head + 1) & (size - 1));
+	}
+
+	head = b->conn_backlog_head;
+	tail = READ_ONCE(b->conn_backlog_tail);
+	if (CIRC_CNT(head, tail, size) < max) {
+		struct rxrpc_connection *conn;
+
+		conn = rxrpc_prealloc_service_connection(gfp);
+		if (!conn)
+			return -ENOMEM;
+		b->conn_backlog[head] = conn;
+		smp_store_release(&b->conn_backlog_head,
+				  (head + 1) & (size - 1));
+	}
+
+	/* Now it gets complicated, because calls get registered with the
+	 * socket here, particularly if a user ID is preassigned by the user.
+	 */
+	call = rxrpc_alloc_call(gfp);
+	if (!call)
+		return -ENOMEM;
+	call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
+	call->state = RXRPC_CALL_SERVER_PREALLOC;
+
+	trace_rxrpc_call(call, rxrpc_call_new_service,
+			 atomic_read(&call->usage),
+			 here, (const void *)user_call_ID);
+
+	write_lock(&rx->call_lock);
+	if (user_attach_call) {
+		struct rxrpc_call *xcall;
+		struct rb_node *parent, **pp;
+
+		/* Check the user ID isn't already in use */
+		pp = &rx->calls.rb_node;
+		parent = NULL;
+		while (*pp) {
+			parent = *pp;
+			xcall = rb_entry(parent, struct rxrpc_call, sock_node);
+			if (user_call_ID < call->user_call_ID)
+				pp = &(*pp)->rb_left;
+			else if (user_call_ID > call->user_call_ID)
+				pp = &(*pp)->rb_right;
+			else
+				goto id_in_use;
+		}
+
+		call->user_call_ID = user_call_ID;
+		call->notify_rx = notify_rx;
+		rxrpc_get_call(call, rxrpc_call_got);
+		user_attach_call(call, user_call_ID);
+		rxrpc_get_call(call, rxrpc_call_got_userid);
+		rb_link_node(&call->sock_node, parent, pp);
+		rb_insert_color(&call->sock_node, &rx->calls);
+		set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+	}
+
+	write_unlock(&rx->call_lock);
+
+	write_lock(&rxrpc_call_lock);
+	list_add_tail(&call->link, &rxrpc_calls);
+	write_unlock(&rxrpc_call_lock);
+
+	b->call_backlog[call_head] = call;
+	smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1));
+	_leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID);
+	return 0;
+
+id_in_use:
+	write_unlock(&rx->call_lock);
+	rxrpc_cleanup_call(call);
+	_leave(" = -EBADSLT");
+	return -EBADSLT;
+}
+
+/*
+ * Preallocate sufficient service connections, calls and peers to cover the
+ * entire backlog of a socket.  When a new call comes in, if we don't have
+ * sufficient of each available, the call gets rejected as busy or ignored.
+ *
+ * The backlog is replenished when a connection is accepted or rejected.
+ */
+int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp)
+{
+	struct rxrpc_backlog *b = rx->backlog;
+
+	if (!b) {
+		b = kzalloc(sizeof(struct rxrpc_backlog), gfp);
+		if (!b)
+			return -ENOMEM;
+		rx->backlog = b;
+	}
+
+	if (rx->discard_new_call)
+		return 0;
+
+	while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0)
+		;
+
+	return 0;
+}
+
+/*
+ * Discard the preallocation on a service.
+ */
+void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
+{
+	struct rxrpc_backlog *b = rx->backlog;
+	unsigned int size = RXRPC_BACKLOG_MAX, head, tail;
+
+	if (!b)
+		return;
+	rx->backlog = NULL;
+
+	head = b->peer_backlog_head;
+	tail = b->peer_backlog_tail;
+	while (CIRC_CNT(head, tail, size) > 0) {
+		struct rxrpc_peer *peer = b->peer_backlog[tail];
+		kfree(peer);
+		tail = (tail + 1) & (size - 1);
+	}
+
+	head = b->conn_backlog_head;
+	tail = b->conn_backlog_tail;
+	while (CIRC_CNT(head, tail, size) > 0) {
+		struct rxrpc_connection *conn = b->conn_backlog[tail];
+		write_lock(&rxrpc_connection_lock);
+		list_del(&conn->link);
+		list_del(&conn->proc_link);
+		write_unlock(&rxrpc_connection_lock);
+		kfree(conn);
+		tail = (tail + 1) & (size - 1);
+	}
+
+	head = b->call_backlog_head;
+	tail = b->call_backlog_tail;
+	while (CIRC_CNT(head, tail, size) > 0) {
+		struct rxrpc_call *call = b->call_backlog[tail];
+		if (rx->discard_new_call) {
+			_debug("discard %lx", call->user_call_ID);
+			rx->discard_new_call(call, call->user_call_ID);
+		}
+		rxrpc_call_completed(call);
+		rxrpc_release_call(rx, call);
+		rxrpc_put_call(call, rxrpc_call_put);
+		tail = (tail + 1) & (size - 1);
+	}
+
+	kfree(b);
+}
+
 /*
  * generate a connection-level abort
  */
@@ -450,3 +648,34 @@ int rxrpc_kernel_reject_call(struct socket *sock)
 	return ret;
 }
 EXPORT_SYMBOL(rxrpc_kernel_reject_call);
+
+/*
+ * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
+ * @sock: The socket on which to preallocate
+ * @notify_rx: Event notification function for the call
+ * @user_attach_call: Func to attach call to user_call_ID
+ * @user_call_ID: The tag to attach to the preallocated call
+ * @gfp: The allocation conditions.
+ *
+ * Charge up the socket with preallocated calls, each with a user ID.  A
+ * function should be provided to effect the attachment from the user's side.
+ * The user is given a ref to hold on the call.
+ *
+ * Note that the call may be come connected before this function returns.
+ */
+int rxrpc_kernel_charge_accept(struct socket *sock,
+			       rxrpc_notify_rx_t notify_rx,
+			       rxrpc_user_attach_call_t user_attach_call,
+			       unsigned long user_call_ID, gfp_t gfp)
+{
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	struct rxrpc_backlog *b = rx->backlog;
+
+	if (sock->sk->sk_state == RXRPC_CLOSE)
+		return -ESHUTDOWN;
+
+	return rxrpc_service_prealloc_one(rx, b, notify_rx,
+					  user_attach_call, user_call_ID,
+					  gfp);
+}
+EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f843397e03b6..d233adc9b5e5 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -31,6 +31,7 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 	[RXRPC_CALL_CLIENT_AWAIT_REPLY]		= "ClAwtRpl",
 	[RXRPC_CALL_CLIENT_RECV_REPLY]		= "ClRcvRpl",
 	[RXRPC_CALL_CLIENT_FINAL_ACK]		= "ClFnlACK",
+	[RXRPC_CALL_SERVER_PREALLOC]		= "SvPrealc",
 	[RXRPC_CALL_SERVER_SECURING]		= "SvSecure",
 	[RXRPC_CALL_SERVER_ACCEPTING]		= "SvAccept",
 	[RXRPC_CALL_SERVER_RECV_REQUEST]	= "SvRcvReq",
@@ -71,7 +72,6 @@ DEFINE_RWLOCK(rxrpc_call_lock);
 static void rxrpc_call_life_expired(unsigned long _call);
 static void rxrpc_ack_time_expired(unsigned long _call);
 static void rxrpc_resend_time_expired(unsigned long _call);
-static void rxrpc_cleanup_call(struct rxrpc_call *call);
 
 /*
  * find an extant server call
@@ -113,7 +113,7 @@ found_extant_call:
 /*
  * allocate a new call
  */
-static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
+struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 {
 	struct rxrpc_call *call;
 
@@ -392,6 +392,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	if (call_id <= conn->channels[chan].call_counter)
 		goto old_call; /* TODO: Just drop packet */
 
+	/* Temporary: Mirror the backlog prealloc ref (TODO: use prealloc) */
+	rxrpc_get_call(candidate, rxrpc_call_got);
+
 	/* make the call available */
 	_debug("new call");
 	call = candidate;
@@ -596,6 +599,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 	del_timer_sync(&call->ack_timer);
 	del_timer_sync(&call->lifetimer);
 
+	/* We have to release the prealloc backlog ref */
+	if (rxrpc_is_service_call(call))
+		rxrpc_put_call(call, rxrpc_call_put);
 	_leave("");
 }
 
@@ -682,7 +688,7 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
 /*
  * clean up a call
  */
-static void rxrpc_cleanup_call(struct rxrpc_call *call)
+void rxrpc_cleanup_call(struct rxrpc_call *call)
 {
 	_net("DESTROY CALL %d", call->debug_id);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 9c6685b97e70..8da82e3aa00e 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -286,6 +286,8 @@ static void rxrpc_connection_reaper(struct work_struct *work)
 		ASSERTCMP(atomic_read(&conn->usage), >, 0);
 		if (likely(atomic_read(&conn->usage) > 1))
 			continue;
+		if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
+			continue;
 
 		idle_timestamp = READ_ONCE(conn->idle_timestamp);
 		_debug("reap CONN %d { u=%d,t=%ld }",
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 316a92107fee..189338a60457 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -118,6 +118,30 @@ replace_old_connection:
 	goto conn_published;
 }
 
+/*
+ * Preallocate a service connection.  The connection is placed on the proc and
+ * reap lists so that we don't have to get the lock from BH context.
+ */
+struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
+{
+	struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp);
+
+	if (conn) {
+		/* We maintain an extra ref on the connection whilst it is on
+		 * the rxrpc_connections list.
+		 */
+		conn->state = RXRPC_CONN_SERVICE_PREALLOC;
+		atomic_set(&conn->usage, 2);
+
+		write_lock(&rxrpc_connection_lock);
+		list_add_tail(&conn->link, &rxrpc_connections);
+		list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
+		write_unlock(&rxrpc_connection_lock);
+	}
+
+	return conn;
+}
+
 /*
  * get a record of an incoming connection
  */
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 6c4b7df05e95..5906579060cd 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -102,7 +102,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 		    rx->notify_new_call) {
 			spin_unlock_bh(&sk->sk_receive_queue.lock);
 			skb_queue_tail(&call->knlrecv_queue, skb);
-			rx->notify_new_call(&rx->sk);
+			rx->notify_new_call(&rx->sk, NULL, 0);
 		} else if (call->notify_rx) {
 			spin_unlock_bh(&sk->sk_receive_queue.lock);
 			skb_queue_tail(&call->knlrecv_queue, skb);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index dfad23821a62..d529d1b4021c 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -17,6 +17,7 @@
 static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
 	[RXRPC_CONN_UNUSED]			= "Unused  ",
 	[RXRPC_CONN_CLIENT]			= "Client  ",
+	[RXRPC_CONN_SERVICE_PREALLOC]		= "SvPrealc",
 	[RXRPC_CONN_SERVICE_UNSECURED]		= "SvUnsec ",
 	[RXRPC_CONN_SERVICE_CHALLENGING]	= "SvChall ",
 	[RXRPC_CONN_SERVICE]			= "SvSecure",
@@ -156,6 +157,11 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
 	}
 
 	conn = list_entry(v, struct rxrpc_connection, proc_link);
+	if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) {
+		strcpy(lbuff, "no_local");
+		strcpy(rbuff, "no_connection");
+		goto print;
+	}
 
 	sprintf(lbuff, "%pI4:%u",
 		&conn->params.local->srx.transport.sin.sin_addr,
@@ -164,7 +170,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
 	sprintf(rbuff, "%pI4:%u",
 		&conn->params.peer->srx.transport.sin.sin_addr,
 		ntohs(conn->params.peer->srx.transport.sin.sin_port));
-
+print:
 	seq_printf(seq,
 		   "UDP   %-22.22s %-22.22s %4x %08x %s %3u"
 		   " %s %08x %08x %08x\n",
-- 
cgit v1.2.3


From 248f219cb8bcbfbd7f132752d44afa2df7c241d1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:12 +0100
Subject: rxrpc: Rewrite the data and ack handling code

Rewrite the data and ack handling code such that:

 (1) Parsing of received ACK and ABORT packets and the distribution and the
     filing of DATA packets happens entirely within the data_ready context
     called from the UDP socket.  This allows us to process and discard ACK
     and ABORT packets much more quickly (they're no longer stashed on a
     queue for a background thread to process).

 (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim().  We instead
     keep track of the offset and length of the content of each packet in
     the sk_buff metadata.  This means we don't do any allocation in the
     receive path.

 (3) Jumbo DATA packet parsing is now done in data_ready context.  Rather
     than cloning the packet once for each subpacket and pulling/trimming
     it, we file the packet multiple times with an annotation for each
     indicating which subpacket is there.  From that we can directly
     calculate the offset and length.

 (4) A call's receive queue can be accessed without taking locks (memory
     barriers do have to be used, though).

 (5) Incoming calls are set up from preallocated resources and immediately
     made live.  They can than have packets queued upon them and ACKs
     generated.  If insufficient resources exist, DATA packet #1 is given a
     BUSY reply and other DATA packets are discarded).

 (6) sk_buffs no longer take a ref on their parent call.

To make this work, the following changes are made:

 (1) Each call's receive buffer is now a circular buffer of sk_buff
     pointers (rxtx_buffer) rather than a number of sk_buff_heads spread
     between the call and the socket.  This permits each sk_buff to be in
     the buffer multiple times.  The receive buffer is reused for the
     transmit buffer.

 (2) A circular buffer of annotations (rxtx_annotations) is kept parallel
     to the data buffer.  Transmission phase annotations indicate whether a
     buffered packet has been ACK'd or not and whether it needs
     retransmission.

     Receive phase annotations indicate whether a slot holds a whole packet
     or a jumbo subpacket and, if the latter, which subpacket.  They also
     note whether the packet has been decrypted in place.

 (3) DATA packet window tracking is much simplified.  Each phase has just
     two numbers representing the window (rx_hard_ack/rx_top and
     tx_hard_ack/tx_top).

     The hard_ack number is the sequence number before base of the window,
     representing the last packet the other side says it has consumed.
     hard_ack starts from 0 and the first packet is sequence number 1.

     The top number is the sequence number of the highest-numbered packet
     residing in the buffer.  Packets between hard_ack+1 and top are
     soft-ACK'd to indicate they've been received, but not yet consumed.

     Four macros, before(), before_eq(), after() and after_eq() are added
     to compare sequence numbers within the window.  This allows for the
     top of the window to wrap when the hard-ack sequence number gets close
     to the limit.

     Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also
     to indicate when rx_top and tx_top point at the packets with the
     LAST_PACKET bit set, indicating the end of the phase.

 (4) Calls are queued on the socket 'receive queue' rather than packets.
     This means that we don't need have to invent dummy packets to queue to
     indicate abnormal/terminal states and we don't have to keep metadata
     packets (such as ABORTs) around

 (5) The offset and length of a (sub)packet's content are now passed to
     the verify_packet security op.  This is currently expected to decrypt
     the packet in place and validate it.

     However, there's now nowhere to store the revised offset and length of
     the actual data within the decrypted blob (there may be a header and
     padding to skip) because an sk_buff may represent multiple packets, so
     a locate_data security op is added to retrieve these details from the
     sk_buff content when needed.

 (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is
     individually secured and needs to be individually decrypted.  The code
     to do this is broken out into rxrpc_recvmsg_data() and shared with the
     kernel API.  It now iterates over the call's receive buffer rather
     than walking the socket receive queue.

Additional changes:

 (1) The timers are condensed to a single timer that is set for the soonest
     of three timeouts (delayed ACK generation, DATA retransmission and
     call lifespan).

 (2) Transmission of ACK and ABORT packets is effected immediately from
     process-context socket ops/kernel API calls that cause them instead of
     them being punted off to a background work item.  The data_ready
     handler still has to defer to the background, though.

 (3) A shutdown op is added to the AF_RXRPC socket so that the AFS
     filesystem can shut down the socket and flush its own work items
     before closing the socket to deal with any in-progress service calls.

Future additional changes that will need to be considered:

 (1) Make sure that a call doesn't hog the front of the queue by receiving
     data from the network as fast as userspace is consuming it to the
     exclusion of other calls.

 (2) Transmit delayed ACKs from within recvmsg() when we've consumed
     sufficiently more packets to avoid the background work item needing to
     run.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c           |   51 +-
 include/net/af_rxrpc.h   |    3 -
 include/rxrpc/packet.h   |    7 +
 net/rxrpc/af_rxrpc.c     |   57 +-
 net/rxrpc/ar-internal.h  |  177 +++---
 net/rxrpc/call_accept.c  |  472 +++++++---------
 net/rxrpc/call_event.c   | 1357 +++++++---------------------------------------
 net/rxrpc/call_object.c  |  535 +++++-------------
 net/rxrpc/conn_event.c   |  137 +----
 net/rxrpc/conn_object.c  |    6 +-
 net/rxrpc/conn_service.c |  101 +---
 net/rxrpc/input.c        | 1044 ++++++++++++++++++-----------------
 net/rxrpc/insecure.c     |   13 +-
 net/rxrpc/local_event.c  |    2 +-
 net/rxrpc/local_object.c |    7 -
 net/rxrpc/misc.c         |    2 +-
 net/rxrpc/output.c       |  125 ++++-
 net/rxrpc/peer_event.c   |   17 +-
 net/rxrpc/peer_object.c  |   82 ++-
 net/rxrpc/recvmsg.c      |  764 ++++++++++++++------------
 net/rxrpc/rxkad.c        |  108 +++-
 net/rxrpc/security.c     |   10 +-
 net/rxrpc/sendmsg.c      |  126 ++---
 net/rxrpc/skbuff.c       |  127 -----
 24 files changed, 1993 insertions(+), 3337 deletions(-)

(limited to 'include')

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 720ef05a24fe..59bdaa7527b6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -55,10 +55,8 @@ static const struct afs_call_type afs_RXCMxxxx = {
 	.abort_to_error	= afs_abort_to_error,
 };
 
-static void afs_collect_incoming_call(struct work_struct *);
 static void afs_charge_preallocation(struct work_struct *);
 
-static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
 static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
 
 static int afs_wait_atomic_t(atomic_t *p)
@@ -143,6 +141,8 @@ void afs_close_socket(void)
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
+	flush_workqueue(afs_async_calls);
+	kernel_sock_shutdown(afs_socket, SHUT_RDWR);
 	flush_workqueue(afs_async_calls);
 	sock_release(afs_socket);
 
@@ -602,51 +602,6 @@ static void afs_process_async_call(struct work_struct *work)
 	_leave("");
 }
 
-/*
- * accept the backlog of incoming calls
- */
-static void afs_collect_incoming_call(struct work_struct *work)
-{
-	struct rxrpc_call *rxcall;
-	struct afs_call *call = NULL;
-
-	_enter("");
-
-	do {
-		if (!call) {
-			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
-			if (!call) {
-				rxrpc_kernel_reject_call(afs_socket);
-				return;
-			}
-
-			INIT_WORK(&call->async_work, afs_process_async_call);
-			call->wait_mode = &afs_async_incoming_call;
-			call->type = &afs_RXCMxxxx;
-			init_waitqueue_head(&call->waitq);
-			call->state = AFS_CALL_AWAIT_OP_ID;
-
-			_debug("CALL %p{%s} [%d]",
-			       call, call->type->name,
-			       atomic_read(&afs_outstanding_calls));
-			atomic_inc(&afs_outstanding_calls);
-		}
-
-		rxcall = rxrpc_kernel_accept_call(afs_socket,
-						  (unsigned long)call,
-						  afs_wake_up_async_call);
-		if (!IS_ERR(rxcall)) {
-			call->rxcall = rxcall;
-			call->need_attention = true;
-			queue_work(afs_async_calls, &call->async_work);
-			call = NULL;
-		}
-	} while (!call);
-
-	if (call)
-		afs_free_call(call);
-}
-
 static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
 {
 	struct afs_call *call = (struct afs_call *)user_call_ID;
@@ -704,7 +659,7 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
 static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
 			    unsigned long user_call_ID)
 {
-	queue_work(afs_wq, &afs_collect_incoming_call_work);
+	atomic_inc(&afs_outstanding_calls);
 	queue_work(afs_wq, &afs_charge_preallocation_work);
 }
 
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 9cf551be916b..1061a472a3e3 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -42,9 +42,6 @@ int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
 void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
 			     u32, int, const char *);
 void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
-struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long,
-					    rxrpc_notify_rx_t);
-int rxrpc_kernel_reject_call(struct socket *);
 void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
 			   struct sockaddr_rxrpc *);
 int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index b0ae5c1a6ce6..fd6eb3a60a8c 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -133,6 +133,13 @@ struct rxrpc_ackpacket {
 
 } __packed;
 
+/* Some ACKs refer to specific packets and some are general and can be updated. */
+#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED)	|	\
+			      (1 << RXRPC_ACK_PING_RESPONSE)	|	\
+			      (1 << RXRPC_ACK_DELAY)		|	\
+			      (1 << RXRPC_ACK_IDLE))
+
+
 /*
  * ACK packets can have a further piece of information tagged on the end
  */
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 1e8cf3ded81f..caa226dd436e 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -155,7 +155,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 	}
 
 	if (rx->srx.srx_service) {
-		write_lock_bh(&local->services_lock);
+		write_lock(&local->services_lock);
 		hlist_for_each_entry(prx, &local->services, listen_link) {
 			if (prx->srx.srx_service == rx->srx.srx_service)
 				goto service_in_use;
@@ -163,7 +163,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 
 		rx->local = local;
 		hlist_add_head_rcu(&rx->listen_link, &local->services);
-		write_unlock_bh(&local->services_lock);
+		write_unlock(&local->services_lock);
 
 		rx->sk.sk_state = RXRPC_SERVER_BOUND;
 	} else {
@@ -176,7 +176,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 	return 0;
 
 service_in_use:
-	write_unlock_bh(&local->services_lock);
+	write_unlock(&local->services_lock);
 	rxrpc_put_local(local);
 	ret = -EADDRINUSE;
 error_unlock:
@@ -515,15 +515,16 @@ error:
 static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
 			       poll_table *wait)
 {
-	unsigned int mask;
 	struct sock *sk = sock->sk;
+	struct rxrpc_sock *rx = rxrpc_sk(sk);
+	unsigned int mask;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* the socket is readable if there are any messages waiting on the Rx
 	 * queue */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!list_empty(&rx->recvmsg_q))
 		mask |= POLLIN | POLLRDNORM;
 
 	/* the socket is writable if there is space to add new data to the
@@ -575,8 +576,11 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	rx->calls = RB_ROOT;
 
 	INIT_HLIST_NODE(&rx->listen_link);
-	INIT_LIST_HEAD(&rx->secureq);
-	INIT_LIST_HEAD(&rx->acceptq);
+	spin_lock_init(&rx->incoming_lock);
+	INIT_LIST_HEAD(&rx->sock_calls);
+	INIT_LIST_HEAD(&rx->to_be_accepted);
+	INIT_LIST_HEAD(&rx->recvmsg_q);
+	rwlock_init(&rx->recvmsg_lock);
 	rwlock_init(&rx->call_lock);
 	memset(&rx->srx, 0, sizeof(rx->srx));
 
@@ -584,6 +588,39 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	return 0;
 }
 
+/*
+ * Kill all the calls on a socket and shut it down.
+ */
+static int rxrpc_shutdown(struct socket *sock, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct rxrpc_sock *rx = rxrpc_sk(sk);
+	int ret = 0;
+
+	_enter("%p,%d", sk, flags);
+
+	if (flags != SHUT_RDWR)
+		return -EOPNOTSUPP;
+	if (sk->sk_state == RXRPC_CLOSE)
+		return -ESHUTDOWN;
+
+	lock_sock(sk);
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	if (sk->sk_state < RXRPC_CLOSE) {
+		sk->sk_state = RXRPC_CLOSE;
+		sk->sk_shutdown = SHUTDOWN_MASK;
+	} else {
+		ret = -ESHUTDOWN;
+	}
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+	rxrpc_discard_prealloc(rx);
+
+	release_sock(sk);
+	return ret;
+}
+
 /*
  * RxRPC socket destructor
  */
@@ -623,9 +660,9 @@ static int rxrpc_release_sock(struct sock *sk)
 	ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1);
 
 	if (!hlist_unhashed(&rx->listen_link)) {
-		write_lock_bh(&rx->local->services_lock);
+		write_lock(&rx->local->services_lock);
 		hlist_del_rcu(&rx->listen_link);
-		write_unlock_bh(&rx->local->services_lock);
+		write_unlock(&rx->local->services_lock);
 	}
 
 	/* try to flush out this socket */
@@ -678,7 +715,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
 	.poll		= rxrpc_poll,
 	.ioctl		= sock_no_ioctl,
 	.listen		= rxrpc_listen,
-	.shutdown	= sock_no_shutdown,
+	.shutdown	= rxrpc_shutdown,
 	.setsockopt	= rxrpc_setsockopt,
 	.getsockopt	= sock_no_getsockopt,
 	.sendmsg	= rxrpc_sendmsg,
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 45e1c269f90e..b1cb79ec4e96 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -94,9 +94,12 @@ struct rxrpc_sock {
 	rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
 	struct rxrpc_local	*local;		/* local endpoint */
 	struct hlist_node	listen_link;	/* link in the local endpoint's listen list */
-	struct list_head	secureq;	/* calls awaiting connection security clearance */
-	struct list_head	acceptq;	/* calls awaiting acceptance */
 	struct rxrpc_backlog	*backlog;	/* Preallocation for services */
+	spinlock_t		incoming_lock;	/* Incoming call vs service shutdown lock */
+	struct list_head	sock_calls;	/* List of calls owned by this socket */
+	struct list_head	to_be_accepted;	/* calls awaiting acceptance */
+	struct list_head	recvmsg_q;	/* Calls awaiting recvmsg's attention  */
+	rwlock_t		recvmsg_lock;	/* Lock for recvmsg_q */
 	struct key		*key;		/* security for this socket */
 	struct key		*securities;	/* list of server security descriptors */
 	struct rb_root		calls;		/* User ID -> call mapping */
@@ -138,13 +141,16 @@ struct rxrpc_host_header {
  * - max 48 bytes (struct sk_buff::cb)
  */
 struct rxrpc_skb_priv {
-	struct rxrpc_call	*call;		/* call with which associated */
-	unsigned long		resend_at;	/* time in jiffies at which to resend */
+	union {
+		unsigned long	resend_at;	/* time in jiffies at which to resend */
+		struct {
+			u8	nr_jumbo;	/* Number of jumbo subpackets */
+		};
+	};
 	union {
 		unsigned int	offset;		/* offset into buffer of next read */
 		int		remain;		/* amount of space remaining for next write */
 		u32		error;		/* network error code */
-		bool		need_resend;	/* T if needs resending */
 	};
 
 	struct rxrpc_host_header hdr;		/* RxRPC packet header from this packet */
@@ -179,7 +185,11 @@ struct rxrpc_security {
 
 	/* verify the security on a received packet */
 	int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
-			     rxrpc_seq_t, u16);
+			     unsigned int, unsigned int, rxrpc_seq_t, u16);
+
+	/* Locate the data in a received packet that has been verified. */
+	void (*locate_data)(struct rxrpc_call *, struct sk_buff *,
+			    unsigned int *, unsigned int *);
 
 	/* issue a challenge */
 	int (*issue_challenge)(struct rxrpc_connection *);
@@ -211,7 +221,6 @@ struct rxrpc_local {
 	struct work_struct	processor;
 	struct hlist_head	services;	/* services listening on this endpoint */
 	struct rw_semaphore	defrag_sem;	/* control re-enablement of IP DF bit */
-	struct sk_buff_head	accept_queue;	/* incoming calls awaiting acceptance */
 	struct sk_buff_head	reject_queue;	/* packets awaiting rejection */
 	struct sk_buff_head	event_queue;	/* endpoint event packets awaiting processing */
 	struct rb_root		client_conns;	/* Client connections by socket params */
@@ -388,38 +397,21 @@ struct rxrpc_connection {
  */
 enum rxrpc_call_flag {
 	RXRPC_CALL_RELEASED,		/* call has been released - no more message to userspace */
-	RXRPC_CALL_TERMINAL_MSG,	/* call has given the socket its final message */
-	RXRPC_CALL_RCVD_LAST,		/* all packets received */
-	RXRPC_CALL_RUN_RTIMER,		/* Tx resend timer started */
-	RXRPC_CALL_TX_SOFT_ACK,		/* sent some soft ACKs */
-	RXRPC_CALL_INIT_ACCEPT,		/* acceptance was initiated */
 	RXRPC_CALL_HAS_USERID,		/* has a user ID attached */
-	RXRPC_CALL_EXPECT_OOS,		/* expect out of sequence packets */
 	RXRPC_CALL_IS_SERVICE,		/* Call is service call */
 	RXRPC_CALL_EXPOSED,		/* The call was exposed to the world */
-	RXRPC_CALL_RX_NO_MORE,		/* Don't indicate MSG_MORE from recvmsg() */
+	RXRPC_CALL_RX_LAST,		/* Received the last packet (at rxtx_top) */
+	RXRPC_CALL_TX_LAST,		/* Last packet in Tx buffer (at rxtx_top) */
 };
 
 /*
  * Events that can be raised on a call.
  */
 enum rxrpc_call_event {
-	RXRPC_CALL_EV_RCVD_ACKALL,	/* ACKALL or reply received */
-	RXRPC_CALL_EV_RCVD_BUSY,	/* busy packet received */
-	RXRPC_CALL_EV_RCVD_ABORT,	/* abort packet received */
-	RXRPC_CALL_EV_RCVD_ERROR,	/* network error received */
-	RXRPC_CALL_EV_ACK_FINAL,	/* need to generate final ACK (and release call) */
 	RXRPC_CALL_EV_ACK,		/* need to generate ACK */
-	RXRPC_CALL_EV_REJECT_BUSY,	/* need to generate busy message */
 	RXRPC_CALL_EV_ABORT,		/* need to generate abort */
-	RXRPC_CALL_EV_CONN_ABORT,	/* local connection abort generated */
-	RXRPC_CALL_EV_RESEND_TIMER,	/* Tx resend timer expired */
+	RXRPC_CALL_EV_TIMER,		/* Timer expired */
 	RXRPC_CALL_EV_RESEND,		/* Tx resend required */
-	RXRPC_CALL_EV_DRAIN_RX_OOS,	/* drain the Rx out of sequence queue */
-	RXRPC_CALL_EV_LIFE_TIMER,	/* call's lifetimer ran out */
-	RXRPC_CALL_EV_ACCEPTED,		/* incoming call accepted by userspace app */
-	RXRPC_CALL_EV_SECURED,		/* incoming call's connection is now secure */
-	RXRPC_CALL_EV_POST_ACCEPT,	/* need to post an "accept?" message to the app */
 };
 
 /*
@@ -431,7 +423,6 @@ enum rxrpc_call_state {
 	RXRPC_CALL_CLIENT_SEND_REQUEST,	/* - client sending request phase */
 	RXRPC_CALL_CLIENT_AWAIT_REPLY,	/* - client awaiting reply */
 	RXRPC_CALL_CLIENT_RECV_REPLY,	/* - client receiving reply phase */
-	RXRPC_CALL_CLIENT_FINAL_ACK,	/* - client sending final ACK phase */
 	RXRPC_CALL_SERVER_PREALLOC,	/* - service preallocation */
 	RXRPC_CALL_SERVER_SECURING,	/* - server securing request connection */
 	RXRPC_CALL_SERVER_ACCEPTING,	/* - server accepting request */
@@ -448,7 +439,6 @@ enum rxrpc_call_state {
  */
 enum rxrpc_call_completion {
 	RXRPC_CALL_SUCCEEDED,		/* - Normal termination */
-	RXRPC_CALL_SERVER_BUSY,		/* - call rejected by busy server */
 	RXRPC_CALL_REMOTELY_ABORTED,	/* - call aborted by peer */
 	RXRPC_CALL_LOCALLY_ABORTED,	/* - call aborted locally on error or close */
 	RXRPC_CALL_LOCAL_ERROR,		/* - call failed due to local error */
@@ -465,24 +455,23 @@ struct rxrpc_call {
 	struct rxrpc_connection	*conn;		/* connection carrying call */
 	struct rxrpc_peer	*peer;		/* Peer record for remote address */
 	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
-	struct timer_list	lifetimer;	/* lifetime remaining on call */
-	struct timer_list	ack_timer;	/* ACK generation timer */
-	struct timer_list	resend_timer;	/* Tx resend timer */
-	struct work_struct	processor;	/* packet processor and ACK generator */
+	unsigned long		ack_at;		/* When deferred ACK needs to happen */
+	unsigned long		resend_at;	/* When next resend needs to happen */
+	unsigned long		expire_at;	/* When the call times out */
+	struct timer_list	timer;		/* Combined event timer */
+	struct work_struct	processor;	/* Event processor */
 	rxrpc_notify_rx_t	notify_rx;	/* kernel service Rx notification function */
 	struct list_head	link;		/* link in master call list */
 	struct list_head	chan_wait_link;	/* Link in conn->waiting_calls */
 	struct hlist_node	error_link;	/* link in error distribution list */
-	struct list_head	accept_link;	/* calls awaiting acceptance */
-	struct rb_node		sock_node;	/* node in socket call tree */
-	struct sk_buff_head	rx_queue;	/* received packets */
-	struct sk_buff_head	rx_oos_queue;	/* packets received out of sequence */
-	struct sk_buff_head	knlrecv_queue;	/* Queue for kernel_recv [TODO: replace this] */
+	struct list_head	accept_link;	/* Link in rx->acceptq */
+	struct list_head	recvmsg_link;	/* Link in rx->recvmsg_q */
+	struct list_head	sock_link;	/* Link in rx->sock_calls */
+	struct rb_node		sock_node;	/* Node in rx->calls */
 	struct sk_buff		*tx_pending;	/* Tx socket buffer being filled */
 	wait_queue_head_t	waitq;		/* Wait queue for channel or Tx */
 	__be32			crypto_buf[2];	/* Temporary packet crypto buffer */
 	unsigned long		user_call_ID;	/* user-defined call ID */
-	unsigned long		creation_jif;	/* time of call creation */
 	unsigned long		flags;
 	unsigned long		events;
 	spinlock_t		lock;
@@ -492,40 +481,55 @@ struct rxrpc_call {
 	enum rxrpc_call_state	state;		/* current state of call */
 	enum rxrpc_call_completion completion;	/* Call completion condition */
 	atomic_t		usage;
-	atomic_t		sequence;	/* Tx data packet sequence counter */
 	u16			service_id;	/* service ID */
 	u8			security_ix;	/* Security type */
 	u32			call_id;	/* call ID on connection  */
 	u32			cid;		/* connection ID plus channel index */
 	int			debug_id;	/* debug ID for printks */
 
-	/* transmission-phase ACK management */
-	u8			acks_head;	/* offset into window of first entry */
-	u8			acks_tail;	/* offset into window of last entry */
-	u8			acks_winsz;	/* size of un-ACK'd window */
-	u8			acks_unacked;	/* lowest unacked packet in last ACK received */
-	int			acks_latest;	/* serial number of latest ACK received */
-	rxrpc_seq_t		acks_hard;	/* highest definitively ACK'd msg seq */
-	unsigned long		*acks_window;	/* sent packet window
-						 * - elements are pointers with LSB set if ACK'd
+	/* Rx/Tx circular buffer, depending on phase.
+	 *
+	 * In the Rx phase, packets are annotated with 0 or the number of the
+	 * segment of a jumbo packet each buffer refers to.  There can be up to
+	 * 47 segments in a maximum-size UDP packet.
+	 *
+	 * In the Tx phase, packets are annotated with which buffers have been
+	 * acked.
+	 */
+#define RXRPC_RXTX_BUFF_SIZE	64
+#define RXRPC_RXTX_BUFF_MASK	(RXRPC_RXTX_BUFF_SIZE - 1)
+	struct sk_buff		**rxtx_buffer;
+	u8			*rxtx_annotations;
+#define RXRPC_TX_ANNO_ACK	0
+#define RXRPC_TX_ANNO_UNACK	1
+#define RXRPC_TX_ANNO_NAK	2
+#define RXRPC_TX_ANNO_RETRANS	3
+#define RXRPC_RX_ANNO_JUMBO	0x3f		/* Jumbo subpacket number + 1 if not zero */
+#define RXRPC_RX_ANNO_JLAST	0x40		/* Set if last element of a jumbo packet */
+#define RXRPC_RX_ANNO_VERIFIED	0x80		/* Set if verified and decrypted */
+	rxrpc_seq_t		tx_hard_ack;	/* Dead slot in buffer; the first transmitted but
+						 * not hard-ACK'd packet follows this.
+						 */
+	rxrpc_seq_t		tx_top;		/* Highest Tx slot allocated. */
+	rxrpc_seq_t		rx_hard_ack;	/* Dead slot in buffer; the first received but not
+						 * consumed packet follows this.
 						 */
+	rxrpc_seq_t		rx_top;		/* Highest Rx slot allocated. */
+	rxrpc_seq_t		rx_expect_next;	/* Expected next packet sequence number */
+	u8			rx_winsize;	/* Size of Rx window */
+	u8			tx_winsize;	/* Maximum size of Tx window */
+	u8			nr_jumbo_dup;	/* Number of jumbo duplicates */
 
 	/* receive-phase ACK management */
-	rxrpc_seq_t		rx_data_expect;	/* next data seq ID expected to be received */
-	rxrpc_seq_t		rx_data_post;	/* next data seq ID expected to be posted */
-	rxrpc_seq_t		rx_data_recv;	/* last data seq ID encountered by recvmsg */
-	rxrpc_seq_t		rx_data_eaten;	/* last data seq ID consumed by recvmsg */
-	rxrpc_seq_t		rx_first_oos;	/* first packet in rx_oos_queue (or 0) */
-	rxrpc_seq_t		ackr_win_top;	/* top of ACK window (rx_data_eaten is bottom) */
-	rxrpc_seq_t		ackr_prev_seq;	/* previous sequence number received */
 	u8			ackr_reason;	/* reason to ACK */
 	u16			ackr_skew;	/* skew on packet being ACK'd */
 	rxrpc_serial_t		ackr_serial;	/* serial of packet being ACK'd */
-	atomic_t		ackr_not_idle;	/* number of packets in Rx queue */
+	rxrpc_seq_t		ackr_prev_seq;	/* previous sequence number received */
+	unsigned short		rx_pkt_offset;	/* Current recvmsg packet offset */
+	unsigned short		rx_pkt_len;	/* Current recvmsg packet len */
 
-	/* received packet records, 1 bit per record */
-#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG)
-	unsigned long		ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
+	/* transmission-phase ACK management */
+	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 };
 
 enum rxrpc_call_trace {
@@ -535,10 +539,8 @@ enum rxrpc_call_trace {
 	rxrpc_call_queued_ref,
 	rxrpc_call_seen,
 	rxrpc_call_got,
-	rxrpc_call_got_skb,
 	rxrpc_call_got_userid,
 	rxrpc_call_put,
-	rxrpc_call_put_skb,
 	rxrpc_call_put_userid,
 	rxrpc_call_put_noqueue,
 	rxrpc_call__nr_trace
@@ -561,6 +563,9 @@ extern struct workqueue_struct *rxrpc_workqueue;
  */
 int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
 void rxrpc_discard_prealloc(struct rxrpc_sock *);
+struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
+					   struct rxrpc_connection *,
+					   struct sk_buff *);
 void rxrpc_accept_incoming_calls(struct rxrpc_local *);
 struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
 				     rxrpc_notify_rx_t);
@@ -569,8 +574,7 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
-void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
 void rxrpc_process_call(struct work_struct *);
 
 /*
@@ -589,9 +593,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 					 struct rxrpc_conn_parameters *,
 					 struct sockaddr_rxrpc *,
 					 unsigned long, gfp_t);
-struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
-				       struct rxrpc_connection *,
-				       struct sk_buff *);
+void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
+			 struct sk_buff *);
 void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
 bool __rxrpc_queue_call(struct rxrpc_call *);
@@ -599,8 +602,6 @@ bool rxrpc_queue_call(struct rxrpc_call *);
 void rxrpc_see_call(struct rxrpc_call *);
 void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
-void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *);
-void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_cleanup_call(struct rxrpc_call *);
 void __exit rxrpc_destroy_all_calls(void);
 
@@ -672,13 +673,8 @@ static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
 {
 	trace_rxrpc_abort(why, call->cid, call->call_id, seq,
 			  abort_code, error);
-	if (__rxrpc_set_call_completion(call,
-					RXRPC_CALL_LOCALLY_ABORTED,
-					abort_code, error)) {
-		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
-		return true;
-	}
-	return false;
+	return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED,
+					   abort_code, error);
 }
 
 static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
@@ -713,8 +709,6 @@ void __exit rxrpc_destroy_all_client_connections(void);
  * conn_event.c
  */
 void rxrpc_process_connection(struct work_struct *);
-void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
-void rxrpc_reject_packets(struct rxrpc_local *);
 
 /*
  * conn_object.c
@@ -783,18 +777,14 @@ static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
  */
 struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
 						     struct sk_buff *);
-struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *,
-						   struct sockaddr_rxrpc *,
-						   struct sk_buff *);
 struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t);
+void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *);
 void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
 
 /*
  * input.c
  */
 void rxrpc_data_ready(struct sock *);
-int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
-void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
 
 /*
  * insecure.c
@@ -868,6 +858,7 @@ extern const char *rxrpc_acks(u8 reason);
  */
 int rxrpc_send_call_packet(struct rxrpc_call *, u8);
 int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
+void rxrpc_reject_packets(struct rxrpc_local *);
 
 /*
  * peer_event.c
@@ -883,6 +874,8 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
 struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
 				     struct sockaddr_rxrpc *, gfp_t);
 struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
+struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
+					      struct rxrpc_peer *);
 
 static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
 {
@@ -912,6 +905,7 @@ extern const struct file_operations rxrpc_connection_seq_fops;
 /*
  * recvmsg.c
  */
+void rxrpc_notify_socket(struct rxrpc_call *);
 int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
 
 /*
@@ -961,6 +955,23 @@ static inline void rxrpc_sysctl_exit(void) {}
  */
 int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
 
+static inline bool before(u32 seq1, u32 seq2)
+{
+        return (s32)(seq1 - seq2) < 0;
+}
+static inline bool before_eq(u32 seq1, u32 seq2)
+{
+        return (s32)(seq1 - seq2) <= 0;
+}
+static inline bool after(u32 seq1, u32 seq2)
+{
+        return (s32)(seq1 - seq2) > 0;
+}
+static inline bool after_eq(u32 seq1, u32 seq2)
+{
+        return (s32)(seq1 - seq2) >= 0;
+}
+
 /*
  * debug tracing
  */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index cc7194e05a15..b8acec0d596e 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -129,6 +129,8 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 		set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
 	}
 
+	list_add(&call->sock_link, &rx->sock_calls);
+
 	write_unlock(&rx->call_lock);
 
 	write_lock(&rxrpc_call_lock);
@@ -186,6 +188,12 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 		return;
 	rx->backlog = NULL;
 
+	/* Make sure that there aren't any incoming calls in progress before we
+	 * clear the preallocation buffers.
+	 */
+	spin_lock_bh(&rx->incoming_lock);
+	spin_unlock_bh(&rx->incoming_lock);
+
 	head = b->peer_backlog_head;
 	tail = b->peer_backlog_tail;
 	while (CIRC_CNT(head, tail, size) > 0) {
@@ -224,251 +232,179 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 }
 
 /*
- * generate a connection-level abort
+ * Allocate a new incoming call from the prealloc pool, along with a connection
+ * and a peer as necessary.
  */
-static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
-		      struct rxrpc_wire_header *whdr)
+static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
+						    struct rxrpc_local *local,
+						    struct rxrpc_connection *conn,
+						    struct sk_buff *skb)
 {
-	struct msghdr msg;
-	struct kvec iov[1];
-	size_t len;
-	int ret;
-
-	_enter("%d,,", local->debug_id);
-
-	whdr->type	= RXRPC_PACKET_TYPE_BUSY;
-	whdr->serial	= htonl(1);
-
-	msg.msg_name	= &srx->transport.sin;
-	msg.msg_namelen	= sizeof(srx->transport.sin);
-	msg.msg_control	= NULL;
-	msg.msg_controllen = 0;
-	msg.msg_flags	= 0;
-
-	iov[0].iov_base	= whdr;
-	iov[0].iov_len	= sizeof(*whdr);
-
-	len = iov[0].iov_len;
-
-	_proto("Tx BUSY %%1");
+	struct rxrpc_backlog *b = rx->backlog;
+	struct rxrpc_peer *peer, *xpeer;
+	struct rxrpc_call *call;
+	unsigned short call_head, conn_head, peer_head;
+	unsigned short call_tail, conn_tail, peer_tail;
+	unsigned short call_count, conn_count;
+
+	/* #calls >= #conns >= #peers must hold true. */
+	call_head = smp_load_acquire(&b->call_backlog_head);
+	call_tail = b->call_backlog_tail;
+	call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX);
+	conn_head = smp_load_acquire(&b->conn_backlog_head);
+	conn_tail = b->conn_backlog_tail;
+	conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX);
+	ASSERTCMP(conn_count, >=, call_count);
+	peer_head = smp_load_acquire(&b->peer_backlog_head);
+	peer_tail = b->peer_backlog_tail;
+	ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=,
+		  conn_count);
+
+	if (call_count == 0)
+		return NULL;
+
+	if (!conn) {
+		/* No connection.  We're going to need a peer to start off
+		 * with.  If one doesn't yet exist, use a spare from the
+		 * preallocation set.  We dump the address into the spare in
+		 * anticipation - and to save on stack space.
+		 */
+		xpeer = b->peer_backlog[peer_tail];
+		if (rxrpc_extract_addr_from_skb(&xpeer->srx, skb) < 0)
+			return NULL;
+
+		peer = rxrpc_lookup_incoming_peer(local, xpeer);
+		if (peer == xpeer) {
+			b->peer_backlog[peer_tail] = NULL;
+			smp_store_release(&b->peer_backlog_tail,
+					  (peer_tail + 1) &
+					  (RXRPC_BACKLOG_MAX - 1));
+		}
 
-	ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
-	if (ret < 0) {
-		_leave(" = -EAGAIN [sendmsg failed: %d]", ret);
-		return -EAGAIN;
+		/* Now allocate and set up the connection */
+		conn = b->conn_backlog[conn_tail];
+		b->conn_backlog[conn_tail] = NULL;
+		smp_store_release(&b->conn_backlog_tail,
+				  (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
+		rxrpc_get_local(local);
+		conn->params.local = local;
+		conn->params.peer = peer;
+		rxrpc_new_incoming_connection(conn, skb);
+	} else {
+		rxrpc_get_connection(conn);
 	}
 
-	_leave(" = 0");
-	return 0;
+	/* And now we can allocate and set up a new call */
+	call = b->call_backlog[call_tail];
+	b->call_backlog[call_tail] = NULL;
+	smp_store_release(&b->call_backlog_tail,
+			  (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
+
+	call->conn = conn;
+	call->peer = rxrpc_get_peer(conn->params.peer);
+	return call;
 }
 
 /*
- * accept an incoming call that needs peer, transport and/or connection setting
- * up
+ * Set up a new incoming call.  Called in BH context with the RCU read lock
+ * held.
+ *
+ * If this is for a kernel service, when we allocate the call, it will have
+ * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the
+ * retainer ref obtained from the backlog buffer.  Prealloc calls for userspace
+ * services only have the ref from the backlog buffer.  We want to pass this
+ * ref to non-BH context to dispose of.
+ *
+ * If we want to report an error, we mark the skb with the packet type and
+ * abort code and return NULL.
  */
-static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
-				      struct rxrpc_sock *rx,
-				      struct sk_buff *skb,
-				      struct sockaddr_rxrpc *srx)
+struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
+					   struct rxrpc_connection *conn,
+					   struct sk_buff *skb)
 {
-	struct rxrpc_connection *conn;
-	struct rxrpc_skb_priv *sp, *nsp;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	struct rxrpc_sock *rx;
 	struct rxrpc_call *call;
-	struct sk_buff *notification;
-	int ret;
 
 	_enter("");
 
-	sp = rxrpc_skb(skb);
-
-	/* get a notification message to send to the server app */
-	notification = alloc_skb(0, GFP_NOFS);
-	if (!notification) {
-		_debug("no memory");
-		ret = -ENOMEM;
-		goto error_nofree;
-	}
-	rxrpc_new_skb(notification);
-	notification->mark = RXRPC_SKB_MARK_NEW_CALL;
-
-	conn = rxrpc_incoming_connection(local, srx, skb);
-	if (IS_ERR(conn)) {
-		_debug("no conn");
-		ret = PTR_ERR(conn);
-		goto error;
-	}
-
-	call = rxrpc_incoming_call(rx, conn, skb);
-	rxrpc_put_connection(conn);
-	if (IS_ERR(call)) {
-		_debug("no call");
-		ret = PTR_ERR(call);
-		goto error;
+	/* Get the socket providing the service */
+	hlist_for_each_entry_rcu_bh(rx, &local->services, listen_link) {
+		if (rx->srx.srx_service == sp->hdr.serviceId)
+			goto found_service;
 	}
 
-	/* attach the call to the socket */
-	read_lock_bh(&local->services_lock);
-	if (rx->sk.sk_state == RXRPC_CLOSE)
-		goto invalid_service;
-
-	write_lock(&rx->call_lock);
-	if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
-		rxrpc_get_call(call, rxrpc_call_got);
-
-		spin_lock(&call->conn->state_lock);
-		if (sp->hdr.securityIndex > 0 &&
-		    call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) {
-			_debug("await conn sec");
-			list_add_tail(&call->accept_link, &rx->secureq);
-			call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
-			set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
-			rxrpc_queue_conn(call->conn);
-		} else {
-			_debug("conn ready");
-			call->state = RXRPC_CALL_SERVER_ACCEPTING;
-			list_add_tail(&call->accept_link, &rx->acceptq);
-			rxrpc_get_call_for_skb(call, notification);
-			nsp = rxrpc_skb(notification);
-			nsp->call = call;
-
-			ASSERTCMP(atomic_read(&call->usage), >=, 3);
-
-			_debug("notify");
-			spin_lock(&call->lock);
-			ret = rxrpc_queue_rcv_skb(call, notification, true,
-						  false);
-			spin_unlock(&call->lock);
-			notification = NULL;
-			BUG_ON(ret < 0);
-		}
-		spin_unlock(&call->conn->state_lock);
+	trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+			  RX_INVALID_OPERATION, EOPNOTSUPP);
+	skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+	skb->priority = RX_INVALID_OPERATION;
+	_leave(" = NULL [service]");
+	return NULL;
 
-		_debug("queued");
+found_service:
+	spin_lock(&rx->incoming_lock);
+	if (rx->sk.sk_state == RXRPC_CLOSE) {
+		trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
+				  sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
+		skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+		skb->priority = RX_INVALID_OPERATION;
+		_leave(" = NULL [close]");
+		call = NULL;
+		goto out;
 	}
-	write_unlock(&rx->call_lock);
 
-	_debug("process");
-	rxrpc_fast_process_packet(call, skb);
-
-	_debug("done");
-	read_unlock_bh(&local->services_lock);
-	rxrpc_free_skb(notification);
-	rxrpc_put_call(call, rxrpc_call_put);
-	_leave(" = 0");
-	return 0;
-
-invalid_service:
-	_debug("invalid");
-	read_unlock_bh(&local->services_lock);
-
-	rxrpc_release_call(rx, call);
-	rxrpc_put_call(call, rxrpc_call_put);
-	ret = -ECONNREFUSED;
-error:
-	rxrpc_free_skb(notification);
-error_nofree:
-	_leave(" = %d", ret);
-	return ret;
-}
+	call = rxrpc_alloc_incoming_call(rx, local, conn, skb);
+	if (!call) {
+		skb->mark = RXRPC_SKB_MARK_BUSY;
+		_leave(" = NULL [busy]");
+		call = NULL;
+		goto out;
+	}
 
-/*
- * accept incoming calls that need peer, transport and/or connection setting up
- * - the packets we get are all incoming client DATA packets that have seq == 1
- */
-void rxrpc_accept_incoming_calls(struct rxrpc_local *local)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sockaddr_rxrpc srx;
-	struct rxrpc_sock *rx;
-	struct rxrpc_wire_header whdr;
-	struct sk_buff *skb;
-	int ret;
+	/* Make the call live. */
+	rxrpc_incoming_call(rx, call, skb);
+	conn = call->conn;
 
-	_enter("%d", local->debug_id);
+	if (rx->notify_new_call)
+		rx->notify_new_call(&rx->sk, call, call->user_call_ID);
 
-	skb = skb_dequeue(&local->accept_queue);
-	if (!skb) {
-		_leave("\n");
-		return;
-	}
+	spin_lock(&conn->state_lock);
+	switch (conn->state) {
+	case RXRPC_CONN_SERVICE_UNSECURED:
+		conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
+		set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
+		rxrpc_queue_conn(call->conn);
+		break;
 
-	_net("incoming call skb %p", skb);
-
-	rxrpc_see_skb(skb);
-	sp = rxrpc_skb(skb);
-
-	/* Set up a response packet header in case we need it */
-	whdr.epoch	= htonl(sp->hdr.epoch);
-	whdr.cid	= htonl(sp->hdr.cid);
-	whdr.callNumber	= htonl(sp->hdr.callNumber);
-	whdr.seq	= htonl(sp->hdr.seq);
-	whdr.serial	= 0;
-	whdr.flags	= 0;
-	whdr.type	= 0;
-	whdr.userStatus	= 0;
-	whdr.securityIndex = sp->hdr.securityIndex;
-	whdr._rsvd	= 0;
-	whdr.serviceId	= htons(sp->hdr.serviceId);
-
-	if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
-		goto drop;
-
-	/* get the socket providing the service */
-	read_lock_bh(&local->services_lock);
-	hlist_for_each_entry(rx, &local->services, listen_link) {
-		if (rx->srx.srx_service == sp->hdr.serviceId &&
-		    rx->sk.sk_state != RXRPC_CLOSE)
-			goto found_service;
-	}
-	read_unlock_bh(&local->services_lock);
-	goto invalid_service;
+	case RXRPC_CONN_SERVICE:
+		write_lock(&call->state_lock);
+		if (rx->discard_new_call)
+			call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+		else
+			call->state = RXRPC_CALL_SERVER_ACCEPTING;
+		write_unlock(&call->state_lock);
+		break;
 
-found_service:
-	_debug("found service %hd", rx->srx.srx_service);
-	if (sk_acceptq_is_full(&rx->sk))
-		goto backlog_full;
-	sk_acceptq_added(&rx->sk);
-	read_unlock_bh(&local->services_lock);
-
-	ret = rxrpc_accept_incoming_call(local, rx, skb, &srx);
-	if (ret < 0)
-		sk_acceptq_removed(&rx->sk);
-	switch (ret) {
-	case -ECONNRESET: /* old calls are ignored */
-	case -ECONNABORTED: /* aborted calls are reaborted or ignored */
-	case 0:
-		return;
-	case -ECONNREFUSED:
-		goto invalid_service;
-	case -EBUSY:
-		goto busy;
-	case -EKEYREJECTED:
-		goto security_mismatch;
+	case RXRPC_CONN_REMOTELY_ABORTED:
+		rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+					  conn->remote_abort, ECONNABORTED);
+		break;
+	case RXRPC_CONN_LOCALLY_ABORTED:
+		rxrpc_abort_call("CON", call, sp->hdr.seq,
+				 conn->local_abort, ECONNABORTED);
+		break;
 	default:
 		BUG();
 	}
+	spin_unlock(&conn->state_lock);
 
-backlog_full:
-	read_unlock_bh(&local->services_lock);
-busy:
-	rxrpc_busy(local, &srx, &whdr);
-	rxrpc_free_skb(skb);
-	return;
-
-drop:
-	rxrpc_free_skb(skb);
-	return;
+	if (call->state == RXRPC_CALL_SERVER_ACCEPTING)
+		rxrpc_notify_socket(call);
 
-invalid_service:
-	skb->priority = RX_INVALID_OPERATION;
-	rxrpc_reject_packet(local, skb);
-	return;
-
-	/* can't change connection security type mid-flow */
-security_mismatch:
-	skb->priority = RX_PROTOCOL_ERROR;
-	rxrpc_reject_packet(local, skb);
-	return;
+	_leave(" = %p{%d}", call, call->debug_id);
+out:
+	spin_unlock(&rx->incoming_lock);
+	return call;
 }
 
 /*
@@ -490,11 +426,10 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	write_lock(&rx->call_lock);
 
 	ret = -ENODATA;
-	if (list_empty(&rx->acceptq))
+	if (list_empty(&rx->to_be_accepted))
 		goto out;
 
 	/* check the user ID isn't already in use */
-	ret = -EBADSLT;
 	pp = &rx->calls.rb_node;
 	parent = NULL;
 	while (*pp) {
@@ -506,11 +441,14 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 		else if (user_call_ID > call->user_call_ID)
 			pp = &(*pp)->rb_right;
 		else
-			goto out;
+			goto id_in_use;
 	}
 
-	/* dequeue the first call and check it's still valid */
-	call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+	/* Dequeue the first call and check it's still valid.  We gain
+	 * responsibility for the queue's reference.
+	 */
+	call = list_entry(rx->to_be_accepted.next,
+			  struct rxrpc_call, accept_link);
 	list_del_init(&call->accept_link);
 	sk_acceptq_removed(&rx->sk);
 	rxrpc_see_call(call);
@@ -528,31 +466,35 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	}
 
 	/* formalise the acceptance */
-	rxrpc_get_call(call, rxrpc_call_got_userid);
+	rxrpc_get_call(call, rxrpc_call_got);
 	call->notify_rx = notify_rx;
 	call->user_call_ID = user_call_ID;
+	rxrpc_get_call(call, rxrpc_call_got_userid);
 	rb_link_node(&call->sock_node, parent, pp);
 	rb_insert_color(&call->sock_node, &rx->calls);
 	if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
 		BUG();
-	if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
-		BUG();
 
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
-	rxrpc_queue_call(call);
+	rxrpc_notify_socket(call);
+	rxrpc_service_prealloc(rx, GFP_KERNEL);
 	_leave(" = %p{%d}", call, call->debug_id);
 	return call;
 
 out_release:
+	_debug("release %p", call);
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
-	_debug("release %p", call);
 	rxrpc_release_call(rx, call);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-out:
+	rxrpc_put_call(call, rxrpc_call_put);
+	goto out;
+
+id_in_use:
+	ret = -EBADSLT;
 	write_unlock(&rx->call_lock);
+out:
+	rxrpc_service_prealloc(rx, GFP_KERNEL);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
@@ -564,6 +506,7 @@ out:
 int rxrpc_reject_call(struct rxrpc_sock *rx)
 {
 	struct rxrpc_call *call;
+	bool abort = false;
 	int ret;
 
 	_enter("");
@@ -572,15 +515,16 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
 
 	write_lock(&rx->call_lock);
 
-	ret = -ENODATA;
-	if (list_empty(&rx->acceptq)) {
+	if (list_empty(&rx->to_be_accepted)) {
 		write_unlock(&rx->call_lock);
-		_leave(" = -ENODATA");
 		return -ENODATA;
 	}
 
-	/* dequeue the first call and check it's still valid */
-	call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+	/* Dequeue the first call and check it's still valid.  We gain
+	 * responsibility for the queue's reference.
+	 */
+	call = list_entry(rx->to_be_accepted.next,
+			  struct rxrpc_call, accept_link);
 	list_del_init(&call->accept_link);
 	sk_acceptq_removed(&rx->sk);
 	rxrpc_see_call(call);
@@ -588,66 +532,28 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
 	write_lock_bh(&call->state_lock);
 	switch (call->state) {
 	case RXRPC_CALL_SERVER_ACCEPTING:
-		__rxrpc_set_call_completion(call, RXRPC_CALL_SERVER_BUSY,
-					    0, ECONNABORTED);
-		if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events))
-			rxrpc_queue_call(call);
-		ret = 0;
-		break;
+		__rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, ECONNABORTED);
+		abort = true;
+		/* fall through */
 	case RXRPC_CALL_COMPLETE:
 		ret = call->error;
-		break;
+		goto out_discard;
 	default:
 		BUG();
 	}
 
+out_discard:
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
-	rxrpc_release_call(rx, call);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/**
- * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call
- * @sock: The socket on which the impending call is waiting
- * @user_call_ID: The tag to attach to the call
- * @notify_rx: Where to send notifications instead of socket queue
- *
- * Allow a kernel service to accept an incoming call, assuming the incoming
- * call is still valid.  The caller should immediately trigger their own
- * notification as there must be data waiting.
- */
-struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
-					    unsigned long user_call_ID,
-					    rxrpc_notify_rx_t notify_rx)
-{
-	struct rxrpc_call *call;
-
-	_enter(",%lx", user_call_ID);
-	call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID, notify_rx);
-	_leave(" = %p", call);
-	return call;
-}
-EXPORT_SYMBOL(rxrpc_kernel_accept_call);
-
-/**
- * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call
- * @sock: The socket on which the impending call is waiting
- *
- * Allow a kernel service to reject an incoming call with a BUSY message,
- * assuming the incoming call is still valid.
- */
-int rxrpc_kernel_reject_call(struct socket *sock)
-{
-	int ret;
-
-	_enter("");
-	ret = rxrpc_reject_call(rxrpc_sk(sock->sk));
+	if (abort) {
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+		rxrpc_release_call(rx, call);
+		rxrpc_put_call(call, rxrpc_call_put);
+	}
+	rxrpc_service_prealloc(rx, GFP_KERNEL);
 	_leave(" = %d", ret);
 	return ret;
 }
-EXPORT_SYMBOL(rxrpc_kernel_reject_call);
 
 /*
  * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index af88ad7d2cf9..2b976e789562 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -22,1257 +22,286 @@
 #include "ar-internal.h"
 
 /*
- * propose an ACK be sent
+ * Set the timer
  */
-void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-			 u16 skew, u32 serial, bool immediate)
+static void rxrpc_set_timer(struct rxrpc_call *call)
 {
-	unsigned long expiry;
-	s8 prior = rxrpc_ack_priority[ack_reason];
-
-	ASSERTCMP(prior, >, 0);
-
-	_enter("{%d},%s,%%%x,%u",
-	       call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
+	unsigned long t, now = jiffies;
 
-	if (prior < rxrpc_ack_priority[call->ackr_reason]) {
-		if (immediate)
-			goto cancel_timer;
-		return;
-	}
-
-	/* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
-	 * numbers */
-	if (prior == rxrpc_ack_priority[call->ackr_reason]) {
-		if (prior <= 4) {
-			call->ackr_skew = skew;
-			call->ackr_serial = serial;
-		}
-		if (immediate)
-			goto cancel_timer;
-		return;
-	}
-
-	call->ackr_reason = ack_reason;
-	call->ackr_serial = serial;
-
-	switch (ack_reason) {
-	case RXRPC_ACK_DELAY:
-		_debug("run delay timer");
-		expiry = rxrpc_soft_ack_delay;
-		goto run_timer;
-
-	case RXRPC_ACK_IDLE:
-		if (!immediate) {
-			_debug("run defer timer");
-			expiry = rxrpc_idle_ack_delay;
-			goto run_timer;
-		}
-		goto cancel_timer;
+	_enter("{%ld,%ld,%ld:%ld}",
+	       call->ack_at - now, call->resend_at - now, call->expire_at - now,
+	       call->timer.expires - now);
+	
+	read_lock_bh(&call->state_lock);
 
-	case RXRPC_ACK_REQUESTED:
-		expiry = rxrpc_requested_ack_delay;
-		if (!expiry)
-			goto cancel_timer;
-		if (!immediate || serial == 1) {
-			_debug("run defer timer");
-			goto run_timer;
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		t = call->ack_at;
+		if (time_before(call->resend_at, t))
+			t = call->resend_at;
+		if (time_before(call->expire_at, t))
+			t = call->expire_at;
+		if (!timer_pending(&call->timer) ||
+		    time_before(t, call->timer.expires)) {
+			_debug("set timer %ld", t - now);
+			mod_timer(&call->timer, t);
 		}
-
-	default:
-		_debug("immediate ACK");
-		goto cancel_timer;
 	}
-
-run_timer:
-	expiry += jiffies;
-	if (!timer_pending(&call->ack_timer) ||
-	    time_after(call->ack_timer.expires, expiry))
-		mod_timer(&call->ack_timer, expiry);
-	return;
-
-cancel_timer:
-	_debug("cancel timer %%%u", serial);
-	try_to_del_timer_sync(&call->ack_timer);
-	read_lock_bh(&call->state_lock);
-	if (call->state < RXRPC_CALL_COMPLETE &&
-	    !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
-		rxrpc_queue_call(call);
 	read_unlock_bh(&call->state_lock);
 }
 
 /*
- * propose an ACK be sent, locking the call structure
+ * propose an ACK be sent
  */
-void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-		       u16 skew, u32 serial, bool immediate)
+static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+				u16 skew, u32 serial, bool immediate,
+				bool background)
 {
+	unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
-	if (prior > rxrpc_ack_priority[call->ackr_reason]) {
-		spin_lock_bh(&call->lock);
-		__rxrpc_propose_ACK(call, ack_reason, skew, serial, immediate);
-		spin_unlock_bh(&call->lock);
-	}
-}
-
-/*
- * set the resend timer
- */
-static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
-			     unsigned long resend_at)
-{
-	read_lock_bh(&call->state_lock);
-	if (call->state == RXRPC_CALL_COMPLETE)
-		resend = 0;
-
-	if (resend & 1) {
-		_debug("SET RESEND");
-		set_bit(RXRPC_CALL_EV_RESEND, &call->events);
-	}
-
-	if (resend & 2) {
-		_debug("MODIFY RESEND TIMER");
-		set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-		mod_timer(&call->resend_timer, resend_at);
-	} else {
-		_debug("KILL RESEND TIMER");
-		del_timer_sync(&call->resend_timer);
-		clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	}
-	read_unlock_bh(&call->state_lock);
-}
-
-/*
- * resend packets
- */
-static void rxrpc_resend(struct rxrpc_call *call)
-{
-	struct rxrpc_wire_header *whdr;
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *txb;
-	unsigned long *p_txb, resend_at;
-	bool stop;
-	int loop;
-	u8 resend;
-
-	_enter("{%d,%d,%d,%d},",
-	       call->acks_hard, call->acks_unacked,
-	       atomic_read(&call->sequence),
-	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
-
-	stop = false;
-	resend = 0;
-	resend_at = 0;
-
-	for (loop = call->acks_tail;
-	     loop != call->acks_head || stop;
-	     loop = (loop + 1) &  (call->acks_winsz - 1)
-	     ) {
-		p_txb = call->acks_window + loop;
-		smp_read_barrier_depends();
-		if (*p_txb & 1)
-			continue;
-
-		txb = (struct sk_buff *) *p_txb;
-		sp = rxrpc_skb(txb);
-
-		if (sp->need_resend) {
-			sp->need_resend = false;
-
-			/* each Tx packet has a new serial number */
-			sp->hdr.serial = atomic_inc_return(&call->conn->serial);
-
-			whdr = (struct rxrpc_wire_header *)txb->head;
-			whdr->serial = htonl(sp->hdr.serial);
-
-			_proto("Tx DATA %%%u { #%d }",
-			       sp->hdr.serial, sp->hdr.seq);
-			if (rxrpc_send_data_packet(call->conn, txb) < 0) {
-				stop = true;
-				sp->resend_at = jiffies + 3;
-			} else {
-				if (rxrpc_is_client_call(call))
-					rxrpc_expose_client_call(call);
-				sp->resend_at =
-					jiffies + rxrpc_resend_timeout;
-			}
-		}
-
-		if (time_after_eq(jiffies + 1, sp->resend_at)) {
-			sp->need_resend = true;
-			resend |= 1;
-		} else if (resend & 2) {
-			if (time_before(sp->resend_at, resend_at))
-				resend_at = sp->resend_at;
-		} else {
-			resend_at = sp->resend_at;
-			resend |= 2;
-		}
-	}
-
-	rxrpc_set_resend(call, resend, resend_at);
-	_leave("");
-}
-
-/*
- * handle resend timer expiry
- */
-static void rxrpc_resend_timer(struct rxrpc_call *call)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *txb;
-	unsigned long *p_txb, resend_at;
-	int loop;
-	u8 resend;
-
-	_enter("%d,%d,%d",
-	       call->acks_tail, call->acks_unacked, call->acks_head);
-
-	if (call->state == RXRPC_CALL_COMPLETE)
-		return;
-
-	resend = 0;
-	resend_at = 0;
-
-	for (loop = call->acks_unacked;
-	     loop != call->acks_head;
-	     loop = (loop + 1) &  (call->acks_winsz - 1)
-	     ) {
-		p_txb = call->acks_window + loop;
-		smp_read_barrier_depends();
-		txb = (struct sk_buff *) (*p_txb & ~1);
-		sp = rxrpc_skb(txb);
-
-		ASSERT(!(*p_txb & 1));
+	_enter("{%d},%s,%%%x,%u",
+	       call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
 
-		if (sp->need_resend) {
-			;
-		} else if (time_after_eq(jiffies + 1, sp->resend_at)) {
-			sp->need_resend = true;
-			resend |= 1;
-		} else if (resend & 2) {
-			if (time_before(sp->resend_at, resend_at))
-				resend_at = sp->resend_at;
-		} else {
-			resend_at = sp->resend_at;
-			resend |= 2;
+	/* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
+	 * numbers, but we don't alter the timeout.
+	 */
+	_debug("prior %u %u vs %u %u",
+	       ack_reason, prior,
+	       call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
+	if (ack_reason == call->ackr_reason) {
+		if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
+			call->ackr_serial = serial;
+			call->ackr_skew = skew;
 		}
+		if (!immediate)
+			return;
+	} else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
+		call->ackr_reason = ack_reason;
+		call->ackr_serial = serial;
+		call->ackr_skew = skew;
 	}
 
-	rxrpc_set_resend(call, resend, resend_at);
-	_leave("");
-}
-
-/*
- * process soft ACKs of our transmitted packets
- * - these indicate packets the peer has or has not received, but hasn't yet
- *   given to the consumer, and so can still be discarded and re-requested
- */
-static int rxrpc_process_soft_ACKs(struct rxrpc_call *call,
-				   struct rxrpc_ackpacket *ack,
-				   struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *txb;
-	unsigned long *p_txb, resend_at;
-	int loop;
-	u8 sacks[RXRPC_MAXACKS], resend;
-
-	_enter("{%d,%d},{%d},",
-	       call->acks_hard,
-	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz),
-	       ack->nAcks);
+	switch (ack_reason) {
+	case RXRPC_ACK_REQUESTED:
+		if (rxrpc_requested_ack_delay < expiry)
+			expiry = rxrpc_requested_ack_delay;
+		if (serial == 1)
+			immediate = false;
+		break;
 
-	if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0)
-		goto protocol_error;
+	case RXRPC_ACK_DELAY:
+		if (rxrpc_soft_ack_delay < expiry)
+			expiry = rxrpc_soft_ack_delay;
+		break;
 
-	resend = 0;
-	resend_at = 0;
-	for (loop = 0; loop < ack->nAcks; loop++) {
-		p_txb = call->acks_window;
-		p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1);
-		smp_read_barrier_depends();
-		txb = (struct sk_buff *) (*p_txb & ~1);
-		sp = rxrpc_skb(txb);
+	case RXRPC_ACK_IDLE:
+		if (rxrpc_soft_ack_delay < expiry)
+			expiry = rxrpc_idle_ack_delay;
+		break;
 
-		switch (sacks[loop]) {
-		case RXRPC_ACK_TYPE_ACK:
-			sp->need_resend = false;
-			*p_txb |= 1;
-			break;
-		case RXRPC_ACK_TYPE_NACK:
-			sp->need_resend = true;
-			*p_txb &= ~1;
-			resend = 1;
-			break;
-		default:
-			_debug("Unsupported ACK type %d", sacks[loop]);
-			goto protocol_error;
-		}
+	default:
+		immediate = true;
+		break;
 	}
 
-	smp_mb();
-	call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1);
-
-	/* anything not explicitly ACK'd is implicitly NACK'd, but may just not
-	 * have been received or processed yet by the far end */
-	for (loop = call->acks_unacked;
-	     loop != call->acks_head;
-	     loop = (loop + 1) &  (call->acks_winsz - 1)
-	     ) {
-		p_txb = call->acks_window + loop;
-		smp_read_barrier_depends();
-		txb = (struct sk_buff *) (*p_txb & ~1);
-		sp = rxrpc_skb(txb);
-
-		if (*p_txb & 1) {
-			/* packet must have been discarded */
-			sp->need_resend = true;
-			*p_txb &= ~1;
-			resend |= 1;
-		} else if (sp->need_resend) {
-			;
-		} else if (time_after_eq(jiffies + 1, sp->resend_at)) {
-			sp->need_resend = true;
-			resend |= 1;
-		} else if (resend & 2) {
-			if (time_before(sp->resend_at, resend_at))
-				resend_at = sp->resend_at;
-		} else {
-			resend_at = sp->resend_at;
-			resend |= 2;
+	now = jiffies;
+	if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
+		_debug("already scheduled");
+	} else if (immediate || expiry == 0) {
+		_debug("immediate ACK %lx", call->events);
+		if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) &&
+		    background)
+			rxrpc_queue_call(call);
+	} else {
+		ack_at = now + expiry;
+		_debug("deferred ACK %ld < %ld", expiry, call->ack_at - now);
+		if (time_before(ack_at, call->ack_at)) {
+			call->ack_at = ack_at;
+			rxrpc_set_timer(call);
 		}
 	}
-
-	rxrpc_set_resend(call, resend, resend_at);
-	_leave(" = 0");
-	return 0;
-
-protocol_error:
-	_leave(" = -EPROTO");
-	return -EPROTO;
 }
 
 /*
- * discard hard-ACK'd packets from the Tx window
- */
-static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
-{
-	unsigned long _skb;
-	int tail = call->acks_tail, old_tail;
-	int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
-
-	_enter("{%u,%u},%u", call->acks_hard, win, hard);
-
-	ASSERTCMP(hard - call->acks_hard, <=, win);
-
-	while (call->acks_hard < hard) {
-		smp_read_barrier_depends();
-		_skb = call->acks_window[tail] & ~1;
-		rxrpc_free_skb((struct sk_buff *) _skb);
-		old_tail = tail;
-		tail = (tail + 1) & (call->acks_winsz - 1);
-		call->acks_tail = tail;
-		if (call->acks_unacked == old_tail)
-			call->acks_unacked = tail;
-		call->acks_hard++;
-	}
-
-	wake_up(&call->waitq);
-}
-
-/*
- * clear the Tx window in the event of a failure
+ * propose an ACK be sent, locking the call structure
  */
-static void rxrpc_clear_tx_window(struct rxrpc_call *call)
+void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+		       u16 skew, u32 serial, bool immediate, bool background)
 {
-	rxrpc_rotate_tx_window(call, atomic_read(&call->sequence));
+	spin_lock_bh(&call->lock);
+	__rxrpc_propose_ACK(call, ack_reason, skew, serial,
+			    immediate, background);
+	spin_unlock_bh(&call->lock);
 }
 
 /*
- * drain the out of sequence received packet queue into the packet Rx queue
+ * Perform retransmission of NAK'd and unack'd packets.
  */
-static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
+static void rxrpc_resend(struct rxrpc_call *call)
 {
+	struct rxrpc_wire_header *whdr;
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
-	bool terminal;
-	int ret;
+	rxrpc_seq_t cursor, seq, top;
+	unsigned long resend_at, now;
+	int ix;
+	u8 annotation;
 
-	_enter("{%d,%d}", call->rx_data_post, call->rx_first_oos);
+	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
 	spin_lock_bh(&call->lock);
 
-	ret = -ECONNRESET;
-	if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
-		goto socket_unavailable;
+	cursor = call->tx_hard_ack;
+	top = call->tx_top;
+	ASSERT(before_eq(cursor, top));
+	if (cursor == top)
+		goto out_unlock;
+
+	/* Scan the packet list without dropping the lock and decide which of
+	 * the packets in the Tx buffer we're going to resend and what the new
+	 * resend timeout will be.
+	 */
+	now = jiffies;
+	resend_at = now + rxrpc_resend_timeout;
+	seq = cursor + 1;
+	do {
+		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		annotation = call->rxtx_annotations[ix];
+		if (annotation == RXRPC_TX_ANNO_ACK)
+			continue;
 
-	skb = skb_dequeue(&call->rx_oos_queue);
-	if (skb) {
+		skb = call->rxtx_buffer[ix];
 		rxrpc_see_skb(skb);
 		sp = rxrpc_skb(skb);
 
-		_debug("drain OOS packet %d [%d]",
-		       sp->hdr.seq, call->rx_first_oos);
-
-		if (sp->hdr.seq != call->rx_first_oos) {
-			skb_queue_head(&call->rx_oos_queue, skb);
-			call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
-			_debug("requeue %p {%u}", skb, call->rx_first_oos);
-		} else {
-			skb->mark = RXRPC_SKB_MARK_DATA;
-			terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
-				!(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
-			ret = rxrpc_queue_rcv_skb(call, skb, true, terminal);
-			BUG_ON(ret < 0);
-			_debug("drain #%u", call->rx_data_post);
-			call->rx_data_post++;
-
-			/* find out what the next packet is */
-			skb = skb_peek(&call->rx_oos_queue);
-			rxrpc_see_skb(skb);
-			if (skb)
-				call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
-			else
-				call->rx_first_oos = 0;
-			_debug("peek %p {%u}", skb, call->rx_first_oos);
-		}
-	}
-
-	ret = 0;
-socket_unavailable:
-	spin_unlock_bh(&call->lock);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * insert an out of sequence packet into the buffer
- */
-static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
-				    struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp, *psp;
-	struct sk_buff *p;
-	u32 seq;
-
-	sp = rxrpc_skb(skb);
-	seq = sp->hdr.seq;
-	_enter(",,{%u}", seq);
-
-	skb->destructor = rxrpc_packet_destructor;
-	ASSERTCMP(sp->call, ==, NULL);
-	sp->call = call;
-	rxrpc_get_call_for_skb(call, skb);
-
-	/* insert into the buffer in sequence order */
-	spin_lock_bh(&call->lock);
-
-	skb_queue_walk(&call->rx_oos_queue, p) {
-		psp = rxrpc_skb(p);
-		if (psp->hdr.seq > seq) {
-			_debug("insert oos #%u before #%u", seq, psp->hdr.seq);
-			skb_insert(p, skb, &call->rx_oos_queue);
-			goto inserted;
-		}
-	}
-
-	_debug("append oos #%u", seq);
-	skb_queue_tail(&call->rx_oos_queue, skb);
-inserted:
-
-	/* we might now have a new front to the queue */
-	if (call->rx_first_oos == 0 || seq < call->rx_first_oos)
-		call->rx_first_oos = seq;
-
-	read_lock(&call->state_lock);
-	if (call->state < RXRPC_CALL_COMPLETE &&
-	    call->rx_data_post == call->rx_first_oos) {
-		_debug("drain rx oos now");
-		set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
-	}
-	read_unlock(&call->state_lock);
-
-	spin_unlock_bh(&call->lock);
-	_leave(" [stored #%u]", call->rx_first_oos);
-}
-
-/*
- * clear the Tx window on final ACK reception
- */
-static void rxrpc_zap_tx_window(struct rxrpc_call *call)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
-	unsigned long _skb, *acks_window;
-	u8 winsz = call->acks_winsz;
-	int tail;
-
-	acks_window = call->acks_window;
-	call->acks_window = NULL;
-
-	while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) {
-		tail = call->acks_tail;
-		smp_read_barrier_depends();
-		_skb = acks_window[tail] & ~1;
-		smp_mb();
-		call->acks_tail = (call->acks_tail + 1) & (winsz - 1);
-
-		skb = (struct sk_buff *) _skb;
-		sp = rxrpc_skb(skb);
-		_debug("+++ clear Tx %u", sp->hdr.seq);
-		rxrpc_free_skb(skb);
-	}
-
-	kfree(acks_window);
-}
-
-/*
- * process the extra information that may be appended to an ACK packet
- */
-static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
-				  unsigned int latest, int nAcks)
-{
-	struct rxrpc_ackinfo ackinfo;
-	struct rxrpc_peer *peer;
-	unsigned int mtu;
-
-	if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) {
-		_leave(" [no ackinfo]");
-		return;
-	}
-
-	_proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
-	       latest,
-	       ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU),
-	       ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max));
-
-	mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
-
-	peer = call->peer;
-	if (mtu < peer->maxdata) {
-		spin_lock_bh(&peer->lock);
-		peer->maxdata = mtu;
-		peer->mtu = mtu + peer->hdrsize;
-		spin_unlock_bh(&peer->lock);
-		_net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
-	}
-}
-
-/*
- * process packets in the reception queue
- */
-static int rxrpc_process_rx_queue(struct rxrpc_call *call,
-				  u32 *_abort_code)
-{
-	struct rxrpc_ackpacket ack;
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
-	bool post_ACK;
-	int latest;
-	u32 hard, tx;
-
-	_enter("");
-
-process_further:
-	skb = skb_dequeue(&call->rx_queue);
-	if (!skb)
-		return -EAGAIN;
-
-	rxrpc_see_skb(skb);
-	_net("deferred skb %p", skb);
-
-	sp = rxrpc_skb(skb);
-
-	_debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state);
-
-	post_ACK = false;
-
-	switch (sp->hdr.type) {
-		/* data packets that wind up here have been received out of
-		 * order, need security processing or are jumbo packets */
-	case RXRPC_PACKET_TYPE_DATA:
-		_proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
-		/* secured packets must be verified and possibly decrypted */
-		if (call->conn->security->verify_packet(call, skb,
-							sp->hdr.seq,
-							sp->hdr.cksum) < 0)
-			goto protocol_error;
-
-		rxrpc_insert_oos_packet(call, skb);
-		goto process_further;
-
-		/* partial ACK to process */
-	case RXRPC_PACKET_TYPE_ACK:
-		if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) {
-			_debug("extraction failure");
-			goto protocol_error;
-		}
-		if (!skb_pull(skb, sizeof(ack)))
-			BUG();
-
-		latest = sp->hdr.serial;
-		hard = ntohl(ack.firstPacket);
-		tx = atomic_read(&call->sequence);
-
-		_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-		       latest,
-		       ntohs(ack.maxSkew),
-		       hard,
-		       ntohl(ack.previousPacket),
-		       ntohl(ack.serial),
-		       rxrpc_acks(ack.reason),
-		       ack.nAcks);
-
-		rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks);
-
-		if (ack.reason == RXRPC_ACK_PING) {
-			_proto("Rx ACK %%%u PING Request", latest);
-			rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
-					  skb->priority, sp->hdr.serial, true);
-		}
-
-		/* discard any out-of-order or duplicate ACKs */
-		if (latest - call->acks_latest <= 0) {
-			_debug("discard ACK %d <= %d",
-			       latest, call->acks_latest);
-			goto discard;
-		}
-		call->acks_latest = latest;
-
-		if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
-		    call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY &&
-		    call->state != RXRPC_CALL_SERVER_SEND_REPLY &&
-		    call->state != RXRPC_CALL_SERVER_AWAIT_ACK)
-			goto discard;
-
-		_debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state);
-
-		if (hard > 0) {
-			if (hard - 1 > tx) {
-				_debug("hard-ACK'd packet %d not transmitted"
-				       " (%d top)",
-				       hard - 1, tx);
-				goto protocol_error;
-			}
-
-			if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
-			     call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
-			    hard > tx) {
-				call->acks_hard = tx;
-				goto all_acked;
+		if (annotation == RXRPC_TX_ANNO_UNACK) {
+			if (time_after(sp->resend_at, now)) {
+				if (time_before(sp->resend_at, resend_at))
+					resend_at = sp->resend_at;
+				continue;
 			}
-
-			smp_rmb();
-			rxrpc_rotate_tx_window(call, hard - 1);
-		}
-
-		if (ack.nAcks > 0) {
-			if (hard - 1 + ack.nAcks > tx) {
-				_debug("soft-ACK'd packet %d+%d not"
-				       " transmitted (%d top)",
-				       hard - 1, ack.nAcks, tx);
-				goto protocol_error;
-			}
-
-			if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0)
-				goto protocol_error;
 		}
-		goto discard;
 
-		/* complete ACK to process */
-	case RXRPC_PACKET_TYPE_ACKALL:
-		goto all_acked;
-
-		/* abort and busy are handled elsewhere */
-	case RXRPC_PACKET_TYPE_BUSY:
-	case RXRPC_PACKET_TYPE_ABORT:
-		BUG();
-
-		/* connection level events - also handled elsewhere */
-	case RXRPC_PACKET_TYPE_CHALLENGE:
-	case RXRPC_PACKET_TYPE_RESPONSE:
-	case RXRPC_PACKET_TYPE_DEBUG:
-		BUG();
-	}
-
-	/* if we've had a hard ACK that covers all the packets we've sent, then
-	 * that ends that phase of the operation */
-all_acked:
-	write_lock_bh(&call->state_lock);
-	_debug("ack all %d", call->state);
-
-	switch (call->state) {
-	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
-		break;
-	case RXRPC_CALL_SERVER_AWAIT_ACK:
-		_debug("srv complete");
-		__rxrpc_call_completed(call);
-		post_ACK = true;
-		break;
-	case RXRPC_CALL_CLIENT_SEND_REQUEST:
-	case RXRPC_CALL_SERVER_RECV_REQUEST:
-		goto protocol_error_unlock; /* can't occur yet */
-	default:
-		write_unlock_bh(&call->state_lock);
-		goto discard; /* assume packet left over from earlier phase */
-	}
-
-	write_unlock_bh(&call->state_lock);
-
-	/* if all the packets we sent are hard-ACK'd, then we can discard
-	 * whatever we've got left */
-	_debug("clear Tx %d",
-	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
-
-	del_timer_sync(&call->resend_timer);
-	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-
-	if (call->acks_window)
-		rxrpc_zap_tx_window(call);
+		/* Okay, we need to retransmit a packet. */
+		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+		seq++;
+	} while (before_eq(seq, top));
+
+	call->resend_at = resend_at;
+
+	/* Now go through the Tx window and perform the retransmissions.  We
+	 * have to drop the lock for each send.  If an ACK comes in whilst the
+	 * lock is dropped, it may clear some of the retransmission markers for
+	 * packets that it soft-ACKs.
+	 */
+	seq = cursor + 1;
+	do {
+		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		annotation = call->rxtx_annotations[ix];
+		if (annotation != RXRPC_TX_ANNO_RETRANS)
+			continue;
 
-	if (post_ACK) {
-		/* post the final ACK message for userspace to pick up */
-		_debug("post ACK");
-		skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
-		sp->call = call;
-		rxrpc_get_call_for_skb(call, skb);
-		spin_lock_bh(&call->lock);
-		if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
-			BUG();
+		skb = call->rxtx_buffer[ix];
+		rxrpc_get_skb(skb);
 		spin_unlock_bh(&call->lock);
-		goto process_further;
-	}
-
-discard:
-	rxrpc_free_skb(skb);
-	goto process_further;
-
-protocol_error_unlock:
-	write_unlock_bh(&call->state_lock);
-protocol_error:
-	rxrpc_free_skb(skb);
-	_leave(" = -EPROTO");
-	return -EPROTO;
-}
-
-/*
- * post a message to the socket Rx queue for recvmsg() to pick up
- */
-static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
-			      bool fatal)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
-	int ret;
-
-	_enter("{%d,%lx},%u,%u,%d",
-	       call->debug_id, call->flags, mark, error, fatal);
-
-	/* remove timers and things for fatal messages */
-	if (fatal) {
-		del_timer_sync(&call->resend_timer);
-		del_timer_sync(&call->ack_timer);
-		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	}
+		sp = rxrpc_skb(skb);
 
-	if (mark != RXRPC_SKB_MARK_NEW_CALL &&
-	    !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
-		_leave("[no userid]");
-		return 0;
-	}
+		/* Each Tx packet needs a new serial number */
+		sp->hdr.serial = atomic_inc_return(&call->conn->serial);
 
-	if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
-		skb = alloc_skb(0, GFP_NOFS);
-		if (!skb)
-			return -ENOMEM;
+		whdr = (struct rxrpc_wire_header *)skb->head;
+		whdr->serial = htonl(sp->hdr.serial);
 
-		rxrpc_new_skb(skb);
+		if (rxrpc_send_data_packet(call->conn, skb) < 0) {
+			call->resend_at = now + 2;
+			rxrpc_free_skb(skb);
+			return;
+		}
 
-		skb->mark = mark;
-
-		sp = rxrpc_skb(skb);
-		memset(sp, 0, sizeof(*sp));
-		sp->error = error;
-		sp->call = call;
-		rxrpc_get_call_for_skb(call, skb);
+		if (rxrpc_is_client_call(call))
+			rxrpc_expose_client_call(call);
+		sp->resend_at = now + rxrpc_resend_timeout;
 
+		rxrpc_free_skb(skb);
 		spin_lock_bh(&call->lock);
-		ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
-		spin_unlock_bh(&call->lock);
-		BUG_ON(ret < 0);
-	}
 
-	return 0;
+		/* We need to clear the retransmit state, but there are two
+		 * things we need to be aware of: A new ACK/NAK might have been
+		 * received and the packet might have been hard-ACK'd (in which
+		 * case it will no longer be in the buffer).
+		 */
+		if (after(seq, call->tx_hard_ack) &&
+		    (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_RETRANS ||
+		     call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK))
+			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
+
+		if (after(call->tx_hard_ack, seq))
+			seq = call->tx_hard_ack;
+		seq++;
+	} while (before_eq(seq, top));
+
+out_unlock:
+	spin_unlock_bh(&call->lock);
+	_leave("");
 }
 
 /*
- * Handle background processing of incoming call packets and ACK / abort
- * generation.  A ref on the call is donated to us by whoever queued the work
- * item.
+ * Handle retransmission and deferred ACK/abort generation.
  */
 void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
-	struct rxrpc_wire_header whdr;
-	struct rxrpc_ackpacket ack;
-	struct rxrpc_ackinfo ackinfo;
-	struct msghdr msg;
-	struct kvec iov[5];
-	enum rxrpc_call_event genbit;
-	unsigned long bits;
-	__be32 data, pad;
-	size_t len;
-	bool requeue = false;
-	int loop, nbit, ioc, ret, mtu;
-	u32 serial, abort_code = RX_PROTOCOL_ERROR;
-	u8 *acks = NULL;
+	unsigned long now;
 
 	rxrpc_see_call(call);
 
 	//printk("\n--------------------\n");
-	_enter("{%d,%s,%lx} [%lu]",
-	       call->debug_id, rxrpc_call_states[call->state], call->events,
-	       (jiffies - call->creation_jif) / (HZ / 10));
-
-	if (call->state >= RXRPC_CALL_COMPLETE) {
-		rxrpc_put_call(call, rxrpc_call_put);
-		return;
-	}
-
-	if (!call->conn)
-		goto skip_msg_init;
-
-	/* there's a good chance we're going to have to send a message, so set
-	 * one up in advance */
-	msg.msg_name	= &call->peer->srx.transport;
-	msg.msg_namelen	= call->peer->srx.transport_len;
-	msg.msg_control	= NULL;
-	msg.msg_controllen = 0;
-	msg.msg_flags	= 0;
+	_enter("{%d,%s,%lx}",
+	       call->debug_id, rxrpc_call_states[call->state], call->events);
 
-	whdr.epoch	= htonl(call->conn->proto.epoch);
-	whdr.cid	= htonl(call->cid);
-	whdr.callNumber	= htonl(call->call_id);
-	whdr.seq	= 0;
-	whdr.type	= RXRPC_PACKET_TYPE_ACK;
-	whdr.flags	= call->conn->out_clientflag;
-	whdr.userStatus	= 0;
-	whdr.securityIndex = call->conn->security_ix;
-	whdr._rsvd	= 0;
-	whdr.serviceId	= htons(call->service_id);
-
-	memset(iov, 0, sizeof(iov));
-	iov[0].iov_base	= &whdr;
-	iov[0].iov_len	= sizeof(whdr);
-skip_msg_init:
-
-	/* deal with events of a final nature */
-	if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
-		enum rxrpc_skb_mark mark;
-
-		clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
-		clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
-
-		if (call->completion == RXRPC_CALL_NETWORK_ERROR) {
-			mark = RXRPC_SKB_MARK_NET_ERROR;
-			_debug("post net error %d", call->error);
-		} else {
-			mark = RXRPC_SKB_MARK_LOCAL_ERROR;
-			_debug("post net local error %d", call->error);
-		}
-
-		if (rxrpc_post_message(call, mark, call->error, true) < 0)
-			goto no_mem;
-		clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
-		goto kill_ACKs;
-	}
-
-	if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) {
-		ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
-
-		clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
-
-		_debug("post conn abort");
-
-		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
-				       call->error, true) < 0)
-			goto no_mem;
-		clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
-		goto kill_ACKs;
-	}
-
-	if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
-		whdr.type = RXRPC_PACKET_TYPE_BUSY;
-		genbit = RXRPC_CALL_EV_REJECT_BUSY;
-		goto send_message;
-	}
-
-	if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
-		ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
-
-		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
-				       call->error, true) < 0)
-			goto no_mem;
-		whdr.type = RXRPC_PACKET_TYPE_ABORT;
-		data = htonl(call->abort_code);
-		iov[1].iov_base = &data;
-		iov[1].iov_len = sizeof(data);
-		genbit = RXRPC_CALL_EV_ABORT;
-		goto send_message;
-	}
-
-	if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) {
-		genbit = RXRPC_CALL_EV_ACK_FINAL;
-
-		ack.bufferSpace	= htons(8);
-		ack.maxSkew	= 0;
-		ack.serial	= 0;
-		ack.reason	= RXRPC_ACK_IDLE;
-		ack.nAcks	= 0;
-		call->ackr_reason = 0;
-
-		spin_lock_bh(&call->lock);
-		ack.serial	= htonl(call->ackr_serial);
-		ack.previousPacket = htonl(call->ackr_prev_seq);
-		ack.firstPacket	= htonl(call->rx_data_eaten + 1);
-		spin_unlock_bh(&call->lock);
-
-		pad = 0;
-
-		iov[1].iov_base = &ack;
-		iov[1].iov_len	= sizeof(ack);
-		iov[2].iov_base = &pad;
-		iov[2].iov_len	= 3;
-		iov[3].iov_base = &ackinfo;
-		iov[3].iov_len	= sizeof(ackinfo);
-		goto send_ACK;
+recheck_state:
+	if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+		goto recheck_state;
 	}
 
-	if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) |
-			    (1 << RXRPC_CALL_EV_RCVD_ABORT))
-	    ) {
-		u32 mark;
-
-		if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events))
-			mark = RXRPC_SKB_MARK_REMOTE_ABORT;
-		else
-			mark = RXRPC_SKB_MARK_BUSY;
-
-		_debug("post abort/busy");
-		rxrpc_clear_tx_window(call);
-		if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
-			goto no_mem;
-
-		clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
-		goto kill_ACKs;
+	if (call->state == RXRPC_CALL_COMPLETE) {
+		del_timer_sync(&call->timer);
+		goto out_put;
 	}
 
-	if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) {
-		_debug("do implicit ackall");
-		rxrpc_clear_tx_window(call);
-	}
-
-	if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
+	now = jiffies;
+	if (time_after_eq(now, call->expire_at)) {
 		rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
-
-		_debug("post timeout");
-		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
-				       ETIME, true) < 0)
-			goto no_mem;
-
-		clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
-		goto kill_ACKs;
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 	}
 
-	/* deal with assorted inbound messages */
-	if (!skb_queue_empty(&call->rx_queue)) {
-		ret = rxrpc_process_rx_queue(call, &abort_code);
-		switch (ret) {
-		case 0:
-		case -EAGAIN:
-			break;
-		case -ENOMEM:
-			goto no_mem;
-		case -EKEYEXPIRED:
-		case -EKEYREJECTED:
-		case -EPROTO:
-			rxrpc_abort_call("PRO", call, 0, abort_code, -ret);
-			goto kill_ACKs;
+	if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
+	    time_after_eq(now, call->ack_at)) {
+		call->ack_at = call->expire_at;
+		if (call->ackr_reason) {
+			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+			goto recheck_state;
 		}
 	}
 
-	/* handle resending */
-	if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
-		rxrpc_resend_timer(call);
-	if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events))
+	if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) ||
+	    time_after_eq(now, call->resend_at)) {
 		rxrpc_resend(call);
-
-	/* consider sending an ordinary ACK */
-	if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
-		_debug("send ACK: window: %d - %d { %lx }",
-		       call->rx_data_eaten, call->ackr_win_top,
-		       call->ackr_window[0]);
-
-		if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
-		    call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
-			/* ACK by sending reply DATA packet in this state */
-			clear_bit(RXRPC_CALL_EV_ACK, &call->events);
-			goto maybe_reschedule;
-		}
-
-		genbit = RXRPC_CALL_EV_ACK;
-
-		acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
-			       GFP_NOFS);
-		if (!acks)
-			goto no_mem;
-
-		//hdr.flags	= RXRPC_SLOW_START_OK;
-		ack.bufferSpace	= htons(8);
-		ack.maxSkew	= 0;
-
-		spin_lock_bh(&call->lock);
-		ack.reason	= call->ackr_reason;
-		ack.serial	= htonl(call->ackr_serial);
-		ack.previousPacket = htonl(call->ackr_prev_seq);
-		ack.firstPacket = htonl(call->rx_data_eaten + 1);
-
-		ack.nAcks = 0;
-		for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
-			nbit = loop * BITS_PER_LONG;
-			for (bits = call->ackr_window[loop]; bits; bits >>= 1
-			     ) {
-				_debug("- l=%d n=%d b=%lx", loop, nbit, bits);
-				if (bits & 1) {
-					acks[nbit] = RXRPC_ACK_TYPE_ACK;
-					ack.nAcks = nbit + 1;
-				}
-				nbit++;
-			}
-		}
-		call->ackr_reason = 0;
-		spin_unlock_bh(&call->lock);
-
-		pad = 0;
-
-		iov[1].iov_base = &ack;
-		iov[1].iov_len	= sizeof(ack);
-		iov[2].iov_base = acks;
-		iov[2].iov_len	= ack.nAcks;
-		iov[3].iov_base = &pad;
-		iov[3].iov_len	= 3;
-		iov[4].iov_base = &ackinfo;
-		iov[4].iov_len	= sizeof(ackinfo);
-
-		switch (ack.reason) {
-		case RXRPC_ACK_REQUESTED:
-		case RXRPC_ACK_DUPLICATE:
-		case RXRPC_ACK_OUT_OF_SEQUENCE:
-		case RXRPC_ACK_EXCEEDS_WINDOW:
-		case RXRPC_ACK_NOSPACE:
-		case RXRPC_ACK_PING:
-		case RXRPC_ACK_PING_RESPONSE:
-			goto send_ACK_with_skew;
-		case RXRPC_ACK_DELAY:
-		case RXRPC_ACK_IDLE:
-			goto send_ACK;
-		}
+		goto recheck_state;
 	}
 
-	/* handle completion of security negotiations on an incoming
-	 * connection */
-	if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) {
-		_debug("secured");
-		spin_lock_bh(&call->lock);
-
-		if (call->state == RXRPC_CALL_SERVER_SECURING) {
-			struct rxrpc_sock *rx;
-			_debug("securing");
-			rcu_read_lock();
-			rx = rcu_dereference(call->socket);
-			if (rx) {
-				write_lock(&rx->call_lock);
-				if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
-					_debug("not released");
-					call->state = RXRPC_CALL_SERVER_ACCEPTING;
-					list_move_tail(&call->accept_link,
-						       &rx->acceptq);
-				}
-				write_unlock(&rx->call_lock);
-			}
-			rcu_read_unlock();
-			read_lock(&call->state_lock);
-			if (call->state < RXRPC_CALL_COMPLETE)
-				set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
-			read_unlock(&call->state_lock);
-		}
-
-		spin_unlock_bh(&call->lock);
-		if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events))
-			goto maybe_reschedule;
-	}
-
-	/* post a notification of an acceptable connection to the app */
-	if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) {
-		_debug("post accept");
-		if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
-				       0, false) < 0)
-			goto no_mem;
-		clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
-		goto maybe_reschedule;
-	}
-
-	/* handle incoming call acceptance */
-	if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) {
-		_debug("accepted");
-		ASSERTCMP(call->rx_data_post, ==, 0);
-		call->rx_data_post = 1;
-		read_lock_bh(&call->state_lock);
-		if (call->state < RXRPC_CALL_COMPLETE)
-			set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
-		read_unlock_bh(&call->state_lock);
-	}
-
-	/* drain the out of sequence received packet queue into the packet Rx
-	 * queue */
-	if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) {
-		while (call->rx_data_post == call->rx_first_oos)
-			if (rxrpc_drain_rx_oos_queue(call) < 0)
-				break;
-		goto maybe_reschedule;
-	}
+	rxrpc_set_timer(call);
 
 	/* other events may have been raised since we started checking */
-	goto maybe_reschedule;
-
-send_ACK_with_skew:
-	ack.maxSkew = htons(call->ackr_skew);
-send_ACK:
-	mtu = call->peer->if_mtu;
-	mtu -= call->peer->hdrsize;
-	ackinfo.maxMTU	= htonl(mtu);
-	ackinfo.rwind	= htonl(rxrpc_rx_window_size);
-
-	/* permit the peer to send us jumbo packets if it wants to */
-	ackinfo.rxMTU	= htonl(rxrpc_rx_mtu);
-	ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
-
-	serial = atomic_inc_return(&call->conn->serial);
-	whdr.serial = htonl(serial);
-	_proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-	       serial,
-	       ntohs(ack.maxSkew),
-	       ntohl(ack.firstPacket),
-	       ntohl(ack.previousPacket),
-	       ntohl(ack.serial),
-	       rxrpc_acks(ack.reason),
-	       ack.nAcks);
-
-	del_timer_sync(&call->ack_timer);
-	if (ack.nAcks > 0)
-		set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags);
-	goto send_message_2;
-
-send_message:
-	_debug("send message");
-
-	serial = atomic_inc_return(&call->conn->serial);
-	whdr.serial = htonl(serial);
-	_proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
-send_message_2:
-
-	len = iov[0].iov_len;
-	ioc = 1;
-	if (iov[4].iov_len) {
-		ioc = 5;
-		len += iov[4].iov_len;
-		len += iov[3].iov_len;
-		len += iov[2].iov_len;
-		len += iov[1].iov_len;
-	} else if (iov[3].iov_len) {
-		ioc = 4;
-		len += iov[3].iov_len;
-		len += iov[2].iov_len;
-		len += iov[1].iov_len;
-	} else if (iov[2].iov_len) {
-		ioc = 3;
-		len += iov[2].iov_len;
-		len += iov[1].iov_len;
-	} else if (iov[1].iov_len) {
-		ioc = 2;
-		len += iov[1].iov_len;
-	}
-
-	ret = kernel_sendmsg(call->conn->params.local->socket,
-			     &msg, iov, ioc, len);
-	if (ret < 0) {
-		_debug("sendmsg failed: %d", ret);
-		if (call->state < RXRPC_CALL_COMPLETE)
-			requeue = true;
-		goto error;
-	}
-
-	switch (genbit) {
-	case RXRPC_CALL_EV_ABORT:
-		clear_bit(genbit, &call->events);
-		clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
-		goto kill_ACKs;
-
-	case RXRPC_CALL_EV_ACK_FINAL:
-		rxrpc_call_completed(call);
-		goto kill_ACKs;
-
-	default:
-		clear_bit(genbit, &call->events);
-		switch (call->state) {
-		case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-		case RXRPC_CALL_CLIENT_RECV_REPLY:
-		case RXRPC_CALL_SERVER_RECV_REQUEST:
-		case RXRPC_CALL_SERVER_ACK_REQUEST:
-			_debug("start ACK timer");
-			rxrpc_propose_ACK(call, RXRPC_ACK_DELAY,
-					  call->ackr_skew, call->ackr_serial,
-					  false);
-		default:
-			break;
-		}
-		goto maybe_reschedule;
-	}
-
-kill_ACKs:
-	del_timer_sync(&call->ack_timer);
-	clear_bit(RXRPC_CALL_EV_ACK, &call->events);
-
-maybe_reschedule:
-	if (call->events || !skb_queue_empty(&call->rx_queue)) {
-		if (call->state < RXRPC_CALL_COMPLETE)
-			requeue = true;
-	}
-
-error:
-	kfree(acks);
-
-	if ((requeue || call->events) && !work_pending(&call->processor)) {
-		_debug("jumpstart %x", call->conn->proto.cid);
+	if (call->events && call->state < RXRPC_CALL_COMPLETE) {
 		__rxrpc_queue_call(call);
-	} else {
-		rxrpc_put_call(call, rxrpc_call_put);
+		goto out;
 	}
 
+out_put:
+	rxrpc_put_call(call, rxrpc_call_put);
+out:
 	_leave("");
-	return;
-
-no_mem:
-	_debug("out of memory");
-	goto maybe_reschedule;
 }
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index d233adc9b5e5..18ab13f82f6e 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -30,7 +30,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 	[RXRPC_CALL_CLIENT_SEND_REQUEST]	= "ClSndReq",
 	[RXRPC_CALL_CLIENT_AWAIT_REPLY]		= "ClAwtRpl",
 	[RXRPC_CALL_CLIENT_RECV_REPLY]		= "ClRcvRpl",
-	[RXRPC_CALL_CLIENT_FINAL_ACK]		= "ClFnlACK",
 	[RXRPC_CALL_SERVER_PREALLOC]		= "SvPrealc",
 	[RXRPC_CALL_SERVER_SECURING]		= "SvSecure",
 	[RXRPC_CALL_SERVER_ACCEPTING]		= "SvAccept",
@@ -43,7 +42,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 
 const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 	[RXRPC_CALL_SUCCEEDED]			= "Complete",
-	[RXRPC_CALL_SERVER_BUSY]		= "SvBusy  ",
 	[RXRPC_CALL_REMOTELY_ABORTED]		= "RmtAbort",
 	[RXRPC_CALL_LOCALLY_ABORTED]		= "LocAbort",
 	[RXRPC_CALL_LOCAL_ERROR]		= "LocError",
@@ -57,10 +55,8 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
 	[rxrpc_call_queued_ref]		= "QUR",
 	[rxrpc_call_seen]		= "SEE",
 	[rxrpc_call_got]		= "GOT",
-	[rxrpc_call_got_skb]		= "Gsk",
 	[rxrpc_call_got_userid]		= "Gus",
 	[rxrpc_call_put]		= "PUT",
-	[rxrpc_call_put_skb]		= "Psk",
 	[rxrpc_call_put_userid]		= "Pus",
 	[rxrpc_call_put_noqueue]	= "PNQ",
 };
@@ -69,9 +65,15 @@ struct kmem_cache *rxrpc_call_jar;
 LIST_HEAD(rxrpc_calls);
 DEFINE_RWLOCK(rxrpc_call_lock);
 
-static void rxrpc_call_life_expired(unsigned long _call);
-static void rxrpc_ack_time_expired(unsigned long _call);
-static void rxrpc_resend_time_expired(unsigned long _call);
+static void rxrpc_call_timer_expired(unsigned long _call)
+{
+	struct rxrpc_call *call = (struct rxrpc_call *)_call;
+
+	_enter("%d", call->debug_id);
+
+	if (call->state < RXRPC_CALL_COMPLETE)
+		rxrpc_queue_call(call);
+}
 
 /*
  * find an extant server call
@@ -121,27 +123,24 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 	if (!call)
 		return NULL;
 
-	call->acks_winsz = 16;
-	call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long),
+	call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE,
+				    sizeof(struct sk_buff *),
 				    gfp);
-	if (!call->acks_window) {
-		kmem_cache_free(rxrpc_call_jar, call);
-		return NULL;
-	}
+	if (!call->rxtx_buffer)
+		goto nomem;
 
-	setup_timer(&call->lifetimer, &rxrpc_call_life_expired,
-		    (unsigned long) call);
-	setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
-		    (unsigned long) call);
-	setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
-		    (unsigned long) call);
+	call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp);
+	if (!call->rxtx_annotations)
+		goto nomem_2;
+
+	setup_timer(&call->timer, rxrpc_call_timer_expired,
+		    (unsigned long)call);
 	INIT_WORK(&call->processor, &rxrpc_process_call);
 	INIT_LIST_HEAD(&call->link);
 	INIT_LIST_HEAD(&call->chan_wait_link);
 	INIT_LIST_HEAD(&call->accept_link);
-	skb_queue_head_init(&call->rx_queue);
-	skb_queue_head_init(&call->rx_oos_queue);
-	skb_queue_head_init(&call->knlrecv_queue);
+	INIT_LIST_HEAD(&call->recvmsg_link);
+	INIT_LIST_HEAD(&call->sock_link);
 	init_waitqueue_head(&call->waitq);
 	spin_lock_init(&call->lock);
 	rwlock_init(&call->state_lock);
@@ -150,63 +149,52 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 
 	memset(&call->sock_node, 0xed, sizeof(call->sock_node));
 
-	call->rx_data_expect = 1;
-	call->rx_data_eaten = 0;
-	call->rx_first_oos = 0;
-	call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size;
-	call->creation_jif = jiffies;
+	/* Leave space in the ring to handle a maxed-out jumbo packet */
+	call->rx_winsize = RXRPC_RXTX_BUFF_SIZE - 1 - 46;
+	call->tx_winsize = 16;
+	call->rx_expect_next = 1;
 	return call;
+
+nomem_2:
+	kfree(call->rxtx_buffer);
+nomem:
+	kmem_cache_free(rxrpc_call_jar, call);
+	return NULL;
 }
 
 /*
  * Allocate a new client call.
  */
-static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
-						  struct sockaddr_rxrpc *srx,
+static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 						  gfp_t gfp)
 {
 	struct rxrpc_call *call;
 
 	_enter("");
 
-	ASSERT(rx->local != NULL);
-
 	call = rxrpc_alloc_call(gfp);
 	if (!call)
 		return ERR_PTR(-ENOMEM);
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
-	call->rx_data_post = 1;
 	call->service_id = srx->srx_service;
-	rcu_assign_pointer(call->socket, rx);
 
 	_leave(" = %p", call);
 	return call;
 }
 
 /*
- * Begin client call.
+ * Initiate the call ack/resend/expiry timer.
  */
-static int rxrpc_begin_client_call(struct rxrpc_call *call,
-				   struct rxrpc_conn_parameters *cp,
-				   struct sockaddr_rxrpc *srx,
-				   gfp_t gfp)
+static void rxrpc_start_call_timer(struct rxrpc_call *call)
 {
-	int ret;
-
-	/* Set up or get a connection record and set the protocol parameters,
-	 * including channel number and call ID.
-	 */
-	ret = rxrpc_connect_call(call, cp, srx, gfp);
-	if (ret < 0)
-		return ret;
-
-	spin_lock(&call->conn->params.peer->lock);
-	hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets);
-	spin_unlock(&call->conn->params.peer->lock);
-
-	call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
-	add_timer(&call->lifetimer);
-	return 0;
+	unsigned long expire_at;
+
+	expire_at = jiffies + rxrpc_max_call_lifetime;
+	call->expire_at = expire_at;
+	call->ack_at = expire_at;
+	call->resend_at = expire_at;
+	call->timer.expires = expire_at;
+	add_timer(&call->timer);
 }
 
 /*
@@ -226,7 +214,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 
 	_enter("%p,%lx", rx, user_call_ID);
 
-	call = rxrpc_alloc_client_call(rx, srx, gfp);
+	call = rxrpc_alloc_client_call(srx, gfp);
 	if (IS_ERR(call)) {
 		_leave(" = %ld", PTR_ERR(call));
 		return call;
@@ -255,19 +243,32 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 			goto found_user_ID_now_present;
 	}
 
+	rcu_assign_pointer(call->socket, rx);
 	rxrpc_get_call(call, rxrpc_call_got_userid);
 	rb_link_node(&call->sock_node, parent, pp);
 	rb_insert_color(&call->sock_node, &rx->calls);
+	list_add(&call->sock_link, &rx->sock_calls);
+
 	write_unlock(&rx->call_lock);
 
-	write_lock_bh(&rxrpc_call_lock);
+	write_lock(&rxrpc_call_lock);
 	list_add_tail(&call->link, &rxrpc_calls);
-	write_unlock_bh(&rxrpc_call_lock);
+	write_unlock(&rxrpc_call_lock);
 
-	ret = rxrpc_begin_client_call(call, cp, srx, gfp);
+	/* Set up or get a connection record and set the protocol parameters,
+	 * including channel number and call ID.
+	 */
+	ret = rxrpc_connect_call(call, cp, srx, gfp);
 	if (ret < 0)
 		goto error;
 
+	spin_lock_bh(&call->conn->params.peer->lock);
+	hlist_add_head(&call->error_link,
+		       &call->conn->params.peer->error_targets);
+	spin_unlock_bh(&call->conn->params.peer->lock);
+
+	rxrpc_start_call_timer(call);
+
 	_net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
 
 	_leave(" = %p [new]", call);
@@ -279,9 +280,9 @@ error:
 	write_unlock(&rx->call_lock);
 	rxrpc_put_call(call, rxrpc_call_put_userid);
 
-	write_lock_bh(&rxrpc_call_lock);
+	write_lock(&rxrpc_call_lock);
 	list_del_init(&call->link);
-	write_unlock_bh(&rxrpc_call_lock);
+	write_unlock(&rxrpc_call_lock);
 
 error_out:
 	__rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
@@ -303,142 +304,46 @@ found_user_ID_now_present:
 }
 
 /*
- * set up an incoming call
- * - called in process context with IRQs enabled
+ * Set up an incoming call.  call->conn points to the connection.
+ * This is called in BH context and isn't allowed to fail.
  */
-struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
-				       struct rxrpc_connection *conn,
-				       struct sk_buff *skb)
+void rxrpc_incoming_call(struct rxrpc_sock *rx,
+			 struct rxrpc_call *call,
+			 struct sk_buff *skb)
 {
+	struct rxrpc_connection *conn = call->conn;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_call *call, *candidate;
-	const void *here = __builtin_return_address(0);
-	u32 call_id, chan;
-
-	_enter(",%d", conn->debug_id);
-
-	ASSERT(rx != NULL);
-
-	candidate = rxrpc_alloc_call(GFP_NOIO);
-	if (!candidate)
-		return ERR_PTR(-EBUSY);
+	u32 chan;
 
-	trace_rxrpc_call(candidate, rxrpc_call_new_service,
-			 atomic_read(&candidate->usage), here, NULL);
+	_enter(",%d", call->conn->debug_id);
 
-	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
-	candidate->conn		= conn;
-	candidate->peer		= conn->params.peer;
-	candidate->cid		= sp->hdr.cid;
-	candidate->call_id	= sp->hdr.callNumber;
-	candidate->security_ix	= sp->hdr.securityIndex;
-	candidate->rx_data_post	= 0;
-	candidate->state	= RXRPC_CALL_SERVER_ACCEPTING;
-	candidate->flags	|= (1 << RXRPC_CALL_IS_SERVICE);
-	if (conn->security_ix > 0)
-		candidate->state = RXRPC_CALL_SERVER_SECURING;
-	rcu_assign_pointer(candidate->socket, rx);
-
-	spin_lock(&conn->channel_lock);
-
-	/* set the channel for this call */
-	call = rcu_dereference_protected(conn->channels[chan].call,
-					 lockdep_is_held(&conn->channel_lock));
-
-	_debug("channel[%u] is %p", candidate->cid & RXRPC_CHANNELMASK, call);
-	if (call && call->call_id == sp->hdr.callNumber) {
-		/* already set; must've been a duplicate packet */
-		_debug("extant call [%d]", call->state);
-		ASSERTCMP(call->conn, ==, conn);
-
-		read_lock(&call->state_lock);
-		switch (call->state) {
-		case RXRPC_CALL_LOCALLY_ABORTED:
-			if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
-				rxrpc_queue_call(call);
-		case RXRPC_CALL_REMOTELY_ABORTED:
-			read_unlock(&call->state_lock);
-			goto aborted_call;
-		default:
-			rxrpc_get_call(call, rxrpc_call_got);
-			read_unlock(&call->state_lock);
-			goto extant_call;
-		}
-	}
-
-	if (call) {
-		/* it seems the channel is still in use from the previous call
-		 * - ditch the old binding if its call is now complete */
-		_debug("CALL: %u { %s }",
-		       call->debug_id, rxrpc_call_states[call->state]);
-
-		if (call->state == RXRPC_CALL_COMPLETE) {
-			__rxrpc_disconnect_call(conn, call);
-		} else {
-			spin_unlock(&conn->channel_lock);
-			kmem_cache_free(rxrpc_call_jar, candidate);
-			_leave(" = -EBUSY");
-			return ERR_PTR(-EBUSY);
-		}
-	}
-
-	/* check the call number isn't duplicate */
-	_debug("check dup");
-	call_id = sp->hdr.callNumber;
-
-	/* We just ignore calls prior to the current call ID.  Terminated calls
-	 * are handled via the connection.
+	rcu_assign_pointer(call->socket, rx);
+	call->call_id		= sp->hdr.callNumber;
+	call->service_id	= sp->hdr.serviceId;
+	call->cid		= sp->hdr.cid;
+	call->state		= RXRPC_CALL_SERVER_ACCEPTING;
+	if (sp->hdr.securityIndex > 0)
+		call->state	= RXRPC_CALL_SERVER_SECURING;
+
+	/* Set the channel for this call.  We don't get channel_lock as we're
+	 * only defending against the data_ready handler (which we're called
+	 * from) and the RESPONSE packet parser (which is only really
+	 * interested in call_counter and can cope with a disagreement with the
+	 * call pointer).
 	 */
-	if (call_id <= conn->channels[chan].call_counter)
-		goto old_call; /* TODO: Just drop packet */
-
-	/* Temporary: Mirror the backlog prealloc ref (TODO: use prealloc) */
-	rxrpc_get_call(candidate, rxrpc_call_got);
-
-	/* make the call available */
-	_debug("new call");
-	call = candidate;
-	candidate = NULL;
-	conn->channels[chan].call_counter = call_id;
+	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
+	conn->channels[chan].call_counter = call->call_id;
+	conn->channels[chan].call_id = call->call_id;
 	rcu_assign_pointer(conn->channels[chan].call, call);
-	rxrpc_get_connection(conn);
-	rxrpc_get_peer(call->peer);
-	spin_unlock(&conn->channel_lock);
 
 	spin_lock(&conn->params.peer->lock);
 	hlist_add_head(&call->error_link, &conn->params.peer->error_targets);
 	spin_unlock(&conn->params.peer->lock);
 
-	write_lock_bh(&rxrpc_call_lock);
-	list_add_tail(&call->link, &rxrpc_calls);
-	write_unlock_bh(&rxrpc_call_lock);
-
-	call->service_id = conn->params.service_id;
-
 	_net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
 
-	call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
-	add_timer(&call->lifetimer);
-	_leave(" = %p {%d} [new]", call, call->debug_id);
-	return call;
-
-extant_call:
-	spin_unlock(&conn->channel_lock);
-	kmem_cache_free(rxrpc_call_jar, candidate);
-	_leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1);
-	return call;
-
-aborted_call:
-	spin_unlock(&conn->channel_lock);
-	kmem_cache_free(rxrpc_call_jar, candidate);
-	_leave(" = -ECONNABORTED");
-	return ERR_PTR(-ECONNABORTED);
-
-old_call:
-	spin_unlock(&conn->channel_lock);
-	kmem_cache_free(rxrpc_call_jar, candidate);
-	_leave(" = -ECONNRESET [old]");
-	return ERR_PTR(-ECONNRESET);
+	rxrpc_start_call_timer(call);
+	_leave("");
 }
 
 /*
@@ -497,25 +402,17 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 }
 
 /*
- * Note the addition of a ref on a call for a socket buffer.
+ * Detach a call from its owning socket.
  */
-void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
+void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 {
-	const void *here = __builtin_return_address(0);
-	int n = atomic_inc_return(&call->usage);
+	struct rxrpc_connection *conn = call->conn;
+	bool put = false;
+	int i;
 
-	trace_rxrpc_call(call, rxrpc_call_got_skb, n, here, skb);
-}
+	_enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
 
-/*
- * detach a call from a socket and set up for release
- */
-void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
-{
-	_enter("{%d,%d,%d,%d}",
-	       call->debug_id, atomic_read(&call->usage),
-	       atomic_read(&call->ackr_not_idle),
-	       call->rx_first_oos);
+	ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
 	rxrpc_see_call(call);
 
@@ -524,80 +421,46 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 		BUG();
 	spin_unlock_bh(&call->lock);
 
-	/* dissociate from the socket
-	 * - the socket's ref on the call is passed to the death timer
-	 */
-	_debug("RELEASE CALL %p (%d)", call, call->debug_id);
+	del_timer_sync(&call->timer);
 
-	if (call->peer) {
-		spin_lock(&call->peer->lock);
-		hlist_del_init(&call->error_link);
-		spin_unlock(&call->peer->lock);
-	}
+	/* Make sure we don't get any more notifications */
+	write_lock_bh(&rx->recvmsg_lock);
 
-	write_lock_bh(&rx->call_lock);
-	if (!list_empty(&call->accept_link)) {
+	if (!list_empty(&call->recvmsg_link)) {
 		_debug("unlinking once-pending call %p { e=%lx f=%lx }",
 		       call, call->events, call->flags);
-		ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
-		list_del_init(&call->accept_link);
-		sk_acceptq_removed(&rx->sk);
-	} else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+		list_del(&call->recvmsg_link);
+		put = true;
+	}
+
+	/* list_empty() must return false in rxrpc_notify_socket() */
+	call->recvmsg_link.next = NULL;
+	call->recvmsg_link.prev = NULL;
+
+	write_unlock_bh(&rx->recvmsg_lock);
+	if (put)
+		rxrpc_put_call(call, rxrpc_call_put);
+
+	write_lock(&rx->call_lock);
+
+	if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
 		rb_erase(&call->sock_node, &rx->calls);
 		memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
-		clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
 		rxrpc_put_call(call, rxrpc_call_put_userid);
 	}
-	write_unlock_bh(&rx->call_lock);
-
-	/* free up the channel for reuse */
-	if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) {
-		clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
-		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
-		rxrpc_call_completed(call);
-	} else {
-		write_lock_bh(&call->state_lock);
-
-		if (call->state < RXRPC_CALL_COMPLETE) {
-			_debug("+++ ABORTING STATE %d +++\n", call->state);
-			__rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
-			clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
-			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
-		}
-
-		write_unlock_bh(&call->state_lock);
-	}
 
-	if (call->conn)
+	list_del(&call->sock_link);
+	write_unlock(&rx->call_lock);
+
+	_debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
+
+	if (conn)
 		rxrpc_disconnect_call(call);
 
-	/* clean up the Rx queue */
-	if (!skb_queue_empty(&call->rx_queue) ||
-	    !skb_queue_empty(&call->rx_oos_queue)) {
-		struct rxrpc_skb_priv *sp;
-		struct sk_buff *skb;
-
-		_debug("purge Rx queues");
-
-		spin_lock_bh(&call->lock);
-		while ((skb = skb_dequeue(&call->rx_queue)) ||
-		       (skb = skb_dequeue(&call->rx_oos_queue))) {
-			spin_unlock_bh(&call->lock);
-
-			sp = rxrpc_skb(skb);
-			_debug("- zap %s %%%u #%u",
-			       rxrpc_pkts[sp->hdr.type],
-			       sp->hdr.serial, sp->hdr.seq);
-			rxrpc_free_skb(skb);
-			spin_lock_bh(&call->lock);
-		}
-		spin_unlock_bh(&call->lock);
+	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
+		rxrpc_free_skb(call->rxtx_buffer[i]);
+		call->rxtx_buffer[i] = NULL;
 	}
-	rxrpc_purge_queue(&call->knlrecv_queue);
-
-	del_timer_sync(&call->resend_timer);
-	del_timer_sync(&call->ack_timer);
-	del_timer_sync(&call->lifetimer);
 
 	/* We have to release the prealloc backlog ref */
 	if (rxrpc_is_service_call(call))
@@ -611,28 +474,19 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 {
 	struct rxrpc_call *call;
-	struct rb_node *p;
 
 	_enter("%p", rx);
 
-	read_lock_bh(&rx->call_lock);
-
-	/* kill the not-yet-accepted incoming calls */
-	list_for_each_entry(call, &rx->secureq, accept_link) {
-		rxrpc_release_call(rx, call);
-	}
-
-	list_for_each_entry(call, &rx->acceptq, accept_link) {
-		rxrpc_release_call(rx, call);
-	}
-
-	/* mark all the calls as no longer wanting incoming packets */
-	for (p = rb_first(&rx->calls); p; p = rb_next(p)) {
-		call = rb_entry(p, struct rxrpc_call, sock_node);
+	while (!list_empty(&rx->sock_calls)) {
+		call = list_entry(rx->sock_calls.next,
+				  struct rxrpc_call, sock_link);
+		rxrpc_get_call(call, rxrpc_call_got);
+		rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 		rxrpc_release_call(rx, call);
+		rxrpc_put_call(call, rxrpc_call_put);
 	}
 
-	read_unlock_bh(&rx->call_lock);
 	_leave("");
 }
 
@@ -651,23 +505,12 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
-		rxrpc_cleanup_call(call);
-	}
-}
+		ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
-/*
- * Release a call ref held by a socket buffer.
- */
-void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
-{
-	const void *here = __builtin_return_address(0);
-	int n;
+		write_lock(&rxrpc_call_lock);
+		list_del_init(&call->link);
+		write_unlock(&rxrpc_call_lock);
 
-	n = atomic_dec_return(&call->usage);
-	trace_rxrpc_call(call, rxrpc_call_put_skb, n, here, skb);
-	ASSERTCMP(n, >=, 0);
-	if (n == 0) {
-		_debug("call %d dead", call->debug_id);
 		rxrpc_cleanup_call(call);
 	}
 }
@@ -679,9 +522,9 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
 {
 	struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
 
-	rxrpc_purge_queue(&call->rx_queue);
-	rxrpc_purge_queue(&call->knlrecv_queue);
 	rxrpc_put_peer(call->peer);
+	kfree(call->rxtx_buffer);
+	kfree(call->rxtx_annotations);
 	kmem_cache_free(rxrpc_call_jar, call);
 }
 
@@ -690,49 +533,24 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
  */
 void rxrpc_cleanup_call(struct rxrpc_call *call)
 {
-	_net("DESTROY CALL %d", call->debug_id);
+	int i;
 
-	write_lock_bh(&rxrpc_call_lock);
-	list_del_init(&call->link);
-	write_unlock_bh(&rxrpc_call_lock);
+	_net("DESTROY CALL %d", call->debug_id);
 
 	memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
 
-	del_timer_sync(&call->lifetimer);
-	del_timer_sync(&call->ack_timer);
-	del_timer_sync(&call->resend_timer);
+	del_timer_sync(&call->timer);
 
 	ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 	ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
-	ASSERT(!work_pending(&call->processor));
 	ASSERTCMP(call->conn, ==, NULL);
 
-	if (call->acks_window) {
-		_debug("kill Tx window %d",
-		       CIRC_CNT(call->acks_head, call->acks_tail,
-				call->acks_winsz));
-		smp_mb();
-		while (CIRC_CNT(call->acks_head, call->acks_tail,
-				call->acks_winsz) > 0) {
-			struct rxrpc_skb_priv *sp;
-			unsigned long _skb;
-
-			_skb = call->acks_window[call->acks_tail] & ~1;
-			sp = rxrpc_skb((struct sk_buff *)_skb);
-			_debug("+++ clear Tx %u", sp->hdr.seq);
-			rxrpc_free_skb((struct sk_buff *)_skb);
-			call->acks_tail =
-				(call->acks_tail + 1) & (call->acks_winsz - 1);
-		}
-
-		kfree(call->acks_window);
-	}
+	/* Clean up the Rx/Tx buffer */
+	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
+		rxrpc_free_skb(call->rxtx_buffer[i]);
 
 	rxrpc_free_skb(call->tx_pending);
 
-	rxrpc_purge_queue(&call->rx_queue);
-	ASSERT(skb_queue_empty(&call->rx_oos_queue));
-	rxrpc_purge_queue(&call->knlrecv_queue);
 	call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
 }
 
@@ -747,8 +565,8 @@ void __exit rxrpc_destroy_all_calls(void)
 
 	if (list_empty(&rxrpc_calls))
 		return;
-	
-	write_lock_bh(&rxrpc_call_lock);
+
+	write_lock(&rxrpc_call_lock);
 
 	while (!list_empty(&rxrpc_calls)) {
 		call = list_entry(rxrpc_calls.next, struct rxrpc_call, link);
@@ -757,74 +575,15 @@ void __exit rxrpc_destroy_all_calls(void)
 		rxrpc_see_call(call);
 		list_del_init(&call->link);
 
-		pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
+		pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n",
 		       call, atomic_read(&call->usage),
-		       atomic_read(&call->ackr_not_idle),
 		       rxrpc_call_states[call->state],
 		       call->flags, call->events);
-		if (!skb_queue_empty(&call->rx_queue))
-			pr_err("Rx queue occupied\n");
-		if (!skb_queue_empty(&call->rx_oos_queue))
-			pr_err("OOS queue occupied\n");
 
-		write_unlock_bh(&rxrpc_call_lock);
+		write_unlock(&rxrpc_call_lock);
 		cond_resched();
-		write_lock_bh(&rxrpc_call_lock);
+		write_lock(&rxrpc_call_lock);
 	}
 
-	write_unlock_bh(&rxrpc_call_lock);
-	_leave("");
-}
-
-/*
- * handle call lifetime being exceeded
- */
-static void rxrpc_call_life_expired(unsigned long _call)
-{
-	struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
-	_enter("{%d}", call->debug_id);
-
-	rxrpc_see_call(call);
-	if (call->state >= RXRPC_CALL_COMPLETE)
-		return;
-
-	set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
-	rxrpc_queue_call(call);
-}
-
-/*
- * handle resend timer expiry
- * - may not take call->state_lock as this can deadlock against del_timer_sync()
- */
-static void rxrpc_resend_time_expired(unsigned long _call)
-{
-	struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
-	_enter("{%d}", call->debug_id);
-
-	rxrpc_see_call(call);
-	if (call->state >= RXRPC_CALL_COMPLETE)
-		return;
-
-	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
-		rxrpc_queue_call(call);
-}
-
-/*
- * handle ACK timer expiry
- */
-static void rxrpc_ack_time_expired(unsigned long _call)
-{
-	struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
-	_enter("{%d}", call->debug_id);
-
-	rxrpc_see_call(call);
-	if (call->state >= RXRPC_CALL_COMPLETE)
-		return;
-
-	if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
-		rxrpc_queue_call(call);
+	write_unlock(&rxrpc_call_lock);
 }
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 8c7938ba6a84..0691007cfc02 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -15,10 +15,6 @@
 #include <linux/net.h>
 #include <linux/skbuff.h>
 #include <linux/errqueue.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/icmp.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <net/ip.h>
@@ -140,16 +136,10 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			      u32 abort_code, int error)
 {
 	struct rxrpc_call *call;
-	bool queue;
-	int i, bit;
+	int i;
 
 	_enter("{%d},%x", conn->debug_id, abort_code);
 
-	if (compl == RXRPC_CALL_LOCALLY_ABORTED)
-		bit = RXRPC_CALL_EV_CONN_ABORT;
-	else
-		bit = RXRPC_CALL_EV_RCVD_ABORT;
-
 	spin_lock(&conn->channel_lock);
 
 	for (i = 0; i < RXRPC_MAXCALLS; i++) {
@@ -157,22 +147,13 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			conn->channels[i].call,
 			lockdep_is_held(&conn->channel_lock));
 		if (call) {
-			rxrpc_see_call(call);
 			if (compl == RXRPC_CALL_LOCALLY_ABORTED)
 				trace_rxrpc_abort("CON", call->cid,
 						  call->call_id, 0,
 						  abort_code, error);
-
-			write_lock_bh(&call->state_lock);
-			if (rxrpc_set_call_completion(call, compl, abort_code,
-						      error)) {
-				set_bit(bit, &call->events);
-				queue = true;
-			}
-			write_unlock_bh(&call->state_lock);
-			if (queue)
-				rxrpc_queue_call(call);
-
+			if (rxrpc_set_call_completion(call, compl,
+						      abort_code, error))
+				rxrpc_notify_socket(call);
 		}
 	}
 
@@ -251,17 +232,18 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 
 /*
  * mark a call as being on a now-secured channel
- * - must be called with softirqs disabled
+ * - must be called with BH's disabled.
  */
 static void rxrpc_call_is_secure(struct rxrpc_call *call)
 {
 	_enter("%p", call);
 	if (call) {
-		read_lock(&call->state_lock);
-		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events))
-			rxrpc_queue_call(call);
-		read_unlock(&call->state_lock);
+		write_lock_bh(&call->state_lock);
+		if (call->state == RXRPC_CALL_SERVER_SECURING) {
+			call->state = RXRPC_CALL_SERVER_ACCEPTING;
+			rxrpc_notify_socket(call);
+		}
+		write_unlock_bh(&call->state_lock);
 	}
 }
 
@@ -278,7 +260,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 	int loop, ret;
 
 	if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
-		kleave(" = -ECONNABORTED [%u]", conn->state);
+		_leave(" = -ECONNABORTED [%u]", conn->state);
 		return -ECONNABORTED;
 	}
 
@@ -291,14 +273,14 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 		return 0;
 
 	case RXRPC_PACKET_TYPE_ABORT:
-		if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
+		if (skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) < 0)
 			return -EPROTO;
 		abort_code = ntohl(wtmp);
 		_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
 
 		conn->state = RXRPC_CONN_REMOTELY_ABORTED;
-		rxrpc_abort_calls(conn, 0, RXRPC_CALL_REMOTELY_ABORTED,
-				  abort_code);
+		rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
+				  abort_code, ECONNABORTED);
 		return -ECONNABORTED;
 
 	case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -323,14 +305,16 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 
 		if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
 			conn->state = RXRPC_CONN_SERVICE;
+			spin_unlock(&conn->state_lock);
 			for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
 				rxrpc_call_is_secure(
 					rcu_dereference_protected(
 						conn->channels[loop].call,
 						lockdep_is_held(&conn->channel_lock)));
+		} else {
+			spin_unlock(&conn->state_lock);
 		}
 
-		spin_unlock(&conn->state_lock);
 		spin_unlock(&conn->channel_lock);
 		return 0;
 
@@ -433,88 +417,3 @@ protocol_error:
 	_leave(" [EPROTO]");
 	goto out;
 }
-
-/*
- * put a packet up for transport-level abort
- */
-void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
-{
-	CHECK_SLAB_OKAY(&local->usage);
-
-	skb_queue_tail(&local->reject_queue, skb);
-	rxrpc_queue_local(local);
-}
-
-/*
- * reject packets through the local endpoint
- */
-void rxrpc_reject_packets(struct rxrpc_local *local)
-{
-	union {
-		struct sockaddr sa;
-		struct sockaddr_in sin;
-	} sa;
-	struct rxrpc_skb_priv *sp;
-	struct rxrpc_wire_header whdr;
-	struct sk_buff *skb;
-	struct msghdr msg;
-	struct kvec iov[2];
-	size_t size;
-	__be32 code;
-
-	_enter("%d", local->debug_id);
-
-	iov[0].iov_base = &whdr;
-	iov[0].iov_len = sizeof(whdr);
-	iov[1].iov_base = &code;
-	iov[1].iov_len = sizeof(code);
-	size = sizeof(whdr) + sizeof(code);
-
-	msg.msg_name = &sa;
-	msg.msg_control = NULL;
-	msg.msg_controllen = 0;
-	msg.msg_flags = 0;
-
-	memset(&sa, 0, sizeof(sa));
-	sa.sa.sa_family = local->srx.transport.family;
-	switch (sa.sa.sa_family) {
-	case AF_INET:
-		msg.msg_namelen = sizeof(sa.sin);
-		break;
-	default:
-		msg.msg_namelen = 0;
-		break;
-	}
-
-	memset(&whdr, 0, sizeof(whdr));
-	whdr.type = RXRPC_PACKET_TYPE_ABORT;
-
-	while ((skb = skb_dequeue(&local->reject_queue))) {
-		rxrpc_see_skb(skb);
-		sp = rxrpc_skb(skb);
-		switch (sa.sa.sa_family) {
-		case AF_INET:
-			sa.sin.sin_port = udp_hdr(skb)->source;
-			sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
-			code = htonl(skb->priority);
-
-			whdr.epoch	= htonl(sp->hdr.epoch);
-			whdr.cid	= htonl(sp->hdr.cid);
-			whdr.callNumber	= htonl(sp->hdr.callNumber);
-			whdr.serviceId	= htons(sp->hdr.serviceId);
-			whdr.flags	= sp->hdr.flags;
-			whdr.flags	^= RXRPC_CLIENT_INITIATED;
-			whdr.flags	&= RXRPC_CLIENT_INITIATED;
-
-			kernel_sendmsg(local->socket, &msg, iov, 2, size);
-			break;
-
-		default:
-			break;
-		}
-
-		rxrpc_free_skb(skb);
-	}
-
-	_leave("");
-}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 8da82e3aa00e..ffa9addb97b2 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -169,7 +169,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
 			chan->last_abort = call->abort_code;
 			chan->last_type = RXRPC_PACKET_TYPE_ABORT;
 		} else {
-			chan->last_seq = call->rx_data_eaten;
+			chan->last_seq = call->rx_hard_ack;
 			chan->last_type = RXRPC_PACKET_TYPE_ACK;
 		}
 		/* Sync with rxrpc_conn_retransmit(). */
@@ -191,6 +191,10 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
 {
 	struct rxrpc_connection *conn = call->conn;
 
+	spin_lock_bh(&conn->params.peer->lock);
+	hlist_del_init(&call->error_link);
+	spin_unlock_bh(&conn->params.peer->lock);
+
 	if (rxrpc_is_client_call(call))
 		return rxrpc_disconnect_client_call(call);
 
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 189338a60457..83d54da4ce8b 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -65,9 +65,8 @@ done:
  * Insert a service connection into a peer's tree, thereby making it a target
  * for incoming packets.
  */
-static struct rxrpc_connection *
-rxrpc_publish_service_conn(struct rxrpc_peer *peer,
-			   struct rxrpc_connection *conn)
+static void rxrpc_publish_service_conn(struct rxrpc_peer *peer,
+				       struct rxrpc_connection *conn)
 {
 	struct rxrpc_connection *cursor = NULL;
 	struct rxrpc_conn_proto k = conn->proto;
@@ -96,7 +95,7 @@ conn_published:
 	set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags);
 	write_sequnlock_bh(&peer->service_conn_lock);
 	_leave(" = %d [new]", conn->debug_id);
-	return conn;
+	return;
 
 found_extant_conn:
 	if (atomic_read(&cursor->usage) == 0)
@@ -143,106 +142,30 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
 }
 
 /*
- * get a record of an incoming connection
+ * Set up an incoming connection.  This is called in BH context with the RCU
+ * read lock held.
  */
-struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local,
-						   struct sockaddr_rxrpc *srx,
-						   struct sk_buff *skb)
+void rxrpc_new_incoming_connection(struct rxrpc_connection *conn,
+				   struct sk_buff *skb)
 {
-	struct rxrpc_connection *conn;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_peer *peer;
-	const char *new = "old";
 
 	_enter("");
 
-	peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
-	if (!peer) {
-		_debug("no peer");
-		return ERR_PTR(-EBUSY);
-	}
-
-	ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED);
-
-	rcu_read_lock();
-	peer = rxrpc_lookup_peer_rcu(local, srx);
-	if (peer) {
-		conn = rxrpc_find_service_conn_rcu(peer, skb);
-		if (conn) {
-			if (sp->hdr.securityIndex != conn->security_ix)
-				goto security_mismatch_rcu;
-			if (rxrpc_get_connection_maybe(conn))
-				goto found_extant_connection_rcu;
-
-			/* The conn has expired but we can't remove it without
-			 * the appropriate lock, so we attempt to replace it
-			 * when we have a new candidate.
-			 */
-		}
-
-		if (!rxrpc_get_peer_maybe(peer))
-			peer = NULL;
-	}
-	rcu_read_unlock();
-
-	if (!peer) {
-		peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
-		if (!peer)
-			goto enomem;
-	}
-
-	/* We don't have a matching record yet. */
-	conn = rxrpc_alloc_connection(GFP_NOIO);
-	if (!conn)
-		goto enomem_peer;
-
 	conn->proto.epoch	= sp->hdr.epoch;
 	conn->proto.cid		= sp->hdr.cid & RXRPC_CIDMASK;
-	conn->params.local	= local;
-	conn->params.peer	= peer;
 	conn->params.service_id	= sp->hdr.serviceId;
 	conn->security_ix	= sp->hdr.securityIndex;
 	conn->out_clientflag	= 0;
-	conn->state		= RXRPC_CONN_SERVICE;
-	if (conn->params.service_id)
+	if (conn->security_ix)
 		conn->state	= RXRPC_CONN_SERVICE_UNSECURED;
-
-	rxrpc_get_local(local);
-
-	/* We maintain an extra ref on the connection whilst it is on
-	 * the rxrpc_connections list.
-	 */
-	atomic_set(&conn->usage, 2);
-
-	write_lock(&rxrpc_connection_lock);
-	list_add_tail(&conn->link, &rxrpc_connections);
-	list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
-	write_unlock(&rxrpc_connection_lock);
+	else
+		conn->state	= RXRPC_CONN_SERVICE;
 
 	/* Make the connection a target for incoming packets. */
-	rxrpc_publish_service_conn(peer, conn);
-
-	new = "new";
-
-success:
-	_net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid);
-	_leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
-	return conn;
-
-found_extant_connection_rcu:
-	rcu_read_unlock();
-	goto success;
-
-security_mismatch_rcu:
-	rcu_read_unlock();
-	_leave(" = -EKEYREJECTED");
-	return ERR_PTR(-EKEYREJECTED);
+	rxrpc_publish_service_conn(conn->params.peer, conn);
 
-enomem_peer:
-	rxrpc_put_peer(peer);
-enomem:
-	_leave(" = -ENOMEM");
-	return ERR_PTR(-ENOMEM);
+	_net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid);
 }
 
 /*
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 5906579060cd..afeba98004b1 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1,6 +1,6 @@
 /* RxRPC packet reception
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -27,549 +27,547 @@
 #include <net/net_namespace.h>
 #include "ar-internal.h"
 
+static void rxrpc_proto_abort(const char *why,
+			      struct rxrpc_call *call, rxrpc_seq_t seq)
+{
+	if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, EBADMSG)) {
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+		rxrpc_queue_call(call);
+	}
+}
+
 /*
- * queue a packet for recvmsg to pass to userspace
- * - the caller must hold a lock on call->lock
- * - must not be called with interrupts disabled (sk_filter() disables BH's)
- * - eats the packet whether successful or not
- * - there must be just one reference to the packet, which the caller passes to
- *   this function
+ * Apply a hard ACK by advancing the Tx window.
  */
-int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
-			bool force, bool terminal)
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 {
-	struct rxrpc_skb_priv *sp;
-	struct rxrpc_sock *rx;
-	struct sock *sk;
-	int ret;
+	struct sk_buff *skb, *list = NULL;
+	int ix;
 
-	_enter(",,%d,%d", force, terminal);
+	spin_lock(&call->lock);
 
-	ASSERT(!irqs_disabled());
+	while (before(call->tx_hard_ack, to)) {
+		call->tx_hard_ack++;
+		ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
+		skb = call->rxtx_buffer[ix];
+		rxrpc_see_skb(skb);
+		call->rxtx_buffer[ix] = NULL;
+		call->rxtx_annotations[ix] = 0;
+		skb->next = list;
+		list = skb;
+	}
 
-	sp = rxrpc_skb(skb);
-	ASSERTCMP(sp->call, ==, call);
+	spin_unlock(&call->lock);
 
-	/* if we've already posted the terminal message for a call, then we
-	 * don't post any more */
-	if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
-		_debug("already terminated");
-		ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
+	while (list) {
+		skb = list;
+		list = skb->next;
+		skb->next = NULL;
 		rxrpc_free_skb(skb);
-		return 0;
 	}
+}
 
-	/* The socket may go away under us */
-	ret = 0;
-	rcu_read_lock();
-	rx = rcu_dereference(call->socket);
-	if (!rx)
-		goto out;
-	sk = &rx->sk;
-	if (sock_flag(sk, SOCK_DEAD))
-		goto out;
+/*
+ * End the transmission phase of a call.
+ *
+ * This occurs when we get an ACKALL packet, the first DATA packet of a reply,
+ * or a final ACK packet.
+ */
+static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
+{
+	_enter("");
 
-	if (!force) {
-		/* cast skb->rcvbuf to unsigned...  It's pointless, but
-		 * reduces number of warnings when compiling with -W
-		 * --ANK */
-//		ret = -ENOBUFS;
-//		if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-//		    (unsigned int) sk->sk_rcvbuf)
-//			goto out;
-
-		ret = sk_filter(sk, skb);
-		if (ret < 0)
-			goto out;
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_RECV_REPLY:
+		return true;
+	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+	case RXRPC_CALL_SERVER_AWAIT_ACK:
+		break;
+	default:
+		rxrpc_proto_abort(abort_why, call, call->tx_top);
+		return false;
 	}
 
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) &&
-	    !test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    sk->sk_state != RXRPC_CLOSE) {
-		skb->destructor = rxrpc_packet_destructor;
-		skb->dev = NULL;
-		skb->sk = sk;
-		atomic_add(skb->truesize, &sk->sk_rmem_alloc);
-
-		if (terminal) {
-			_debug("<<<< TERMINAL MESSAGE >>>>");
-			set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags);
-		}
+	rxrpc_rotate_tx_window(call, call->tx_top);
 
-		/* allow interception by a kernel service */
-		if (skb->mark == RXRPC_SKB_MARK_NEW_CALL &&
-		    rx->notify_new_call) {
-			spin_unlock_bh(&sk->sk_receive_queue.lock);
-			skb_queue_tail(&call->knlrecv_queue, skb);
-			rx->notify_new_call(&rx->sk, NULL, 0);
-		} else if (call->notify_rx) {
-			spin_unlock_bh(&sk->sk_receive_queue.lock);
-			skb_queue_tail(&call->knlrecv_queue, skb);
-			call->notify_rx(&rx->sk, call, call->user_call_ID);
-		} else {
-			_net("post skb %p", skb);
-			__skb_queue_tail(&sk->sk_receive_queue, skb);
-			spin_unlock_bh(&sk->sk_receive_queue.lock);
+	write_lock(&call->state_lock);
 
-			sk->sk_data_ready(sk);
-		}
-		skb = NULL;
-	} else {
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
+	switch (call->state) {
+	default:
+		break;
+	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+		break;
+	case RXRPC_CALL_SERVER_AWAIT_ACK:
+		__rxrpc_call_completed(call);
+		rxrpc_notify_socket(call);
+		break;
 	}
-	ret = 0;
 
-out:
-	rxrpc_free_skb(skb);
-	rcu_read_unlock();
+	write_unlock(&call->state_lock);
+	_leave(" = ok");
+	return true;
+}
+
+/*
+ * Scan a jumbo packet to validate its structure and to work out how many
+ * subpackets it contains.
+ *
+ * A jumbo packet is a collection of consecutive packets glued together with
+ * little headers between that indicate how to change the initial header for
+ * each subpacket.
+ *
+ * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but
+ * the last are RXRPC_JUMBO_DATALEN in size.  The last subpacket may be of any
+ * size.
+ */
+static bool rxrpc_validate_jumbo(struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned int offset = sp->offset;
+	unsigned int len = skb->data_len;
+	int nr_jumbo = 1;
+	u8 flags = sp->hdr.flags;
+
+	do {
+		nr_jumbo++;
+		if (len - offset < RXRPC_JUMBO_SUBPKTLEN)
+			goto protocol_error;
+		if (flags & RXRPC_LAST_PACKET)
+			goto protocol_error;
+		offset += RXRPC_JUMBO_DATALEN;
+		if (skb_copy_bits(skb, offset, &flags, 1) < 0)
+			goto protocol_error;
+		offset += sizeof(struct rxrpc_jumbo_header);
+	} while (flags & RXRPC_JUMBO_PACKET);
+
+	sp->nr_jumbo = nr_jumbo;
+	return true;
 
-	_leave(" = %d", ret);
-	return ret;
+protocol_error:
+	return false;
 }
 
 /*
- * process a DATA packet, posting the packet to the appropriate queue
- * - eats the packet if successful
+ * Handle reception of a duplicate packet.
+ *
+ * We have to take care to avoid an attack here whereby we're given a series of
+ * jumbograms, each with a sequence number one before the preceding one and
+ * filled up to maximum UDP size.  If they never send us the first packet in
+ * the sequence, they can cause us to have to hold on to around 2MiB of kernel
+ * space until the call times out.
+ *
+ * We limit the space usage by only accepting three duplicate jumbo packets per
+ * call.  After that, we tell the other side we're no longer accepting jumbos
+ * (that information is encoded in the ACK packet).
  */
-static int rxrpc_fast_process_data(struct rxrpc_call *call,
-				   struct sk_buff *skb, u32 seq)
+static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
+				 u8 annotation, bool *_jumbo_dup)
 {
-	struct rxrpc_skb_priv *sp;
-	bool terminal;
-	int ret, ackbit, ack;
-	u32 serial;
-	u16 skew;
-	u8 flags;
+	/* Discard normal packets that are duplicates. */
+	if (annotation == 0)
+		return;
 
-	_enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq);
+	/* Skip jumbo subpackets that are duplicates.  When we've had three or
+	 * more partially duplicate jumbo packets, we refuse to take any more
+	 * jumbos for this call.
+	 */
+	if (!*_jumbo_dup) {
+		call->nr_jumbo_dup++;
+		*_jumbo_dup = true;
+	}
+}
 
-	sp = rxrpc_skb(skb);
-	ASSERTCMP(sp->call, ==, NULL);
-	flags = sp->hdr.flags;
-	serial = sp->hdr.serial;
-	skew = skb->priority;
+/*
+ * Process a DATA packet, adding the packet to the Rx ring.
+ */
+static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
+			     u16 skew)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned int offset = sp->offset;
+	unsigned int ix;
+	rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
+	rxrpc_seq_t seq = sp->hdr.seq, hard_ack;
+	bool immediate_ack = false, jumbo_dup = false, queued;
+	u16 len;
+	u8 ack = 0, flags, annotation = 0;
 
-	spin_lock(&call->lock);
+	_enter("{%u,%u},{%u,%u}",
+	       call->rx_hard_ack, call->rx_top, skb->data_len, seq);
 
-	if (call->state > RXRPC_CALL_COMPLETE)
-		goto discard;
+	_proto("Rx DATA %%%u { #%u f=%02x }",
+	       sp->hdr.serial, seq, sp->hdr.flags);
 
-	ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post);
-	ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv);
-	ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten);
+	if (call->state >= RXRPC_CALL_COMPLETE)
+		return;
 
-	if (seq < call->rx_data_post) {
-		_debug("dup #%u [-%u]", seq, call->rx_data_post);
-		ack = RXRPC_ACK_DUPLICATE;
-		ret = -ENOBUFS;
-		goto discard_and_ack;
-	}
+	/* Received data implicitly ACKs all of the request packets we sent
+	 * when we're acting as a client.
+	 */
+	if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY &&
+	    !rxrpc_end_tx_phase(call, "ETD"))
+		return;
 
-	/* we may already have the packet in the out of sequence queue */
-	ackbit = seq - (call->rx_data_eaten + 1);
-	ASSERTCMP(ackbit, >=, 0);
-	if (__test_and_set_bit(ackbit, call->ackr_window)) {
-		_debug("dup oos #%u [%u,%u]",
-		       seq, call->rx_data_eaten, call->rx_data_post);
-		ack = RXRPC_ACK_DUPLICATE;
-		goto discard_and_ack;
-	}
+	call->ackr_prev_seq = seq;
 
-	if (seq >= call->ackr_win_top) {
-		_debug("exceed #%u [%u]", seq, call->ackr_win_top);
-		__clear_bit(ackbit, call->ackr_window);
+	hard_ack = READ_ONCE(call->rx_hard_ack);
+	if (after(seq, hard_ack + call->rx_winsize)) {
 		ack = RXRPC_ACK_EXCEEDS_WINDOW;
-		goto discard_and_ack;
+		ack_serial = serial;
+		goto ack;
 	}
 
-	if (seq == call->rx_data_expect) {
-		clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags);
-		call->rx_data_expect++;
-	} else if (seq > call->rx_data_expect) {
-		_debug("oos #%u [%u]", seq, call->rx_data_expect);
-		call->rx_data_expect = seq + 1;
-		if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) {
-			ack = RXRPC_ACK_OUT_OF_SEQUENCE;
-			goto enqueue_and_ack;
+	flags = sp->hdr.flags;
+	if (flags & RXRPC_JUMBO_PACKET) {
+		if (call->nr_jumbo_dup > 3) {
+			ack = RXRPC_ACK_NOSPACE;
+			ack_serial = serial;
+			goto ack;
 		}
-		goto enqueue_packet;
+		annotation = 1;
 	}
 
-	if (seq != call->rx_data_post) {
-		_debug("ahead #%u [%u]", seq, call->rx_data_post);
-		goto enqueue_packet;
+next_subpacket:
+	queued = false;
+	ix = seq & RXRPC_RXTX_BUFF_MASK;
+	len = skb->data_len;
+	if (flags & RXRPC_JUMBO_PACKET)
+		len = RXRPC_JUMBO_DATALEN;
+
+	if (flags & RXRPC_LAST_PACKET) {
+		if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+		    seq != call->rx_top)
+			return rxrpc_proto_abort("LSN", call, seq);
+	} else {
+		if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+		    after_eq(seq, call->rx_top))
+			return rxrpc_proto_abort("LSA", call, seq);
 	}
 
-	if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags))
-		goto protocol_error;
-
-	/* if the packet need security things doing to it, then it goes down
-	 * the slow path */
-	if (call->security_ix)
-		goto enqueue_packet;
-
-	sp->call = call;
-	rxrpc_get_call_for_skb(call, skb);
-	terminal = ((flags & RXRPC_LAST_PACKET) &&
-		    !(flags & RXRPC_CLIENT_INITIATED));
-	ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
-	if (ret < 0) {
-		if (ret == -ENOMEM || ret == -ENOBUFS) {
-			__clear_bit(ackbit, call->ackr_window);
-			ack = RXRPC_ACK_NOSPACE;
-			goto discard_and_ack;
+	if (before_eq(seq, hard_ack)) {
+		ack = RXRPC_ACK_DUPLICATE;
+		ack_serial = serial;
+		goto skip;
+	}
+
+	if (flags & RXRPC_REQUEST_ACK && !ack) {
+		ack = RXRPC_ACK_REQUESTED;
+		ack_serial = serial;
+	}
+
+	if (call->rxtx_buffer[ix]) {
+		rxrpc_input_dup_data(call, seq, annotation, &jumbo_dup);
+		if (ack != RXRPC_ACK_DUPLICATE) {
+			ack = RXRPC_ACK_DUPLICATE;
+			ack_serial = serial;
 		}
-		goto out;
+		immediate_ack = true;
+		goto skip;
 	}
 
-	skb = NULL;
-	sp = NULL;
-
-	_debug("post #%u", seq);
-	ASSERTCMP(call->rx_data_post, ==, seq);
-	call->rx_data_post++;
-
-	if (flags & RXRPC_LAST_PACKET)
-		set_bit(RXRPC_CALL_RCVD_LAST, &call->flags);
-
-	/* if we've reached an out of sequence packet then we need to drain
-	 * that queue into the socket Rx queue now */
-	if (call->rx_data_post == call->rx_first_oos) {
-		_debug("drain rx oos now");
-		read_lock(&call->state_lock);
-		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events))
-			rxrpc_queue_call(call);
-		read_unlock(&call->state_lock);
+	/* Queue the packet.  We use a couple of memory barriers here as need
+	 * to make sure that rx_top is perceived to be set after the buffer
+	 * pointer and that the buffer pointer is set after the annotation and
+	 * the skb data.
+	 *
+	 * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
+	 * and also rxrpc_fill_out_ack().
+	 */
+	rxrpc_get_skb(skb);
+	call->rxtx_annotations[ix] = annotation;
+	smp_wmb();
+	call->rxtx_buffer[ix] = skb;
+	if (after(seq, call->rx_top))
+		smp_store_release(&call->rx_top, seq);
+	queued = true;
+
+	if (after_eq(seq, call->rx_expect_next)) {
+		if (after(seq, call->rx_expect_next)) {
+			_net("OOS %u > %u", seq, call->rx_expect_next);
+			ack = RXRPC_ACK_OUT_OF_SEQUENCE;
+			ack_serial = serial;
+		}
+		call->rx_expect_next = seq + 1;
 	}
 
-	spin_unlock(&call->lock);
-	atomic_inc(&call->ackr_not_idle);
-	rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial, false);
-	_leave(" = 0 [posted]");
-	return 0;
+skip:
+	offset += len;
+	if (flags & RXRPC_JUMBO_PACKET) {
+		if (skb_copy_bits(skb, offset, &flags, 1) < 0)
+			return rxrpc_proto_abort("XJF", call, seq);
+		offset += sizeof(struct rxrpc_jumbo_header);
+		seq++;
+		serial++;
+		annotation++;
+		if (flags & RXRPC_JUMBO_PACKET)
+			annotation |= RXRPC_RX_ANNO_JLAST;
+
+		_proto("Rx DATA Jumbo %%%u", serial);
+		goto next_subpacket;
+	}
 
-protocol_error:
-	ret = -EBADMSG;
-out:
-	spin_unlock(&call->lock);
-	_leave(" = %d", ret);
-	return ret;
+	if (queued && flags & RXRPC_LAST_PACKET && !ack) {
+		ack = RXRPC_ACK_DELAY;
+		ack_serial = serial;
+	}
 
-discard_and_ack:
-	_debug("discard and ACK packet %p", skb);
-	__rxrpc_propose_ACK(call, ack, skew, serial, true);
-discard:
-	spin_unlock(&call->lock);
-	rxrpc_free_skb(skb);
-	_leave(" = 0 [discarded]");
-	return 0;
+ack:
+	if (ack)
+		rxrpc_propose_ACK(call, ack, skew, ack_serial,
+				  immediate_ack, true);
 
-enqueue_and_ack:
-	__rxrpc_propose_ACK(call, ack, skew, serial, true);
-enqueue_packet:
-	_net("defer skb %p", skb);
-	spin_unlock(&call->lock);
-	skb_queue_tail(&call->rx_queue, skb);
-	atomic_inc(&call->ackr_not_idle);
-	read_lock(&call->state_lock);
-	if (call->state < RXRPC_CALL_COMPLETE)
-		rxrpc_queue_call(call);
-	read_unlock(&call->state_lock);
-	_leave(" = 0 [queued]");
-	return 0;
+	if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
+		rxrpc_notify_socket(call);
+	_leave(" [queued]");
 }
 
 /*
- * assume an implicit ACKALL of the transmission phase of a client socket upon
- * reception of the first reply packet
+ * Process the extra information that may be appended to an ACK packet
  */
-static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
+static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
+				struct rxrpc_ackinfo *ackinfo)
 {
-	write_lock_bh(&call->state_lock);
-
-	switch (call->state) {
-	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
-		call->acks_latest = serial;
-
-		_debug("implicit ACKALL %%%u", call->acks_latest);
-		set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events);
-		write_unlock_bh(&call->state_lock);
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	struct rxrpc_peer *peer;
+	unsigned int mtu;
+
+	_proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
+	       sp->hdr.serial,
+	       ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
+	       ntohl(ackinfo->rwind), ntohl(ackinfo->jumbo_max));
+
+	if (call->tx_winsize > ntohl(ackinfo->rwind))
+		call->tx_winsize = ntohl(ackinfo->rwind);
+
+	mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU));
+
+	peer = call->peer;
+	if (mtu < peer->maxdata) {
+		spin_lock_bh(&peer->lock);
+		peer->maxdata = mtu;
+		peer->mtu = mtu + peer->hdrsize;
+		spin_unlock_bh(&peer->lock);
+		_net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
+	}
+}
 
-		if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
-			clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-			clear_bit(RXRPC_CALL_EV_RESEND, &call->events);
-			clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+/*
+ * Process individual soft ACKs.
+ *
+ * Each ACK in the array corresponds to one packet and can be either an ACK or
+ * a NAK.  If we get find an explicitly NAK'd packet we resend immediately;
+ * packets that lie beyond the end of the ACK list are scheduled for resend by
+ * the timer on the basis that the peer might just not have processed them at
+ * the time the ACK was sent.
+ */
+static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
+				  rxrpc_seq_t seq, int nr_acks)
+{
+	bool resend = false;
+	int ix;
+
+	for (; nr_acks > 0; nr_acks--, seq++) {
+		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		switch (*acks) {
+		case RXRPC_ACK_TYPE_ACK:
+			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK;
+			break;
+		case RXRPC_ACK_TYPE_NACK:
+			if (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK)
+				continue;
+			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK;
+			resend = true;
+			break;
+		default:
+			return rxrpc_proto_abort("SFT", call, 0);
 		}
-		break;
-
-	default:
-		write_unlock_bh(&call->state_lock);
-		break;
 	}
+
+	if (resend &&
+	    !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+		rxrpc_queue_call(call);
 }
 
 /*
- * post an incoming packet to the nominated call to deal with
- * - must get rid of the sk_buff, either by freeing it or by queuing it
+ * Process an ACK packet.
+ *
+ * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet
+ * in the ACK array.  Anything before that is hard-ACK'd and may be discarded.
+ *
+ * A hard-ACK means that a packet has been processed and may be discarded; a
+ * soft-ACK means that the packet may be discarded and retransmission
+ * requested.  A phase is complete when all packets are hard-ACK'd.
  */
-void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
+static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
+			    u16 skew)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	__be32 wtmp;
-	u32 abort_code;
-
-	_enter("%p,%p", call, skb);
-
-	ASSERT(!irqs_disabled());
-
-#if 0 // INJECT RX ERROR
-	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
-		static int skip = 0;
-		if (++skip == 3) {
-			printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n");
-			skip = 0;
-			goto free_packet;
-		}
+	union {
+		struct rxrpc_ackpacket ack;
+		struct rxrpc_ackinfo info;
+		u8 acks[RXRPC_MAXACKS];
+	} buf;
+	rxrpc_seq_t first_soft_ack, hard_ack;
+	int nr_acks, offset;
+
+	_enter("");
+
+	if (skb_copy_bits(skb, sp->offset, &buf.ack, sizeof(buf.ack)) < 0) {
+		_debug("extraction failure");
+		return rxrpc_proto_abort("XAK", call, 0);
 	}
-#endif
-
-	/* request ACK generation for any ACK or DATA packet that requests
-	 * it */
-	if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
-		_proto("ACK Requested on %%%u", sp->hdr.serial);
+	sp->offset += sizeof(buf.ack);
+
+	first_soft_ack = ntohl(buf.ack.firstPacket);
+	hard_ack = first_soft_ack - 1;
+	nr_acks = buf.ack.nAcks;
+
+	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+	       sp->hdr.serial,
+	       ntohs(buf.ack.maxSkew),
+	       first_soft_ack,
+	       ntohl(buf.ack.previousPacket),
+	       ntohl(buf.ack.serial),
+	       rxrpc_acks(buf.ack.reason),
+	       buf.ack.nAcks);
+
+	if (buf.ack.reason == RXRPC_ACK_PING) {
+		_proto("Rx ACK %%%u PING Request", sp->hdr.serial);
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
+				  skew, sp->hdr.serial, true, true);
+	} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
-				  skb->priority, sp->hdr.serial, false);
+				  skew, sp->hdr.serial, true, true);
 	}
 
-	switch (sp->hdr.type) {
-	case RXRPC_PACKET_TYPE_ABORT:
-		_debug("abort");
-
-		if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
-			goto protocol_error;
-
-		abort_code = ntohl(wtmp);
-		_proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
-
-		if (__rxrpc_set_call_completion(call,
-						RXRPC_CALL_REMOTELY_ABORTED,
-						abort_code, ECONNABORTED)) {
-			set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
-			rxrpc_queue_call(call);
-		}
-		goto free_packet;
-
-	case RXRPC_PACKET_TYPE_BUSY:
-		_proto("Rx BUSY %%%u", sp->hdr.serial);
-
-		if (rxrpc_is_service_call(call))
-			goto protocol_error;
+	offset = sp->offset + nr_acks + 3;
+	if (skb->data_len >= offset + sizeof(buf.info)) {
+		if (skb_copy_bits(skb, offset, &buf.info, sizeof(buf.info)) < 0)
+			return rxrpc_proto_abort("XAI", call, 0);
+		rxrpc_input_ackinfo(call, skb, &buf.info);
+	}
 
-		write_lock_bh(&call->state_lock);
-		switch (call->state) {
-		case RXRPC_CALL_CLIENT_SEND_REQUEST:
-			__rxrpc_set_call_completion(call,
-						    RXRPC_CALL_SERVER_BUSY,
-						    0, EBUSY);
-			set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
-			rxrpc_queue_call(call);
-		case RXRPC_CALL_SERVER_BUSY:
-			goto free_packet_unlock;
-		default:
-			goto protocol_error_locked;
-		}
+	if (first_soft_ack == 0)
+		return rxrpc_proto_abort("AK0", call, 0);
 
+	/* Ignore ACKs unless we are or have just been transmitting. */
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_SEND_REQUEST:
+	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+	case RXRPC_CALL_SERVER_SEND_REPLY:
+	case RXRPC_CALL_SERVER_AWAIT_ACK:
+		break;
 	default:
-		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
-		goto protocol_error;
-
-	case RXRPC_PACKET_TYPE_DATA:
-		_proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
-		if (sp->hdr.seq == 0)
-			goto protocol_error;
-
-		call->ackr_prev_seq = sp->hdr.seq;
+		return;
+	}
 
-		/* received data implicitly ACKs all of the request packets we
-		 * sent when we're acting as a client */
-		if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
-			rxrpc_assume_implicit_ackall(call, sp->hdr.serial);
+	/* Discard any out-of-order or duplicate ACKs. */
+	if ((int)sp->hdr.serial - (int)call->acks_latest <= 0) {
+		_debug("discard ACK %d <= %d",
+		       sp->hdr.serial, call->acks_latest);
+		return;
+	}
+	call->acks_latest = sp->hdr.serial;
 
-		switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) {
-		case 0:
-			skb = NULL;
-			goto done;
+	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
+	    hard_ack == call->tx_top) {
+		rxrpc_end_tx_phase(call, "ETA");
+		return;
+	}
 
-		default:
-			BUG();
+	if (before(hard_ack, call->tx_hard_ack) ||
+	    after(hard_ack, call->tx_top))
+		return rxrpc_proto_abort("AKW", call, 0);
 
-			/* data packet received beyond the last packet */
-		case -EBADMSG:
-			goto protocol_error;
-		}
+	if (after(hard_ack, call->tx_hard_ack))
+		rxrpc_rotate_tx_window(call, hard_ack);
 
-	case RXRPC_PACKET_TYPE_ACKALL:
-	case RXRPC_PACKET_TYPE_ACK:
-		/* ACK processing is done in process context */
-		read_lock_bh(&call->state_lock);
-		if (call->state < RXRPC_CALL_COMPLETE) {
-			skb_queue_tail(&call->rx_queue, skb);
-			rxrpc_queue_call(call);
-			skb = NULL;
-		}
-		read_unlock_bh(&call->state_lock);
-		goto free_packet;
-	}
+	if (after(first_soft_ack, call->tx_top))
+		return;
 
-protocol_error:
-	_debug("protocol error");
-	write_lock_bh(&call->state_lock);
-protocol_error_locked:
-	if (__rxrpc_abort_call("FPR", call, 0, RX_PROTOCOL_ERROR, EPROTO))
-		rxrpc_queue_call(call);
-free_packet_unlock:
-	write_unlock_bh(&call->state_lock);
-free_packet:
-	rxrpc_free_skb(skb);
-done:
-	_leave("");
+	if (nr_acks > call->tx_top - first_soft_ack + 1)
+		nr_acks = first_soft_ack - call->tx_top + 1;
+	if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0)
+		return rxrpc_proto_abort("XSA", call, 0);
+	rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks);
 }
 
 /*
- * split up a jumbo data packet
+ * Process an ACKALL packet.
  */
-static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
-				       struct sk_buff *jumbo)
+static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
 {
-	struct rxrpc_jumbo_header jhdr;
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *part;
-
-	_enter(",{%u,%u}", jumbo->data_len, jumbo->len);
-
-	sp = rxrpc_skb(jumbo);
-
-	do {
-		sp->hdr.flags &= ~RXRPC_JUMBO_PACKET;
-
-		/* make a clone to represent the first subpacket in what's left
-		 * of the jumbo packet */
-		part = skb_clone(jumbo, GFP_ATOMIC);
-		if (!part) {
-			/* simply ditch the tail in the event of ENOMEM */
-			pskb_trim(jumbo, RXRPC_JUMBO_DATALEN);
-			break;
-		}
-		rxrpc_new_skb(part);
-
-		pskb_trim(part, RXRPC_JUMBO_DATALEN);
-
-		if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN))
-			goto protocol_error;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
-		if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0)
-			goto protocol_error;
-		if (!pskb_pull(jumbo, sizeof(jhdr)))
-			BUG();
+	_proto("Rx ACKALL %%%u", sp->hdr.serial);
 
-		sp->hdr.seq	+= 1;
-		sp->hdr.serial	+= 1;
-		sp->hdr.flags	= jhdr.flags;
-		sp->hdr._rsvd	= ntohs(jhdr._rsvd);
+	rxrpc_end_tx_phase(call, "ETL");
+}
 
-		_proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1);
+/*
+ * Process an ABORT packet.
+ */
+static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	__be32 wtmp;
+	u32 abort_code = RX_CALL_DEAD;
 
-		rxrpc_fast_process_packet(call, part);
-		part = NULL;
+	_enter("");
 
-	} while (sp->hdr.flags & RXRPC_JUMBO_PACKET);
+	if (skb->len >= 4 &&
+	    skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) >= 0)
+		abort_code = ntohl(wtmp);
 
-	rxrpc_fast_process_packet(call, jumbo);
-	_leave("");
-	return;
+	_proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
 
-protocol_error:
-	_debug("protocol error");
-	rxrpc_free_skb(part);
-	if (rxrpc_abort_call("PJP", call, sp->hdr.seq,
-			     RX_PROTOCOL_ERROR, EPROTO))
-		rxrpc_queue_call(call);
-	rxrpc_free_skb(jumbo);
-	_leave("");
+	if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+				      abort_code, ECONNABORTED))
+		rxrpc_notify_socket(call);
 }
 
 /*
- * post an incoming packet to the appropriate call/socket to deal with
- * - must get rid of the sk_buff, either by freeing it or by queuing it
+ * Process an incoming call packet.
  */
-static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn,
-				      struct rxrpc_call *call,
-				      struct sk_buff *skb)
+static void rxrpc_input_call_packet(struct rxrpc_call *call,
+				    struct sk_buff *skb, u16 skew)
 {
-	struct rxrpc_skb_priv *sp;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
 	_enter("%p,%p", call, skb);
 
-	sp = rxrpc_skb(skb);
-
-	_debug("extant call [%d]", call->state);
-
-	read_lock(&call->state_lock);
-	switch (call->state) {
-	case RXRPC_CALL_COMPLETE:
-		switch (call->completion) {
-		case RXRPC_CALL_LOCALLY_ABORTED:
-			if (!test_and_set_bit(RXRPC_CALL_EV_ABORT,
-					      &call->events)) {
-				rxrpc_queue_call(call);
-				goto free_unlock;
-			}
-		default:
-			goto dead_call;
-		case RXRPC_CALL_SUCCEEDED:
-			if (rxrpc_is_service_call(call))
-				goto dead_call;
-			goto resend_final_ack;
-		}
-
-	case RXRPC_CALL_CLIENT_FINAL_ACK:
-		goto resend_final_ack;
+	switch (sp->hdr.type) {
+	case RXRPC_PACKET_TYPE_DATA:
+		rxrpc_input_data(call, skb, skew);
+		break;
 
-	default:
+	case RXRPC_PACKET_TYPE_ACK:
+		rxrpc_input_ack(call, skb, skew);
 		break;
-	}
 
-	read_unlock(&call->state_lock);
+	case RXRPC_PACKET_TYPE_BUSY:
+		_proto("Rx BUSY %%%u", sp->hdr.serial);
 
-	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
-	    sp->hdr.flags & RXRPC_JUMBO_PACKET)
-		rxrpc_process_jumbo_packet(call, skb);
-	else
-		rxrpc_fast_process_packet(call, skb);
+		/* Just ignore BUSY packets from the server; the retry and
+		 * lifespan timers will take care of business.  BUSY packets
+		 * from the client don't make sense.
+		 */
+		break;
 
-	goto done;
+	case RXRPC_PACKET_TYPE_ABORT:
+		rxrpc_input_abort(call, skb);
+		break;
 
-resend_final_ack:
-	_debug("final ack again");
-	set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
-	rxrpc_queue_call(call);
-	goto free_unlock;
+	case RXRPC_PACKET_TYPE_ACKALL:
+		rxrpc_input_ackall(call, skb);
+		break;
 
-dead_call:
-	if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
-		skb->priority = RX_CALL_DEAD;
-		rxrpc_reject_packet(conn->params.local, skb);
-		goto unlock;
+	default:
+		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
+		break;
 	}
-free_unlock:
-	rxrpc_free_skb(skb);
-unlock:
-	read_unlock(&call->state_lock);
-done:
+
 	_leave("");
 }
 
@@ -600,6 +598,17 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
 	rxrpc_queue_local(local);
 }
 
+/*
+ * put a packet up for transport-level abort
+ */
+static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
+{
+	CHECK_SLAB_OKAY(&local->usage);
+
+	skb_queue_tail(&local->reject_queue, skb);
+	rxrpc_queue_local(local);
+}
+
 /*
  * Extract the wire header from a packet and translate the byte order.
  */
@@ -611,8 +620,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
 	/* dig out the RxRPC connection details */
 	if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0)
 		return -EBADMSG;
-	if (!pskb_pull(skb, sizeof(whdr)))
-		BUG();
 
 	memset(sp, 0, sizeof(*sp));
 	sp->hdr.epoch		= ntohl(whdr.epoch);
@@ -626,6 +633,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
 	sp->hdr.securityIndex	= whdr.securityIndex;
 	sp->hdr._rsvd		= ntohs(whdr._rsvd);
 	sp->hdr.serviceId	= ntohs(whdr.serviceId);
+	sp->offset = sizeof(whdr);
 	return 0;
 }
 
@@ -637,19 +645,22 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
  * shut down and the local endpoint from going away, thus sk_user_data will not
  * be cleared until this function returns.
  */
-void rxrpc_data_ready(struct sock *sk)
+void rxrpc_data_ready(struct sock *udp_sk)
 {
 	struct rxrpc_connection *conn;
+	struct rxrpc_channel *chan;
+	struct rxrpc_call *call;
 	struct rxrpc_skb_priv *sp;
-	struct rxrpc_local *local = sk->sk_user_data;
+	struct rxrpc_local *local = udp_sk->sk_user_data;
 	struct sk_buff *skb;
+	unsigned int channel;
 	int ret, skew;
 
-	_enter("%p", sk);
+	_enter("%p", udp_sk);
 
 	ASSERT(!irqs_disabled());
 
-	skb = skb_recv_datagram(sk, 0, 1, &ret);
+	skb = skb_recv_datagram(udp_sk, 0, 1, &ret);
 	if (!skb) {
 		if (ret == -EAGAIN)
 			return;
@@ -695,111 +706,122 @@ void rxrpc_data_ready(struct sock *sk)
 		goto bad_message;
 	}
 
-	if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) {
+	switch (sp->hdr.type) {
+	case RXRPC_PACKET_TYPE_VERSION:
 		rxrpc_post_packet_to_local(local, skb);
 		goto out;
-	}
 
-	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
-	    (sp->hdr.callNumber == 0 || sp->hdr.seq == 0))
-		goto bad_message;
+	case RXRPC_PACKET_TYPE_BUSY:
+		if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
+			goto discard;
+
+	case RXRPC_PACKET_TYPE_DATA:
+		if (sp->hdr.callNumber == 0)
+			goto bad_message;
+		if (sp->hdr.flags & RXRPC_JUMBO_PACKET &&
+		    !rxrpc_validate_jumbo(skb))
+			goto bad_message;
+		break;
+	}
 
 	rcu_read_lock();
 
 	conn = rxrpc_find_connection_rcu(local, skb);
-	if (!conn) {
-		skb->priority = 0;
-		goto cant_route_call;
-	}
+	if (conn) {
+		if (sp->hdr.securityIndex != conn->security_ix)
+			goto wrong_security;
 
-	/* Note the serial number skew here */
-	skew = (int)sp->hdr.serial - (int)conn->hi_serial;
-	if (skew >= 0) {
-		if (skew > 0)
-			conn->hi_serial = sp->hdr.serial;
-		skb->priority = 0;
-	} else {
-		skew = -skew;
-		skb->priority = min(skew, 65535);
-	}
+		if (sp->hdr.callNumber == 0) {
+			/* Connection-level packet */
+			_debug("CONN %p {%d}", conn, conn->debug_id);
+			rxrpc_post_packet_to_conn(conn, skb);
+			goto out_unlock;
+		}
+
+		/* Note the serial number skew here */
+		skew = (int)sp->hdr.serial - (int)conn->hi_serial;
+		if (skew >= 0) {
+			if (skew > 0)
+				conn->hi_serial = sp->hdr.serial;
+		} else {
+			skew = -skew;
+			skew = min(skew, 65535);
+		}
 
-	if (sp->hdr.callNumber == 0) {
-		/* Connection-level packet */
-		_debug("CONN %p {%d}", conn, conn->debug_id);
-		rxrpc_post_packet_to_conn(conn, skb);
-		goto out_unlock;
-	} else {
 		/* Call-bound packets are routed by connection channel. */
-		unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK;
-		struct rxrpc_channel *chan = &conn->channels[channel];
-		struct rxrpc_call *call;
+		channel = sp->hdr.cid & RXRPC_CHANNELMASK;
+		chan = &conn->channels[channel];
 
 		/* Ignore really old calls */
 		if (sp->hdr.callNumber < chan->last_call)
 			goto discard_unlock;
 
 		if (sp->hdr.callNumber == chan->last_call) {
-			/* For the previous service call, if completed
-			 * successfully, we discard all further packets.
+			/* For the previous service call, if completed successfully, we
+			 * discard all further packets.
 			 */
 			if (rxrpc_conn_is_service(conn) &&
 			    (chan->last_type == RXRPC_PACKET_TYPE_ACK ||
 			     sp->hdr.type == RXRPC_PACKET_TYPE_ABORT))
 				goto discard_unlock;
 
-			/* But otherwise we need to retransmit the final packet
-			 * from data cached in the connection record.
+			/* But otherwise we need to retransmit the final packet from
+			 * data cached in the connection record.
 			 */
 			rxrpc_post_packet_to_conn(conn, skb);
 			goto out_unlock;
 		}
 
 		call = rcu_dereference(chan->call);
-		if (!call || atomic_read(&call->usage) == 0)
-			goto cant_route_call;
+	} else {
+		skew = 0;
+		call = NULL;
+	}
 
-		rxrpc_see_call(call);
-		rxrpc_post_packet_to_call(conn, call, skb);
-		goto out_unlock;
+	if (!call || atomic_read(&call->usage) == 0) {
+		if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) ||
+		    sp->hdr.callNumber == 0 ||
+		    sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
+			goto bad_message_unlock;
+		if (sp->hdr.seq != 1)
+			goto discard_unlock;
+		call = rxrpc_new_incoming_call(local, conn, skb);
+		if (!call) {
+			rcu_read_unlock();
+			goto reject_packet;
+		}
 	}
 
+	rxrpc_input_call_packet(call, skb, skew);
+	goto discard_unlock;
+
 discard_unlock:
-	rxrpc_free_skb(skb);
-out_unlock:
 	rcu_read_unlock();
+discard:
+	rxrpc_free_skb(skb);
 out:
 	trace_rxrpc_rx_done(0, 0);
 	return;
 
-cant_route_call:
+out_unlock:
 	rcu_read_unlock();
+	goto out;
 
-	_debug("can't route call");
-	if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
-	    sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
-		if (sp->hdr.seq == 1) {
-			_debug("first packet");
-			skb_queue_tail(&local->accept_queue, skb);
-			rxrpc_queue_work(&local->processor);
-			_leave(" [incoming]");
-			goto out;
-		}
-		skb->priority = RX_INVALID_OPERATION;
-	} else {
-		skb->priority = RX_CALL_DEAD;
-	}
-
-	if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
-		_debug("reject type %d",sp->hdr.type);
-		goto reject_packet;
-	} else {
-		rxrpc_free_skb(skb);
-	}
-	_leave(" [no call]");
-	return;
+wrong_security:
+	rcu_read_unlock();
+	trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+			  RXKADINCONSISTENCY, EBADMSG);
+	skb->priority = RXKADINCONSISTENCY;
+	goto post_abort;
 
+bad_message_unlock:
+	rcu_read_unlock();
 bad_message:
+	trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+			  RX_PROTOCOL_ERROR, EBADMSG);
 	skb->priority = RX_PROTOCOL_ERROR;
+post_abort:
+	skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
 reject_packet:
 	trace_rxrpc_rx_done(skb->mark, skb->priority);
 	rxrpc_reject_packet(local, skb);
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index a4aba0246731..7d4375e557e6 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -30,14 +30,18 @@ static int none_secure_packet(struct rxrpc_call *call,
 	return 0;
 }
 
-static int none_verify_packet(struct rxrpc_call *call,
-			      struct sk_buff *skb,
-			      rxrpc_seq_t seq,
-			      u16 expected_cksum)
+static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			      unsigned int offset, unsigned int len,
+			      rxrpc_seq_t seq, u16 expected_cksum)
 {
 	return 0;
 }
 
+static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+			     unsigned int *_offset, unsigned int *_len)
+{
+}
+
 static int none_respond_to_challenge(struct rxrpc_connection *conn,
 				     struct sk_buff *skb,
 				     u32 *_abort_code)
@@ -79,6 +83,7 @@ const struct rxrpc_security rxrpc_no_security = {
 	.prime_packet_security		= none_prime_packet_security,
 	.secure_packet			= none_secure_packet,
 	.verify_packet			= none_verify_packet,
+	.locate_data			= none_locate_data,
 	.respond_to_challenge		= none_respond_to_challenge,
 	.verify_response		= none_verify_response,
 	.clear				= none_clear,
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index bcc6593b4cdb..cdd58e6e9fbd 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -98,7 +98,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
 
 		switch (sp->hdr.type) {
 		case RXRPC_PACKET_TYPE_VERSION:
-			if (skb_copy_bits(skb, 0, &v, 1) < 0)
+			if (skb_copy_bits(skb, sp->offset, &v, 1) < 0)
 				return;
 			_proto("Rx VERSION { %02x }", v);
 			if (v == 0)
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 610916f4ae34..782b9adf67cb 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -77,7 +77,6 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
 		INIT_WORK(&local->processor, rxrpc_local_processor);
 		INIT_HLIST_HEAD(&local->services);
 		init_rwsem(&local->defrag_sem);
-		skb_queue_head_init(&local->accept_queue);
 		skb_queue_head_init(&local->reject_queue);
 		skb_queue_head_init(&local->event_queue);
 		local->client_conns = RB_ROOT;
@@ -308,7 +307,6 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
 	/* At this point, there should be no more packets coming in to the
 	 * local endpoint.
 	 */
-	rxrpc_purge_queue(&local->accept_queue);
 	rxrpc_purge_queue(&local->reject_queue);
 	rxrpc_purge_queue(&local->event_queue);
 
@@ -332,11 +330,6 @@ static void rxrpc_local_processor(struct work_struct *work)
 		if (atomic_read(&local->usage) == 0)
 			return rxrpc_local_destroyer(local);
 
-		if (!skb_queue_empty(&local->accept_queue)) {
-			rxrpc_accept_incoming_calls(local);
-			again = true;
-		}
-
 		if (!skb_queue_empty(&local->reject_queue)) {
 			rxrpc_reject_packets(local);
 			again = true;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 39e7cc37c392..fd096f742e4b 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -50,7 +50,7 @@ unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
  * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
  * packets.
  */
-unsigned int rxrpc_rx_window_size = 32;
+unsigned int rxrpc_rx_window_size = RXRPC_RXTX_BUFF_SIZE - 46;
 
 /*
  * Maximum Rx MTU size.  This indicates to the sender the size of jumbo packet
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 8756d74fd74b..719a4c23f09d 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -15,6 +15,8 @@
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
@@ -38,20 +40,38 @@ struct rxrpc_pkt_buffer {
 static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 				 struct rxrpc_pkt_buffer *pkt)
 {
+	rxrpc_seq_t hard_ack, top, seq;
+	int ix;
 	u32 mtu, jmax;
 	u8 *ackp = pkt->acks;
 
+	/* Barrier against rxrpc_input_data(). */
+	hard_ack = READ_ONCE(call->rx_hard_ack);
+	top = smp_load_acquire(&call->rx_top);
+
 	pkt->ack.bufferSpace	= htons(8);
-	pkt->ack.maxSkew	= htons(0);
-	pkt->ack.firstPacket	= htonl(call->rx_data_eaten + 1);
+	pkt->ack.maxSkew	= htons(call->ackr_skew);
+	pkt->ack.firstPacket	= htonl(hard_ack + 1);
 	pkt->ack.previousPacket	= htonl(call->ackr_prev_seq);
 	pkt->ack.serial		= htonl(call->ackr_serial);
-	pkt->ack.reason		= RXRPC_ACK_IDLE;
-	pkt->ack.nAcks		= 0;
+	pkt->ack.reason		= call->ackr_reason;
+	pkt->ack.nAcks		= top - hard_ack;
+
+	if (after(top, hard_ack)) {
+		seq = hard_ack + 1;
+		do {
+			ix = seq & RXRPC_RXTX_BUFF_MASK;
+			if (call->rxtx_buffer[ix])
+				*ackp++ = RXRPC_ACK_TYPE_ACK;
+			else
+				*ackp++ = RXRPC_ACK_TYPE_NACK;
+			seq++;
+		} while (before_eq(seq, top));
+	}
 
-	mtu = call->peer->if_mtu;
-	mtu -= call->peer->hdrsize;
-	jmax = rxrpc_rx_jumbo_max;
+	mtu = call->conn->params.peer->if_mtu;
+	mtu -= call->conn->params.peer->hdrsize;
+	jmax = (call->nr_jumbo_dup > 3) ? 1 : rxrpc_rx_jumbo_max;
 	pkt->ackinfo.rxMTU	= htonl(rxrpc_rx_mtu);
 	pkt->ackinfo.maxMTU	= htonl(mtu);
 	pkt->ackinfo.rwind	= htonl(rxrpc_rx_window_size);
@@ -60,11 +80,11 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	*ackp++ = 0;
 	*ackp++ = 0;
 	*ackp++ = 0;
-	return 3;
+	return top - hard_ack + 3;
 }
 
 /*
- * Send a final ACK or ABORT call packet.
+ * Send an ACK or ABORT call packet.
  */
 int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 {
@@ -158,6 +178,19 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	ret = kernel_sendmsg(conn->params.local->socket,
 			     &msg, iov, ioc, len);
 
+	if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) {
+		switch (pkt->whdr.type) {
+		case RXRPC_PACKET_TYPE_ACK:
+			rxrpc_propose_ACK(call, pkt->ack.reason,
+					  ntohs(pkt->ack.maxSkew),
+					  ntohl(pkt->ack.serial),
+					  true, true);
+			break;
+		case RXRPC_PACKET_TYPE_ABORT:
+			break;
+		}
+	}
+
 out:
 	rxrpc_put_connection(conn);
 	kfree(pkt);
@@ -233,3 +266,77 @@ send_fragmentable:
 	_leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
 	return ret;
 }
+
+/*
+ * reject packets through the local endpoint
+ */
+void rxrpc_reject_packets(struct rxrpc_local *local)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sin;
+	} sa;
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_wire_header whdr;
+	struct sk_buff *skb;
+	struct msghdr msg;
+	struct kvec iov[2];
+	size_t size;
+	__be32 code;
+
+	_enter("%d", local->debug_id);
+
+	iov[0].iov_base = &whdr;
+	iov[0].iov_len = sizeof(whdr);
+	iov[1].iov_base = &code;
+	iov[1].iov_len = sizeof(code);
+	size = sizeof(whdr) + sizeof(code);
+
+	msg.msg_name = &sa;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa.sa_family = local->srx.transport.family;
+	switch (sa.sa.sa_family) {
+	case AF_INET:
+		msg.msg_namelen = sizeof(sa.sin);
+		break;
+	default:
+		msg.msg_namelen = 0;
+		break;
+	}
+
+	memset(&whdr, 0, sizeof(whdr));
+	whdr.type = RXRPC_PACKET_TYPE_ABORT;
+
+	while ((skb = skb_dequeue(&local->reject_queue))) {
+		rxrpc_see_skb(skb);
+		sp = rxrpc_skb(skb);
+		switch (sa.sa.sa_family) {
+		case AF_INET:
+			sa.sin.sin_port = udp_hdr(skb)->source;
+			sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+			code = htonl(skb->priority);
+
+			whdr.epoch	= htonl(sp->hdr.epoch);
+			whdr.cid	= htonl(sp->hdr.cid);
+			whdr.callNumber	= htonl(sp->hdr.callNumber);
+			whdr.serviceId	= htons(sp->hdr.serviceId);
+			whdr.flags	= sp->hdr.flags;
+			whdr.flags	^= RXRPC_CLIENT_INITIATED;
+			whdr.flags	&= RXRPC_CLIENT_INITIATED;
+
+			kernel_sendmsg(local->socket, &msg, iov, 2, size);
+			break;
+
+		default:
+			break;
+		}
+
+		rxrpc_free_skb(skb);
+	}
+
+	_leave("");
+}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 27b9ecad007e..c8948936c6fc 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -129,15 +129,14 @@ void rxrpc_error_report(struct sock *sk)
 		_leave("UDP socket errqueue empty");
 		return;
 	}
+	rxrpc_new_skb(skb);
 	serr = SKB_EXT_ERR(skb);
 	if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
 		_leave("UDP empty message");
-		kfree_skb(skb);
+		rxrpc_free_skb(skb);
 		return;
 	}
 
-	rxrpc_new_skb(skb);
-
 	rcu_read_lock();
 	peer = rxrpc_lookup_peer_icmp_rcu(local, skb);
 	if (peer && !rxrpc_get_peer_maybe(peer))
@@ -249,7 +248,6 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
 		container_of(work, struct rxrpc_peer, error_distributor);
 	struct rxrpc_call *call;
 	enum rxrpc_call_completion compl;
-	bool queue;
 	int error;
 
 	_enter("");
@@ -272,15 +270,8 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
 		hlist_del_init(&call->error_link);
 		rxrpc_see_call(call);
 
-		queue = false;
-		write_lock(&call->state_lock);
-		if (__rxrpc_set_call_completion(call, compl, 0, error)) {
-			set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
-			queue = true;
-		}
-		write_unlock(&call->state_lock);
-		if (queue)
-			rxrpc_queue_call(call);
+		if (rxrpc_set_call_completion(call, compl, 0, error))
+			rxrpc_notify_socket(call);
 	}
 
 	spin_unlock_bh(&peer->lock);
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index aebc73ac16dc..2efe29a4c232 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -198,6 +198,32 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
 	return peer;
 }
 
+/*
+ * Initialise peer record.
+ */
+static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
+{
+	rxrpc_assess_MTU_size(peer);
+	peer->mtu = peer->if_mtu;
+
+	if (peer->srx.transport.family == AF_INET) {
+		peer->hdrsize = sizeof(struct iphdr);
+		switch (peer->srx.transport_type) {
+		case SOCK_DGRAM:
+			peer->hdrsize += sizeof(struct udphdr);
+			break;
+		default:
+			BUG();
+			break;
+		}
+	} else {
+		BUG();
+	}
+
+	peer->hdrsize += sizeof(struct rxrpc_wire_header);
+	peer->maxdata = peer->mtu - peer->hdrsize;
+}
+
 /*
  * Set up a new peer.
  */
@@ -214,29 +240,39 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
 	if (peer) {
 		peer->hash_key = hash_key;
 		memcpy(&peer->srx, srx, sizeof(*srx));
+		rxrpc_init_peer(peer, hash_key);
+	}
 
-		rxrpc_assess_MTU_size(peer);
-		peer->mtu = peer->if_mtu;
-
-		if (srx->transport.family == AF_INET) {
-			peer->hdrsize = sizeof(struct iphdr);
-			switch (srx->transport_type) {
-			case SOCK_DGRAM:
-				peer->hdrsize += sizeof(struct udphdr);
-				break;
-			default:
-				BUG();
-				break;
-			}
-		} else {
-			BUG();
-		}
+	_leave(" = %p", peer);
+	return peer;
+}
 
-		peer->hdrsize += sizeof(struct rxrpc_wire_header);
-		peer->maxdata = peer->mtu - peer->hdrsize;
+/*
+ * Set up a new incoming peer.  The address is prestored in the preallocated
+ * peer.
+ */
+struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
+					      struct rxrpc_peer *prealloc)
+{
+	struct rxrpc_peer *peer;
+	unsigned long hash_key;
+
+	hash_key = rxrpc_peer_hash_key(local, &prealloc->srx);
+	prealloc->local = local;
+	rxrpc_init_peer(prealloc, hash_key);
+
+	spin_lock(&rxrpc_peer_hash_lock);
+
+	/* Need to check that we aren't racing with someone else */
+	peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key);
+	if (peer && !rxrpc_get_peer_maybe(peer))
+		peer = NULL;
+	if (!peer) {
+		peer = prealloc;
+		hash_add_rcu(rxrpc_peer_hash, &peer->hash_link, hash_key);
 	}
 
-	_leave(" = %p", peer);
+	spin_unlock(&rxrpc_peer_hash_lock);
 	return peer;
 }
 
@@ -272,7 +308,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 			return NULL;
 		}
 
-		spin_lock(&rxrpc_peer_hash_lock);
+		spin_lock_bh(&rxrpc_peer_hash_lock);
 
 		/* Need to check that we aren't racing with someone else */
 		peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
@@ -282,7 +318,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 			hash_add_rcu(rxrpc_peer_hash,
 				     &candidate->hash_link, hash_key);
 
-		spin_unlock(&rxrpc_peer_hash_lock);
+		spin_unlock_bh(&rxrpc_peer_hash_lock);
 
 		if (peer)
 			kfree(candidate);
@@ -307,9 +343,9 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer)
 {
 	ASSERT(hlist_empty(&peer->error_targets));
 
-	spin_lock(&rxrpc_peer_hash_lock);
+	spin_lock_bh(&rxrpc_peer_hash_lock);
 	hash_del_rcu(&peer->hash_link);
-	spin_unlock(&rxrpc_peer_hash_lock);
+	spin_unlock_bh(&rxrpc_peer_hash_lock);
 
 	kfree_rcu(peer, rcu);
 }
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 6876ffb3b410..20d0b5c6f81b 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -19,319 +19,479 @@
 #include "ar-internal.h"
 
 /*
- * receive a message from an RxRPC socket
- * - we need to be careful about two or more threads calling recvmsg
- *   simultaneously
+ * Post a call for attention by the socket or kernel service.  Further
+ * notifications are suppressed by putting recvmsg_link on a dummy queue.
  */
-int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
-		  int flags)
+void rxrpc_notify_socket(struct rxrpc_call *call)
 {
-	struct rxrpc_skb_priv *sp;
-	struct rxrpc_call *call = NULL, *continue_call = NULL;
-	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
-	struct sk_buff *skb;
-	long timeo;
-	int copy, ret, ullen, offset, copied = 0;
-	u32 abort_code;
+	struct rxrpc_sock *rx;
+	struct sock *sk;
 
-	DEFINE_WAIT(wait);
+	_enter("%d", call->debug_id);
 
-	_enter(",,,%zu,%d", len, flags);
+	if (!list_empty(&call->recvmsg_link))
+		return;
+
+	rcu_read_lock();
+
+	rx = rcu_dereference(call->socket);
+	sk = &rx->sk;
+	if (rx && sk->sk_state < RXRPC_CLOSE) {
+		if (call->notify_rx) {
+			call->notify_rx(sk, call, call->user_call_ID);
+		} else {
+			write_lock_bh(&rx->recvmsg_lock);
+			if (list_empty(&call->recvmsg_link)) {
+				rxrpc_get_call(call, rxrpc_call_got);
+				list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
+			}
+			write_unlock_bh(&rx->recvmsg_lock);
 
-	if (flags & (MSG_OOB | MSG_TRUNC))
-		return -EOPNOTSUPP;
+			if (!sock_flag(sk, SOCK_DEAD)) {
+				_debug("call %ps", sk->sk_data_ready);
+				sk->sk_data_ready(sk);
+			}
+		}
+	}
 
-	ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long);
+	rcu_read_unlock();
+	_leave("");
+}
 
-	timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
-	msg->msg_flags |= MSG_MORE;
+/*
+ * Pass a call terminating message to userspace.
+ */
+static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
+{
+	u32 tmp = 0;
+	int ret;
 
-	lock_sock(&rx->sk);
+	switch (call->completion) {
+	case RXRPC_CALL_SUCCEEDED:
+		ret = 0;
+		if (rxrpc_is_service_call(call))
+			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp);
+		break;
+	case RXRPC_CALL_REMOTELY_ABORTED:
+		tmp = call->abort_code;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
+		break;
+	case RXRPC_CALL_LOCALLY_ABORTED:
+		tmp = call->abort_code;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
+		break;
+	case RXRPC_CALL_NETWORK_ERROR:
+		tmp = call->error;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp);
+		break;
+	case RXRPC_CALL_LOCAL_ERROR:
+		tmp = call->error;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp);
+		break;
+	default:
+		pr_err("Invalid terminal call state %u\n", call->state);
+		BUG();
+		break;
+	}
 
-	for (;;) {
-		/* return immediately if a client socket has no outstanding
-		 * calls */
-		if (RB_EMPTY_ROOT(&rx->calls)) {
-			if (copied)
-				goto out;
-			if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
-				release_sock(&rx->sk);
-				if (continue_call)
-					rxrpc_put_call(continue_call,
-						       rxrpc_call_put);
-				return -ENODATA;
-			}
-		}
+	return ret;
+}
 
-		/* get the next message on the Rx queue */
-		skb = skb_peek(&rx->sk.sk_receive_queue);
-		if (!skb) {
-			/* nothing remains on the queue */
-			if (copied &&
-			    (flags & MSG_PEEK || timeo == 0))
-				goto out;
+/*
+ * Pass back notification of a new call.  The call is added to the
+ * to-be-accepted list.  This means that the next call to be accepted might not
+ * be the last call seen awaiting acceptance, but unless we leave this on the
+ * front of the queue and block all other messages until someone gives us a
+ * user_ID for it, there's not a lot we can do.
+ */
+static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
+				  struct rxrpc_call *call,
+				  struct msghdr *msg, int flags)
+{
+	int tmp = 0, ret;
 
-			/* wait for a message to turn up */
-			release_sock(&rx->sk);
-			prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
-						  TASK_INTERRUPTIBLE);
-			ret = sock_error(&rx->sk);
-			if (ret)
-				goto wait_error;
-
-			if (skb_queue_empty(&rx->sk.sk_receive_queue)) {
-				if (signal_pending(current))
-					goto wait_interrupted;
-				timeo = schedule_timeout(timeo);
-			}
-			finish_wait(sk_sleep(&rx->sk), &wait);
-			lock_sock(&rx->sk);
-			continue;
-		}
+	ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp);
 
-	peek_next_packet:
-		rxrpc_see_skb(skb);
-		sp = rxrpc_skb(skb);
-		call = sp->call;
-		ASSERT(call != NULL);
-		rxrpc_see_call(call);
-
-		_debug("next pkt %s", rxrpc_pkts[sp->hdr.type]);
-
-		/* make sure we wait for the state to be updated in this call */
-		spin_lock_bh(&call->lock);
-		spin_unlock_bh(&call->lock);
-
-		if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
-			_debug("packet from released call");
-			if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
-				BUG();
-			rxrpc_free_skb(skb);
-			continue;
-		}
+	if (ret == 0 && !(flags & MSG_PEEK)) {
+		_debug("to be accepted");
+		write_lock_bh(&rx->recvmsg_lock);
+		list_del_init(&call->recvmsg_link);
+		write_unlock_bh(&rx->recvmsg_lock);
 
-		/* determine whether to continue last data receive */
-		if (continue_call) {
-			_debug("maybe cont");
-			if (call != continue_call ||
-			    skb->mark != RXRPC_SKB_MARK_DATA) {
-				release_sock(&rx->sk);
-				rxrpc_put_call(continue_call, rxrpc_call_put);
-				_leave(" = %d [noncont]", copied);
-				return copied;
-			}
-		}
+		write_lock(&rx->call_lock);
+		list_add_tail(&call->accept_link, &rx->to_be_accepted);
+		write_unlock(&rx->call_lock);
+	}
 
-		rxrpc_get_call(call, rxrpc_call_got);
+	return ret;
+}
 
-		/* copy the peer address and timestamp */
-		if (!continue_call) {
-			if (msg->msg_name) {
-				size_t len =
-					sizeof(call->conn->params.peer->srx);
-				memcpy(msg->msg_name,
-				       &call->conn->params.peer->srx, len);
-				msg->msg_namelen = len;
-			}
-			sock_recv_timestamp(msg, &rx->sk, skb);
-		}
+/*
+ * End the packet reception phase.
+ */
+static void rxrpc_end_rx_phase(struct rxrpc_call *call)
+{
+	_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
 
-		/* receive the message */
-		if (skb->mark != RXRPC_SKB_MARK_DATA)
-			goto receive_non_data_message;
+	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
+		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+	} else {
+		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, false);
+	}
 
-		_debug("recvmsg DATA #%u { %d, %d }",
-		       sp->hdr.seq, skb->len, sp->offset);
+	write_lock_bh(&call->state_lock);
 
-		if (!continue_call) {
-			/* only set the control data once per recvmsg() */
-			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
-				       ullen, &call->user_call_ID);
-			if (ret < 0)
-				goto copy_error;
-			ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
-		}
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_RECV_REPLY:
+		__rxrpc_call_completed(call);
+		break;
 
-		ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
-		ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
-		call->rx_data_recv = sp->hdr.seq;
+	case RXRPC_CALL_SERVER_RECV_REQUEST:
+		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
+		break;
+	default:
+		break;
+	}
 
-		ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
+	write_unlock_bh(&call->state_lock);
+}
 
-		offset = sp->offset;
-		copy = skb->len - offset;
-		if (copy > len - copied)
-			copy = len - copied;
+/*
+ * Discard a packet we've used up and advance the Rx window by one.
+ */
+static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
+{
+	struct sk_buff *skb;
+	rxrpc_seq_t hard_ack, top;
+	int ix;
+
+	_enter("%d", call->debug_id);
+
+	hard_ack = call->rx_hard_ack;
+	top = smp_load_acquire(&call->rx_top);
+	ASSERT(before(hard_ack, top));
+
+	hard_ack++;
+	ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
+	skb = call->rxtx_buffer[ix];
+	rxrpc_see_skb(skb);
+	call->rxtx_buffer[ix] = NULL;
+	call->rxtx_annotations[ix] = 0;
+	/* Barrier against rxrpc_input_data(). */
+	smp_store_release(&call->rx_hard_ack, hard_ack);
 
-		ret = skb_copy_datagram_msg(skb, offset, msg, copy);
+	rxrpc_free_skb(skb);
 
+	_debug("%u,%u,%lx", hard_ack, top, call->flags);
+	if (hard_ack == top && test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+		rxrpc_end_rx_phase(call);
+}
+
+/*
+ * Decrypt and verify a (sub)packet.  The packet's length may be changed due to
+ * padding, but if this is the case, the packet length will be resident in the
+ * socket buffer.  Note that we can't modify the master skb info as the skb may
+ * be the home to multiple subpackets.
+ */
+static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			       u8 annotation,
+			       unsigned int offset, unsigned int len)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	rxrpc_seq_t seq = sp->hdr.seq;
+	u16 cksum = sp->hdr.cksum;
+
+	_enter("");
+
+	/* For all but the head jumbo subpacket, the security checksum is in a
+	 * jumbo header immediately prior to the data.
+	 */
+	if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) {
+		__be16 tmp;
+		if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0)
+			BUG();
+		cksum = ntohs(tmp);
+		seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1;
+	}
+
+	return call->conn->security->verify_packet(call, skb, offset, len,
+						   seq, cksum);
+}
+
+/*
+ * Locate the data within a packet.  This is complicated by:
+ *
+ * (1) An skb may contain a jumbo packet - so we have to find the appropriate
+ *     subpacket.
+ *
+ * (2) The (sub)packets may be encrypted and, if so, the encrypted portion
+ *     contains an extra header which includes the true length of the data,
+ *     excluding any encrypted padding.
+ */
+static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+			     u8 *_annotation,
+			     unsigned int *_offset, unsigned int *_len)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned int offset = *_offset;
+	unsigned int len = *_len;
+	int ret;
+	u8 annotation = *_annotation;
+
+	if (offset > 0)
+		return 0;
+
+	/* Locate the subpacket */
+	offset = sp->offset;
+	len = skb->len - sp->offset;
+	if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) {
+		offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) *
+			   RXRPC_JUMBO_SUBPKTLEN);
+		len = (annotation & RXRPC_RX_ANNO_JLAST) ?
+			skb->len - offset : RXRPC_JUMBO_SUBPKTLEN;
+	}
+
+	if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) {
+		ret = rxrpc_verify_packet(call, skb, annotation, offset, len);
 		if (ret < 0)
-			goto copy_error;
+			return ret;
+		*_annotation |= RXRPC_RX_ANNO_VERIFIED;
+	}
 
-		/* handle piecemeal consumption of data packets */
-		_debug("copied %d+%d", copy, copied);
+	*_offset = offset;
+	*_len = len;
+	call->conn->security->locate_data(call, skb, _offset, _len);
+	return 0;
+}
 
-		offset += copy;
-		copied += copy;
+/*
+ * Deliver messages to a call.  This keeps processing packets until the buffer
+ * is filled and we find either more DATA (returns 0) or the end of the DATA
+ * (returns 1).  If more packets are required, it returns -EAGAIN.
+ */
+static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
+			      struct msghdr *msg, struct iov_iter *iter,
+			      size_t len, int flags, size_t *_offset)
+{
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *skb;
+	rxrpc_seq_t hard_ack, top, seq;
+	size_t remain;
+	bool last;
+	unsigned int rx_pkt_offset, rx_pkt_len;
+	int ix, copy, ret = 0;
+
+	_enter("");
+
+	rx_pkt_offset = call->rx_pkt_offset;
+	rx_pkt_len = call->rx_pkt_len;
+
+	/* Barriers against rxrpc_input_data(). */
+	hard_ack = call->rx_hard_ack;
+	top = smp_load_acquire(&call->rx_top);
+	for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		skb = call->rxtx_buffer[ix];
+		if (!skb)
+			break;
+		smp_rmb();
+		rxrpc_see_skb(skb);
+		sp = rxrpc_skb(skb);
 
-		if (!(flags & MSG_PEEK))
-			sp->offset = offset;
+		if (msg)
+			sock_recv_timestamp(msg, sock->sk, skb);
+
+		ret = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix],
+					&rx_pkt_offset, &rx_pkt_len);
+		_debug("recvmsg %x DATA #%u { %d, %d }",
+		       sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
+
+		/* We have to handle short, empty and used-up DATA packets. */
+		remain = len - *_offset;
+		copy = rx_pkt_len;
+		if (copy > remain)
+			copy = remain;
+		if (copy > 0) {
+			ret = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
+						     copy);
+			if (ret < 0)
+				goto out;
+
+			/* handle piecemeal consumption of data packets */
+			_debug("copied %d @%zu", copy, *_offset);
+
+			rx_pkt_offset += copy;
+			rx_pkt_len -= copy;
+			*_offset += copy;
+		}
 
-		if (sp->offset < skb->len) {
+		if (rx_pkt_len > 0) {
 			_debug("buffer full");
-			ASSERTCMP(copied, ==, len);
+			ASSERTCMP(*_offset, ==, len);
 			break;
 		}
 
-		/* we transferred the whole data packet */
+		/* The whole packet has been transferred. */
+		last = sp->hdr.flags & RXRPC_LAST_PACKET;
 		if (!(flags & MSG_PEEK))
-			rxrpc_kernel_data_consumed(call, skb);
-
-		if (sp->hdr.flags & RXRPC_LAST_PACKET) {
-			_debug("last");
-			if (rxrpc_conn_is_client(call->conn)) {
-				 /* last byte of reply received */
-				ret = copied;
-				goto terminal_message;
-			}
+			rxrpc_rotate_rx_window(call);
+		rx_pkt_offset = 0;
+		rx_pkt_len = 0;
 
-			/* last bit of request received */
-			if (!(flags & MSG_PEEK)) {
-				_debug("eat packet");
-				if (skb_dequeue(&rx->sk.sk_receive_queue) !=
-				    skb)
-					BUG();
-				rxrpc_free_skb(skb);
-			}
-			msg->msg_flags &= ~MSG_MORE;
-			break;
-		}
+		ASSERTIFCMP(last, seq, ==, top);
+	}
 
-		/* move on to the next data message */
-		_debug("next");
-		if (!continue_call)
-			continue_call = sp->call;
-		else
-			rxrpc_put_call(call, rxrpc_call_put);
-		call = NULL;
-
-		if (flags & MSG_PEEK) {
-			_debug("peek next");
-			skb = skb->next;
-			if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue)
-				break;
-			goto peek_next_packet;
-		}
+	if (after(seq, top)) {
+		ret = -EAGAIN;
+		if (test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+			ret = 1;
+	}
+out:
+	if (!(flags & MSG_PEEK)) {
+		call->rx_pkt_offset = rx_pkt_offset;
+		call->rx_pkt_len = rx_pkt_len;
+	}
+	_leave(" = %d [%u/%u]", ret, seq, top);
+	return ret;
+}
 
-		_debug("eat packet");
-		if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
-			BUG();
-		rxrpc_free_skb(skb);
+/*
+ * Receive a message from an RxRPC socket
+ * - we need to be careful about two or more threads calling recvmsg
+ *   simultaneously
+ */
+int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+		  int flags)
+{
+	struct rxrpc_call *call;
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	struct list_head *l;
+	size_t copied = 0;
+	long timeo;
+	int ret;
+
+	DEFINE_WAIT(wait);
+
+	_enter(",,,%zu,%d", len, flags);
+
+	if (flags & (MSG_OOB | MSG_TRUNC))
+		return -EOPNOTSUPP;
+
+	timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
+
+try_again:
+	lock_sock(&rx->sk);
+
+	/* Return immediately if a client socket has no outstanding calls */
+	if (RB_EMPTY_ROOT(&rx->calls) &&
+	    list_empty(&rx->recvmsg_q) &&
+	    rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
+		release_sock(&rx->sk);
+		return -ENODATA;
 	}
 
-	/* end of non-terminal data packet reception for the moment */
-	_debug("end rcv data");
-out:
-	release_sock(&rx->sk);
-	if (call)
-		rxrpc_put_call(call, rxrpc_call_put);
-	if (continue_call)
-		rxrpc_put_call(continue_call, rxrpc_call_put);
-	_leave(" = %d [data]", copied);
-	return copied;
-
-	/* handle non-DATA messages such as aborts, incoming connections and
-	 * final ACKs */
-receive_non_data_message:
-	_debug("non-data");
-
-	if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) {
-		_debug("RECV NEW CALL");
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code);
-		if (ret < 0)
-			goto copy_error;
-		if (!(flags & MSG_PEEK)) {
-			if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
-				BUG();
-			rxrpc_free_skb(skb);
+	if (list_empty(&rx->recvmsg_q)) {
+		ret = -EWOULDBLOCK;
+		if (timeo == 0)
+			goto error_no_call;
+
+		release_sock(&rx->sk);
+
+		/* Wait for something to happen */
+		prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
+					  TASK_INTERRUPTIBLE);
+		ret = sock_error(&rx->sk);
+		if (ret)
+			goto wait_error;
+
+		if (list_empty(&rx->recvmsg_q)) {
+			if (signal_pending(current))
+				goto wait_interrupted;
+			timeo = schedule_timeout(timeo);
 		}
-		goto out;
+		finish_wait(sk_sleep(&rx->sk), &wait);
+		goto try_again;
 	}
 
-	ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
-		       ullen, &call->user_call_ID);
-	if (ret < 0)
-		goto copy_error;
-	ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+	/* Find the next call and dequeue it if we're not just peeking.  If we
+	 * do dequeue it, that comes with a ref that we will need to release.
+	 */
+	write_lock_bh(&rx->recvmsg_lock);
+	l = rx->recvmsg_q.next;
+	call = list_entry(l, struct rxrpc_call, recvmsg_link);
+	if (!(flags & MSG_PEEK))
+		list_del_init(&call->recvmsg_link);
+	else
+		rxrpc_get_call(call, rxrpc_call_got);
+	write_unlock_bh(&rx->recvmsg_lock);
 
-	switch (skb->mark) {
-	case RXRPC_SKB_MARK_DATA:
+	_debug("recvmsg call %p", call);
+
+	if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
 		BUG();
-	case RXRPC_SKB_MARK_FINAL_ACK:
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code);
-		break;
-	case RXRPC_SKB_MARK_BUSY:
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
-		break;
-	case RXRPC_SKB_MARK_REMOTE_ABORT:
-		abort_code = call->abort_code;
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
-		break;
-	case RXRPC_SKB_MARK_LOCAL_ABORT:
-		abort_code = call->abort_code;
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
-		if (call->error) {
-			abort_code = call->error;
-			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
-				       &abort_code);
+
+	if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+		if (flags & MSG_CMSG_COMPAT) {
+			unsigned int id32 = call->user_call_ID;
+
+			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+				       sizeof(unsigned int), &id32);
+		} else {
+			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+				       sizeof(unsigned long),
+				       &call->user_call_ID);
 		}
+		if (ret < 0)
+			goto error;
+	}
+
+	if (msg->msg_name) {
+		size_t len = sizeof(call->conn->params.peer->srx);
+		memcpy(msg->msg_name, &call->conn->params.peer->srx, len);
+		msg->msg_namelen = len;
+	}
+
+	switch (call->state) {
+	case RXRPC_CALL_SERVER_ACCEPTING:
+		ret = rxrpc_recvmsg_new_call(rx, call, msg, flags);
 		break;
-	case RXRPC_SKB_MARK_NET_ERROR:
-		_debug("RECV NET ERROR %d", sp->error);
-		abort_code = sp->error;
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code);
-		break;
-	case RXRPC_SKB_MARK_LOCAL_ERROR:
-		_debug("RECV LOCAL ERROR %d", sp->error);
-		abort_code = sp->error;
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
-			       &abort_code);
+	case RXRPC_CALL_CLIENT_RECV_REPLY:
+	case RXRPC_CALL_SERVER_RECV_REQUEST:
+	case RXRPC_CALL_SERVER_ACK_REQUEST:
+		ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len,
+					 flags, &copied);
+		if (ret == -EAGAIN)
+			ret = 0;
 		break;
 	default:
-		pr_err("Unknown packet mark %u\n", skb->mark);
-		BUG();
+		ret = 0;
 		break;
 	}
 
 	if (ret < 0)
-		goto copy_error;
-
-terminal_message:
-	_debug("terminal");
-	msg->msg_flags &= ~MSG_MORE;
-	msg->msg_flags |= MSG_EOR;
+		goto error;
 
-	if (!(flags & MSG_PEEK)) {
-		_net("free terminal skb %p", skb);
-		if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
-			BUG();
-		rxrpc_free_skb(skb);
-		rxrpc_release_call(rx, call);
+	if (call->state == RXRPC_CALL_COMPLETE) {
+		ret = rxrpc_recvmsg_term(call, msg);
+		if (ret < 0)
+			goto error;
+		if (!(flags & MSG_PEEK))
+			rxrpc_release_call(rx, call);
+		msg->msg_flags |= MSG_EOR;
+		ret = 1;
 	}
 
-	release_sock(&rx->sk);
-	rxrpc_put_call(call, rxrpc_call_put);
-	if (continue_call)
-		rxrpc_put_call(continue_call, rxrpc_call_put);
-	_leave(" = %d", ret);
-	return ret;
+	if (ret == 0)
+		msg->msg_flags |= MSG_MORE;
+	else
+		msg->msg_flags &= ~MSG_MORE;
+	ret = copied;
 
-copy_error:
-	_debug("copy error");
-	release_sock(&rx->sk);
+error:
 	rxrpc_put_call(call, rxrpc_call_put);
-	if (continue_call)
-		rxrpc_put_call(continue_call, rxrpc_call_put);
+error_no_call:
+	release_sock(&rx->sk);
 	_leave(" = %d", ret);
 	return ret;
 
@@ -339,85 +499,8 @@ wait_interrupted:
 	ret = sock_intr_errno(timeo);
 wait_error:
 	finish_wait(sk_sleep(&rx->sk), &wait);
-	if (continue_call)
-		rxrpc_put_call(continue_call, rxrpc_call_put);
-	if (copied)
-		copied = ret;
-	_leave(" = %d [waitfail %d]", copied, ret);
-	return copied;
-
-}
-
-/*
- * Deliver messages to a call.  This keeps processing packets until the buffer
- * is filled and we find either more DATA (returns 0) or the end of the DATA
- * (returns 1).  If more packets are required, it returns -EAGAIN.
- *
- * TODO: Note that this is hacked in at the moment and will be replaced.
- */
-static int temp_deliver_data(struct socket *sock, struct rxrpc_call *call,
-			     struct iov_iter *iter, size_t size,
-			     size_t *_offset)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
-	size_t remain;
-	int ret, copy;
-
-	_enter("%d", call->debug_id);
-
-next:
-	local_bh_disable();
-	skb = skb_dequeue(&call->knlrecv_queue);
-	local_bh_enable();
-	if (!skb) {
-		if (test_bit(RXRPC_CALL_RX_NO_MORE, &call->flags))
-			return 1;
-		_leave(" = -EAGAIN [empty]");
-		return -EAGAIN;
-	}
-
-	sp = rxrpc_skb(skb);
-	_debug("dequeued %p %u/%zu", skb, sp->offset, size);
-
-	switch (skb->mark) {
-	case RXRPC_SKB_MARK_DATA:
-		remain = size - *_offset;
-		if (remain > 0) {
-			copy = skb->len - sp->offset;
-			if (copy > remain)
-				copy = remain;
-			ret = skb_copy_datagram_iter(skb, sp->offset, iter,
-						     copy);
-			if (ret < 0)
-				goto requeue_and_leave;
-
-			/* handle piecemeal consumption of data packets */
-			sp->offset += copy;
-			*_offset += copy;
-		}
-
-		if (sp->offset < skb->len)
-			goto partially_used_skb;
-
-		/* We consumed the whole packet */
-		ASSERTCMP(sp->offset, ==, skb->len);
-		if (sp->hdr.flags & RXRPC_LAST_PACKET)
-			set_bit(RXRPC_CALL_RX_NO_MORE, &call->flags);
-		rxrpc_kernel_data_consumed(call, skb);
-		rxrpc_free_skb(skb);
-		goto next;
-
-	default:
-		rxrpc_free_skb(skb);
-		goto next;
-	}
-
-partially_used_skb:
-	ASSERTCMP(*_offset, ==, size);
-	ret = 0;
-requeue_and_leave:
-	skb_queue_head(&call->knlrecv_queue, skb);
+	release_sock(&rx->sk);
+	_leave(" = %d [wait]", ret);
 	return ret;
 }
 
@@ -453,8 +536,9 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 	struct kvec iov;
 	int ret;
 
-	_enter("{%d,%s},%zu,%d",
-	       call->debug_id, rxrpc_call_states[call->state], size, want_more);
+	_enter("{%d,%s},%zu/%zu,%d",
+	       call->debug_id, rxrpc_call_states[call->state],
+	       *_offset, size, want_more);
 
 	ASSERTCMP(*_offset, <=, size);
 	ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING);
@@ -469,7 +553,8 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 	case RXRPC_CALL_CLIENT_RECV_REPLY:
 	case RXRPC_CALL_SERVER_RECV_REQUEST:
 	case RXRPC_CALL_SERVER_ACK_REQUEST:
-		ret = temp_deliver_data(sock, call, &iter, size, _offset);
+		ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0,
+					 _offset);
 		if (ret < 0)
 			goto out;
 
@@ -494,7 +579,6 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 		goto call_complete;
 
 	default:
-		*_offset = 0;
 		ret = -EINPROGRESS;
 		goto out;
 	}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 3777432df10b..ae392558829d 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -317,6 +317,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
  * decrypt partial encryption on a packet (level 1 security)
  */
 static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
+				 unsigned int offset, unsigned int len,
 				 rxrpc_seq_t seq)
 {
 	struct rxkad_level1_hdr sechdr;
@@ -330,18 +331,20 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
 
 	_enter("");
 
-	if (skb->len < 8) {
+	if (len < 8) {
 		rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO);
 		goto protocol_error;
 	}
 
-	/* we want to decrypt the skbuff in-place */
+	/* Decrypt the skbuff in-place.  TODO: We really want to decrypt
+	 * directly into the target buffer.
+	 */
 	nsg = skb_cow_data(skb, 0, &trailer);
 	if (nsg < 0 || nsg > 16)
 		goto nomem;
 
 	sg_init_table(sg, nsg);
-	skb_to_sgvec(skb, sg, 0, 8);
+	skb_to_sgvec(skb, sg, offset, 8);
 
 	/* start the decryption afresh */
 	memset(&iv, 0, sizeof(iv));
@@ -353,12 +356,12 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
 	skcipher_request_zero(req);
 
 	/* Extract the decrypted packet length */
-	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) {
+	if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
 		rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO);
 		goto protocol_error;
 	}
-	if (!skb_pull(skb, sizeof(sechdr)))
-		BUG();
+	offset += sizeof(sechdr);
+	len -= sizeof(sechdr);
 
 	buf = ntohl(sechdr.data_size);
 	data_size = buf & 0xffff;
@@ -371,18 +374,16 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
 		goto protocol_error;
 	}
 
-	/* shorten the packet to remove the padding */
-	if (data_size > skb->len) {
+	if (data_size > len) {
 		rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO);
 		goto protocol_error;
 	}
-	if (data_size < skb->len)
-		skb->len = data_size;
 
 	_leave(" = 0 [dlen=%x]", data_size);
 	return 0;
 
 protocol_error:
+	rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 	_leave(" = -EPROTO");
 	return -EPROTO;
 
@@ -395,6 +396,7 @@ nomem:
  * wholly decrypt a packet (level 2 security)
  */
 static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
+				 unsigned int offset, unsigned int len,
 				 rxrpc_seq_t seq)
 {
 	const struct rxrpc_key_token *token;
@@ -409,12 +411,14 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 
 	_enter(",{%d}", skb->len);
 
-	if (skb->len < 8) {
+	if (len < 8) {
 		rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO);
 		goto protocol_error;
 	}
 
-	/* we want to decrypt the skbuff in-place */
+	/* Decrypt the skbuff in-place.  TODO: We really want to decrypt
+	 * directly into the target buffer.
+	 */
 	nsg = skb_cow_data(skb, 0, &trailer);
 	if (nsg < 0)
 		goto nomem;
@@ -427,7 +431,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 	}
 
 	sg_init_table(sg, nsg);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	skb_to_sgvec(skb, sg, offset, len);
 
 	/* decrypt from the session key */
 	token = call->conn->params.key->payload.data[0];
@@ -435,19 +439,19 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 
 	skcipher_request_set_tfm(req, call->conn->cipher);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x);
+	skcipher_request_set_crypt(req, sg, sg, len, iv.x);
 	crypto_skcipher_decrypt(req);
 	skcipher_request_zero(req);
 	if (sg != _sg)
 		kfree(sg);
 
 	/* Extract the decrypted packet length */
-	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) {
+	if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
 		rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO);
 		goto protocol_error;
 	}
-	if (!skb_pull(skb, sizeof(sechdr)))
-		BUG();
+	offset += sizeof(sechdr);
+	len -= sizeof(sechdr);
 
 	buf = ntohl(sechdr.data_size);
 	data_size = buf & 0xffff;
@@ -460,17 +464,16 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 		goto protocol_error;
 	}
 
-	if (data_size > skb->len) {
+	if (data_size > len) {
 		rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO);
 		goto protocol_error;
 	}
-	if (data_size < skb->len)
-		skb->len = data_size;
 
 	_leave(" = 0 [dlen=%x]", data_size);
 	return 0;
 
 protocol_error:
+	rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 	_leave(" = -EPROTO");
 	return -EPROTO;
 
@@ -484,6 +487,7 @@ nomem:
  * jumbo packet).
  */
 static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			       unsigned int offset, unsigned int len,
 			       rxrpc_seq_t seq, u16 expected_cksum)
 {
 	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
@@ -521,6 +525,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
 
 	if (cksum != expected_cksum) {
 		rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 		_leave(" = -EPROTO [csum failed]");
 		return -EPROTO;
 	}
@@ -529,14 +534,60 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	case RXRPC_SECURITY_PLAIN:
 		return 0;
 	case RXRPC_SECURITY_AUTH:
-		return rxkad_verify_packet_1(call, skb, seq);
+		return rxkad_verify_packet_1(call, skb, offset, len, seq);
 	case RXRPC_SECURITY_ENCRYPT:
-		return rxkad_verify_packet_2(call, skb, seq);
+		return rxkad_verify_packet_2(call, skb, offset, len, seq);
 	default:
 		return -ENOANO;
 	}
 }
 
+/*
+ * Locate the data contained in a packet that was partially encrypted.
+ */
+static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb,
+				unsigned int *_offset, unsigned int *_len)
+{
+	struct rxkad_level1_hdr sechdr;
+
+	if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
+		BUG();
+	*_offset += sizeof(sechdr);
+	*_len = ntohl(sechdr.data_size) & 0xffff;
+}
+
+/*
+ * Locate the data contained in a packet that was completely encrypted.
+ */
+static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb,
+				unsigned int *_offset, unsigned int *_len)
+{
+	struct rxkad_level2_hdr sechdr;
+
+	if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
+		BUG();
+	*_offset += sizeof(sechdr);
+	*_len = ntohl(sechdr.data_size) & 0xffff;
+}
+
+/*
+ * Locate the data contained in an already decrypted packet.
+ */
+static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+			      unsigned int *_offset, unsigned int *_len)
+{
+	switch (call->conn->params.security_level) {
+	case RXRPC_SECURITY_AUTH:
+		rxkad_locate_data_1(call, skb, _offset, _len);
+		return;
+	case RXRPC_SECURITY_ENCRYPT:
+		rxkad_locate_data_2(call, skb, _offset, _len);
+		return;
+	default:
+		return;
+	}
+}
+
 /*
  * issue a challenge
  */
@@ -704,7 +755,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	struct rxkad_challenge challenge;
 	struct rxkad_response resp
 		__attribute__((aligned(8))); /* must be aligned for crypto */
-	struct rxrpc_skb_priv *sp;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	u32 version, nonce, min_level, abort_code;
 	int ret;
 
@@ -722,8 +773,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	}
 
 	abort_code = RXKADPACKETSHORT;
-	sp = rxrpc_skb(skb);
-	if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0)
+	if (skb_copy_bits(skb, sp->offset, &challenge, sizeof(challenge)) < 0)
 		goto protocol_error;
 
 	version = ntohl(challenge.version);
@@ -969,7 +1019,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 {
 	struct rxkad_response response
 		__attribute__((aligned(8))); /* must be aligned for crypto */
-	struct rxrpc_skb_priv *sp;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_crypt session_key;
 	time_t expiry;
 	void *ticket;
@@ -980,7 +1030,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
 
 	abort_code = RXKADPACKETSHORT;
-	if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0)
+	if (skb_copy_bits(skb, sp->offset, &response, sizeof(response)) < 0)
 		goto protocol_error;
 	if (!pskb_pull(skb, sizeof(response)))
 		BUG();
@@ -988,7 +1038,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	version = ntohl(response.version);
 	ticket_len = ntohl(response.ticket_len);
 	kvno = ntohl(response.kvno);
-	sp = rxrpc_skb(skb);
 	_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
 	       sp->hdr.serial, version, kvno, ticket_len);
 
@@ -1010,7 +1059,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 		return -ENOMEM;
 
 	abort_code = RXKADPACKETSHORT;
-	if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0)
+	if (skb_copy_bits(skb, sp->offset, ticket, ticket_len) < 0)
 		goto protocol_error_free;
 
 	ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key,
@@ -1135,6 +1184,7 @@ const struct rxrpc_security rxkad = {
 	.prime_packet_security		= rxkad_prime_packet_security,
 	.secure_packet			= rxkad_secure_packet,
 	.verify_packet			= rxkad_verify_packet,
+	.locate_data			= rxkad_locate_data,
 	.issue_challenge		= rxkad_issue_challenge,
 	.respond_to_challenge		= rxkad_respond_to_challenge,
 	.verify_response		= rxkad_verify_response,
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 5d79d5a9c944..82d8134e9287 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -130,20 +130,20 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
 	}
 
 	/* find the service */
-	read_lock_bh(&local->services_lock);
+	read_lock(&local->services_lock);
 	hlist_for_each_entry(rx, &local->services, listen_link) {
 		if (rx->srx.srx_service == conn->params.service_id)
 			goto found_service;
 	}
 
 	/* the service appears to have died */
-	read_unlock_bh(&local->services_lock);
+	read_unlock(&local->services_lock);
 	_leave(" = -ENOENT");
 	return -ENOENT;
 
 found_service:
 	if (!rx->securities) {
-		read_unlock_bh(&local->services_lock);
+		read_unlock(&local->services_lock);
 		_leave(" = -ENOKEY");
 		return -ENOKEY;
 	}
@@ -152,13 +152,13 @@ found_service:
 	kref = keyring_search(make_key_ref(rx->securities, 1UL),
 			      &key_type_rxrpc_s, kdesc);
 	if (IS_ERR(kref)) {
-		read_unlock_bh(&local->services_lock);
+		read_unlock(&local->services_lock);
 		_leave(" = %ld [search]", PTR_ERR(kref));
 		return PTR_ERR(kref);
 	}
 
 	key = key_ref_to_ptr(kref);
-	read_unlock_bh(&local->services_lock);
+	read_unlock(&local->services_lock);
 
 	conn->server_key = key;
 	conn->security = sec;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 9a4af992fcdf..cba236575073 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -15,7 +15,6 @@
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
-#include <linux/circ_buf.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
@@ -38,19 +37,20 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 	DECLARE_WAITQUEUE(myself, current);
 	int ret;
 
-	_enter(",{%d},%ld",
-	       CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
-			  call->acks_winsz),
-	       *timeo);
+	_enter(",{%u,%u,%u}",
+	       call->tx_hard_ack, call->tx_top, call->tx_winsize);
 
 	add_wait_queue(&call->waitq, &myself);
 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		ret = 0;
-		if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
-			       call->acks_winsz) > 0)
+		if (call->tx_top - call->tx_hard_ack < call->tx_winsize)
 			break;
+		if (call->state >= RXRPC_CALL_COMPLETE) {
+			ret = -call->error;
+			break;
+		}
 		if (signal_pending(current)) {
 			ret = sock_intr_errno(*timeo);
 			break;
@@ -68,36 +68,44 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 }
 
 /*
- * attempt to schedule an instant Tx resend
+ * Schedule an instant Tx resend.
  */
-static inline void rxrpc_instant_resend(struct rxrpc_call *call)
+static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
 {
-	read_lock_bh(&call->state_lock);
-	if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
-		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
+	spin_lock_bh(&call->lock);
+
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+		if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
 			rxrpc_queue_call(call);
 	}
-	read_unlock_bh(&call->state_lock);
+
+	spin_unlock_bh(&call->lock);
 }
 
 /*
- * queue a packet for transmission, set the resend timer and attempt
- * to send the packet immediately
+ * Queue a DATA packet for transmission, set the resend timeout and send the
+ * packet immediately
  */
 static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 			       bool last)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	int ret;
+	rxrpc_seq_t seq = sp->hdr.seq;
+	int ret, ix;
+
+	_net("queue skb %p [%d]", skb, seq);
 
-	_net("queue skb %p [%d]", skb, call->acks_head);
+	ASSERTCMP(seq, ==, call->tx_top + 1);
 
-	ASSERT(call->acks_window != NULL);
-	call->acks_window[call->acks_head] = (unsigned long) skb;
+	ix = seq & RXRPC_RXTX_BUFF_MASK;
+	rxrpc_get_skb(skb);
+	call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
 	smp_wmb();
-	call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1);
+	call->rxtx_buffer[ix] = skb;
+	call->tx_top = seq;
+	if (last)
+		set_bit(RXRPC_CALL_TX_LAST, &call->flags);
 
 	if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
 		_debug("________awaiting reply/ACK__________");
@@ -121,34 +129,17 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 
 	_proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
 
-	sp->need_resend = false;
-	sp->resend_at = jiffies + rxrpc_resend_timeout;
-	if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) {
-		_debug("run timer");
-		call->resend_timer.expires = sp->resend_at;
-		add_timer(&call->resend_timer);
-	}
-
-	/* attempt to cancel the rx-ACK timer, deferring reply transmission if
-	 * we're ACK'ing the request phase of an incoming call */
-	ret = -EAGAIN;
-	if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
-		/* the packet may be freed by rxrpc_process_call() before this
-		 * returns */
-		if (rxrpc_is_client_call(call))
-			rxrpc_expose_client_call(call);
-		ret = rxrpc_send_data_packet(call->conn, skb);
-		_net("sent skb %p", skb);
-	} else {
-		_debug("failed to delete ACK timer");
-	}
+	if (seq == 1 && rxrpc_is_client_call(call))
+		rxrpc_expose_client_call(call);
 
+	sp->resend_at = jiffies + rxrpc_resend_timeout;
+	ret = rxrpc_send_data_packet(call->conn, skb);
 	if (ret < 0) {
 		_debug("need instant resend %d", ret);
-		sp->need_resend = true;
-		rxrpc_instant_resend(call);
+		rxrpc_instant_resend(call, ix);
 	}
 
+	rxrpc_free_skb(skb);
 	_leave("");
 }
 
@@ -212,9 +203,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			_debug("alloc");
 
-			if (CIRC_SPACE(call->acks_head,
-				       ACCESS_ONCE(call->acks_tail),
-				       call->acks_winsz) <= 0) {
+			if (call->tx_top - call->tx_hard_ack >=
+			    call->tx_winsize) {
 				ret = -EAGAIN;
 				if (msg->msg_flags & MSG_DONTWAIT)
 					goto maybe_error;
@@ -313,7 +303,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 					memset(skb_put(skb, pad), 0, pad);
 			}
 
-			seq = atomic_inc_return(&call->sequence);
+			seq = call->tx_top + 1;
 
 			sp->hdr.epoch	= conn->proto.epoch;
 			sp->hdr.cid	= call->cid;
@@ -329,9 +319,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			sp->hdr.flags = conn->out_clientflag;
 			if (msg_data_left(msg) == 0 && !more)
 				sp->hdr.flags |= RXRPC_LAST_PACKET;
-			else if (CIRC_SPACE(call->acks_head,
-					    ACCESS_ONCE(call->acks_tail),
-					    call->acks_winsz) > 1)
+			else if (call->tx_top - call->tx_hard_ack <
+				 call->tx_winsize)
 				sp->hdr.flags |= RXRPC_MORE_PACKETS;
 			if (more && seq & 1)
 				sp->hdr.flags |= RXRPC_REQUEST_ACK;
@@ -358,7 +347,7 @@ out:
 call_terminated:
 	rxrpc_free_skb(skb);
 	_leave(" = %d", -call->error);
-	return ret;
+	return -call->error;
 
 maybe_error:
 	if (copied)
@@ -451,29 +440,6 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
 	return 0;
 }
 
-/*
- * abort a call, sending an ABORT packet to the peer
- */
-static void rxrpc_send_abort(struct rxrpc_call *call, const char *why,
-			     u32 abort_code, int error)
-{
-	if (call->state >= RXRPC_CALL_COMPLETE)
-		return;
-
-	write_lock_bh(&call->state_lock);
-
-	if (__rxrpc_abort_call(why, call, 0, abort_code, error)) {
-		del_timer_sync(&call->resend_timer);
-		del_timer_sync(&call->ack_timer);
-		clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-		clear_bit(RXRPC_CALL_EV_ACK, &call->events);
-		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-		rxrpc_queue_call(call);
-	}
-
-	write_unlock_bh(&call->state_lock);
-}
-
 /*
  * Create a new client call for sendmsg().
  */
@@ -549,7 +515,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 			return PTR_ERR(call);
 	}
 
-	rxrpc_see_call(call);
 	_debug("CALL %d USR %lx ST %d on CONN %p",
 	       call->debug_id, call->user_call_ID, call->state, call->conn);
 
@@ -557,8 +522,10 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		/* it's too late for this call */
 		ret = -ESHUTDOWN;
 	} else if (cmd == RXRPC_CMD_SEND_ABORT) {
-		rxrpc_send_abort(call, "CMD", abort_code, ECONNABORTED);
 		ret = 0;
+		if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED))
+			ret = rxrpc_send_call_packet(call,
+						     RXRPC_PACKET_TYPE_ABORT);
 	} else if (cmd != RXRPC_CMD_SEND_DATA) {
 		ret = -EINVAL;
 	} else if (rxrpc_is_client_call(call) &&
@@ -639,7 +606,8 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
 
 	lock_sock(sock->sk);
 
-	rxrpc_send_abort(call, why, abort_code, error);
+	if (rxrpc_abort_call(why, call, 0, abort_code, error))
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 
 	release_sock(sock->sk);
 	_leave("");
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 9b8f8456d3bf..620d9ccaf3c1 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -18,133 +18,6 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
-/*
- * set up for the ACK at the end of the receive phase when we discard the final
- * receive phase data packet
- * - called with softirqs disabled
- */
-static void rxrpc_request_final_ACK(struct rxrpc_call *call)
-{
-	/* the call may be aborted before we have a chance to ACK it */
-	write_lock(&call->state_lock);
-
-	switch (call->state) {
-	case RXRPC_CALL_CLIENT_RECV_REPLY:
-		call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
-		_debug("request final ACK");
-
-		set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
-		if (try_to_del_timer_sync(&call->ack_timer) >= 0)
-			rxrpc_queue_call(call);
-		break;
-
-	case RXRPC_CALL_SERVER_RECV_REQUEST:
-		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
-	default:
-		break;
-	}
-
-	write_unlock(&call->state_lock);
-}
-
-/*
- * drop the bottom ACK off of the call ACK window and advance the window
- */
-static void rxrpc_hard_ACK_data(struct rxrpc_call *call, struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	int loop;
-	u32 seq;
-
-	spin_lock_bh(&call->lock);
-
-	_debug("hard ACK #%u", sp->hdr.seq);
-
-	for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
-		call->ackr_window[loop] >>= 1;
-		call->ackr_window[loop] |=
-			call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
-	}
-
-	seq = sp->hdr.seq;
-	ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
-	call->rx_data_eaten = seq;
-
-	if (call->ackr_win_top < UINT_MAX)
-		call->ackr_win_top++;
-
-	ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
-		    call->rx_data_post, >=, call->rx_data_recv);
-	ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
-		    call->rx_data_recv, >=, call->rx_data_eaten);
-
-	if (sp->hdr.flags & RXRPC_LAST_PACKET) {
-		rxrpc_request_final_ACK(call);
-	} else if (atomic_dec_and_test(&call->ackr_not_idle) &&
-		   test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) {
-		/* We previously soft-ACK'd some received packets that have now
-		 * been consumed, so send a hard-ACK if no more packets are
-		 * immediately forthcoming to allow the transmitter to free up
-		 * its Tx bufferage.
-		 */
-		_debug("send Rx idle ACK");
-		__rxrpc_propose_ACK(call, RXRPC_ACK_IDLE,
-				    skb->priority, sp->hdr.serial, false);
-	}
-
-	spin_unlock_bh(&call->lock);
-}
-
-/**
- * rxrpc_kernel_data_consumed - Record consumption of data message
- * @call: The call to which the message pertains.
- * @skb: Message holding data
- *
- * Record the consumption of a data message and generate an ACK if appropriate.
- * The call state is shifted if this was the final packet.  The caller must be
- * in process context with no spinlocks held.
- *
- * TODO: Actually generate the ACK here rather than punting this to the
- * workqueue.
- */
-void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
-	_enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq);
-
-	ASSERTCMP(sp->call, ==, call);
-	ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA);
-
-	/* TODO: Fix the sequence number tracking */
-	ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
-	ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
-	ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
-
-	call->rx_data_recv = sp->hdr.seq;
-	rxrpc_hard_ACK_data(call, skb);
-}
-
-/*
- * Destroy a packet that has an RxRPC control buffer
- */
-void rxrpc_packet_destructor(struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_call *call = sp->call;
-
-	_enter("%p{%p}", skb, call);
-
-	if (call) {
-		rxrpc_put_call_for_skb(call, skb);
-		sp->call = NULL;
-	}
-
-	if (skb->sk)
-		sock_rfree(skb);
-	_leave("");
-}
-
 /*
  * Note the existence of a new-to-us socket buffer (allocated or dequeued).
  */
-- 
cgit v1.2.3


From d545caca827b65aab557a9e9dcdcf1e5a3823c2d Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Thu, 8 Sep 2016 00:42:25 +0900
Subject: net: inet: diag: expose the socket mark to privileged processes.

This adds the capability for a process that has CAP_NET_ADMIN on
a socket to see the socket mark in socket dumps.

Commit a52e95abf772 ("net: diag: allow socket bytecode filters to
match socket marks") recently gave privileged processes the
ability to filter socket dumps based on mark. This patch is
complementary: it ensures that the mark is also passed to
userspace in the socket's netlink attributes.  It is useful for
tools like ss which display information about sockets.

Tested: https://android-review.googlesource.com/270210
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_diag.h      |  4 ++--
 include/uapi/linux/inet_diag.h |  1 +
 net/ipv4/inet_diag.c           | 49 ++++++++++++++++++++++++++++--------------
 net/ipv4/udp_diag.c            | 10 +++++----
 net/sctp/sctp_diag.c           | 20 +++++++++++------
 5 files changed, 56 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index feb04ea20f11..65da430e260f 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
 		      struct user_namespace *user_ns,
 		      u32 pid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh);
+		      const struct nlmsghdr *unlh, bool net_admin);
 void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb,
 			 struct netlink_callback *cb,
 			 const struct inet_diag_req_v2 *r,
@@ -56,7 +56,7 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk);
 
 int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 			     struct inet_diag_msg *r, int ext,
-			     struct user_namespace *user_ns);
+			     struct user_namespace *user_ns, bool net_admin);
 
 extern int  inet_diag_register(const struct inet_diag_handler *handler);
 extern void inet_diag_unregister(const struct inet_diag_handler *handler);
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 5581206a08ae..b5c366f87b3e 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -123,6 +123,7 @@ enum {
 	INET_DIAG_LOCALS,
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
+	INET_DIAG_MARK,
 	__INET_DIAG_MAX,
 };
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index abfbe492ebfe..e4d16fc5bbb3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -99,6 +99,7 @@ static size_t inet_sk_attr_size(void)
 		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(4) /* INET_DIAG_MARK */
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
 		+ nla_total_size(sizeof(struct inet_diag_msg))
 		+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -109,7 +110,8 @@ static size_t inet_sk_attr_size(void)
 
 int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 			     struct inet_diag_msg *r, int ext,
-			     struct user_namespace *user_ns)
+			     struct user_namespace *user_ns,
+			     bool net_admin)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 
@@ -136,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+		goto errout;
+
 	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
 	r->idiag_inode = sock_i_ino(sk);
 
@@ -149,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
 		      struct user_namespace *user_ns,
 		      u32 portid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh)
+		      const struct nlmsghdr *unlh,
+		      bool net_admin)
 {
 	const struct tcp_congestion_ops *ca_ops;
 	const struct inet_diag_handler *handler;
@@ -175,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	r->idiag_timer = 0;
 	r->idiag_retrans = 0;
 
-	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
 		goto errout;
 
 	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -274,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk,
 			      const struct inet_diag_req_v2 *req,
 			      struct user_namespace *user_ns,
 			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh)
+			      const struct nlmsghdr *unlh,
+			      bool net_admin)
 {
-	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
-				 user_ns, portid, seq, nlmsg_flags, unlh);
+	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
+				 portid, seq, nlmsg_flags, unlh, net_admin);
 }
 
 static int inet_twsk_diag_fill(struct sock *sk,
@@ -319,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
 
 static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh)
+			      const struct nlmsghdr *unlh, bool net_admin)
 {
+	struct request_sock *reqsk = inet_reqsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	long tmo;
@@ -334,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 	inet_diag_msg_common_fill(r, sk);
 	r->idiag_state = TCP_SYN_RECV;
 	r->idiag_timer = 1;
-	r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+	r->idiag_retrans = reqsk->num_retrans;
 
 	BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
 		     offsetof(struct sock, sk_cookie));
@@ -346,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 	r->idiag_uid	= 0;
 	r->idiag_inode	= 0;
 
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+				     inet_rsk(reqsk)->ir_mark))
+		return -EMSGSIZE;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 }
@@ -354,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			const struct inet_diag_req_v2 *r,
 			struct user_namespace *user_ns,
 			u32 portid, u32 seq, u16 nlmsg_flags,
-			const struct nlmsghdr *unlh)
+			const struct nlmsghdr *unlh, bool net_admin)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
 		return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -362,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 
 	if (sk->sk_state == TCP_NEW_SYN_RECV)
 		return inet_req_diag_fill(sk, skb, portid, seq,
-					  nlmsg_flags, unlh);
+					  nlmsg_flags, unlh, net_admin);
 
 	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
-				  nlmsg_flags, unlh);
+				  nlmsg_flags, unlh, net_admin);
 }
 
 struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -435,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 	err = sk_diag_fill(sk, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh);
+			   nlh->nlmsg_seq, 0, nlh,
+			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		nlmsg_free(rep);
@@ -796,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk,
 			      struct sk_buff *skb,
 			      struct netlink_callback *cb,
 			      const struct inet_diag_req_v2 *r,
-			      const struct nlattr *bc)
+			      const struct nlattr *bc,
+			      bool net_admin)
 {
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
@@ -804,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk,
 	return inet_csk_diag_fill(sk, skb, r,
 				  sk_user_ns(NETLINK_CB(cb->skb).sk),
 				  NETLINK_CB(cb->skb).portid,
-				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+				  net_admin);
 }
 
 static void twsk_build_assert(void)
@@ -840,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	int i, num, s_i, s_num;
 	u32 idiag_states = r->idiag_states;
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 
 	if (idiag_states & TCPF_SYN_RECV)
 		idiag_states |= TCPF_NEW_SYN_RECV;
@@ -880,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 				    cb->args[3] > 0)
 					goto next_listen;
 
-				if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+				if (inet_csk_diag_dump(sk, skb, cb, r,
+						       bc, net_admin) < 0) {
 					spin_unlock_bh(&ilb->lock);
 					goto done;
 				}
@@ -948,7 +965,7 @@ skip_listen_ht:
 					   sk_user_ns(NETLINK_CB(cb->skb).sk),
 					   NETLINK_CB(cb->skb).portid,
 					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					   cb->nlh);
+					   cb->nlh, net_admin);
 			if (res < 0) {
 				spin_unlock_bh(lock);
 				goto done;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 58b79c0c0d69..9a89c10a55f0 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -20,7 +20,7 @@
 static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 			struct netlink_callback *cb,
 			const struct inet_diag_req_v2 *req,
-			struct nlattr *bc)
+			struct nlattr *bc, bool net_admin)
 {
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
@@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 	return inet_sk_diag_fill(sk, NULL, skb, req,
 			sk_user_ns(NETLINK_CB(cb->skb).sk),
 			NETLINK_CB(cb->skb).portid,
-			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
 }
 
 static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 	err = inet_sk_diag_fill(sk, NULL, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh);
+			   nlh->nlmsg_seq, 0, nlh,
+			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(rep);
@@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 		     struct netlink_callback *cb,
 		     const struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 	struct net *net = sock_net(skb->sk);
 	int num, s_num, slot, s_slot;
 
@@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 			    r->id.idiag_dport)
 				goto next;
 
-			if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+			if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
 				spin_unlock_bh(&hslot->lock);
 				goto done;
 			}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index f3508aa75815..807158e32f5f 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
 			       const struct inet_diag_req_v2 *req,
 			       struct user_namespace *user_ns,
 			       int portid, u32 seq, u16 nlmsg_flags,
-			       const struct nlmsghdr *unlh)
+			       const struct nlmsghdr *unlh,
+			       bool net_admin)
 {
 	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
 	struct list_head *addr_list;
@@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
 		r->idiag_retrans = 0;
 	}
 
-	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
 		goto errout;
 
 	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
@@ -203,6 +204,7 @@ struct sctp_comm_param {
 	struct netlink_callback *cb;
 	const struct inet_diag_req_v2 *r;
 	const struct nlmsghdr *nlh;
+	bool net_admin;
 };
 
 static size_t inet_assoc_attr_size(struct sctp_association *asoc)
@@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc)
 		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(4) /* INET_DIAG_MARK */
 		+ nla_total_size(addrlen * asoc->peer.transport_count)
 		+ nla_total_size(addrlen * addrcnt)
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
@@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
 	err = inet_sctp_diag_fill(sk, assoc, rep, req,
 				  sk_user_ns(NETLINK_CB(in_skb).sk),
 				  NETLINK_CB(in_skb).portid,
-				  nlh->nlmsg_seq, 0, nlh);
+				  nlh->nlmsg_seq, 0, nlh,
+				  commp->net_admin);
 	release_sock(sk);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
@@ -310,7 +314,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
 					sk_user_ns(NETLINK_CB(cb->skb).sk),
 					NETLINK_CB(cb->skb).portid,
 					cb->nlh->nlmsg_seq,
-					NLM_F_MULTI, cb->nlh) < 0) {
+					NLM_F_MULTI, cb->nlh,
+					commp->net_admin) < 0) {
 			cb->args[3] = 1;
 			err = 2;
 			goto release;
@@ -320,7 +325,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
 		if (inet_sctp_diag_fill(sk, assoc, skb, r,
 					sk_user_ns(NETLINK_CB(cb->skb).sk),
 					NETLINK_CB(cb->skb).portid,
-					cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
+					cb->nlh->nlmsg_seq, 0, cb->nlh,
+					commp->net_admin) < 0) {
 			err = 2;
 			goto release;
 		}
@@ -375,7 +381,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
 				sk_user_ns(NETLINK_CB(cb->skb).sk),
 				NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
-				cb->nlh) < 0) {
+				cb->nlh, commp->net_admin) < 0) {
 		err = 2;
 		goto out;
 	}
@@ -412,6 +418,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
 		.skb = in_skb,
 		.r = req,
 		.nlh = nlh,
+		.net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
 	};
 
 	if (req->sdiag_family == AF_INET) {
@@ -447,6 +454,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		.skb = skb,
 		.cb = cb,
 		.r = r,
+		.net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
 	};
 
 	/* eps hashtable dumps
-- 
cgit v1.2.3


From e7e31ca43d6bedf1c551b1f9c7e78d51c9a45790 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Wed, 7 Sep 2016 19:07:58 +0300
Subject: net/mlx5e: Move an_disable_cap bit to a new position

Previous an_disable_cap position bit31 is deprecated to be use in driver
with newer firmware.  New firmware will advertise the same capability
in bit29.

Old capability didn't allow setting more than one protocol for a
specific speed when autoneg is off, while newer firmware will allow
this and it is indicated in the new capability location.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 21bc4557b67a..d1f9a581aca8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -6710,9 +6710,10 @@ struct mlx5_ifc_pude_reg_bits {
 };
 
 struct mlx5_ifc_ptys_reg_bits {
-	u8         an_disable_cap[0x1];
+	u8         reserved_at_0[0x1];
 	u8         an_disable_admin[0x1];
-	u8         reserved_at_2[0x6];
+	u8         an_disable_cap[0x1];
+	u8         reserved_at_3[0x5];
 	u8         local_port[0x8];
 	u8         reserved_at_10[0xd];
 	u8         proto_mask[0x3];
-- 
cgit v1.2.3


From 8c146bb9d59aa2ac45222171916ece186c4b3943 Mon Sep 17 00:00:00 2001
From: Thomas F Herbert <thomasfherbert@gmail.com>
Date: Wed, 7 Sep 2016 12:56:57 -0400
Subject: openvswitch: 802.1ad uapi changes.

openvswitch: Add support for 8021.AD

Change the description of the VLAN tpid field.

Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 54c3b4f4aceb..59ed3992c760 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -605,13 +605,13 @@ struct ovs_action_push_mpls {
  * @vlan_tci: Tag control identifier (TCI) to push.  The CFI bit must be set
  * (but it will not be set in the 802.1Q header that is pushed).
  *
- * The @vlan_tpid value is typically %ETH_P_8021Q.  The only acceptable TPID
- * values are those that the kernel module also parses as 802.1Q headers, to
- * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN
- * from having surprising results.
+ * The @vlan_tpid value is typically %ETH_P_8021Q or %ETH_P_8021AD.
+ * The only acceptable TPID values are those that the kernel module also parses
+ * as 802.1Q or 802.1AD headers, to prevent %OVS_ACTION_ATTR_PUSH_VLAN followed
+ * by %OVS_ACTION_ATTR_POP_VLAN from having surprising results.
  */
 struct ovs_action_push_vlan {
-	__be16 vlan_tpid;	/* 802.1Q TPID. */
+	__be16 vlan_tpid;	/* 802.1Q or 802.1ad TPID. */
 	__be16 vlan_tci;	/* 802.1Q TCI (VLAN ID and priority). */
 };
 
@@ -721,9 +721,10 @@ enum ovs_nat_attr {
  * is copied from the value to the packet header field, rest of the bits are
  * left unchanged.  The non-masked value bits must be passed in as zeroes.
  * Masking is not supported for the %OVS_KEY_ATTR_TUNNEL attribute.
- * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the
- * packet.
- * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet.
+ * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q or 802.1ad header
+ * onto the packet.
+ * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q or 802.1ad header
+ * from the packet.
  * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in
  * the nested %OVS_SAMPLE_ATTR_* attributes.
  * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the
-- 
cgit v1.2.3


From fe19c4f971a55cea3be442d8032a5f6021702791 Mon Sep 17 00:00:00 2001
From: Eric Garver <e@erig.me>
Date: Wed, 7 Sep 2016 12:56:58 -0400
Subject: vlan: Check for vlan ethernet types for 8021.q or 802.1ad

This is to simplify using double tagged vlans. This function allows all
valid vlan ethertypes to be checked in a single function call.
Also replace some instances that check for both ETH_P_8021Q and
ETH_P_8021AD.

Patch based on one originally by Thomas F Herbert.

Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 49d4aef1f789..3319d97d789d 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -272,6 +272,23 @@ static inline int vlan_get_encap_level(struct net_device *dev)
 }
 #endif
 
+/**
+ * eth_type_vlan - check for valid vlan ether type.
+ * @ethertype: ether type to check
+ *
+ * Returns true if the ether type is a vlan ether type.
+ */
+static inline bool eth_type_vlan(__be16 ethertype)
+{
+	switch (ethertype) {
+	case htons(ETH_P_8021Q):
+	case htons(ETH_P_8021AD):
+		return true;
+	default:
+		return false;
+	}
+}
+
 static inline bool vlan_hw_offload_capable(netdev_features_t features,
 					   __be16 proto)
 {
@@ -425,8 +442,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 {
 	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data;
 
-	if (veth->h_vlan_proto != htons(ETH_P_8021Q) &&
-	    veth->h_vlan_proto != htons(ETH_P_8021AD))
+	if (!eth_type_vlan(veth->h_vlan_proto))
 		return -EINVAL;
 
 	*vlan_tci = ntohs(veth->h_vlan_TCI);
@@ -488,7 +504,7 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type,
 	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
 	 * ETH_HLEN otherwise
 	 */
-	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+	if (eth_type_vlan(type)) {
 		if (vlan_depth) {
 			if (WARN_ON(vlan_depth < VLAN_HLEN))
 				return 0;
@@ -506,8 +522,7 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type,
 			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
 			type = vh->h_vlan_encapsulated_proto;
 			vlan_depth += VLAN_HLEN;
-		} while (type == htons(ETH_P_8021Q) ||
-			 type == htons(ETH_P_8021AD));
+		} while (eth_type_vlan(type));
 	}
 
 	if (depth)
@@ -572,8 +587,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb,
 static inline bool skb_vlan_tagged(const struct sk_buff *skb)
 {
 	if (!skb_vlan_tag_present(skb) &&
-	    likely(skb->protocol != htons(ETH_P_8021Q) &&
-		   skb->protocol != htons(ETH_P_8021AD)))
+	    likely(!eth_type_vlan(skb->protocol)))
 		return false;
 
 	return true;
@@ -593,15 +607,14 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb)
 	if (!skb_vlan_tag_present(skb)) {
 		struct vlan_ethhdr *veh;
 
-		if (likely(protocol != htons(ETH_P_8021Q) &&
-			   protocol != htons(ETH_P_8021AD)))
+		if (likely(!eth_type_vlan(protocol)))
 			return false;
 
 		veh = (struct vlan_ethhdr *)skb->data;
 		protocol = veh->h_vlan_encapsulated_proto;
 	}
 
-	if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD))
+	if (!eth_type_vlan(protocol))
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a Mon Sep 17 00:00:00 2001
From: Yaogong Wang <wygivan@google.com>
Date: Wed, 7 Sep 2016 14:49:28 -0700
Subject: tcp: use an RB tree for ooo receive queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Over the years, TCP BDP has increased by several orders of magnitude,
and some people are considering to reach the 2 Gbytes limit.

Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000
MSS.

In presence of packet losses (or reorders), TCP stores incoming packets
into an out of order queue, and number of skbs sitting there waiting for
the missing packets to be received can be in the 10^5 range.

Most packets are appended to the tail of this queue, and when
packets can finally be transferred to receive queue, we scan the queue
from its head.

However, in presence of heavy losses, we might have to find an arbitrary
point in this queue, involving a linear scan for every incoming packet,
throwing away cpu caches.

This patch converts it to a RB tree, to get bounded latencies.

Yaogong wrote a preliminary patch about 2 years ago.
Eric did the rebase, added ofo_last_skb cache, polishing and tests.

Tested with network dropping between 1 and 10 % packets, with good
success (about 30 % increase of throughput in stress tests)

Next step would be to also use an RB tree for the write queue at sender
side ;)

Signed-off-by: Yaogong Wang <wygivan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-By: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h   |   2 +
 include/linux/tcp.h      |   7 +-
 include/net/tcp.h        |   2 +-
 net/core/skbuff.c        |  19 +++
 net/ipv4/tcp.c           |   4 +-
 net/ipv4/tcp_input.c     | 330 +++++++++++++++++++++++++++--------------------
 net/ipv4/tcp_ipv4.c      |   2 +-
 net/ipv4/tcp_minisocks.c |   1 -
 8 files changed, 218 insertions(+), 149 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cfb7219be665..4c5662f05bda 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2402,6 +2402,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
+void skb_rbtree_purge(struct rb_root *root);
+
 void *netdev_alloc_frag(unsigned int fragsz);
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7be9b1242354..c723a465125d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -281,10 +281,9 @@ struct tcp_sock {
 	struct sk_buff* lost_skb_hint;
 	struct sk_buff *retransmit_skb_hint;
 
-	/* OOO segments go in this list. Note that socket lock must be held,
-	 * as we do not use sk_buff_head lock.
-	 */
-	struct sk_buff_head	out_of_order_queue;
+	/* OOO segments go in this rbtree. Socket lock must be held. */
+	struct rb_root	out_of_order_queue;
+	struct sk_buff	*ooo_last_skb; /* cache rb_last(out_of_order_queue) */
 
 	/* SACKs data, these 2 need to be together (see tcp_options_write) */
 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d6ae36512429..fdfbedd61c67 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -640,7 +640,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (skb_queue_empty(&tp->out_of_order_queue) &&
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
 	    tp->rcv_wnd &&
 	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
 	    !tp->urg_data)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b68fa1..1e329d411242 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2444,6 +2444,25 @@ void skb_queue_purge(struct sk_buff_head *list)
 }
 EXPORT_SYMBOL(skb_queue_purge);
 
+/**
+ *	skb_rbtree_purge - empty a skb rbtree
+ *	@root: root of the rbtree to empty
+ *
+ *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ *	the list and one reference dropped. This function does not take
+ *	any lock. Synchronization should be handled by the caller (e.g., TCP
+ *	out-of-order queue is protected by the socket lock).
+ */
+void skb_rbtree_purge(struct rb_root *root)
+{
+	struct sk_buff *skb, *next;
+
+	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
+		kfree_skb(skb);
+
+	*root = RB_ROOT;
+}
+
 /**
  *	skb_queue_head - queue a buffer at the list head
  *	@list: list to use
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 77311a92275c..a13fcb369f52 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -380,7 +380,7 @@ void tcp_init_sock(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	__skb_queue_head_init(&tp->out_of_order_queue);
+	tp->out_of_order_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
 	INIT_LIST_HEAD(&tp->tsq_node);
@@ -2243,7 +2243,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
 	tcp_write_queue_purge(sk);
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 
 	inet->inet_dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8cd02c0b056c..a5934c4c8cd4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4108,7 +4108,7 @@ void tcp_fin(struct sock *sk)
 	/* It _is_ possible, that we have something out-of-order _after_ FIN.
 	 * Probably, we should reset in this case. For now drop them.
 	 */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 	if (tcp_is_sack(tp))
 		tcp_sack_reset(&tp->rx_opt);
 	sk_mem_reclaim(sk);
@@ -4268,7 +4268,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	int this_sack;
 
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-	if (skb_queue_empty(&tp->out_of_order_queue)) {
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
 		return;
 	}
@@ -4344,10 +4344,13 @@ static void tcp_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 dsack_high = tp->rcv_nxt;
+	bool fin, fragstolen, eaten;
 	struct sk_buff *skb, *tail;
-	bool fragstolen, eaten;
+	struct rb_node *p;
 
-	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+	p = rb_first(&tp->out_of_order_queue);
+	while (p) {
+		skb = rb_entry(p, struct sk_buff, rbnode);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4357,9 +4360,10 @@ static void tcp_ofo_queue(struct sock *sk)
 				dsack_high = TCP_SKB_CB(skb)->end_seq;
 			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
 		}
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
 
-		__skb_unlink(skb, &tp->out_of_order_queue);
-		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			tcp_drop(sk, skb);
 			continue;
@@ -4371,12 +4375,19 @@ static void tcp_ofo_queue(struct sock *sk)
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (!eaten)
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
-			tcp_fin(sk);
-		if (eaten)
+		else
 			kfree_skb_partial(skb, fragstolen);
+
+		if (unlikely(fin)) {
+			tcp_fin(sk);
+			/* tcp_fin() purges tp->out_of_order_queue,
+			 * so we must end this loop right now.
+			 */
+			break;
+		}
 	}
 }
 
@@ -4403,8 +4414,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct rb_node **p, *q, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
+	bool fragstolen;
 
 	tcp_ecn_check_ce(tp, skb);
 
@@ -4419,88 +4432,85 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	inet_csk_schedule_ack(sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+	seq = TCP_SKB_CB(skb)->seq;
+	end_seq = TCP_SKB_CB(skb)->end_seq;
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
-		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+		   tp->rcv_nxt, seq, end_seq);
 
-	skb1 = skb_peek_tail(&tp->out_of_order_queue);
-	if (!skb1) {
+	p = &tp->out_of_order_queue.rb_node;
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		/* Initial out of order segment, build 1 SACK. */
 		if (tcp_is_sack(tp)) {
 			tp->rx_opt.num_sacks = 1;
-			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
-			tp->selective_acks[0].end_seq =
-						TCP_SKB_CB(skb)->end_seq;
+			tp->selective_acks[0].start_seq = seq;
+			tp->selective_acks[0].end_seq = end_seq;
 		}
-		__skb_queue_head(&tp->out_of_order_queue, skb);
+		rb_link_node(&skb->rbnode, NULL, p);
+		rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+		tp->ooo_last_skb = skb;
 		goto end;
 	}
 
-	seq = TCP_SKB_CB(skb)->seq;
-	end_seq = TCP_SKB_CB(skb)->end_seq;
-
-	if (seq == TCP_SKB_CB(skb1)->end_seq) {
-		bool fragstolen;
-
-		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
-			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-		} else {
-			tcp_grow_window(sk, skb);
-			kfree_skb_partial(skb, fragstolen);
-			skb = NULL;
+	/* In the typical case, we are adding an skb to the end of the list.
+	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+	 */
+	if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+coalesce_done:
+		tcp_grow_window(sk, skb);
+		kfree_skb_partial(skb, fragstolen);
+		skb = NULL;
+		goto add_sack;
+	}
+
+	/* Find place to insert this segment. Handle overlaps on the way. */
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+			p = &parent->rb_left;
+			continue;
 		}
-
-		if (!tp->rx_opt.num_sacks ||
-		    tp->selective_acks[0].end_seq != seq)
-			goto add_sack;
-
-		/* Common case: data arrive in order after hole. */
-		tp->selective_acks[0].end_seq = end_seq;
-		goto end;
-	}
-
-	/* Find place to insert this segment. */
-	while (1) {
-		if (!after(TCP_SKB_CB(skb1)->seq, seq))
-			break;
-		if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
-			skb1 = NULL;
-			break;
+		if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+				/* All the bits are present. Drop. */
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				__kfree_skb(skb);
+				skb = NULL;
+				tcp_dsack_set(sk, seq, end_seq);
+				goto add_sack;
+			}
+			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+				/* Partial overlap. */
+				tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+			} else {
+				/* skb's seq == skb1's seq and skb covers skb1.
+				 * Replace skb1 with skb.
+				 */
+				rb_replace_node(&skb1->rbnode, &skb->rbnode,
+						&tp->out_of_order_queue);
+				tcp_dsack_extend(sk,
+						 TCP_SKB_CB(skb1)->seq,
+						 TCP_SKB_CB(skb1)->end_seq);
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				__kfree_skb(skb1);
+				goto add_sack;
+			}
+		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+			goto coalesce_done;
 		}
-		skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+		p = &parent->rb_right;
 	}
 
-	/* Do skb overlap to previous one? */
-	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
-		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
-			/* All the bits are present. Drop. */
-			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-			tcp_drop(sk, skb);
-			skb = NULL;
-			tcp_dsack_set(sk, seq, end_seq);
-			goto add_sack;
-		}
-		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
-			/* Partial overlap. */
-			tcp_dsack_set(sk, seq,
-				      TCP_SKB_CB(skb1)->end_seq);
-		} else {
-			if (skb_queue_is_first(&tp->out_of_order_queue,
-					       skb1))
-				skb1 = NULL;
-			else
-				skb1 = skb_queue_prev(
-					&tp->out_of_order_queue,
-					skb1);
-		}
-	}
-	if (!skb1)
-		__skb_queue_head(&tp->out_of_order_queue, skb);
-	else
-		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+	/* Insert segment into RB tree. */
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
 
-	/* And clean segments covered by new one as whole. */
-	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
-		skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+	/* Remove other segments covered by skb. */
+	while ((q = rb_next(&skb->rbnode)) != NULL) {
+		skb1 = rb_entry(q, struct sk_buff, rbnode);
 
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
@@ -4509,12 +4519,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 					 end_seq);
 			break;
 		}
-		__skb_unlink(skb1, &tp->out_of_order_queue);
+		rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
 		tcp_drop(sk, skb1);
 	}
+	/* If there is no skb after us, we are the last_skb ! */
+	if (!q)
+		tp->ooo_last_skb = skb;
 
 add_sack:
 	if (tcp_is_sack(tp))
@@ -4651,13 +4664,13 @@ queue_and_out:
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			tcp_fin(sk);
 
-		if (!skb_queue_empty(&tp->out_of_order_queue)) {
+		if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 			tcp_ofo_queue(sk);
 
 			/* RFC2581. 4.2. SHOULD send immediate ACK, when
 			 * gap in queue is filled.
 			 */
-			if (skb_queue_empty(&tp->out_of_order_queue))
+			if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 				inet_csk(sk)->icsk_ack.pingpong = 0;
 		}
 
@@ -4711,48 +4724,76 @@ drop:
 	tcp_data_queue_ofo(sk, skb);
 }
 
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+	if (list)
+		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+
+	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+}
+
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
-					struct sk_buff_head *list)
+					struct sk_buff_head *list,
+					struct rb_root *root)
 {
-	struct sk_buff *next = NULL;
+	struct sk_buff *next = tcp_skb_next(skb, list);
 
-	if (!skb_queue_is_last(list, skb))
-		next = skb_queue_next(list, skb);
+	if (list)
+		__skb_unlink(skb, list);
+	else
+		rb_erase(&skb->rbnode, root);
 
-	__skb_unlink(skb, list);
 	__kfree_skb(skb);
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
 
 	return next;
 }
 
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sk_buff *skb1;
+
+	while (*p) {
+		parent = *p;
+		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, root);
+}
+
 /* Collapse contiguous sequence of skbs head..tail with
  * sequence numbers start..end.
  *
- * If tail is NULL, this means until the end of the list.
+ * If tail is NULL, this means until the end of the queue.
  *
  * Segments with FIN/SYN are not collapsed (only because this
  * simplifies code)
  */
 static void
-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
-	     struct sk_buff *head, struct sk_buff *tail,
-	     u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+	     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
 {
-	struct sk_buff *skb, *n;
+	struct sk_buff *skb = head, *n;
+	struct sk_buff_head tmp;
 	bool end_of_skbs;
 
 	/* First, check that queue is collapsible and find
-	 * the point where collapsing can be useful. */
-	skb = head;
+	 * the point where collapsing can be useful.
+	 */
 restart:
-	end_of_skbs = true;
-	skb_queue_walk_from_safe(list, skb, n) {
-		if (skb == tail)
-			break;
+	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+		n = tcp_skb_next(skb, list);
+
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-			skb = tcp_collapse_one(sk, skb, list);
+			skb = tcp_collapse_one(sk, skb, list, root);
 			if (!skb)
 				break;
 			goto restart;
@@ -4770,13 +4811,10 @@ restart:
 			break;
 		}
 
-		if (!skb_queue_is_last(list, skb)) {
-			struct sk_buff *next = skb_queue_next(list, skb);
-			if (next != tail &&
-			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
-				end_of_skbs = false;
-				break;
-			}
+		if (n && n != tail &&
+		    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+			end_of_skbs = false;
+			break;
 		}
 
 		/* Decided to skip this, advance start seq. */
@@ -4786,17 +4824,22 @@ restart:
 	    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
 		return;
 
+	__skb_queue_head_init(&tmp);
+
 	while (before(start, end)) {
 		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
 		struct sk_buff *nskb;
 
 		nskb = alloc_skb(copy, GFP_ATOMIC);
 		if (!nskb)
-			return;
+			break;
 
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-		__skb_queue_before(list, skb, nskb);
+		if (list)
+			__skb_queue_before(list, skb, nskb);
+		else
+			__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
 		skb_set_owner_r(nskb, sk);
 
 		/* Copy data, releasing collapsed skbs. */
@@ -4814,14 +4857,17 @@ restart:
 				start += size;
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-				skb = tcp_collapse_one(sk, skb, list);
+				skb = tcp_collapse_one(sk, skb, list, root);
 				if (!skb ||
 				    skb == tail ||
 				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
-					return;
+					goto end;
 			}
 		}
 	}
+end:
+	skb_queue_walk_safe(&tmp, skb, n)
+		tcp_rbtree_insert(root, skb);
 }
 
 /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4830,43 +4876,43 @@ restart:
 static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
-	struct sk_buff *head;
+	struct sk_buff *skb, *head;
+	struct rb_node *p;
 	u32 start, end;
 
-	if (!skb)
+	p = rb_first(&tp->out_of_order_queue);
+	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+new_range:
+	if (!skb) {
+		p = rb_last(&tp->out_of_order_queue);
+		/* Note: This is possible p is NULL here. We do not
+		 * use rb_entry_safe(), as ooo_last_skb is valid only
+		 * if rbtree is not empty.
+		 */
+		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
 		return;
-
+	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
-	head = skb;
-
-	for (;;) {
-		struct sk_buff *next = NULL;
 
-		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
-			next = skb_queue_next(&tp->out_of_order_queue, skb);
-		skb = next;
+	for (head = skb;;) {
+		skb = tcp_skb_next(skb, NULL);
 
-		/* Segment is terminated when we see gap or when
-		 * we are at the end of all the queue. */
+		/* Range is terminated when we see a gap or when
+		 * we are at the queue end.
+		 */
 		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, &tp->out_of_order_queue,
+			tcp_collapse(sk, NULL, &tp->out_of_order_queue,
 				     head, skb, start, end);
-			head = skb;
-			if (!skb)
-				break;
-			/* Start new segment */
+			goto new_range;
+		}
+
+		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
 			start = TCP_SKB_CB(skb)->seq;
+		if (after(TCP_SKB_CB(skb)->end_seq, end))
 			end = TCP_SKB_CB(skb)->end_seq;
-		} else {
-			if (before(TCP_SKB_CB(skb)->seq, start))
-				start = TCP_SKB_CB(skb)->seq;
-			if (after(TCP_SKB_CB(skb)->end_seq, end))
-				end = TCP_SKB_CB(skb)->end_seq;
-		}
 	}
 }
 
@@ -4883,20 +4929,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
+	struct rb_node *node, *prev;
 
-	if (skb_queue_empty(&tp->out_of_order_queue))
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 		return false;
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-
-	while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
-		tcp_drop(sk, skb);
+	node = &tp->ooo_last_skb->rbnode;
+	do {
+		prev = rb_prev(node);
+		rb_erase(node, &tp->out_of_order_queue);
+		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
 		sk_mem_reclaim(sk);
 		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 		    !tcp_under_memory_pressure(sk))
 			break;
-	}
+		node = prev;
+	} while (node);
+	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
@@ -4930,7 +4980,7 @@ static int tcp_prune_queue(struct sock *sk)
 
 	tcp_collapse_ofo_queue(sk);
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		tcp_collapse(sk, &sk->sk_receive_queue,
+		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
 			     skb_peek(&sk->sk_receive_queue),
 			     NULL,
 			     tp->copied_seq, tp->rcv_nxt);
@@ -5035,7 +5085,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* We have out of order data. */
-	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+	    (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
 		/* Then ack it now */
 		tcp_send_ack(sk);
 	} else {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a75bf48d7950..04b989328558 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1845,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	tcp_write_queue_purge(sk);
 
 	/* Cleans up our, hopefully empty, out_of_order_queue. */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Clean up the MD5 key list, if any */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4b95ec4ed2c8..f63c73dc0acb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -488,7 +488,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_cwnd_cnt = 0;
 
 		tcp_init_xmit_timers(newsk);
-		__skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
 
 		newtp->rx_opt.saw_tstamp = 0;
-- 
cgit v1.2.3


From 34a3d4b2d1f1b7c81af79f6f93a6cef4c3a0f54a Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@tricolour.ca>
Date: Thu, 8 Sep 2016 13:55:56 -0400
Subject: xfrm: fix header file comment reference to struct
 xfrm_replay_state_esn

Reported-by: Paul Wouters <paul@nohats.ca>
Signed-off-by: Richard Guy Briggs <rgb@tricolour.ca>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 143338978b48..1fc62b239f1b 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -298,7 +298,7 @@ enum xfrm_attr_type_t {
 	XFRMA_ALG_AUTH_TRUNC,	/* struct xfrm_algo_auth */
 	XFRMA_MARK,		/* struct xfrm_mark */
 	XFRMA_TFCPAD,		/* __u32 */
-	XFRMA_REPLAY_ESN_VAL,	/* struct xfrm_replay_esn */
+	XFRMA_REPLAY_ESN_VAL,	/* struct xfrm_replay_state_esn */
 	XFRMA_SA_EXTRA_FLAGS,	/* __u32 */
 	XFRMA_PROTO,		/* __u8 */
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
-- 
cgit v1.2.3


From 3e9b3112ec74f192eaab976c3889e34255cae940 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 31 Aug 2016 12:46:44 +0100
Subject: add basic register-field manipulation macros

Common approach to accessing register fields is to define
structures or sets of macros containing mask and shift pair.
Operations on the register are then performed as follows:

 field = (reg >> shift) & mask;

 reg &= ~(mask << shift);
 reg |= (field & mask) << shift;

Defining shift and mask separately is tedious.  Ivo van Doorn
came up with an idea of computing them at compilation time
based on a single shifted mask (later refined by Felix) which
can be used like this:

 #define REG_FIELD 0x000ff000

 field = FIELD_GET(REG_FIELD, reg);

 reg &= ~REG_FIELD;
 reg |= FIELD_PREP(REG_FIELD, field);

FIELD_{GET,PREP} macros take care of finding out what the
appropriate shift is based on compilation time ffs operation.

GENMASK can be used to define registers (which is usually
less error-prone and easier to match with datasheets).

This approach is the most convenient I've seen so to limit code
multiplication let's move the macros to a global header file.
Attempts to use static inlines instead of macros failed due
to false positive triggering of BUILD_BUG_ON()s, especially with
GCC < 6.0.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dinan Gunawardena <dinan.gunawardena@netronome.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/linux/bitfield.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bug.h      |  3 ++
 2 files changed, 96 insertions(+)
 create mode 100644 include/linux/bitfield.h

(limited to 'include')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
new file mode 100644
index 000000000000..f6505d83069d
--- /dev/null
+++ b/include/linux/bitfield.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2014 Felix Fietkau <nbd@nbd.name>
+ * Copyright (C) 2004 - 2009 Ivo van Doorn <IvDoorn@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_BITFIELD_H
+#define _LINUX_BITFIELD_H
+
+#include <linux/bug.h>
+
+/*
+ * Bitfield access macros
+ *
+ * FIELD_{GET,PREP} macros take as first parameter shifted mask
+ * from which they extract the base mask and shift amount.
+ * Mask must be a compilation time constant.
+ *
+ * Example:
+ *
+ *  #define REG_FIELD_A  GENMASK(6, 0)
+ *  #define REG_FIELD_B  BIT(7)
+ *  #define REG_FIELD_C  GENMASK(15, 8)
+ *  #define REG_FIELD_D  GENMASK(31, 16)
+ *
+ * Get:
+ *  a = FIELD_GET(REG_FIELD_A, reg);
+ *  b = FIELD_GET(REG_FIELD_B, reg);
+ *
+ * Set:
+ *  reg = FIELD_PREP(REG_FIELD_A, 1) |
+ *	  FIELD_PREP(REG_FIELD_B, 0) |
+ *	  FIELD_PREP(REG_FIELD_C, c) |
+ *	  FIELD_PREP(REG_FIELD_D, 0x40);
+ *
+ * Modify:
+ *  reg &= ~REG_FIELD_C;
+ *  reg |= FIELD_PREP(REG_FIELD_C, c);
+ */
+
+#define __bf_shf(x) (__builtin_ffsll(x) - 1)
+
+#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx)			\
+	({								\
+		BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask),		\
+				 _pfx "mask is not constant");		\
+		BUILD_BUG_ON_MSG(!(_mask), _pfx "mask is zero");	\
+		BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ?		\
+				 ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \
+				 _pfx "value too large for the field"); \
+		BUILD_BUG_ON_MSG((_mask) > (typeof(_reg))~0ull,		\
+				 _pfx "type of reg too small for mask"); \
+		__BUILD_BUG_ON_NOT_POWER_OF_2((_mask) +			\
+					      (1ULL << __bf_shf(_mask))); \
+	})
+
+/**
+ * FIELD_PREP() - prepare a bitfield element
+ * @_mask: shifted mask defining the field's length and position
+ * @_val:  value to put in the field
+ *
+ * FIELD_PREP() masks and shifts up the value.  The result should
+ * be combined with other fields of the bitfield using logical OR.
+ */
+#define FIELD_PREP(_mask, _val)						\
+	({								\
+		__BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: ");	\
+		((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask);	\
+	})
+
+/**
+ * FIELD_GET() - extract a bitfield element
+ * @_mask: shifted mask defining the field's length and position
+ * @_reg:  32bit value of entire bitfield
+ *
+ * FIELD_GET() extracts the field specified by @_mask from the
+ * bitfield passed in as @_reg by masking and shifting it down.
+ */
+#define FIELD_GET(_mask, _reg)						\
+	({								\
+		__BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: ");	\
+		(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask));	\
+	})
+
+#endif
diff --git a/include/linux/bug.h b/include/linux/bug.h
index e51b0709e78d..292d6a10b0c2 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -13,6 +13,7 @@ enum bug_trap_type {
 struct pt_regs;
 
 #ifdef __CHECKER__
+#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0)
 #define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0)
 #define BUILD_BUG_ON_ZERO(e) (0)
 #define BUILD_BUG_ON_NULL(e) ((void*)0)
@@ -24,6 +25,8 @@ struct pt_regs;
 #else /* __CHECKER__ */
 
 /* Force a compilation error if a constant expression is not a power of 2 */
+#define __BUILD_BUG_ON_NOT_POWER_OF_2(n)	\
+	BUILD_BUG_ON(((n) & ((n) - 1)) != 0)
 #define BUILD_BUG_ON_NOT_POWER_OF_2(n)			\
 	BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0))
 
-- 
cgit v1.2.3


From 634faf3686900ccdee87b77e2c56df8b2159912b Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Mon, 5 Sep 2016 11:42:12 +0100
Subject: brcmfmac: add support for bcm4339 chip with modalias
 sdio:c00v02D0d4339

The driver already supports the bcm4339 chipset but only for the variant
that shares the same modalias as the bcm4335, ie. sdio:c00v02D0d4335.
It turns out that there are also bcm4339 devices out there that have a
more distiguishable modalias sdio:c00v02D0d4339.

Reported-by: Steve deRosier <derosier@gmail.com>
Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c   | 3 ++-
 include/linux/mmc/sdio_ids.h                              | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index f549c25608d6..03404cbe9237 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -1101,6 +1101,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43341),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43362),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4335_4339),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4339),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4345),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354),
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 68ab3ac15650..589a49cd9cd5 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -3757,7 +3757,8 @@ static u32 brcmf_sdio_buscore_read32(void *ctx, u32 addr)
 	u32 val, rev;
 
 	val = brcmf_sdiod_regrl(sdiodev, addr, NULL);
-	if (sdiodev->func[0]->device == SDIO_DEVICE_ID_BROADCOM_4335_4339 &&
+	if ((sdiodev->func[0]->device == SDIO_DEVICE_ID_BROADCOM_4335_4339 ||
+	     sdiodev->func[0]->device == SDIO_DEVICE_ID_BROADCOM_4339) &&
 	    addr == CORE_CC_REG(SI_ENUM_BASE, chipid)) {
 		rev = (val & CID_REV_MASK) >> CID_REV_SHIFT;
 		if (rev >= 2) {
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 0d126aeb3ec0..d43ef96bf075 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -32,6 +32,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_43340		0xa94c
 #define SDIO_DEVICE_ID_BROADCOM_43341		0xa94d
 #define SDIO_DEVICE_ID_BROADCOM_4335_4339	0x4335
+#define SDIO_DEVICE_ID_BROADCOM_4339		0x4339
 #define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
 #define SDIO_DEVICE_ID_BROADCOM_4345		0x4345
-- 
cgit v1.2.3


From 2545e5da080b4839dd859e3b09343a884f6ab0e3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 17 Aug 2016 16:36:37 -0400
Subject: asm-generic: make copy_from_user() zero the destination properly

... in all cases, including the failing access_ok()

Note that some architectures using asm-generic/uaccess.h have
__copy_from_user() not zeroing the tail on failure halfway
through.  This variant works either way.

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/asm-generic/uaccess.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index 1bfa602958f2..04e21a41796a 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -257,11 +257,13 @@ extern int __get_user_bad(void) __attribute__((noreturn));
 static inline long copy_from_user(void *to,
 		const void __user * from, unsigned long n)
 {
+	unsigned long res = n;
 	might_fault();
-	if (access_ok(VERIFY_READ, from, n))
-		return __copy_from_user(to, from, n);
-	else
-		return n;
+	if (likely(access_ok(VERIFY_READ, from, n)))
+		res = __copy_from_user(to, from, n);
+	if (unlikely(res))
+		memset(to + (n - res), 0, res);
+	return res;
 }
 
 static inline long copy_to_user(void __user *to,
-- 
cgit v1.2.3


From f035a51536af9802f55d8c79bd87f184ebffb093 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Sep 2016 02:45:29 +0200
Subject: bpf: add BPF_SIZEOF and BPF_FIELD_SIZEOF macros

Add BPF_SIZEOF() and BPF_FIELD_SIZEOF() macros to improve the code a bit
which otherwise often result in overly long bytes_to_bpf_size(sizeof())
and bytes_to_bpf_size(FIELD_SIZEOF()) lines. So place them into a macro
helper instead. Moreover, we currently have a BUILD_BUG_ON(BPF_FIELD_SIZEOF())
check in convert_bpf_extensions(), but we should rather make that generic
as well and add a BUILD_BUG_ON() test in all BPF_SIZEOF()/BPF_FIELD_SIZEOF()
users to detect any rewriter size issues at compile time. Note, there are
currently none, but we want to assert that it stays this way.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h   | 14 ++++++++++++++
 kernel/trace/bpf_trace.c | 12 ++++++------
 net/core/filter.c        | 15 +++++++--------
 3 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index a16439b99fd9..7fabad8dc3fc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -314,6 +314,20 @@ struct bpf_prog_aux;
 	bpf_size;						\
 })
 
+#define BPF_SIZEOF(type)					\
+	({							\
+		const int __size = bytes_to_bpf_size(sizeof(type)); \
+		BUILD_BUG_ON(__size < 0);			\
+		__size;						\
+	})
+
+#define BPF_FIELD_SIZEOF(type, field)				\
+	({							\
+		const int __size = bytes_to_bpf_size(FIELD_SIZEOF(type, field)); \
+		BUILD_BUG_ON(__size < 0);			\
+		__size;						\
+	})
+
 #ifdef CONFIG_COMPAT
 /* A struct sock_filter is architecture independent. */
 struct compat_sock_fprog {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d3869b03d9fe..e63d7d435796 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -583,18 +583,18 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	switch (ctx_off) {
 	case offsetof(struct bpf_perf_event_data, sample_period):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)),
-				      dst_reg, src_reg,
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+						       data), dst_reg, src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, data));
 		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
 				      offsetof(struct perf_sample_data, period));
 		break;
 	default:
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)),
-				      dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+						       regs), dst_reg, src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, regs));
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)),
-				      dst_reg, dst_reg, ctx_off);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
 		break;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 628ed8c7d38d..120c813ef030 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -233,9 +233,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_HATYPE:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
-		BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
 
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
 				      BPF_REG_TMP, BPF_REG_CTX,
 				      offsetof(struct sk_buff, dev));
 		/* if (tmp != 0) goto pc + 1 */
@@ -2685,7 +2684,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	case offsetof(struct __sk_buff, ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
 				      dst_reg, src_reg,
 				      offsetof(struct sk_buff, dev));
 		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
@@ -2750,7 +2749,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		break;
 
 	case offsetof(struct __sk_buff, data):
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
 				      dst_reg, src_reg,
 				      offsetof(struct sk_buff, data));
 		break;
@@ -2759,8 +2758,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		ctx_off -= offsetof(struct __sk_buff, data_end);
 		ctx_off += offsetof(struct sk_buff, cb);
 		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
-				      dst_reg, src_reg, ctx_off);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
+				      ctx_off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_index):
@@ -2795,12 +2794,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 
 	switch (ctx_off) {
 	case offsetof(struct xdp_md, data):
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
 				      dst_reg, src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
 	case offsetof(struct xdp_md, data_end):
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      dst_reg, src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
-- 
cgit v1.2.3


From f3694e00123802d688180e7ae90b240669910e3c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Sep 2016 02:45:31 +0200
Subject: bpf: add BPF_CALL_x macros for declaring helpers

This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.

This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.

BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.

I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h   |  50 ++++++++++++++++++
 kernel/bpf/core.c        |   2 +-
 kernel/bpf/helpers.c     |  46 +++++------------
 kernel/bpf/stackmap.c    |   5 +-
 kernel/trace/bpf_trace.c |  75 +++++++++++++--------------
 net/core/filter.c        | 129 ++++++++++++++++++-----------------------------
 6 files changed, 149 insertions(+), 158 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7fabad8dc3fc..1f09c521adfe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -328,6 +328,56 @@ struct bpf_prog_aux;
 		__size;						\
 	})
 
+#define __BPF_MAP_0(m, v, ...) v
+#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
+#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
+#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__)
+#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__)
+#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__)
+
+#define __BPF_REG_0(...) __BPF_PAD(5)
+#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4)
+#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3)
+#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2)
+#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1)
+#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__)
+
+#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__)
+#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__)
+
+#define __BPF_CAST(t, a)						       \
+	(__force t)							       \
+	(__force							       \
+	 typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long),      \
+				      (unsigned long)0, (t)0))) a
+#define __BPF_V void
+#define __BPF_N
+
+#define __BPF_DECL_ARGS(t, a) t   a
+#define __BPF_DECL_REGS(t, a) u64 a
+
+#define __BPF_PAD(n)							       \
+	__BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
+		  u64, __ur_3, u64, __ur_4, u64, __ur_5)
+
+#define BPF_CALL_x(x, name, ...)					       \
+	static __always_inline						       \
+	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
+	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));	       \
+	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))	       \
+	{								       \
+		return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
+	}								       \
+	static __always_inline						       \
+	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))
+
+#define BPF_CALL_0(name, ...)	BPF_CALL_x(0, name, __VA_ARGS__)
+#define BPF_CALL_1(name, ...)	BPF_CALL_x(1, name, __VA_ARGS__)
+#define BPF_CALL_2(name, ...)	BPF_CALL_x(2, name, __VA_ARGS__)
+#define BPF_CALL_3(name, ...)	BPF_CALL_x(3, name, __VA_ARGS__)
+#define BPF_CALL_4(name, ...)	BPF_CALL_x(4, name, __VA_ARGS__)
+#define BPF_CALL_5(name, ...)	BPF_CALL_x(5, name, __VA_ARGS__)
+
 #ifdef CONFIG_COMPAT
 /* A struct sock_filter is architecture independent. */
 struct compat_sock_fprog {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 03fd23d4d587..7b7baaed9ed4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void)
 	prandom_init_once(&bpf_user_rnd_state);
 }
 
-u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_user_rnd_u32)
 {
 	/* Should someone ever have the rather unwise idea to use some
 	 * of the registers passed into this function, then note that
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6df73bd1ba34..a5b8bf8cfcfd 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
 #include <linux/ktime.h>
 #include <linux/sched.h>
 #include <linux/uidgid.h>
+#include <linux/filter.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -26,24 +27,10 @@
  * if program is allowed to access maps, so check rcu_read_lock_held in
  * all three functions.
  */
-static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 {
-	/* verifier checked that R1 contains a valid pointer to bpf_map
-	 * and R2 points to a program stack and map->key_size bytes were
-	 * initialized
-	 */
-	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-	void *key = (void *) (unsigned long) r2;
-	void *value;
-
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	value = map->ops->map_lookup_elem(map, key);
-
-	/* lookup() returns either pointer to element value or NULL
-	 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
-	 */
-	return (unsigned long) value;
+	return (unsigned long) map->ops->map_lookup_elem(map, key);
 }
 
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
@@ -54,15 +41,11 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
-static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
+	   void *, value, u64, flags)
 {
-	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-	void *key = (void *) (unsigned long) r2;
-	void *value = (void *) (unsigned long) r3;
-
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	return map->ops->map_update_elem(map, key, value, r4);
+	return map->ops->map_update_elem(map, key, value, flags);
 }
 
 const struct bpf_func_proto bpf_map_update_elem_proto = {
@@ -75,13 +58,9 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 {
-	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-	void *key = (void *) (unsigned long) r2;
-
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
 	return map->ops->map_delete_elem(map, key);
 }
 
@@ -99,7 +78,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_smp_processor_id)
 {
 	return smp_processor_id();
 }
@@ -110,7 +89,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_ktime_get_ns)
 {
 	/* NMI safe access to clock monotonic */
 	return ktime_get_mono_fast_ns();
@@ -122,7 +101,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_pid_tgid)
 {
 	struct task_struct *task = current;
 
@@ -138,7 +117,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_uid_gid)
 {
 	struct task_struct *task = current;
 	kuid_t uid;
@@ -158,10 +137,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
 {
 	struct task_struct *task = current;
-	char *buf = (char *) (long) r1;
 
 	if (unlikely(!task))
 		goto err_clear;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index bf4495fcd25d..732ae16d12b7 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,10 +116,9 @@ free_smap:
 	return ERR_PTR(err);
 }
 
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+	   u64, flags)
 {
-	struct pt_regs *regs = (struct pt_regs *) (long) r1;
-	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e63d7d435796..5dcb99281259 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -61,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
-	void *dst = (void *) (long) r1;
-	int ret, size = (int) r2;
-	void *unsafe_ptr = (void *) (long) r3;
+	int ret;
 
 	ret = probe_kernel_read(dst, unsafe_ptr, size);
 	if (unlikely(ret < 0))
@@ -83,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
+	   u32, size)
 {
-	void *unsafe_ptr = (void *) (long) r1;
-	void *src = (void *) (long) r2;
-	int size = (int) r3;
-
 	/*
 	 * Ensure we're in user context which is safe for the helper to
 	 * run. This helper has no business in a kthread.
@@ -130,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
  */
-static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
+	   u64, arg2, u64, arg3)
 {
-	char *fmt = (char *) (long) r1;
 	bool str_seen = false;
 	int mod[3] = {};
 	int fmt_cnt = 0;
@@ -178,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 
 				switch (fmt_cnt) {
 				case 1:
-					unsafe_addr = r3;
-					r3 = (long) buf;
+					unsafe_addr = arg1;
+					arg1 = (long) buf;
 					break;
 				case 2:
-					unsafe_addr = r4;
-					r4 = (long) buf;
+					unsafe_addr = arg2;
+					arg2 = (long) buf;
 					break;
 				case 3:
-					unsafe_addr = r5;
-					r5 = (long) buf;
+					unsafe_addr = arg3;
+					arg3 = (long) buf;
 					break;
 				}
 				buf[0] = 0;
@@ -209,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 	}
 
 	return __trace_printk(1/* fake ip will not be printed */, fmt,
-			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
-			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
-			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+			      mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
+			      mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
+			      mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
 }
 
 static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -233,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 {
-	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
@@ -312,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	return 0;
 }
 
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
+	   u64, flags, void *, data, u64, size)
 {
-	struct pt_regs *regs = (struct pt_regs *)(long) r1;
-	struct bpf_map *map  = (struct bpf_map *)(long) r2;
-	void *data = (void *)(long) r4;
 	struct perf_raw_record raw = {
 		.frag = {
 			.size = size,
@@ -367,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	return __bpf_perf_event_output(regs, map, flags, &raw);
 }
 
-static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_task)
 {
 	return (long) current;
 }
@@ -378,16 +370,13 @@ static const struct bpf_func_proto bpf_get_current_task_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 {
-	struct bpf_map *map = (struct bpf_map *)(long)r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct cgroup *cgrp;
-	u32 idx = (u32)r2;
 
 	if (unlikely(in_interrupt()))
 		return -EINVAL;
-
 	if (unlikely(idx >= array->map.max_entries))
 		return -E2BIG;
 
@@ -481,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
-static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
+	   u64, flags, void *, data, u64, size)
 {
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
 	/*
 	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
 	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
-	 * from there and call the same bpf_perf_event_output() helper
+	 * from there and call the same bpf_perf_event_output() helper inline.
 	 */
-	u64 ctx = *(long *)(uintptr_t)r1;
-
-	return bpf_perf_event_output(ctx, r2, index, r4, size);
+	return ____bpf_perf_event_output(regs, map, flags, data, size);
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
@@ -504,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
-static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
+	   u64, flags)
 {
-	u64 ctx = *(long *)(uintptr_t)r1;
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
 
-	return bpf_get_stackid(ctx, r2, r3, r4, r5);
+	/*
+	 * Same comment as in bpf_perf_event_output_tp(), only that this time
+	 * the other helper's function body cannot be inlined due to being
+	 * external, thus we need to call raw helper function.
+	 */
+	return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+			       flags, 0, 0);
 }
 
 static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
diff --git a/net/core/filter.c b/net/core/filter.c
index d6d9bb89ce3a..298b146b47e7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -94,14 +94,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 }
 EXPORT_SYMBOL(sk_filter_trim_cap);
 
-static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
 {
-	return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+	return skb_get_poff(skb);
 }
 
-static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
 {
-	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 	struct nlattr *nla;
 
 	if (skb_is_nonlinear(skb))
@@ -120,9 +119,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return 0;
 }
 
-static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
 {
-	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 	struct nlattr *nla;
 
 	if (skb_is_nonlinear(skb))
@@ -145,7 +143,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return 0;
 }
 
-static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_0(__get_raw_cpu_id)
 {
 	return raw_smp_processor_id();
 }
@@ -1376,12 +1374,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
 		skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
 }
 
-static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
+BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
+	   const void *, from, u32, len, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	unsigned int offset = (unsigned int) r2;
-	void *from = (void *) (long) r3;
-	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
 	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
@@ -1416,12 +1411,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
+	   void *, to, u32, len)
 {
-	const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
-	unsigned int offset = (unsigned int) r2;
-	void *to = (void *)(unsigned long) r3;
-	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
 	if (unlikely(offset > 0xffff))
@@ -1449,10 +1441,9 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_STACK_SIZE,
 };
 
-static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
+	   u64, from, u64, to, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	unsigned int offset = (unsigned int) r2;
 	__sum16 *ptr;
 
 	if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
@@ -1494,12 +1485,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
+	   u64, from, u64, to, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
 	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
-	unsigned int offset = (unsigned int) r2;
 	__sum16 *ptr;
 
 	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
@@ -1547,12 +1537,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
+	   __be32 *, to, u32, to_size, __wsum, seed)
 {
 	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
-	u64 diff_size = from_size + to_size;
-	__be32 *from = (__be32 *) (long) r1;
-	__be32 *to   = (__be32 *) (long) r3;
+	u32 diff_size = from_size + to_size;
 	int i, j = 0;
 
 	/* This is quite flexible, some examples:
@@ -1610,9 +1599,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 	return ret;
 }
 
-static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	struct net_device *dev;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1648,7 +1636,7 @@ struct redirect_info {
 
 static DEFINE_PER_CPU(struct redirect_info, redirect_info);
 
-static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 
@@ -1687,9 +1675,9 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
-	return task_get_classid((struct sk_buff *) (unsigned long) r1);
+	return task_get_classid(skb);
 }
 
 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
@@ -1699,9 +1687,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
 {
-	return dst_tclassid((struct sk_buff *) (unsigned long) r1);
+	return dst_tclassid(skb);
 }
 
 static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1711,14 +1699,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
 {
 	/* If skb_clear_hash() was called due to mangling, we can
 	 * trigger SW recalculation here. Later access to hash
 	 * can then use the inline skb->hash via context directly
 	 * instead of calling this helper again.
 	 */
-	return skb_get_hash((struct sk_buff *) (unsigned long) r1);
+	return skb_get_hash(skb);
 }
 
 static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
@@ -1728,10 +1716,9 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
-static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
+	   u16, vlan_tci)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	__be16 vlan_proto = (__force __be16) r2;
 	int ret;
 
 	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
@@ -1756,9 +1743,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
 };
 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
 
-static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	int ret;
 
 	bpf_push_mac_rcsum(skb);
@@ -1933,10 +1919,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
 	return -ENOTSUPP;
 }
 
-static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
+	   u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	__be16 proto = (__force __be16) r2;
 	int ret;
 
 	if (unlikely(flags))
@@ -1973,11 +1958,8 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	u32 pkt_type = r2;
-
 	/* We only allow a restricted subset to be changed for now. */
 	if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
 		     !skb_pkt_type_ok(pkt_type)))
@@ -2028,12 +2010,11 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
 	return __skb_trim_rcsum(skb, new_len);
 }
 
-static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+	   u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *)(long) r1;
 	u32 max_len = __bpf_skb_max_len(skb);
 	u32 min_len = __bpf_skb_min_len(skb);
-	u32 new_len = (u32) r2;
 	int ret;
 
 	if (unlikely(flags || new_len > max_len || new_len < min_len))
@@ -2113,13 +2094,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
 	return 0;
 }
 
-static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
-				u64 meta_size)
+BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
+	   u64, flags, void *, meta, u64, meta_size)
 {
-	struct sk_buff *skb = (struct sk_buff *)(long) r1;
-	struct bpf_map *map = (struct bpf_map *)(long) r2;
 	u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
-	void *meta = (void *)(long) r4;
 
 	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -2146,10 +2124,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags)
 	return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
 }
 
-static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
+	   u32, size, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
 	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	void *to_orig = to;
@@ -2214,10 +2191,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	u8 *to = (u8 *) (long) r2;
 	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	int err;
 
@@ -2252,10 +2227,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
 
 static struct metadata_dst __percpu *md_dst;
 
-static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
+	   const struct bpf_tunnel_key *, from, u32, size, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
 	struct metadata_dst *md = this_cpu_ptr(md_dst);
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	struct ip_tunnel_info *info;
@@ -2273,7 +2247,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 			 */
 			memcpy(compat, from, size);
 			memset(compat + size, 0, sizeof(compat) - size);
-			from = (struct bpf_tunnel_key *)compat;
+			from = (const struct bpf_tunnel_key *) compat;
 			break;
 		default:
 			return -EINVAL;
@@ -2323,10 +2297,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
+	   const u8 *, from, u32, size)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	u8 *from = (u8 *) (long) r2;
 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	const struct metadata_dst *md = this_cpu_ptr(md_dst);
 
@@ -2372,23 +2345,20 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 	}
 }
 
-static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
+	   u32, idx)
 {
-	struct sk_buff *skb = (struct sk_buff *)(long)r1;
-	struct bpf_map *map = (struct bpf_map *)(long)r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct cgroup *cgrp;
 	struct sock *sk;
-	u32 i = (u32)r3;
 
 	sk = skb->sk;
 	if (!sk || !sk_fullsock(sk))
 		return -ENOENT;
-
-	if (unlikely(i >= array->map.max_entries))
+	if (unlikely(idx >= array->map.max_entries))
 		return -E2BIG;
 
-	cgrp = READ_ONCE(array->ptrs[i]);
+	cgrp = READ_ONCE(array->ptrs[idx]);
 	if (unlikely(!cgrp))
 		return -EAGAIN;
 
@@ -2411,13 +2381,10 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 	return 0;
 }
 
-static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
-				u64 meta_size)
+BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
+	   u64, flags, void *, meta, u64, meta_size)
 {
-	struct xdp_buff *xdp = (struct xdp_buff *)(long) r1;
-	struct bpf_map *map = (struct bpf_map *)(long) r2;
 	u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
-	void *meta = (void *)(long) r4;
 
 	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
 		return -EINVAL;
-- 
cgit v1.2.3


From ba63f23d69a3a10e7e527a02702023da68ef8a6d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 8 Sep 2016 14:20:38 -0700
Subject: fscrypto: require write access to mount to set encryption policy

Since setting an encryption policy requires writing metadata to the
filesystem, it should be guarded by mnt_want_write/mnt_drop_write.
Otherwise, a user could cause a write to a frozen or readonly
filesystem.  This was handled correctly by f2fs but not by ext4.  Make
fscrypt_process_policy() handle it rather than relying on the filesystem
to get it right.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: stable@vger.kernel.org # 4.1+; check fs/{ext4,f2fs}
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/crypto/policy.c       | 38 +++++++++++++++++++++++++-------------
 fs/ext4/ioctl.c          |  2 +-
 fs/f2fs/file.c           |  9 +--------
 include/linux/fscrypto.h |  5 ++---
 4 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f96547f83cab..ed115acb5dee 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -11,6 +11,7 @@
 #include <linux/random.h>
 #include <linux/string.h>
 #include <linux/fscrypto.h>
+#include <linux/mount.h>
 
 static int inode_has_encryption_context(struct inode *inode)
 {
@@ -92,31 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode,
 	return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
 }
 
-int fscrypt_process_policy(struct inode *inode,
+int fscrypt_process_policy(struct file *filp,
 				const struct fscrypt_policy *policy)
 {
+	struct inode *inode = file_inode(filp);
+	int ret;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
 	if (policy->version != 0)
 		return -EINVAL;
 
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
 	if (!inode_has_encryption_context(inode)) {
 		if (!S_ISDIR(inode->i_mode))
-			return -EINVAL;
-		if (!inode->i_sb->s_cop->empty_dir)
-			return -EOPNOTSUPP;
-		if (!inode->i_sb->s_cop->empty_dir(inode))
-			return -ENOTEMPTY;
-		return create_encryption_context_from_policy(inode, policy);
+			ret = -EINVAL;
+		else if (!inode->i_sb->s_cop->empty_dir)
+			ret = -EOPNOTSUPP;
+		else if (!inode->i_sb->s_cop->empty_dir(inode))
+			ret = -ENOTEMPTY;
+		else
+			ret = create_encryption_context_from_policy(inode,
+								    policy);
+	} else if (!is_encryption_context_consistent_with_policy(inode,
+								 policy)) {
+		printk(KERN_WARNING
+		       "%s: Policy inconsistent with encryption context\n",
+		       __func__);
+		ret = -EINVAL;
 	}
 
-	if (is_encryption_context_consistent_with_policy(inode, policy))
-		return 0;
-
-	printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
-	       __func__);
-	return -EINVAL;
+	mnt_drop_write_file(filp);
+	return ret;
 }
 EXPORT_SYMBOL(fscrypt_process_policy);
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 10686fd67fb4..1bb7df5e4536 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -776,7 +776,7 @@ resizefs_out:
 				   (struct fscrypt_policy __user *)arg,
 				   sizeof(policy)))
 			return -EFAULT;
-		return fscrypt_process_policy(inode, &policy);
+		return fscrypt_process_policy(filp, &policy);
 #else
 		return -EOPNOTSUPP;
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 47abb96098e4..28f4f4cbb8d8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
 	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
-	int ret;
 
 	if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
 							sizeof(policy)))
 		return -EFAULT;
 
-	ret = mnt_want_write_file(filp);
-	if (ret)
-		return ret;
-
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-	ret = fscrypt_process_policy(inode, &policy);
 
-	mnt_drop_write_file(filp);
-	return ret;
+	return fscrypt_process_policy(filp, &policy);
 }
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index cfa6cde25f8e..76cff18bb032 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -274,8 +274,7 @@ extern void fscrypt_restore_control_page(struct page *);
 extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t,
 						unsigned int);
 /* policy.c */
-extern int fscrypt_process_policy(struct inode *,
-					const struct fscrypt_policy *);
+extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *);
 extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *);
 extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
 extern int fscrypt_inherit_context(struct inode *, struct inode *,
@@ -345,7 +344,7 @@ static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p,
 }
 
 /* policy.c */
-static inline int fscrypt_notsupp_process_policy(struct inode *i,
+static inline int fscrypt_notsupp_process_policy(struct file *f,
 				const struct fscrypt_policy *p)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From d817f432c2ab7639a4f69de73eafdc55e57c45ad Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:45 +0300
Subject: net/ip_tunnels: Introduce tunnel_id_to_key32() and
 key32_to_tunnel_id()

Add utility functions to convert a 32 bits key into a 64 bits tunnel and
vice versa.
These functions will be used instead of cloning code in GRE and VXLAN,
and in tc act_iptunnel which will be introduced in a following patch in
this patchset.

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c      |  4 ++--
 include/net/ip_tunnels.h | 19 +++++++++++++++++++
 include/net/vxlan.h      | 18 ------------------
 net/ipv4/ip_gre.c        | 23 ++---------------------
 4 files changed, 23 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 199dec033cf8..4bfeb9765c55 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1291,7 +1291,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 		struct metadata_dst *tun_dst;
 
 		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
-					 vxlan_vni_to_tun_id(vni), sizeof(*md));
+					 key32_to_tunnel_id(vni), sizeof(*md));
 
 		if (!tun_dst)
 			goto drop;
@@ -1945,7 +1945,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto drop;
 		}
 		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
-		vni = vxlan_tun_id_to_vni(info->key.tun_id);
+		vni = tunnel_id_to_key32(info->key.tun_id);
 		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
 		if (remote_ip.sa.sa_family == AF_INET) {
 			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index a5e7035fb93f..e598c639aa6f 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -222,6 +222,25 @@ static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
 	return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
 }
 
+static inline __be64 key32_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be64)key;
+#else
+	return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static inline __be32 tunnel_id_to_key32(__be64 tun_id)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be32)tun_id;
+#else
+	return (__force __be32)((__force u64)tun_id >> 32);
+#endif
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b96d0360c095..0255613a54a4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -350,24 +350,6 @@ static inline __be32 vxlan_vni_field(__be32 vni)
 #endif
 }
 
-static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id)
-{
-#if defined(__BIG_ENDIAN)
-	return (__force __be32)tun_id;
-#else
-	return (__force __be32)((__force u64)tun_id >> 32);
-#endif
-}
-
-static inline __be64 vxlan_vni_to_tun_id(__be32 vni)
-{
-#if defined(__BIG_ENDIAN)
-	return (__force __be64)vni;
-#else
-	return (__force __be64)((u64)(__force u32)vni << 32);
-#endif
-}
-
 static inline size_t vxlan_rco_start(__be32 vni_field)
 {
 	return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..576f705d8180 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
 	ipgre_err(skb, info, &tpi);
 }
 
-static __be64 key_to_tunnel_id(__be32 key)
-{
-#ifdef __BIG_ENDIAN
-	return (__force __be64)((__force u32)key);
-#else
-	return (__force __be64)((__force u64)key << 32);
-#endif
-}
-
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 tunnel_id_to_key(__be64 x)
-{
-#ifdef __BIG_ENDIAN
-	return (__force __be32)x;
-#else
-	return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 {
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 			__be64 tun_id;
 
 			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
-			tun_id = key_to_tunnel_id(tpi->key);
+			tun_id = key32_to_tunnel_id(tpi->key);
 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 			if (!tun_dst)
 				return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
 	gre_build_header(skb, tunnel_hlen, flags, proto,
-			 tunnel_id_to_key(tun_info->key.tun_id), 0);
+			 tunnel_id_to_key32(tun_info->key.tun_id), 0);
 
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 
-- 
cgit v1.2.3


From 2ff378b7474feac1ec665d01e4dfc6907cccc11c Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:46 +0300
Subject: net/dst: Utility functions to build dst_metadata without supplying an
 skb

Extract __ip_tun_set_dst() and __ipv6_tun_set_dst() out of
ip_tun_rx_dst() and ipv6_tun_rx_dst(), to be used without supplying an
skb.

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h | 52 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 5db9f5910428..6965c8f68ade 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -112,12 +112,13 @@ static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb
 	return &dst->u.tun_info;
 }
 
-static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
-						 __be16 flags,
-						 __be64 tunnel_id,
-						 int md_size)
+static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
+						    __be32 daddr,
+						    __u8 tos, __u8 ttl,
+						    __be16 flags,
+						    __be64 tunnel_id,
+						    int md_size)
 {
-	const struct iphdr *iph = ip_hdr(skb);
 	struct metadata_dst *tun_dst;
 
 	tun_dst = tun_rx_dst(md_size);
@@ -125,17 +126,30 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 		return NULL;
 
 	ip_tunnel_key_init(&tun_dst->u.tun_info.key,
-			   iph->saddr, iph->daddr, iph->tos, iph->ttl,
+			   saddr, daddr, tos, ttl,
 			   0, 0, 0, tunnel_id, flags);
 	return tun_dst;
 }
 
-static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
+static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 						 __be16 flags,
 						 __be64 tunnel_id,
 						 int md_size)
 {
-	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	const struct iphdr *iph = ip_hdr(skb);
+
+	return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
+				flags, tunnel_id, md_size);
+}
+
+static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr,
+						      const struct in6_addr *daddr,
+						      __u8 tos, __u8 ttl,
+						      __be32 label,
+						      __be16 flags,
+						      __be64 tunnel_id,
+						      int md_size)
+{
 	struct metadata_dst *tun_dst;
 	struct ip_tunnel_info *info;
 
@@ -150,14 +164,26 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
 	info->key.tp_src = 0;
 	info->key.tp_dst = 0;
 
-	info->key.u.ipv6.src = ip6h->saddr;
-	info->key.u.ipv6.dst = ip6h->daddr;
+	info->key.u.ipv6.src = *saddr;
+	info->key.u.ipv6.dst = *daddr;
 
-	info->key.tos = ipv6_get_dsfield(ip6h);
-	info->key.ttl = ip6h->hop_limit;
-	info->key.label = ip6_flowlabel(ip6h);
+	info->key.tos = tos;
+	info->key.ttl = ttl;
+	info->key.label = label;
 
 	return tun_dst;
 }
 
+static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
+						   __be16 flags,
+						   __be64 tunnel_id,
+						   int md_size)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+
+	return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr,
+				  ipv6_get_dsfield(ip6h), ip6h->hop_limit,
+				  ip6_flowlabel(ip6h), flags, tunnel_id,
+				  md_size);
+}
 #endif /* __NET_DST_METADATA_H */
-- 
cgit v1.2.3


From bc3103f1ed405de587fa43d8b0671e615505a700 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:47 +0300
Subject: net/sched: cls_flower: Classify packet in ip tunnels

Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0' (after metadata is released):

$ tc filter add dev vxlan0 protocol ip parent ffff: \
    flower \
      enc_src_ip 11.11.0.2 \
      enc_dst_ip 11.11.0.1 \
      enc_key_id 11 \
      dst_ip 11.11.11.1 \
    action tunnel_key release \
    action mirred egress redirect dev vnet0

The action tunnel_key, will be introduced in the next patch in this
series.

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  11 +++++
 net/sched/cls_flower.c       | 100 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 51b5b247fb5a..f9c287c67eae 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -431,6 +431,17 @@ enum {
 	TCA_FLOWER_KEY_VLAN_ID,
 	TCA_FLOWER_KEY_VLAN_PRIO,
 	TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+
+	TCA_FLOWER_KEY_ENC_KEY_ID,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index cf9ad5b50889..b084b2aab2d7 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,9 +23,13 @@
 #include <net/ip.h>
 #include <net/flow_dissector.h>
 
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
 struct fl_flow_key {
 	int	indev_ifindex;
 	struct flow_dissector_key_control control;
+	struct flow_dissector_key_control enc_control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
 	struct flow_dissector_key_vlan vlan;
@@ -35,6 +39,11 @@ struct fl_flow_key {
 		struct flow_dissector_key_ipv6_addrs ipv6;
 	};
 	struct flow_dissector_key_ports tp;
+	struct flow_dissector_key_keyid enc_key_id;
+	union {
+		struct flow_dissector_key_ipv4_addrs enc_ipv4;
+		struct flow_dissector_key_ipv6_addrs enc_ipv6;
+	};
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -124,11 +133,31 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	struct cls_fl_filter *f;
 	struct fl_flow_key skb_key;
 	struct fl_flow_key skb_mkey;
+	struct ip_tunnel_info *info;
 
 	if (!atomic_read(&head->ht.nelems))
 		return -1;
 
 	fl_clear_masked_range(&skb_key, &head->mask);
+
+	info = skb_tunnel_info(skb);
+	if (info) {
+		struct ip_tunnel_key *key = &info->key;
+
+		switch (ip_tunnel_info_af(info)) {
+		case AF_INET:
+			skb_key.enc_ipv4.src = key->u.ipv4.src;
+			skb_key.enc_ipv4.dst = key->u.ipv4.dst;
+			break;
+		case AF_INET6:
+			skb_key.enc_ipv6.src = key->u.ipv6.src;
+			skb_key.enc_ipv6.dst = key->u.ipv6.dst;
+			break;
+		}
+
+		skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
+	}
+
 	skb_key.indev_ifindex = skb->skb_iif;
 	/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
 	 * so do it rather here.
@@ -297,7 +326,15 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_VLAN_ID]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_VLAN_PRIO]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_VLAN_ETH_TYPE]	= { .type = NLA_U16 },
-
+	[TCA_FLOWER_KEY_ENC_KEY_ID]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_SRC]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_DST]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV6_SRC]	= { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_DST]	= { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -409,6 +446,40 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       sizeof(key->tp.dst));
 	}
 
+	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
+	    tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
+		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		fl_set_key_val(tb, &key->enc_ipv4.src,
+			       TCA_FLOWER_KEY_ENC_IPV4_SRC,
+			       &mask->enc_ipv4.src,
+			       TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+			       sizeof(key->enc_ipv4.src));
+		fl_set_key_val(tb, &key->enc_ipv4.dst,
+			       TCA_FLOWER_KEY_ENC_IPV4_DST,
+			       &mask->enc_ipv4.dst,
+			       TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+			       sizeof(key->enc_ipv4.dst));
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
+	    tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
+		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+		fl_set_key_val(tb, &key->enc_ipv6.src,
+			       TCA_FLOWER_KEY_ENC_IPV6_SRC,
+			       &mask->enc_ipv6.src,
+			       TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+			       sizeof(key->enc_ipv6.src));
+		fl_set_key_val(tb, &key->enc_ipv6.dst,
+			       TCA_FLOWER_KEY_ENC_IPV6_DST,
+			       &mask->enc_ipv6.dst,
+			       TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+			       sizeof(key->enc_ipv6.dst));
+	}
+
+	fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+		       &mask->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+		       sizeof(key->enc_key_id.keyid));
+
 	return 0;
 }
 
@@ -821,6 +892,33 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 				  sizeof(key->tp.dst))))
 		goto nla_put_failure;
 
+	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
+	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
+			    TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
+			    TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+			    sizeof(key->enc_ipv4.src)) ||
+	     fl_dump_key_val(skb, &key->enc_ipv4.dst,
+			     TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst,
+			     TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+			     sizeof(key->enc_ipv4.dst))))
+		goto nla_put_failure;
+	else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
+		 (fl_dump_key_val(skb, &key->enc_ipv6.src,
+			    TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src,
+			    TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+			    sizeof(key->enc_ipv6.src)) ||
+		 fl_dump_key_val(skb, &key->enc_ipv6.dst,
+				 TCA_FLOWER_KEY_ENC_IPV6_DST,
+				 &mask->enc_ipv6.dst,
+				 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+			    sizeof(key->enc_ipv6.dst))))
+		goto nla_put_failure;
+
+	if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+			    &mask->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+			    sizeof(key->enc_key_id)))
+		goto nla_put_failure;
+
 	nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
 
 	if (tcf_exts_dump(skb, &f->exts))
-- 
cgit v1.2.3


From d0f6dd8a914f42c6f1a3a8c08caa16559d3d9a1b Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:48 +0300
Subject: net/sched: Introduce act_tunnel_key

This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The action will release the metadata created by the tunnel device
(decap), or set the metadata with the specified values for encap
operation.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent ffff: \
    flower \
      ip_proto 1 \
      dst_ip 11.11.11.2 \
    action tunnel_key set \
      src_ip 11.11.0.1 \
      dst_ip 11.11.0.2 \
      id 11 \
    action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_tunnel_key.h        |  30 +++
 include/uapi/linux/tc_act/tc_tunnel_key.h |  41 ++++
 net/sched/Kconfig                         |  11 +
 net/sched/Makefile                        |   1 +
 net/sched/act_tunnel_key.c                | 351 ++++++++++++++++++++++++++++++
 5 files changed, 434 insertions(+)
 create mode 100644 include/net/tc_act/tc_tunnel_key.h
 create mode 100644 include/uapi/linux/tc_act/tc_tunnel_key.h
 create mode 100644 net/sched/act_tunnel_key.c

(limited to 'include')

diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
new file mode 100644
index 000000000000..253f8da6c2a6
--- /dev/null
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NET_TC_TUNNEL_KEY_H
+#define __NET_TC_TUNNEL_KEY_H
+
+#include <net/act_api.h>
+
+struct tcf_tunnel_key_params {
+	struct rcu_head		rcu;
+	int			tcft_action;
+	int			action;
+	struct metadata_dst     *tcft_enc_metadata;
+};
+
+struct tcf_tunnel_key {
+	struct tc_action	      common;
+	struct tcf_tunnel_key_params __rcu *params;
+};
+
+#define to_tunnel_key(a) ((struct tcf_tunnel_key *)a)
+
+#endif /* __NET_TC_TUNNEL_KEY_H */
diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index 000000000000..890106ff16e6
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET	    1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+	tc_gen;
+	int t_action;
+};
+
+enum {
+	TCA_TUNNEL_KEY_UNSPEC,
+	TCA_TUNNEL_KEY_TM,
+	TCA_TUNNEL_KEY_PARMS,
+	TCA_TUNNEL_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_KEY_ID,	/* be64 */
+	TCA_TUNNEL_KEY_PAD,
+	__TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..72e3426fa48f 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -761,6 +761,17 @@ config NET_ACT_IFE
 	  To compile this code as a module, choose M here: the
 	  module will be called act_ife.
 
+config NET_ACT_TUNNEL_KEY
+        tristate "IP tunnel metadata manipulation"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to set/release ip tunnel metadata.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_tunnel_key.
+
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..b9d046b9535a 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
+obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
new file mode 100644
index 000000000000..dceff7412dc3
--- /dev/null
+++ b/net/sched/act_tunnel_key.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_tunnel_key.h>
+
+#define TUNNEL_KEY_TAB_MASK     15
+
+static int tunnel_key_net_id;
+static struct tc_action_ops act_tunnel_key_ops;
+
+static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+	int action;
+
+	rcu_read_lock();
+
+	params = rcu_dereference(t->params);
+
+	tcf_lastuse_update(&t->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+	action = params->action;
+
+	switch (params->tcft_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		skb_dst_drop(skb);
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		skb_dst_drop(skb);
+		skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
+		break;
+	default:
+		WARN_ONCE(1, "Bad tunnel_key action %d.\n",
+			  params->tcft_action);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return action;
+}
+
+static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
+	[TCA_TUNNEL_KEY_PARMS]	    = { .len = sizeof(struct tc_tunnel_key) },
+	[TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+	[TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+	[TCA_TUNNEL_KEY_ENC_KEY_ID]   = { .type = NLA_U32 },
+};
+
+static int tunnel_key_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+	struct tcf_tunnel_key_params *params_old;
+	struct tcf_tunnel_key_params *params_new;
+	struct metadata_dst *metadata = NULL;
+	struct tc_tunnel_key *parm;
+	struct tcf_tunnel_key *t;
+	bool exists = false;
+	__be64 key_id;
+	int ret = 0;
+	int err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TUNNEL_KEY_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	switch (parm->t_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+
+		if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
+		    tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
+			__be32 saddr;
+			__be32 daddr;
+
+			saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+			daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+
+			metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+						    TUNNEL_KEY, key_id, 0);
+		} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
+			   tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
+			struct in6_addr saddr;
+			struct in6_addr daddr;
+
+			saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+			daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+
+			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
+						      TUNNEL_KEY, key_id, 0);
+		}
+
+		if (!metadata) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
+		break;
+	default:
+		goto err_out;
+	}
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_tunnel_key_ops, bind, true);
+		if (ret)
+			return ret;
+
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	t = to_tunnel_key(*a);
+
+	ASSERT_RTNL();
+	params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+	if (unlikely(!params_new)) {
+		if (ret == ACT_P_CREATED)
+			tcf_hash_release(*a, bind);
+		return -ENOMEM;
+	}
+
+	params_old = rtnl_dereference(t->params);
+
+	params_new->action = parm->action;
+	params_new->tcft_action = parm->t_action;
+	params_new->tcft_enc_metadata = metadata;
+
+	rcu_assign_pointer(t->params, params_new);
+
+	if (params_old)
+		kfree_rcu(params_old, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+
+	return ret;
+
+err_out:
+	if (exists)
+		tcf_hash_release(*a, bind);
+	return ret;
+}
+
+static void tunnel_key_release(struct tc_action *a, int bind)
+{
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+
+	rcu_read_lock();
+	params = rcu_dereference(t->params);
+
+	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+		dst_release(&params->tcft_enc_metadata->dst);
+
+	kfree_rcu(params, rcu);
+
+	rcu_read_unlock();
+}
+
+static int tunnel_key_dump_addresses(struct sk_buff *skb,
+				     const struct ip_tunnel_info *info)
+{
+	unsigned short family = ip_tunnel_info_af(info);
+
+	if (family == AF_INET) {
+		__be32 saddr = info->key.u.ipv4.src;
+		__be32 daddr = info->key.u.ipv4.dst;
+
+		if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) &&
+		    !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr))
+			return 0;
+	}
+
+	if (family == AF_INET6) {
+		const struct in6_addr *saddr6 = &info->key.u.ipv6.src;
+		const struct in6_addr *daddr6 = &info->key.u.ipv6.dst;
+
+		if (!nla_put_in6_addr(skb,
+				      TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) &&
+		    !nla_put_in6_addr(skb,
+				      TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6))
+			return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+	struct tc_tunnel_key opt = {
+		.index    = t->tcf_index,
+		.refcnt   = t->tcf_refcnt - ref,
+		.bindcnt  = t->tcf_bindcnt - bind,
+	};
+	struct tcf_t tm;
+	int ret = -1;
+
+	rcu_read_lock();
+	params = rcu_dereference(t->params);
+
+	opt.t_action = params->tcft_action;
+	opt.action = params->action;
+
+	if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
+		struct ip_tunnel_key *key =
+			&params->tcft_enc_metadata->u.tun_info.key;
+		__be32 key_id = tunnel_id_to_key32(key->tun_id);
+
+		if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+		    tunnel_key_dump_addresses(skb,
+					      &params->tcft_enc_metadata->u.tun_info))
+			goto nla_put_failure;
+	}
+
+	tcf_tm_dump(&tm, &t->tcf_tm);
+	if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
+			  &tm, TCA_TUNNEL_KEY_PAD))
+		goto nla_put_failure;
+
+	ret = skb->len;
+	goto out;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_tunnel_key_ops = {
+	.kind		=	"tunnel_key",
+	.type		=	TCA_ACT_TUNNEL_KEY,
+	.owner		=	THIS_MODULE,
+	.act		=	tunnel_key_act,
+	.dump		=	tunnel_key_dump,
+	.init		=	tunnel_key_init,
+	.cleanup	=	tunnel_key_release,
+	.walk		=	tunnel_key_walker,
+	.lookup		=	tunnel_key_search,
+	.size		=	sizeof(struct tcf_tunnel_key),
+};
+
+static __net_init int tunnel_key_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK);
+}
+
+static void __net_exit tunnel_key_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations tunnel_key_net_ops = {
+	.init = tunnel_key_init_net,
+	.exit = tunnel_key_exit_net,
+	.id   = &tunnel_key_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init tunnel_key_init_module(void)
+{
+	return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+static void __exit tunnel_key_cleanup_module(void)
+{
+	tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+module_init(tunnel_key_init_module);
+module_exit(tunnel_key_cleanup_module);
+
+MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
+MODULE_DESCRIPTION("ip tunnel manipulation actions");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 6b6adee3dad25bbe568ee24fc843372d02fb425f Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Fri, 9 Sep 2016 17:35:18 +0300
Subject: net/mlx5: SRIOV core code refactoring

Simplify the code and makes it look modular and symmetric.
Split sriov enable/disable to two levels: device level and pci level.
When user enable/disable sriov (via sriov_configure driver callback) we
will enable/disable both device and pci sriov.
When driver load/unload we will enable/disable (on demand) only device
sriov while keeping the PCI sriov enabled for next driver load.
On internal/pci error, VFs will be kept enabled on PCI and the reset
is done only in device level.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  12 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c    | 216 +++++++++------------
 include/linux/mlx5/driver.h                        |   2 -
 4 files changed, 101 insertions(+), 131 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c132ef1faefe..baba53fb58b7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1180,8 +1180,7 @@ out:
 	return 0;
 
 err_sriov:
-	if (mlx5_sriov_cleanup(dev))
-		dev_err(&dev->pdev->dev, "sriov cleanup failed\n");
+	mlx5_sriov_cleanup(dev);
 
 #ifdef CONFIG_MLX5_CORE_EN
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
@@ -1241,19 +1240,14 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 {
 	int err = 0;
 
-	err = mlx5_sriov_cleanup(dev);
-	if (err) {
-		dev_warn(&dev->pdev->dev, "%s: sriov cleanup failed - abort\n",
-			 __func__);
-		return err;
-	}
-
 	mutex_lock(&dev->intf_state_mutex);
 	if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) {
 		dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
 			 __func__);
 		goto out;
 	}
+
+	mlx5_sriov_cleanup(dev);
 	mlx5_unregister_device(dev);
 #ifdef CONFIG_MLX5_CORE_EN
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 714b71bed2be..7dd14cf73181 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -89,6 +89,8 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
+int mlx5_sriov_init(struct mlx5_core_dev *dev);
+void mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
 int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs);
 bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev);
 int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index 78e789245183..72a8215a2d02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -44,108 +44,132 @@ bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev)
 	return !!sriov->num_vfs;
 }
 
-static void enable_vfs(struct mlx5_core_dev *dev, int num_vfs)
+static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 {
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
 	int err;
 	int vf;
 
-	for (vf = 1; vf <= num_vfs; vf++) {
-		err = mlx5_core_enable_hca(dev, vf);
+	if (sriov->enabled_vfs) {
+		mlx5_core_warn(dev,
+			       "failed to enable SRIOV on device, already enabled with %d vfs\n",
+			       sriov->enabled_vfs);
+		return -EBUSY;
+	}
+
+#ifdef CONFIG_MLX5_CORE_EN
+	err = mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY);
+	if (err) {
+		mlx5_core_warn(dev,
+			       "failed to enable eswitch SRIOV (%d)\n", err);
+		return err;
+	}
+#endif
+
+	for (vf = 0; vf < num_vfs; vf++) {
+		err = mlx5_core_enable_hca(dev, vf + 1);
 		if (err) {
-			mlx5_core_warn(dev, "failed to enable VF %d\n", vf - 1);
-		} else {
-			sriov->vfs_ctx[vf - 1].enabled = 1;
-			mlx5_core_dbg(dev, "successfully enabled VF %d\n", vf - 1);
+			mlx5_core_warn(dev, "failed to enable VF %d (%d)\n", vf, err);
+			continue;
 		}
+		sriov->vfs_ctx[vf].enabled = 1;
+		sriov->enabled_vfs++;
+		mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf);
+
 	}
+
+	return 0;
 }
 
-static void disable_vfs(struct mlx5_core_dev *dev, int num_vfs)
+static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err;
 	int vf;
 
-	for (vf = 1; vf <= num_vfs; vf++) {
-		if (sriov->vfs_ctx[vf - 1].enabled) {
-			if (mlx5_core_disable_hca(dev, vf))
-				mlx5_core_warn(dev, "failed to disable VF %d\n", vf - 1);
-			else
-				sriov->vfs_ctx[vf - 1].enabled = 0;
+	if (!sriov->enabled_vfs)
+		return;
+
+	for (vf = 0; vf < sriov->num_vfs; vf++) {
+		if (!sriov->vfs_ctx[vf].enabled)
+			continue;
+		err = mlx5_core_disable_hca(dev, vf + 1);
+		if (err) {
+			mlx5_core_warn(dev, "failed to disable VF %d\n", vf);
+			continue;
 		}
+		sriov->vfs_ctx[vf].enabled = 0;
+		sriov->enabled_vfs--;
 	}
+
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5_eswitch_disable_sriov(dev->priv.eswitch);
+#endif
+
+	if (mlx5_wait_for_vf_pages(dev))
+		mlx5_core_warn(dev, "timeout reclaiming VFs pages\n");
 }
 
-static int mlx5_core_create_vfs(struct pci_dev *pdev, int num_vfs)
+static int mlx5_pci_enable_sriov(struct pci_dev *pdev, int num_vfs)
 {
 	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
-	int err;
-
-	if (pci_num_vf(pdev))
-		pci_disable_sriov(pdev);
-
-	enable_vfs(dev, num_vfs);
+	int err = 0;
 
-	err = pci_enable_sriov(pdev, num_vfs);
-	if (err) {
-		dev_warn(&pdev->dev, "enable sriov failed %d\n", err);
-		goto ex;
+	if (pci_num_vf(pdev)) {
+		mlx5_core_warn(dev, "Unable to enable pci sriov, already enabled\n");
+		return -EBUSY;
 	}
 
-	return 0;
+	err = pci_enable_sriov(pdev, num_vfs);
+	if (err)
+		mlx5_core_warn(dev, "pci_enable_sriov failed : %d\n", err);
 
-ex:
-	disable_vfs(dev, num_vfs);
 	return err;
 }
 
-static int mlx5_core_sriov_enable(struct pci_dev *pdev, int num_vfs)
+static void mlx5_pci_disable_sriov(struct pci_dev *pdev)
+{
+	pci_disable_sriov(pdev);
+}
+
+static int mlx5_sriov_enable(struct pci_dev *pdev, int num_vfs)
 {
 	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
-	int err;
+	int err = 0;
 
-	kfree(sriov->vfs_ctx);
-	sriov->vfs_ctx = kcalloc(num_vfs, sizeof(*sriov->vfs_ctx), GFP_ATOMIC);
-	if (!sriov->vfs_ctx)
-		return -ENOMEM;
+	err = mlx5_device_enable_sriov(dev, num_vfs);
+	if (err) {
+		mlx5_core_warn(dev, "mlx5_device_enable_sriov failed : %d\n", err);
+		return err;
+	}
 
-	sriov->enabled_vfs = num_vfs;
-	err = mlx5_core_create_vfs(pdev, num_vfs);
+	err = mlx5_pci_enable_sriov(pdev, num_vfs);
 	if (err) {
-		kfree(sriov->vfs_ctx);
-		sriov->vfs_ctx = NULL;
+		mlx5_core_warn(dev, "mlx5_pci_enable_sriov failed : %d\n", err);
+		mlx5_device_disable_sriov(dev);
 		return err;
 	}
 
+	sriov->num_vfs = num_vfs;
+
 	return 0;
 }
 
-static void mlx5_core_init_vfs(struct mlx5_core_dev *dev, int num_vfs)
+static void mlx5_sriov_disable(struct pci_dev *pdev)
 {
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
 
-	sriov->num_vfs = num_vfs;
-}
-
-static void mlx5_core_cleanup_vfs(struct mlx5_core_dev *dev)
-{
-	struct mlx5_core_sriov *sriov;
-
-	sriov = &dev->priv.sriov;
-	disable_vfs(dev, sriov->num_vfs);
-
-	if (mlx5_wait_for_vf_pages(dev))
-		mlx5_core_warn(dev, "timeout claiming VFs pages\n");
-
+	mlx5_pci_disable_sriov(pdev);
+	mlx5_device_disable_sriov(dev);
 	sriov->num_vfs = 0;
 }
 
 int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
 {
 	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
-	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
-	int err;
+	int err = 0;
 
 	mlx5_core_dbg(dev, "requested num_vfs %d\n", num_vfs);
 	if (!mlx5_core_is_pf(dev))
@@ -156,92 +180,44 @@ int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
 		return -EINVAL;
 	}
 
-	mlx5_core_cleanup_vfs(dev);
-
-	if (!num_vfs) {
-#ifdef CONFIG_MLX5_CORE_EN
-		mlx5_eswitch_disable_sriov(dev->priv.eswitch);
-#endif
-		kfree(sriov->vfs_ctx);
-		sriov->vfs_ctx = NULL;
-		if (!pci_vfs_assigned(pdev))
-			pci_disable_sriov(pdev);
-		else
-			mlx5_core_info(dev, "unloading PF driver while leaving orphan VFs\n");
-		return 0;
-	}
-
-	err = mlx5_core_sriov_enable(pdev, num_vfs);
-	if (err) {
-		mlx5_core_warn(dev, "mlx5_core_sriov_enable failed %d\n", err);
-		return err;
-	}
+	if (num_vfs)
+		err = mlx5_sriov_enable(pdev, num_vfs);
+	else
+		mlx5_sriov_disable(pdev);
 
-	mlx5_core_init_vfs(dev, num_vfs);
-#ifdef CONFIG_MLX5_CORE_EN
-	mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY);
-#endif
-
-	return num_vfs;
-}
-
-static int sync_required(struct pci_dev *pdev)
-{
-	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
-	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
-	int cur_vfs = pci_num_vf(pdev);
-
-	if (cur_vfs != sriov->num_vfs) {
-		mlx5_core_warn(dev, "current VFs %d, registered %d - sync needed\n",
-			       cur_vfs, sriov->num_vfs);
-		return 1;
-	}
-
-	return 0;
+	return err ? err : num_vfs;
 }
 
 int mlx5_sriov_init(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
 	struct pci_dev *pdev = dev->pdev;
-	int cur_vfs;
+	int total_vfs;
 
 	if (!mlx5_core_is_pf(dev))
 		return 0;
 
-	if (!sync_required(dev->pdev))
-		return 0;
-
-	cur_vfs = pci_num_vf(pdev);
-	sriov->vfs_ctx = kcalloc(cur_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL);
+	total_vfs = pci_sriov_get_totalvfs(pdev);
+	sriov->num_vfs = pci_num_vf(pdev);
+	sriov->vfs_ctx = kcalloc(total_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL);
 	if (!sriov->vfs_ctx)
 		return -ENOMEM;
 
-	sriov->enabled_vfs = cur_vfs;
-
-	mlx5_core_init_vfs(dev, cur_vfs);
-#ifdef CONFIG_MLX5_CORE_EN
-	if (cur_vfs)
-		mlx5_eswitch_enable_sriov(dev->priv.eswitch, cur_vfs,
-					  SRIOV_LEGACY);
-#endif
-
-	enable_vfs(dev, cur_vfs);
+	/* If sriov VFs exist in PCI level, enable them in device level */
+	if (!sriov->num_vfs)
+		return 0;
 
+	mlx5_device_enable_sriov(dev, sriov->num_vfs);
 	return 0;
 }
 
-int mlx5_sriov_cleanup(struct mlx5_core_dev *dev)
+void mlx5_sriov_cleanup(struct mlx5_core_dev *dev)
 {
-	struct pci_dev *pdev = dev->pdev;
-	int err;
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
 
 	if (!mlx5_core_is_pf(dev))
-		return 0;
+		return;
 
-	err = mlx5_core_sriov_configure(pdev, 0);
-	if (err)
-		return err;
-
-	return 0;
+	mlx5_device_disable_sriov(dev);
+	kfree(sriov->vfs_ctx);
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5cb9fa7aec61..0d7aedfce1d7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -828,8 +828,6 @@ void mlx5_pagealloc_init(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev);
 int mlx5_pagealloc_start(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
-int mlx5_sriov_init(struct mlx5_core_dev *dev);
-int mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
 void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
 				 s32 npages);
 int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
-- 
cgit v1.2.3


From 737a234bb6384800a5b632be85c6b0ad6221d137 Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Fri, 9 Sep 2016 17:35:19 +0300
Subject: net/mlx5: Introduce attach/detach to interface API

Add attach/detach callbacks to interface API.
This is crucial for implementing seamless reset flow which releases the
hardware and it's resources upon detach while keeping software
structures and state (e.g netdev) then reset and reallocate the hardware
needed resources upon attach.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 149 +++++++++++++++++++++----
 include/linux/mlx5/driver.h                    |   2 +
 2 files changed, 131 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index baba53fb58b7..108d8f2e73e1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -81,6 +81,7 @@ struct mlx5_device_context {
 	struct list_head	list;
 	struct mlx5_interface  *intf;
 	void		       *context;
+	unsigned long		state;
 };
 
 enum {
@@ -778,6 +779,11 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
 	return -ENOTSUPP;
 }
 
+enum {
+	MLX5_INTERFACE_ADDED,
+	MLX5_INTERFACE_ATTACHED,
+};
+
 static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 {
 	struct mlx5_device_context *dev_ctx;
@@ -786,12 +792,15 @@ static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	if (!mlx5_lag_intf_add(intf, priv))
 		return;
 
-	dev_ctx = kmalloc(sizeof(*dev_ctx), GFP_KERNEL);
+	dev_ctx = kzalloc(sizeof(*dev_ctx), GFP_KERNEL);
 	if (!dev_ctx)
 		return;
 
-	dev_ctx->intf    = intf;
+	dev_ctx->intf = intf;
 	dev_ctx->context = intf->add(dev);
+	set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+	if (intf->attach)
+		set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
 
 	if (dev_ctx->context) {
 		spin_lock_irq(&priv->ctx_lock);
@@ -802,21 +811,114 @@ static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	}
 }
 
+static struct mlx5_device_context *mlx5_get_device(struct mlx5_interface *intf,
+						   struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf)
+			return dev_ctx;
+	return NULL;
+}
+
 static void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 {
 	struct mlx5_device_context *dev_ctx;
 	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
 
-	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
-		if (dev_ctx->intf == intf) {
-			spin_lock_irq(&priv->ctx_lock);
-			list_del(&dev_ctx->list);
-			spin_unlock_irq(&priv->ctx_lock);
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+	spin_lock_irq(&priv->ctx_lock);
+	list_del(&dev_ctx->list);
+	spin_unlock_irq(&priv->ctx_lock);
+
+	if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+		intf->remove(dev, dev_ctx->context);
 
-			intf->remove(dev, dev_ctx->context);
-			kfree(dev_ctx);
+	kfree(dev_ctx);
+}
+
+static void mlx5_attach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+	if (intf->attach) {
+		if (test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state))
 			return;
-		}
+		intf->attach(dev, dev_ctx->context);
+		set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
+	} else {
+		if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+			return;
+		dev_ctx->context = intf->add(dev);
+		set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+	}
+}
+
+static void mlx5_attach_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_attach_interface(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+static void mlx5_detach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+	if (intf->detach) {
+		if (!test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state))
+			return;
+		intf->detach(dev, dev_ctx->context);
+		clear_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
+	} else {
+		if (!test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+			return;
+		intf->remove(dev, dev_ctx->context);
+		clear_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+	}
+}
+
+static void mlx5_detach_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_detach_interface(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+static bool mlx5_device_registered(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv;
+	bool found = false;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
+		if (priv == &dev->priv)
+			found = true;
+	mutex_unlock(&mlx5_intf_mutex);
+
+	return found;
 }
 
 static int mlx5_register_device(struct mlx5_core_dev *dev)
@@ -1162,16 +1264,16 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto err_sriov;
 	}
 
-	err = mlx5_register_device(dev);
-	if (err) {
-		dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err);
-		goto err_reg_dev;
+	if (mlx5_device_registered(dev)) {
+		mlx5_attach_device(dev);
+	} else {
+		err = mlx5_register_device(dev);
+		if (err) {
+			dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err);
+			goto err_reg_dev;
+		}
 	}
 
-	err = request_module_nowait(MLX5_IB_MOD);
-	if (err)
-		pr_info("failed request module on %s\n", MLX5_IB_MOD);
-
 	clear_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
 	set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 out:
@@ -1247,12 +1349,13 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto out;
 	}
 
+	if (mlx5_device_registered(dev))
+		mlx5_detach_device(dev);
+
 	mlx5_sriov_cleanup(dev);
-	mlx5_unregister_device(dev);
 #ifdef CONFIG_MLX5_CORE_EN
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 #endif
-
 	mlx5_cleanup_rl_table(dev);
 	mlx5_cleanup_fs(dev);
 	mlx5_cleanup_mkey_table(dev);
@@ -1364,6 +1467,9 @@ static int init_one(struct pci_dev *pdev,
 		dev_err(&pdev->dev, "mlx5_load_one failed with error code %d\n", err);
 		goto clean_health;
 	}
+	err = request_module_nowait(MLX5_IB_MOD);
+	if (err)
+		pr_info("failed request module on %s\n", MLX5_IB_MOD);
 
 	err = devlink_register(devlink, &pdev->dev);
 	if (err)
@@ -1391,11 +1497,14 @@ static void remove_one(struct pci_dev *pdev)
 	struct mlx5_priv *priv = &dev->priv;
 
 	devlink_unregister(devlink);
+	mlx5_unregister_device(dev);
+
 	if (mlx5_unload_one(dev, priv)) {
 		dev_err(&dev->pdev->dev, "mlx5_unload_one failed\n");
 		mlx5_health_cleanup(dev);
 		return;
 	}
+
 	mlx5_health_cleanup(dev);
 	mlx5_pci_close(dev, priv);
 	pci_set_drvdata(pdev, NULL);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 0d7aedfce1d7..85c4786427e4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -930,6 +930,8 @@ enum {
 struct mlx5_interface {
 	void *			(*add)(struct mlx5_core_dev *dev);
 	void			(*remove)(struct mlx5_core_dev *dev, void *context);
+	int			(*attach)(struct mlx5_core_dev *dev, void *context);
+	void			(*detach)(struct mlx5_core_dev *dev, void *context);
 	void			(*event)(struct mlx5_core_dev *dev, void *context,
 					 enum mlx5_dev_event event, unsigned long param);
 	void *                  (*get_dev)(void *context);
-- 
cgit v1.2.3


From 3a8963acc70e69606729404713cfa9a03b58b18c Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <sthemmin@microsoft.com>
Date: Fri, 9 Sep 2016 12:45:24 -0700
Subject: Revert "hv_netvsc: make inline functions static"

These functions are used by other code misc-next tree.

This reverts commit 30d1de08c87ddde6f73936c3350e7e153988fe02.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 85 +--------------------------------------------
 include/linux/hyperv.h      | 84 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 2a9ccc4d9e3c..ff05b9b0837f 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -33,89 +33,6 @@
 
 #include "hyperv_net.h"
 
-/*
- * An API to support in-place processing of incoming VMBUS packets.
- */
-#define VMBUS_PKT_TRAILER	8
-
-static struct vmpacket_descriptor *
-get_next_pkt_raw(struct vmbus_channel *channel)
-{
-	struct hv_ring_buffer_info *ring_info = &channel->inbound;
-	u32 read_loc = ring_info->priv_read_index;
-	void *ring_buffer = hv_get_ring_buffer(ring_info);
-	struct vmpacket_descriptor *cur_desc;
-	u32 packetlen;
-	u32 dsize = ring_info->ring_datasize;
-	u32 delta = read_loc - ring_info->ring_buffer->read_index;
-	u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta);
-
-	if (bytes_avail_toread < sizeof(struct vmpacket_descriptor))
-		return NULL;
-
-	if ((read_loc + sizeof(*cur_desc)) > dsize)
-		return NULL;
-
-	cur_desc = ring_buffer + read_loc;
-	packetlen = cur_desc->len8 << 3;
-
-	/*
-	 * If the packet under consideration is wrapping around,
-	 * return failure.
-	 */
-	if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1))
-		return NULL;
-
-	return cur_desc;
-}
-
-/*
- * A helper function to step through packets "in-place"
- * This API is to be called after each successful call
- * get_next_pkt_raw().
- */
-static void put_pkt_raw(struct vmbus_channel *channel,
-			struct vmpacket_descriptor *desc)
-{
-	struct hv_ring_buffer_info *ring_info = &channel->inbound;
-	u32 read_loc = ring_info->priv_read_index;
-	u32 packetlen = desc->len8 << 3;
-	u32 dsize = ring_info->ring_datasize;
-
-	BUG_ON((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize);
-
-	/*
-	 * Include the packet trailer.
-	 */
-	ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
-}
-
-/*
- * This call commits the read index and potentially signals the host.
- * Here is the pattern for using the "in-place" consumption APIs:
- *
- * while (get_next_pkt_raw() {
- *	process the packet "in-place";
- *	put_pkt_raw();
- * }
- * if (packets processed in place)
- *	commit_rd_index();
- */
-static void commit_rd_index(struct vmbus_channel *channel)
-{
-	struct hv_ring_buffer_info *ring_info = &channel->inbound;
-	/*
-	 * Make sure all reads are done before we update the read index since
-	 * the writer may start writing to the read area once the read index
-	 * is updated.
-	 */
-	virt_rmb();
-	ring_info->ring_buffer->read_index = ring_info->priv_read_index;
-
-	if (hv_need_to_signal_on_read(ring_info))
-		vmbus_set_event(channel);
-}
-
 /*
  * Switch the data path from the synthetic interface to the VF
  * interface.
@@ -840,7 +757,7 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 	return msg_size;
 }
 
-static int netvsc_send_pkt(
+static inline int netvsc_send_pkt(
 	struct hv_device *device,
 	struct hv_netvsc_packet *packet,
 	struct netvsc_device *net_device,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index b01c8c3dd531..5df444b1ac18 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1429,4 +1429,88 @@ static inline  bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
 	return false;
 }
 
+/*
+ * An API to support in-place processing of incoming VMBUS packets.
+ */
+#define VMBUS_PKT_TRAILER	8
+
+static inline struct vmpacket_descriptor *
+get_next_pkt_raw(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	u32 read_loc = ring_info->priv_read_index;
+	void *ring_buffer = hv_get_ring_buffer(ring_info);
+	struct vmpacket_descriptor *cur_desc;
+	u32 packetlen;
+	u32 dsize = ring_info->ring_datasize;
+	u32 delta = read_loc - ring_info->ring_buffer->read_index;
+	u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta);
+
+	if (bytes_avail_toread < sizeof(struct vmpacket_descriptor))
+		return NULL;
+
+	if ((read_loc + sizeof(*cur_desc)) > dsize)
+		return NULL;
+
+	cur_desc = ring_buffer + read_loc;
+	packetlen = cur_desc->len8 << 3;
+
+	/*
+	 * If the packet under consideration is wrapping around,
+	 * return failure.
+	 */
+	if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1))
+		return NULL;
+
+	return cur_desc;
+}
+
+/*
+ * A helper function to step through packets "in-place"
+ * This API is to be called after each successful call
+ * get_next_pkt_raw().
+ */
+static inline void put_pkt_raw(struct vmbus_channel *channel,
+				struct vmpacket_descriptor *desc)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	u32 read_loc = ring_info->priv_read_index;
+	u32 packetlen = desc->len8 << 3;
+	u32 dsize = ring_info->ring_datasize;
+
+	if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize)
+		BUG();
+	/*
+	 * Include the packet trailer.
+	 */
+	ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
+}
+
+/*
+ * This call commits the read index and potentially signals the host.
+ * Here is the pattern for using the "in-place" consumption APIs:
+ *
+ * while (get_next_pkt_raw() {
+ *	process the packet "in-place";
+ *	put_pkt_raw();
+ * }
+ * if (packets processed in place)
+ *	commit_rd_index();
+ */
+static inline void commit_rd_index(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *ring_info = &channel->inbound;
+	/*
+	 * Make sure all reads are done before we update the read index since
+	 * the writer may start writing to the read area once the read index
+	 * is updated.
+	 */
+	virt_rmb();
+	ring_info->ring_buffer->read_index = ring_info->priv_read_index;
+
+	if (hv_need_to_signal_on_read(ring_info))
+		vmbus_set_event(channel);
+}
+
+
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From 9ee0034b8f49aaaa7e7c2da8db1038915db99c19 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:52 -0700
Subject: net: flow: Add l3mdev flow update

Add l3mdev hook to set FLOWI_FLAG_SKIP_NH_OIF flag and update oif/iif
in flow struct if its oif or iif points to a device enslaved to an L3
Master device. Only 1 needs to be converted to match the l3mdev FIB
rule. This moves the flow adjustment for l3mdev to a single point
catching all lookups. It is redundant for existing hooks (those are
removed in later patches) but is needed for missed lookups such as
PMTU updates.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h  |  6 ++++++
 net/ipv4/fib_rules.c  |  3 +++
 net/ipv6/fib6_rules.c |  3 +++
 net/l3mdev/l3mdev.c   | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index e90095091aa0..81e175e80537 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -49,6 +49,8 @@ struct l3mdev_ops {
 int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
 			  struct fib_lookup_arg *arg);
 
+void l3mdev_update_flow(struct net *net, struct flowi *fl);
+
 int l3mdev_master_ifindex_rcu(const struct net_device *dev);
 static inline int l3mdev_master_ifindex(struct net_device *dev)
 {
@@ -290,6 +292,10 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
 {
 	return 1;
 }
+static inline
+void l3mdev_update_flow(struct net *net, struct flowi *fl)
+{
+}
 #endif
 
 #endif /* _NET_L3MDEV_H_ */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6e9ea69e5f75..770bebed6b28 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp,
 	};
 	int err;
 
+	/* update flow if oif or iif point to device enslaved to l3mdev */
+	l3mdev_update_flow(net, flowi4_to_flowi(flp));
+
 	err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	if (arg.rule)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 5857c1fc8b67..eea23b57c6a5 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -38,6 +38,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 		.flags = FIB_LOOKUP_NOREF,
 	};
 
+	/* update flow if oif or iif point to device enslaved to l3mdev */
+	l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
 	fib_rules_lookup(net->ipv6.fib6_rules_ops,
 			 flowi6_to_flowi(fl6), flags, &arg);
 
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index c4a1c3e84e12..43610e5acc4e 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -222,3 +222,38 @@ out:
 
 	return rc;
 }
+
+void l3mdev_update_flow(struct net *net, struct flowi *fl)
+{
+	struct net_device *dev;
+	int ifindex;
+
+	rcu_read_lock();
+
+	if (fl->flowi_oif) {
+		dev = dev_get_by_index_rcu(net, fl->flowi_oif);
+		if (dev) {
+			ifindex = l3mdev_master_ifindex_rcu(dev);
+			if (ifindex) {
+				fl->flowi_oif = ifindex;
+				fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+				goto out;
+			}
+		}
+	}
+
+	if (fl->flowi_iif) {
+		dev = dev_get_by_index_rcu(net, fl->flowi_iif);
+		if (dev) {
+			ifindex = l3mdev_master_ifindex_rcu(dev);
+			if (ifindex) {
+				fl->flowi_iif = ifindex;
+				fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+			}
+		}
+	}
+
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(l3mdev_update_flow);
-- 
cgit v1.2.3


From a8e3e1a9f02094145580ea7920c6a1d9aabd5539 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:53 -0700
Subject: net: l3mdev: Add hook to output path

This patch adds the infrastructure to the output path to pass an skb
to an l3mdev device if it has a hook registered. This is the Tx parallel
to l3mdev_ip{6}_rcv in the receive path and is the basis for removing
the existing hook that returns the vrf dst on the fib lookup.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h   | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_output.c   |  8 ++++++++
 net/ipv6/ip6_output.c  |  8 ++++++++
 net/ipv6/output_core.c |  7 +++++++
 net/ipv6/raw.c         |  7 +++++++
 5 files changed, 78 insertions(+)

(limited to 'include')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 81e175e80537..53d5274920e3 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -11,6 +11,7 @@
 #ifndef _NET_L3MDEV_H_
 #define _NET_L3MDEV_H_
 
+#include <net/dst.h>
 #include <net/fib_rules.h>
 
 /**
@@ -18,6 +19,10 @@
  *
  * @l3mdev_fib_table: Get FIB table id to use for lookups
  *
+ * @l3mdev_l3_rcv:    Hook in L3 receive path
+ *
+ * @l3mdev_l3_out:    Hook in L3 output path
+ *
  * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device
  *
  * @l3mdev_get_saddr: Get source address for a flow
@@ -29,6 +34,9 @@ struct l3mdev_ops {
 	u32		(*l3mdev_fib_table)(const struct net_device *dev);
 	struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
 					  struct sk_buff *skb, u16 proto);
+	struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev,
+					  struct sock *sk, struct sk_buff *skb,
+					  u16 proto);
 
 	/* IPv4 ops */
 	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
@@ -201,6 +209,34 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
 	return l3mdev_l3_rcv(skb, AF_INET6);
 }
 
+static inline
+struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto)
+{
+	struct net_device *dev = skb_dst(skb)->dev;
+
+	if (netif_is_l3_slave(dev)) {
+		struct net_device *master;
+
+		master = netdev_master_upper_dev_get_rcu(dev);
+		if (master && master->l3mdev_ops->l3mdev_l3_out)
+			skb = master->l3mdev_ops->l3mdev_l3_out(master, sk,
+								skb, proto);
+	}
+
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
+{
+	return l3mdev_l3_out(sk, skb, AF_INET);
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
+{
+	return l3mdev_l3_out(sk, skb, AF_INET6);
+}
 #else
 
 static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -286,6 +322,18 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
 	return skb;
 }
 
+static inline
+struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
+{
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
+{
+	return skb;
+}
+
 static inline
 int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
 			  struct fib_lookup_arg *arg)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b913f5bf0757..41e10e34769c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -99,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 	iph->tot_len = htons(skb->len);
 	ip_send_check(iph);
+
+	/* if egress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip_out(sk, skb);
+	if (unlikely(!skb))
+		return 0;
+
 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
 		       net, sk, skb, NULL, skb_dst(skb)->dev,
 		       dst_output);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 993fd9666f1b..6ea6caace3a8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -236,6 +236,14 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			      IPSTATS_MIB_OUT, skb->len);
+
+		/* if egress device is enslaved to an L3 master device pass the
+		 * skb to its handler for processing
+		 */
+		skb = l3mdev_ip6_out((struct sock *)sk, skb);
+		if (unlikely(!skb))
+			return 0;
+
 		/* hooks should never assume socket lock is held.
 		 * we promote our socket to non const
 		 */
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 462f2a76b5c2..7cca8ac66fe9 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -148,6 +148,13 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 	ipv6_hdr(skb)->payload_len = htons(len);
 	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
 
+	/* if egress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip6_out(sk, skb);
+	if (unlikely(!skb))
+		return 0;
+
 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 		       net, sk, skb, NULL, skb_dst(skb)->dev,
 		       dst_output);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 590dd1f7746f..54404f08efcc 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -653,6 +653,13 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	if (err)
 		goto error_fault;
 
+	/* if egress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip6_out(sk, skb);
+	if (unlikely(!skb))
+		return 0;
+
 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
 	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
 		      NULL, rt->dst.dev, dst_output);
-- 
cgit v1.2.3


From 5f02ce24c2696fec33f2a5dfcf753996f5fdd211 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:54 -0700
Subject: net: l3mdev: Allow the l3mdev to be a loopback

Allow an L3 master device to act as the loopback for that L3 domain.
For IPv4 the device can also have the address 127.0.0.1.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h |  6 +++---
 net/ipv4/route.c     |  8 ++++++--
 net/ipv6/route.c     | 12 ++++++++++--
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 53d5274920e3..3ee110518584 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -90,7 +90,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
 }
 
 static inline
-const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
+struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
 {
 	/* netdev_master_upper_dev_get_rcu calls
 	 * list_first_or_null_rcu to walk the upper dev list.
@@ -99,7 +99,7 @@ const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
 	 * typecast to remove the const
 	 */
 	struct net_device *dev = (struct net_device *)_dev;
-	const struct net_device *master;
+	struct net_device *master;
 
 	if (!dev)
 		return NULL;
@@ -254,7 +254,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
 }
 
 static inline
-const struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
+struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
 {
 	return NULL;
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3e992783c1d0..f49b2c534e92 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2018,7 +2018,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		return ERR_PTR(-EINVAL);
 
 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
-		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+		if (ipv4_is_loopback(fl4->saddr) &&
+		    !(dev_out->flags & IFF_LOOPBACK) &&
+		    !netif_is_l3_master(dev_out))
 			return ERR_PTR(-EINVAL);
 
 	if (ipv4_is_lbcast(fl4->daddr))
@@ -2302,7 +2304,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 			else
 				fl4->saddr = fl4->daddr;
 		}
-		dev_out = net->loopback_dev;
+
+		/* L3 master device is the loopback for that domain */
+		dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
 		fl4->flowi4_oif = dev_out->ifindex;
 		flags |= RTCF_LOCAL;
 		goto make_route;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 09d43ff11a8d..2c681113c055 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2558,8 +2558,16 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 {
 	u32 tb_id;
 	struct net *net = dev_net(idev->dev);
-	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
-					    DST_NOCOUNT);
+	struct net_device *dev = net->loopback_dev;
+	struct rt6_info *rt;
+
+	/* use L3 Master device as loopback for host routes if device
+	 * is enslaved and address is not link local or multicast
+	 */
+	if (!rt6_need_strict(addr))
+		dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
+
+	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
 	if (!rt)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3


From 4c1feac58e06270321cc500b85c2d94a11495775 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:56 -0700
Subject: net: vrf: Flip IPv6 output path from FIB lookup hook to out hook

Flip the IPv6 output path to use the l3mdev tx out hook. The VRF dst
is not returned on the first FIB lookup. Instead, the dst on the
skb is switched at the beginning of the IPv6 output processing to
send the packet to the VRF driver on xmit.

Link scope addresses (linklocal and multicast) need special handling:
specifically the oif the flow struct can not be changed because we
want the lookup tied to the enslaved interface. ie., the source address
and the returned route MUST point to the interface scope passed in.
Convert the existing vrf_get_rt6_dst to handle only link scope addresses.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c    | 124 ++++++++++++++++++++++++++++++++++-----------------
 include/net/l3mdev.h |   8 ++--
 net/ipv6/route.c     |  11 +++--
 net/l3mdev/l3mdev.c  |  15 +++----
 4 files changed, 100 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 08540b96ec18..f5372edf6edc 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -137,6 +137,20 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
+static int vrf_ip6_local_out(struct net *net, struct sock *sk,
+			     struct sk_buff *skb)
+{
+	int err;
+
+	err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
+		      sk, skb, NULL, skb_dst(skb)->dev, dst_output);
+
+	if (likely(err == 1))
+		err = dst_output(net, sk, skb);
+
+	return err;
+}
+
 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 					   struct net_device *dev)
 {
@@ -207,7 +221,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 	/* strip the ethernet header added for pass through VRF device */
 	__skb_pull(skb, skb_network_offset(skb));
 
-	ret = ip6_local_out(net, skb->sk, skb);
+	ret = vrf_ip6_local_out(net, skb->sk, skb);
 	if (unlikely(net_xmit_eval(ret)))
 		dev->stats.tx_errors++;
 	else
@@ -391,6 +405,43 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
 
+/* set dst on skb to send packet to us via dev_xmit path. Allows
+ * packet to go through device based features such as qdisc, netfilter
+ * hooks and packet sockets with skb->dev set to vrf device.
+ */
+static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
+				   struct sock *sk,
+				   struct sk_buff *skb)
+{
+	struct net_vrf *vrf = netdev_priv(vrf_dev);
+	struct dst_entry *dst = NULL;
+	struct rt6_info *rt6;
+
+	/* don't divert link scope packets */
+	if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
+		return skb;
+
+	rcu_read_lock();
+
+	rt6 = rcu_dereference(vrf->rt6);
+	if (likely(rt6)) {
+		dst = &rt6->dst;
+		dst_hold(dst);
+	}
+
+	rcu_read_unlock();
+
+	if (unlikely(!dst)) {
+		vrf_tx_error(vrf_dev, skb);
+		return NULL;
+	}
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	return skb;
+}
+
 /* holding rtnl */
 static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
 {
@@ -477,6 +528,13 @@ out:
 	return rc;
 }
 #else
+static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
+				   struct sock *sk,
+				   struct sk_buff *skb)
+{
+	return skb;
+}
+
 static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
 {
 }
@@ -587,6 +645,8 @@ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
 	switch (proto) {
 	case AF_INET:
 		return vrf_ip_out(vrf_dev, sk, skb);
+	case AF_INET6:
+		return vrf_ip6_out(vrf_dev, sk, skb);
 	}
 
 	return skb;
@@ -1031,53 +1091,33 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
-					 struct flowi6 *fl6)
+/* send to link-local or multicast address via interface enslaved to
+ * VRF device. Force lookup to VRF table without changing flow struct
+ */
+static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
+					      struct flowi6 *fl6)
 {
-	bool need_strict = rt6_need_strict(&fl6->daddr);
-	struct net_vrf *vrf = netdev_priv(dev);
 	struct net *net = dev_net(dev);
+	int flags = RT6_LOOKUP_F_IFACE;
 	struct dst_entry *dst = NULL;
 	struct rt6_info *rt;
 
-	/* send to link-local or multicast address */
-	if (need_strict) {
-		int flags = RT6_LOOKUP_F_IFACE;
-
-		/* VRF device does not have a link-local address and
-		 * sending packets to link-local or mcast addresses over
-		 * a VRF device does not make sense
-		 */
-		if (fl6->flowi6_oif == dev->ifindex) {
-			struct dst_entry *dst = &net->ipv6.ip6_null_entry->dst;
-
-			dst_hold(dst);
-			return dst;
-		}
-
-		if (!ipv6_addr_any(&fl6->saddr))
-			flags |= RT6_LOOKUP_F_HAS_SADDR;
-
-		rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags);
-		if (rt)
-			dst = &rt->dst;
-
-	} else if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) {
-
-		rcu_read_lock();
-
-		rt = rcu_dereference(vrf->rt6);
-		if (likely(rt)) {
-			dst = &rt->dst;
-			dst_hold(dst);
-		}
-
-		rcu_read_unlock();
+	/* VRF device does not have a link-local address and
+	 * sending packets to link-local or mcast addresses over
+	 * a VRF device does not make sense
+	 */
+	if (fl6->flowi6_oif == dev->ifindex) {
+		dst = &net->ipv6.ip6_null_entry->dst;
+		dst_hold(dst);
+		return dst;
 	}
 
-	/* make sure oif is set to VRF device for lookup */
-	if (!need_strict)
-		fl6->flowi6_oif = dev->ifindex;
+	if (!ipv6_addr_any(&fl6->saddr))
+		flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+	rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags);
+	if (rt)
+		dst = &rt->dst;
 
 	return dst;
 }
@@ -1130,7 +1170,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_l3_rcv		= vrf_l3_rcv,
 	.l3mdev_l3_out		= vrf_l3_out,
 #if IS_ENABLED(CONFIG_IPV6)
-	.l3mdev_get_rt6_dst	= vrf_get_rt6_dst,
+	.l3mdev_link_scope_lookup = vrf_link_scope_lookup,
 	.l3mdev_get_saddr6	= vrf_get_saddr6,
 #endif
 };
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 3ee110518584..51aab20a4d0a 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -27,7 +27,7 @@
  *
  * @l3mdev_get_saddr: Get source address for a flow
  *
- * @l3mdev_get_rt6_dst: Get cached IPv6 rt6_info (dst_entry) for device
+ * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
  */
 
 struct l3mdev_ops {
@@ -45,7 +45,7 @@ struct l3mdev_ops {
 					    struct flowi4 *fl4);
 
 	/* IPv6 ops */
-	struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev,
+	struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
 						 struct flowi6 *fl6);
 	int		   (*l3mdev_get_saddr6)(struct net_device *dev,
 						const struct sock *sk,
@@ -177,7 +177,7 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 
 int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
 
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6);
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
 int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
 		      struct flowi6 *fl6);
 
@@ -299,7 +299,7 @@ static inline int l3mdev_get_saddr(struct net *net, int ifindex,
 }
 
 static inline
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6)
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
 {
 	return NULL;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2c681113c055..87e0a01ce744 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1188,12 +1188,15 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 					 struct flowi6 *fl6, int flags)
 {
-	struct dst_entry *dst;
 	bool any_src;
 
-	dst = l3mdev_get_rt6_dst(net, fl6);
-	if (dst)
-		return dst;
+	if (rt6_need_strict(&fl6->daddr)) {
+		struct dst_entry *dst;
+
+		dst = l3mdev_link_scope_lookup(net, fl6);
+		if (dst)
+			return dst;
+	}
 
 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
 
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 43610e5acc4e..ac9d928d0a9e 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -100,15 +100,14 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
 EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
 
 /**
- *	l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns
- *			     cached route for L3 master device if relevant
- *			     to flow
+ *	l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link
+ *			     local and multicast addresses
  *	@net: network namespace for device index lookup
  *	@fl6: IPv6 flow struct for lookup
  */
 
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
-				     struct flowi6 *fl6)
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
+					   struct flowi6 *fl6)
 {
 	struct dst_entry *dst = NULL;
 	struct net_device *dev;
@@ -121,15 +120,15 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
 			dev = netdev_master_upper_dev_get_rcu(dev);
 
 		if (dev && netif_is_l3_master(dev) &&
-		    dev->l3mdev_ops->l3mdev_get_rt6_dst)
-			dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6);
+		    dev->l3mdev_ops->l3mdev_link_scope_lookup)
+			dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
 
 		rcu_read_unlock();
 	}
 
 	return dst;
 }
-EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst);
+EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
 
 /**
  *	l3mdev_get_saddr - get source address for a flow based on an interface
-- 
cgit v1.2.3


From d66f6c0a8f3c0bcc4ee7a9b1da4b0ebe7ee555a3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:58 -0700
Subject: net: ipv4: Remove l3mdev_get_saddr

No longer needed

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c    | 38 --------------------------------------
 include/net/l3mdev.h | 12 ------------
 include/net/route.h  | 10 ----------
 net/ipv4/raw.c       |  6 ------
 net/ipv4/udp.c       |  6 ------
 net/l3mdev/l3mdev.c  | 31 -------------------------------
 6 files changed, 103 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index f5372edf6edc..9ad2a169485f 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -863,43 +863,6 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev,
 	return rth;
 }
 
-/* called under rcu_read_lock */
-static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
-{
-	struct fib_result res = { .tclassid = 0 };
-	struct net *net = dev_net(dev);
-	u32 orig_tos = fl4->flowi4_tos;
-	u8 flags = fl4->flowi4_flags;
-	u8 scope = fl4->flowi4_scope;
-	u8 tos = RT_FL_TOS(fl4);
-	int rc;
-
-	if (unlikely(!fl4->daddr))
-		return 0;
-
-	fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF;
-	fl4->flowi4_iif = LOOPBACK_IFINDEX;
-	/* make sure oif is set to VRF device for lookup */
-	fl4->flowi4_oif = dev->ifindex;
-	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
-	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
-			     RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
-
-	rc = fib_lookup(net, fl4, &res, 0);
-	if (!rc) {
-		if (res.type == RTN_LOCAL)
-			fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr;
-		else
-			fib_select_path(net, &res, fl4, -1);
-	}
-
-	fl4->flowi4_flags = flags;
-	fl4->flowi4_tos = orig_tos;
-	fl4->flowi4_scope = scope;
-
-	return rc;
-}
-
 static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	return 0;
@@ -1166,7 +1129,6 @@ static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk,
 static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_fib_table	= vrf_fib_table,
 	.l3mdev_get_rtable	= vrf_get_rtable,
-	.l3mdev_get_saddr	= vrf_get_saddr,
 	.l3mdev_l3_rcv		= vrf_l3_rcv,
 	.l3mdev_l3_out		= vrf_l3_out,
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 51aab20a4d0a..1129e1d8cd6e 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -25,8 +25,6 @@
  *
  * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device
  *
- * @l3mdev_get_saddr: Get source address for a flow
- *
  * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
  */
 
@@ -41,8 +39,6 @@ struct l3mdev_ops {
 	/* IPv4 ops */
 	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
 					     const struct flowi4 *fl4);
-	int		(*l3mdev_get_saddr)(struct net_device *dev,
-					    struct flowi4 *fl4);
 
 	/* IPv6 ops */
 	struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
@@ -175,8 +171,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 	return rc;
 }
 
-int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
-
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
 int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
 		      struct flowi6 *fl6);
@@ -292,12 +286,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 	return false;
 }
 
-static inline int l3mdev_get_saddr(struct net *net, int ifindex,
-				   struct flowi4 *fl4)
-{
-	return 0;
-}
-
 static inline
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
 {
diff --git a/include/net/route.h b/include/net/route.h
index ad777d79af94..0429d47cad25 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -29,7 +29,6 @@
 #include <net/flow.h>
 #include <net/inet_sock.h>
 #include <net/ip_fib.h>
-#include <net/l3mdev.h>
 #include <linux/in_route.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
@@ -285,15 +284,6 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
 	ip_route_connect_init(fl4, dst, src, tos, oif, protocol,
 			      sport, dport, sk);
 
-	if (!src && oif) {
-		int rc;
-
-		rc = l3mdev_get_saddr(net, oif, fl4);
-		if (rc < 0)
-			return ERR_PTR(rc);
-
-		src = fl4->saddr;
-	}
 	if (!dst || !src) {
 		rt = __ip_route_output_key(net, fl4);
 		if (IS_ERR(rt))
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 438f50c1a676..90a85c955872 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -606,12 +606,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			    (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
 			   daddr, saddr, 0, 0);
 
-	if (!saddr && ipc.oif) {
-		err = l3mdev_get_saddr(net, ipc.oif, &fl4);
-		if (err < 0)
-			goto done;
-	}
-
 	if (!inet->hdrincl) {
 		rfv.msg = msg;
 		rfv.hlen = 0;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 058c31286ce1..7d96dc2d3d08 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1021,12 +1021,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				   flow_flags,
 				   faddr, saddr, dport, inet->inet_sport);
 
-		if (!saddr && ipc.oif) {
-			err = l3mdev_get_saddr(net, ipc.oif, fl4);
-			if (err < 0)
-				goto out;
-		}
-
 		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
 		rt = ip_route_output_flow(net, fl4, sk);
 		if (IS_ERR(rt)) {
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index ac9d928d0a9e..be40df60703c 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -130,37 +130,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
 
-/**
- *	l3mdev_get_saddr - get source address for a flow based on an interface
- *			   enslaved to an L3 master device
- *	@net: network namespace for device index lookup
- *	@ifindex: Interface index
- *	@fl4: IPv4 flow struct
- */
-
-int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
-{
-	struct net_device *dev;
-	int rc = 0;
-
-	if (ifindex) {
-		rcu_read_lock();
-
-		dev = dev_get_by_index_rcu(net, ifindex);
-		if (dev && netif_is_l3_slave(dev))
-			dev = netdev_master_upper_dev_get_rcu(dev);
-
-		if (dev && netif_is_l3_master(dev) &&
-		    dev->l3mdev_ops->l3mdev_get_saddr)
-			rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);
-
-		rcu_read_unlock();
-	}
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
-
 int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
 		      struct flowi6 *fl6)
 {
-- 
cgit v1.2.3


From 8a966fc016b67d2a8ab4a83d22ded8cde032a0eb Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:59 -0700
Subject: net: ipv6: Remove l3mdev_get_saddr6

No longer needed

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c     | 41 -----------------------------------------
 include/net/l3mdev.h  | 11 -----------
 net/ipv6/ip6_output.c |  9 +--------
 net/l3mdev/l3mdev.c   | 24 ------------------------
 4 files changed, 1 insertion(+), 84 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9ad2a169485f..3a34f547c578 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1084,46 +1084,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
 
 	return dst;
 }
-
-/* called under rcu_read_lock */
-static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk,
-			  struct flowi6 *fl6)
-{
-	struct net *net = dev_net(dev);
-	struct dst_entry *dst;
-	struct rt6_info *rt;
-	int err;
-
-	if (rt6_need_strict(&fl6->daddr)) {
-		rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif,
-					  RT6_LOOKUP_F_IFACE);
-		if (unlikely(!rt))
-			return 0;
-
-		dst = &rt->dst;
-	} else {
-		__u8 flags = fl6->flowi6_flags;
-
-		fl6->flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
-		fl6->flowi6_flags |= FLOWI_FLAG_SKIP_NH_OIF;
-
-		dst = ip6_route_output(net, sk, fl6);
-		rt = (struct rt6_info *)dst;
-
-		fl6->flowi6_flags = flags;
-	}
-
-	err = dst->error;
-	if (!err) {
-		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
-					  sk ? inet6_sk(sk)->srcprefs : 0,
-					  &fl6->saddr);
-	}
-
-	dst_release(dst);
-
-	return err;
-}
 #endif
 
 static const struct l3mdev_ops vrf_l3mdev_ops = {
@@ -1133,7 +1093,6 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_l3_out		= vrf_l3_out,
 #if IS_ENABLED(CONFIG_IPV6)
 	.l3mdev_link_scope_lookup = vrf_link_scope_lookup,
-	.l3mdev_get_saddr6	= vrf_get_saddr6,
 #endif
 };
 
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 1129e1d8cd6e..a5e506eb51de 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -43,9 +43,6 @@ struct l3mdev_ops {
 	/* IPv6 ops */
 	struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
 						 struct flowi6 *fl6);
-	int		   (*l3mdev_get_saddr6)(struct net_device *dev,
-						const struct sock *sk,
-						struct flowi6 *fl6);
 };
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
@@ -172,8 +169,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 }
 
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
-int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
-		      struct flowi6 *fl6);
 
 static inline
 struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
@@ -292,12 +287,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
 	return NULL;
 }
 
-static inline int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
-				    struct flowi6 *fl6)
-{
-	return 0;
-}
-
 static inline
 struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1cb41b365048..6001e781164e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -926,13 +926,6 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	int err;
 	int flags = 0;
 
-	if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
-	    (!*dst || !(*dst)->error)) {
-		err = l3mdev_get_saddr6(net, sk, fl6);
-		if (err)
-			goto out_err;
-	}
-
 	/* The correct way to handle this would be to do
 	 * ip6_route_get_saddr, and then ip6_route_output; however,
 	 * the route-specific preferred source forces the
@@ -1024,7 +1017,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 out_err_release:
 	dst_release(*dst);
 	*dst = NULL;
-out_err:
+
 	if (err == -ENETUNREACH)
 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 	return err;
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index be40df60703c..8da86ceca33d 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -130,30 +130,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
 
-int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
-		      struct flowi6 *fl6)
-{
-	struct net_device *dev;
-	int rc = 0;
-
-	if (fl6->flowi6_oif) {
-		rcu_read_lock();
-
-		dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
-		if (dev && netif_is_l3_slave(dev))
-			dev = netdev_master_upper_dev_get_rcu(dev);
-
-		if (dev && netif_is_l3_master(dev) &&
-		    dev->l3mdev_ops->l3mdev_get_saddr6)
-			rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6);
-
-		rcu_read_unlock();
-	}
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(l3mdev_get_saddr6);
-
 /**
  *	l3mdev_fib_rule_match - Determine if flowi references an
  *				L3 master device
-- 
cgit v1.2.3


From ca28b8f2b8f316b9973693c72770c98da3e9500e Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:10:00 -0700
Subject: net: l3mdev: Remove l3mdev_fib_oif

No longer used

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h | 29 -----------------------------
 1 file changed, 29 deletions(-)

(limited to 'include')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index a5e506eb51de..a586035c97cb 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -107,26 +107,6 @@ struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
 	return master;
 }
 
-/* get index of an interface to use for FIB lookups. For devices
- * enslaved to an L3 master device FIB lookups are based on the
- * master index
- */
-static inline int l3mdev_fib_oif_rcu(struct net_device *dev)
-{
-	return l3mdev_master_ifindex_rcu(dev) ? : dev->ifindex;
-}
-
-static inline int l3mdev_fib_oif(struct net_device *dev)
-{
-	int oif;
-
-	rcu_read_lock();
-	oif = l3mdev_fib_oif_rcu(dev);
-	rcu_read_unlock();
-
-	return oif;
-}
-
 u32 l3mdev_fib_table_rcu(const struct net_device *dev);
 u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
 static inline u32 l3mdev_fib_table(const struct net_device *dev)
@@ -248,15 +228,6 @@ struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
 	return NULL;
 }
 
-static inline int l3mdev_fib_oif_rcu(struct net_device *dev)
-{
-	return dev ? dev->ifindex : 0;
-}
-static inline int l3mdev_fib_oif(struct net_device *dev)
-{
-	return dev ? dev->ifindex : 0;
-}
-
 static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From afb460fe0ef0af6d98ed51006153acb01278df2d Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:10:01 -0700
Subject: net: l3mdev: remove get_rtable method

No longer used

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c    | 21 ---------------------
 include/net/l3mdev.h | 21 ---------------------
 2 files changed, 42 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 3a34f547c578..ccce59fbb2b3 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -843,26 +843,6 @@ static u32 vrf_fib_table(const struct net_device *dev)
 	return vrf->tb_id;
 }
 
-static struct rtable *vrf_get_rtable(const struct net_device *dev,
-				     const struct flowi4 *fl4)
-{
-	struct rtable *rth = NULL;
-
-	if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) {
-		struct net_vrf *vrf = netdev_priv(dev);
-
-		rcu_read_lock();
-
-		rth = rcu_dereference(vrf->rth);
-		if (likely(rth))
-			dst_hold(&rth->dst);
-
-		rcu_read_unlock();
-	}
-
-	return rth;
-}
-
 static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	return 0;
@@ -1088,7 +1068,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
 
 static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_fib_table	= vrf_fib_table,
-	.l3mdev_get_rtable	= vrf_get_rtable,
 	.l3mdev_l3_rcv		= vrf_l3_rcv,
 	.l3mdev_l3_out		= vrf_l3_out,
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index a586035c97cb..3832099289c5 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -23,8 +23,6 @@
  *
  * @l3mdev_l3_out:    Hook in L3 output path
  *
- * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device
- *
  * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
  */
 
@@ -36,10 +34,6 @@ struct l3mdev_ops {
 					  struct sock *sk, struct sk_buff *skb,
 					  u16 proto);
 
-	/* IPv4 ops */
-	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
-					     const struct flowi4 *fl4);
-
 	/* IPv6 ops */
 	struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
 						 struct flowi6 *fl6);
@@ -120,15 +114,6 @@ static inline u32 l3mdev_fib_table(const struct net_device *dev)
 	return tb_id;
 }
 
-static inline struct rtable *l3mdev_get_rtable(const struct net_device *dev,
-					       const struct flowi4 *fl4)
-{
-	if (netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_get_rtable)
-		return dev->l3mdev_ops->l3mdev_get_rtable(dev, fl4);
-
-	return NULL;
-}
-
 static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 {
 	struct net_device *dev;
@@ -241,12 +226,6 @@ static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
 	return 0;
 }
 
-static inline struct rtable *l3mdev_get_rtable(const struct net_device *dev,
-					       const struct flowi4 *fl4)
-{
-	return NULL;
-}
-
 static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 {
 	return false;
-- 
cgit v1.2.3


From c71ad3d45a5e928e617ca436f3ce88bb773fb766 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:10:02 -0700
Subject: net: flow: Remove FLOWI_FLAG_L3MDEV_SRC flag

No longer used

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c  | 5 ++---
 include/net/flow.h | 3 +--
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index ccce59fbb2b3..55674b0e65b7 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -165,7 +165,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 		.flowlabel = ip6_flowinfo(iph),
 		.flowi6_mark = skb->mark,
 		.flowi6_proto = iph->nexthdr,
-		.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF,
+		.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF,
 	};
 	int ret = NET_XMIT_DROP;
 	struct dst_entry *dst;
@@ -265,8 +265,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
 		.flowi4_oif = vrf_dev->ifindex,
 		.flowi4_iif = LOOPBACK_IFINDEX,
 		.flowi4_tos = RT_TOS(ip4h->tos),
-		.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC |
-				FLOWI_FLAG_SKIP_NH_OIF,
+		.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF,
 		.daddr = ip4h->daddr,
 	};
 	struct net *net = dev_net(vrf_dev);
diff --git a/include/net/flow.h b/include/net/flow.h
index d47ef4bb5423..035aa7716967 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -34,8 +34,7 @@ struct flowi_common {
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_KNOWN_NH		0x02
-#define FLOWI_FLAG_L3MDEV_SRC		0x04
-#define FLOWI_FLAG_SKIP_NH_OIF		0x08
+#define FLOWI_FLAG_SKIP_NH_OIF		0x04
 	__u32	flowic_secid;
 	struct flowi_tunnel flowic_tun_key;
 };
-- 
cgit v1.2.3


From 5a1f044b5048e834f936fbb33a93e5d8410779ec Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Mon, 29 Aug 2016 23:25:14 +0300
Subject: cfg80211: clarify the requirements of .disconnect()

cfg80211 expects the .disconnect() handler to call
cfg80211_disconnect() when done. Make this requirement
more explicit.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9c23f4d33e06..d5e7f690bad9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2423,7 +2423,8 @@ struct cfg80211_qos_map {
  *	cases, the result of roaming is indicated with a call to
  *	cfg80211_roamed() or cfg80211_roamed_bss().
  *	(invoked with the wireless_dev mutex held)
- * @disconnect: Disconnect from the BSS/ESS.
+ * @disconnect: Disconnect from the BSS/ESS. Once done, call
+ *	cfg80211_disconnected().
  *	(invoked with the wireless_dev mutex held)
  *
  * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call
-- 
cgit v1.2.3


From 480dd46b9d6812e5fb7172c305ee0f1154c26eed Mon Sep 17 00:00:00 2001
From: Maxim Altshul <maxim.altshul@ti.com>
Date: Mon, 22 Aug 2016 17:14:04 +0300
Subject: mac80211: RX BA support for sta max_rx_aggregation_subframes

The ability to change the max_rx_aggregation frames is useful
in cases of IOP.

There exist some devices (latest mobile phones and some AP's)
that tend to not respect a BA sessions maximum size (in Kbps).
These devices won't respect the AMPDU size that was negotiated during
association (even though they do respect the maximal number of packets).

This violation is characterized by a valid number of packets in
a single AMPDU. Even so, the total size will exceed the size negotiated
during association.

Eventually, this will cause some undefined behavior, which in turn
causes the hw to drop packets, causing the throughput to plummet.

This patch will make the subframe limitation to be held by each station,
instead of being held only by hw.

Signed-off-by: Maxim Altshul <maxim.altshul@ti.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 4 ++++
 net/mac80211/agg-rx.c   | 7 +++++--
 net/mac80211/sta_info.c | 3 +++
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a585c3..a1457ca2a30c 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1735,6 +1735,9 @@ struct ieee80211_sta_rates {
  * @supp_rates: Bitmap of supported rates (per band)
  * @ht_cap: HT capabilities of this STA; restricted to our own capabilities
  * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities
+ * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU
+ *	that this station is allowed to transmit to us.
+ *	Can be modified by driver.
  * @wme: indicates whether the STA supports QoS/WME (if local devices does,
  *	otherwise always false)
  * @drv_priv: data area for driver use, will always be aligned to
@@ -1775,6 +1778,7 @@ struct ieee80211_sta {
 	u16 aid;
 	struct ieee80211_sta_ht_cap ht_cap;
 	struct ieee80211_sta_vht_cap vht_cap;
+	u8 max_rx_aggregation_subframes;
 	bool wme;
 	u8 uapsd_queues;
 	u8 max_sp;
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index a9aff6079c42..282e99bdb301 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -298,10 +298,13 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 		buf_size = IEEE80211_MAX_AMPDU_BUF;
 
 	/* make sure the size doesn't exceed the maximum supported by the hw */
-	if (buf_size > local->hw.max_rx_aggregation_subframes)
-		buf_size = local->hw.max_rx_aggregation_subframes;
+	if (buf_size > sta->sta.max_rx_aggregation_subframes)
+		buf_size = sta->sta.max_rx_aggregation_subframes;
 	params.buf_size = buf_size;
 
+	ht_dbg(sta->sdata, "AddBA Req buf_size=%d for %pM\n",
+	       buf_size, sta->sta.addr);
+
 	/* examine state machine */
 	mutex_lock(&sta->ampdu_mlme.mtx);
 
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 19f14c907d74..5e70fa52e1ff 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -340,6 +340,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 
 	memcpy(sta->addr, addr, ETH_ALEN);
 	memcpy(sta->sta.addr, addr, ETH_ALEN);
+	sta->sta.max_rx_aggregation_subframes =
+		local->hw.max_rx_aggregation_subframes;
+
 	sta->local = local;
 	sta->sdata = sdata;
 	sta->rx_stats.last_rx = jiffies;
-- 
cgit v1.2.3


From 99ee7cae3bf3ce04e90d7b193d9f4f59a7044d91 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Aug 2016 23:25:17 +0300
Subject: mac80211: add support for radiotap timestamp field

Use the existing device timestamp from the RX status information
to add support for the new radiotap timestamp field. Currently
only 32-bit counters are supported, but we also add the radiotap
mactime where applicable. This new field allows more flexibility
in where the timestamp is taken etc. The non-timestamp data in
the field is taken from a new field in the hw struct.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 21 +++++++++++++++++++++
 include/net/mac80211.h           | 12 ++++++++++++
 net/mac80211/main.c              |  3 +++
 net/mac80211/rx.c                | 30 ++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index b0fd9476c538..ba07b9d8ed63 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -190,6 +190,10 @@ struct ieee80211_radiotap_header {
  * IEEE80211_RADIOTAP_VHT	u16, u8, u8, u8[4], u8, u8, u16
  *
  *	Contains VHT information about this frame.
+ *
+ * IEEE80211_RADIOTAP_TIMESTAMP		u64, u16, u8, u8	variable
+ *
+ *	Contains timestamp information for this frame.
  */
 enum ieee80211_radiotap_type {
 	IEEE80211_RADIOTAP_TSFT = 0,
@@ -214,6 +218,7 @@ enum ieee80211_radiotap_type {
 	IEEE80211_RADIOTAP_MCS = 19,
 	IEEE80211_RADIOTAP_AMPDU_STATUS = 20,
 	IEEE80211_RADIOTAP_VHT = 21,
+	IEEE80211_RADIOTAP_TIMESTAMP = 22,
 
 	/* valid in every it_present bitmap, even vendor namespaces */
 	IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE = 29,
@@ -321,6 +326,22 @@ enum ieee80211_radiotap_type {
 #define IEEE80211_RADIOTAP_CODING_LDPC_USER2			0x04
 #define IEEE80211_RADIOTAP_CODING_LDPC_USER3			0x08
 
+/* For IEEE80211_RADIOTAP_TIMESTAMP */
+#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MASK			0x000F
+#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MS			0x0000
+#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_US			0x0001
+#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_NS			0x0003
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_MASK			0x00F0
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_BEGIN_MDPU		0x0000
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU		0x0010
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_PPDU		0x0020
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ		0x0030
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_UNKNOWN		0x00F0
+
+#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_64BIT			0x00
+#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT			0x01
+#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY		0x02
+
 /* helpers */
 static inline int ieee80211_get_radiotap_len(unsigned char *data)
 {
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a1457ca2a30c..08bac23c8de1 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2145,6 +2145,14 @@ enum ieee80211_hw_flags {
  *	the default is _GI | _BANDWIDTH.
  *	Use the %IEEE80211_RADIOTAP_VHT_KNOWN_* values.
  *
+ * @radiotap_timestamp: Information for the radiotap timestamp field; if the
+ *	'units_pos' member is set to a non-negative value it must be set to
+ *	a combination of a IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a
+ *	IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value, and then the timestamp
+ *	field will be added and populated from the &struct ieee80211_rx_status
+ *	device_timestamp. If the 'accuracy' member is non-negative, it's put
+ *	into the accuracy radiotap field and the accuracy known flag is set.
+ *
  * @netdev_features: netdev features to be set in each netdev created
  *	from this HW. Note that not all features are usable with mac80211,
  *	other features will be rejected during HW registration.
@@ -2188,6 +2196,10 @@ struct ieee80211_hw {
 	u8 offchannel_tx_hw_queue;
 	u8 radiotap_mcs_details;
 	u16 radiotap_vht_details;
+	struct {
+		int units_pos;
+		s16 accuracy;
+	} radiotap_timestamp;
 	netdev_features_t netdev_features;
 	u8 uapsd_queues;
 	u8 uapsd_max_sp_len;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index d00ea9b13f49..ac053a9df36d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -660,6 +660,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 
 	ieee80211_roc_setup(local);
 
+	local->hw.radiotap_timestamp.units_pos = -1;
+	local->hw.radiotap_timestamp.accuracy = -1;
+
 	return &local->hw;
  err_free:
 	wiphy_free(wiphy);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6a265aa73a46..284f0f25e22e 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -180,6 +180,11 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 		len += 12;
 	}
 
+	if (local->hw.radiotap_timestamp.units_pos >= 0) {
+		len = ALIGN(len, 8);
+		len += 12;
+	}
+
 	if (status->chains) {
 		/* antenna and antenna signal fields */
 		len += 2 * hweight8(status->chains);
@@ -447,6 +452,31 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		pos += 2;
 	}
 
+	if (local->hw.radiotap_timestamp.units_pos >= 0) {
+		u16 accuracy = 0;
+		u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT;
+
+		rthdr->it_present |=
+			cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP);
+
+		/* ensure 8 byte alignment */
+		while ((pos - (u8 *)rthdr) & 7)
+			pos++;
+
+		put_unaligned_le64(status->device_timestamp, pos);
+		pos += sizeof(u64);
+
+		if (local->hw.radiotap_timestamp.accuracy >= 0) {
+			accuracy = local->hw.radiotap_timestamp.accuracy;
+			flags |= IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY;
+		}
+		put_unaligned_le16(accuracy, pos);
+		pos += sizeof(u16);
+
+		*pos++ = local->hw.radiotap_timestamp.units_pos;
+		*pos++ = flags;
+	}
+
 	for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
 		*pos++ = status->chain_signal[chain];
 		*pos++ = chain;
-- 
cgit v1.2.3


From 4440a2ab3b9f40dddbe006331ef0659c76859296 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Tue, 13 Sep 2016 08:49:18 +0800
Subject: netfilter: synproxy: Check oom when adding synproxy and seqadj ct
 extensions

When memory is exhausted, nfct_seqadj_ext_add may fail to add the
synproxy and seqadj extensions. The function nf_ct_seqadj_init doesn't
check if get valid seqadj pointer by the nfct_seqadj.

Now drop the packet directly when fail to add seqadj extension to
avoid dereference NULL pointer in nf_ct_seqadj_init from
init_conntrack().

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_synproxy.h | 14 ++++++++++++++
 net/netfilter/nf_conntrack_core.c             |  6 +++---
 net/netfilter/nf_nat_core.c                   |  3 ++-
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index 6793614e6502..e6937318546c 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -27,6 +27,20 @@ static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct)
 #endif
 }
 
+static inline bool nf_ct_add_synproxy(struct nf_conn *ct,
+				      const struct nf_conn *tmpl)
+{
+	if (tmpl && nfct_synproxy(tmpl)) {
+		if (!nfct_seqadj_ext_add(ct))
+			return false;
+
+		if (!nfct_synproxy_ext_add(ct))
+			return false;
+	}
+
+	return true;
+}
+
 struct synproxy_stats {
 	unsigned int			syn_received;
 	unsigned int			cookie_invalid;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index dd2c43abf9e2..9934b0c93c1e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1035,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	if (IS_ERR(ct))
 		return (struct nf_conntrack_tuple_hash *)ct;
 
-	if (tmpl && nfct_synproxy(tmpl)) {
-		nfct_seqadj_ext_add(ct);
-		nfct_synproxy_ext_add(ct);
+	if (!nf_ct_add_synproxy(ct, tmpl)) {
+		nf_conntrack_free(ct);
+		return ERR_PTR(-ENOMEM);
 	}
 
 	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 19c081e1b328..ecee105bbada 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct,
 			ct->status |= IPS_DST_NAT;
 
 		if (nfct_help(ct))
-			nfct_seqadj_ext_add(ct);
+			if (!nfct_seqadj_ext_add(ct))
+				return NF_DROP;
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-- 
cgit v1.2.3


From ebf9ff753c041b296241990aef76163bbb2cc9c8 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Tue, 13 Sep 2016 15:58:28 +0200
Subject: genirq: Provide irq_gc_{lock_irqsave,unlock_irqrestore}() helpers

Some irqchip drivers need to take the generic chip lock outside of the
irq context.

Provide the irq_gc_{lock_irqsave,unlock_irqrestore}() helpers to allow
one to disable irqs while entering a critical section protected by
gc->lock.

Note that we do not provide optimized version of these helpers for !SMP,
because they are not called from the hot-path.

[ tglx: Added a comment when these helpers should be [not] used ]

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: stable@vger.kernel.org
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Link: http://lkml.kernel.org/r/1473775109-4192-1-git-send-email-boris.brezillon@free-electrons.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b52424eaa0ed..0ac26c892fe2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -945,6 +945,16 @@ static inline void irq_gc_lock(struct irq_chip_generic *gc) { }
 static inline void irq_gc_unlock(struct irq_chip_generic *gc) { }
 #endif
 
+/*
+ * The irqsave variants are for usage in non interrupt code. Do not use
+ * them in irq_chip callbacks. Use irq_gc_lock() instead.
+ */
+#define irq_gc_lock_irqsave(gc, flags)	\
+	raw_spin_lock_irqsave(&(gc)->lock, flags)
+
+#define irq_gc_unlock_irqrestore(gc, flags)	\
+	raw_spin_unlock_irqrestore(&(gc)->lock, flags)
+
 static inline void irq_reg_writel(struct irq_chip_generic *gc,
 				  u32 val, int reg_offset)
 {
-- 
cgit v1.2.3


From 9ad18b75c2f6e4a78ce204e79f37781f8815c0fa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 17 Aug 2016 23:19:01 -0400
Subject: asm-generic: make get_user() clear the destination on errors

both for access_ok() failures and for faults halfway through

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/asm-generic/uaccess.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index 04e21a41796a..32901d11f8c4 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -230,14 +230,18 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 	might_fault();						\
 	access_ok(VERIFY_READ, __p, sizeof(*ptr)) ?		\
 		__get_user((x), (__typeof__(*(ptr)) *)__p) :	\
-		-EFAULT;					\
+		((x) = (__typeof__(*(ptr)))0,-EFAULT);		\
 })
 
 #ifndef __get_user_fn
 static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 {
-	size = __copy_from_user(x, ptr, size);
-	return size ? -EFAULT : size;
+	size_t n = __copy_from_user(x, ptr, size);
+	if (unlikely(n)) {
+		memset(x + (size - n), 0, n);
+		return -EFAULT;
+	}
+	return 0;
 }
 
 #define __get_user_fn(sz, u, k)	__get_user_fn(sz, u, k)
-- 
cgit v1.2.3


From 6cfeaf5125d425043d44002d0a1a8a147be582bf Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 14 Sep 2016 11:00:26 +0100
Subject: cpu/hotplug: Include linux/types.h in linux/cpuhotplug.h

The linux/cpuhotplug.h header makes use of the bool type, but wasn't
including linux/types.h to ensure that type has been defined. Fix this
by including linux/types.h in preparation for including
linux/cpuhotplug.h in a file that doesn't do so already.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: Richard Cochran <rcochran@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Link: http://lkml.kernel.org/r/20160914100027.20945-1-paul.burton@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 242bf530edfc..34bd80512a0c 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -1,6 +1,8 @@
 #ifndef __CPUHOTPLUG_H
 #define __CPUHOTPLUG_H
 
+#include <linux/types.h>
+
 enum cpuhp_state {
 	CPUHP_OFFLINE,
 	CPUHP_CREATE_THREADS,
-- 
cgit v1.2.3


From c7e9dbcf09bddd01568113103d62423d8894eabd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 10:03:00 +0200
Subject: mac80211: remove sta_remove_debugfs driver callback

No drivers implement this, relying either on the recursive
directory removal to remove their debugfs, or not having any
to start with. Remove the dead driver callback.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 11 ++---------
 net/mac80211/debugfs_sta.c |  4 ----
 net/mac80211/driver-ops.h  | 15 ---------------
 3 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 08bac23c8de1..d9c8ccd6b4e6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3101,11 +3101,8 @@ enum ieee80211_reconfig_type {
  *
  * @sta_add_debugfs: Drivers can use this callback to add debugfs files
  *	when a station is added to mac80211's station list. This callback
- *	and @sta_remove_debugfs should be within a CONFIG_MAC80211_DEBUGFS
- *	conditional. This callback can sleep.
- *
- * @sta_remove_debugfs: Remove the debugfs files which were added using
- *	@sta_add_debugfs. This callback can sleep.
+ *	should be within a CONFIG_MAC80211_DEBUGFS conditional. This
+ *	callback can sleep.
  *
  * @sta_notify: Notifies low level driver about power state transition of an
  *	associated station, AP,  IBSS/WDS/mesh peer etc. For a VIF operating
@@ -3501,10 +3498,6 @@ struct ieee80211_ops {
 				struct ieee80211_vif *vif,
 				struct ieee80211_sta *sta,
 				struct dentry *dir);
-	void (*sta_remove_debugfs)(struct ieee80211_hw *hw,
-				   struct ieee80211_vif *vif,
-				   struct ieee80211_sta *sta,
-				   struct dentry *dir);
 #endif
 	void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			enum sta_notify_cmd, struct ieee80211_sta *sta);
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index fb2693582e40..a2fcdb47a0e6 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -544,10 +544,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 
 void ieee80211_sta_debugfs_remove(struct sta_info *sta)
 {
-	struct ieee80211_local *local = sta->local;
-	struct ieee80211_sub_if_data *sdata = sta->sdata;
-
-	drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir);
 	debugfs_remove_recursive(sta->debugfs_dir);
 	sta->debugfs_dir = NULL;
 }
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index c39f93b48791..fe35a1c0dc86 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -499,21 +499,6 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
 		local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
 					    sta, dir);
 }
-
-static inline void drv_sta_remove_debugfs(struct ieee80211_local *local,
-					  struct ieee80211_sub_if_data *sdata,
-					  struct ieee80211_sta *sta,
-					  struct dentry *dir)
-{
-	might_sleep();
-
-	sdata = get_bss_sdata(sdata);
-	check_sdata_in_driver(sdata);
-
-	if (local->ops->sta_remove_debugfs)
-		local->ops->sta_remove_debugfs(&local->hw, &sdata->vif,
-					       sta, dir);
-}
 #endif
 
 static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
-- 
cgit v1.2.3


From e8a24cd4b87247beedb1addc7b683422092047e5 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@qti.qualcomm.com>
Date: Wed, 14 Sep 2016 12:48:32 +0530
Subject: mac80211: allow driver to handle packet-loss mechanism

Based on consecutive msdu failures, mac80211 triggers CQM packet-loss
mechanism. Drivers like ath10k that have its own connection monitoring
algorithm, offloaded to firmware for triggering station kickout. In case
of station kickout, driver will report low ack status by mac80211 API
(ieee80211_report_low_ack).

This flag will enable the driver to completely rely on firmware events
for station kickout and bypass mac80211 packet loss mechanism.

Signed-off-by: Rajkumar Manoharan <rmanohar@qti.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 6 ++++++
 net/mac80211/debugfs.c | 1 +
 net/mac80211/status.c  | 6 ++++++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d9c8ccd6b4e6..5296100f3889 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2018,6 +2018,11 @@ struct ieee80211_txq {
  * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
  *	skbs, needed for zero-copy software A-MSDU.
  *
+ * @IEEE80211_HW_REPORTS_LOW_ACK: The driver (or firmware) reports low ack event
+ *	by ieee80211_report_low_ack() based on its own algorithm. For such
+ *	drivers, mac80211 packet loss mechanism will not be triggered and driver
+ *	is completely depending on firmware event for station kickout.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2058,6 +2063,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_USES_RSS,
 	IEEE80211_HW_TX_AMSDU,
 	IEEE80211_HW_TX_FRAG_LIST,
+	IEEE80211_HW_REPORTS_LOW_ACK,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 5bbb470f335f..8ca62b6bb02a 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -201,6 +201,7 @@ static const char *hw_flag_names[] = {
 	FLAG(USES_RSS),
 	FLAG(TX_AMSDU),
 	FLAG(TX_FRAG_LIST),
+	FLAG(REPORTS_LOW_ACK),
 #undef FLAG
 };
 
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index fabd9ff710d9..ea39f8a7baf3 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -557,6 +557,12 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 static void ieee80211_lost_packet(struct sta_info *sta,
 				  struct ieee80211_tx_info *info)
 {
+	/* If driver relies on its own algorithm for station kickout, skip
+	 * mac80211 packet loss mechanism.
+	 */
+	if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK))
+		return;
+
 	/* This packet was aggregated but doesn't carry status info */
 	if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
 	    !(info->flags & IEEE80211_TX_STAT_AMPDU))
-- 
cgit v1.2.3


From 86da71b57383d40993cb90baafb3735cffe5d800 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 12 Sep 2016 20:13:09 -0400
Subject: net_sched: Introduce skbmod action

This action is intended to be an upgrade from a usability perspective
from pedit (as well as operational debugability).
Compare this:

sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action pedit munge offset -14 u8 set 0x02 \
munge offset -13 u8 set 0x15 \
munge offset -12 u8 set 0x15 \
munge offset -11 u8 set 0x15 \
munge offset -10 u16 set 0x1515 \
pipe

to:

sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action skbmod dmac 02:15:15:15:15:15

Also try to do a MAC address swap with pedit or worse
try to debug a policy with destination mac, source mac and
etherype. Then make few rules out of those and you'll get my point.

In the future common use cases on pedit can be migrated to this action
(as an example different fields in ip v4/6, transports like tcp/udp/sctp
etc). For this first cut, this allows modifying basic ethernet header.

The most important ethernet use case at the moment is when redirecting or
mirroring packets to a remote machine. The dst mac address needs a re-write
so that it doesnt get dropped or confuse an interconnecting (learning) switch
or dropped by a target machine (which looks at the dst mac). And at times
when flipping back the packet a swap of the MAC addresses is needed.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_skbmod.h        |  30 ++++
 include/uapi/linux/tc_act/tc_skbmod.h |  39 +++++
 net/sched/Kconfig                     |  11 ++
 net/sched/Makefile                    |   1 +
 net/sched/act_skbmod.c                | 301 ++++++++++++++++++++++++++++++++++
 5 files changed, 382 insertions(+)
 create mode 100644 include/net/tc_act/tc_skbmod.h
 create mode 100644 include/uapi/linux/tc_act/tc_skbmod.h
 create mode 100644 net/sched/act_skbmod.c

(limited to 'include')

diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h
new file mode 100644
index 000000000000..644a2116b47b
--- /dev/null
+++ b/include/net/tc_act/tc_skbmod.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Jamal Hadi Salim
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#ifndef __NET_TC_SKBMOD_H
+#define __NET_TC_SKBMOD_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_skbmod.h>
+
+struct tcf_skbmod_params {
+	struct rcu_head	rcu;
+	u64	flags; /*up to 64 types of operations; extend if needed */
+	u8	eth_dst[ETH_ALEN];
+	u16	eth_type;
+	u8	eth_src[ETH_ALEN];
+};
+
+struct tcf_skbmod {
+	struct tc_action	common;
+	struct tcf_skbmod_params __rcu *skbmod_p;
+};
+#define to_skbmod(a) ((struct tcf_skbmod *)a)
+
+#endif /* __NET_TC_SKBMOD_H */
diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h
new file mode 100644
index 000000000000..10fc07da6c69
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_skbmod.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Jamal Hadi Salim
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#ifndef __LINUX_TC_SKBMOD_H
+#define __LINUX_TC_SKBMOD_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_SKBMOD 15
+
+#define SKBMOD_F_DMAC	0x1
+#define SKBMOD_F_SMAC	0x2
+#define SKBMOD_F_ETYPE	0x4
+#define SKBMOD_F_SWAPMAC 0x8
+
+struct tc_skbmod {
+	tc_gen;
+	__u64 flags;
+};
+
+enum {
+	TCA_SKBMOD_UNSPEC,
+	TCA_SKBMOD_TM,
+	TCA_SKBMOD_PARMS,
+	TCA_SKBMOD_DMAC,
+	TCA_SKBMOD_SMAC,
+	TCA_SKBMOD_ETYPE,
+	TCA_SKBMOD_PAD,
+	__TCA_SKBMOD_MAX
+};
+#define TCA_SKBMOD_MAX (__TCA_SKBMOD_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 72e3426fa48f..7795d5a3f79a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -749,6 +749,17 @@ config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_SKBMOD
+        tristate "skb data modification action"
+        depends on NET_CLS_ACT
+        ---help---
+         Say Y here to allow modification of skb data
+
+         If unsure, say N.
+
+         To compile this code as a module, choose M here: the
+         module will be called act_skbmod.
+
 config NET_ACT_IFE
         tristate "Inter-FE action based on IETF ForCES InterFE LFB"
         depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index b9d046b9535a..148ae0d5ac2c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
new file mode 100644
index 000000000000..e7d96381c908
--- /dev/null
+++ b/net/sched/act_skbmod.c
@@ -0,0 +1,301 @@
+/*
+ * net/sched/act_skbmod.c  skb data modifier
+ *
+ * Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbmod.h>
+#include <net/tc_act/tc_skbmod.h>
+
+#define SKBMOD_TAB_MASK     15
+
+static int skbmod_net_id;
+static struct tc_action_ops act_skbmod_ops;
+
+#define MAX_EDIT_LEN ETH_HLEN
+static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	int action;
+	struct tcf_skbmod_params *p;
+	u64 flags;
+	int err;
+
+	tcf_lastuse_update(&d->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+	/* XXX: if you are going to edit more fields beyond ethernet header
+	 * (example when you add IP header replacement or vlan swap)
+	 * then MAX_EDIT_LEN needs to change appropriately
+	*/
+	err = skb_ensure_writable(skb, MAX_EDIT_LEN);
+	if (unlikely(err)) { /* best policy is to drop on the floor */
+		qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+		return TC_ACT_SHOT;
+	}
+
+	rcu_read_lock();
+	action = READ_ONCE(d->tcf_action);
+	if (unlikely(action == TC_ACT_SHOT)) {
+		qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+		rcu_read_unlock();
+		return action;
+	}
+
+	p = rcu_dereference(d->skbmod_p);
+	flags = p->flags;
+	if (flags & SKBMOD_F_DMAC)
+		ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
+	if (flags & SKBMOD_F_SMAC)
+		ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
+	if (flags & SKBMOD_F_ETYPE)
+		eth_hdr(skb)->h_proto = p->eth_type;
+	rcu_read_unlock();
+
+	if (flags & SKBMOD_F_SWAPMAC) {
+		u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
+		/*XXX: I am sure we can come up with more efficient swapping*/
+		ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
+		ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
+		ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
+	}
+
+	return action;
+}
+
+static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
+	[TCA_SKBMOD_PARMS]		= { .len = sizeof(struct tc_skbmod) },
+	[TCA_SKBMOD_DMAC]		= { .len = ETH_ALEN },
+	[TCA_SKBMOD_SMAC]		= { .len = ETH_ALEN },
+	[TCA_SKBMOD_ETYPE]		= { .type = NLA_U16 },
+};
+
+static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
+	struct tcf_skbmod_params *p, *p_old;
+	struct tc_skbmod *parm;
+	struct tcf_skbmod *d;
+	bool exists = false;
+	u8 *daddr = NULL;
+	u8 *saddr = NULL;
+	u16 eth_type = 0;
+	u32 lflags = 0;
+	int ret = 0, err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_SKBMOD_PARMS])
+		return -EINVAL;
+
+	if (tb[TCA_SKBMOD_DMAC]) {
+		daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
+		lflags |= SKBMOD_F_DMAC;
+	}
+
+	if (tb[TCA_SKBMOD_SMAC]) {
+		saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
+		lflags |= SKBMOD_F_SMAC;
+	}
+
+	if (tb[TCA_SKBMOD_ETYPE]) {
+		eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
+		lflags |= SKBMOD_F_ETYPE;
+	}
+
+	parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+	if (parm->flags & SKBMOD_F_SWAPMAC)
+		lflags = SKBMOD_F_SWAPMAC;
+
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	if (!lflags)
+		return -EINVAL;
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_skbmod_ops, bind, true);
+		if (ret)
+			return ret;
+
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	d = to_skbmod(*a);
+
+	ASSERT_RTNL();
+	p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
+	if (unlikely(!p)) {
+		if (ovr)
+			tcf_hash_release(*a, bind);
+		return -ENOMEM;
+	}
+
+	p->flags = lflags;
+	d->tcf_action = parm->action;
+
+	p_old = rtnl_dereference(d->skbmod_p);
+
+	if (ovr)
+		spin_lock_bh(&d->tcf_lock);
+
+	if (lflags & SKBMOD_F_DMAC)
+		ether_addr_copy(p->eth_dst, daddr);
+	if (lflags & SKBMOD_F_SMAC)
+		ether_addr_copy(p->eth_src, saddr);
+	if (lflags & SKBMOD_F_ETYPE)
+		p->eth_type = htons(eth_type);
+
+	rcu_assign_pointer(d->skbmod_p, p);
+	if (ovr)
+		spin_unlock_bh(&d->tcf_lock);
+
+	if (p_old)
+		kfree_rcu(p_old, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+	return ret;
+}
+
+static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	struct tcf_skbmod_params  *p;
+
+	p = rcu_dereference_protected(d->skbmod_p, 1);
+	kfree_rcu(p, rcu);
+}
+
+static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbmod_params  *p = rtnl_dereference(d->skbmod_p);
+	struct tc_skbmod opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
+	struct tcf_t t;
+
+	opt.flags  = p->flags;
+	if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_DMAC) &&
+	    nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_SMAC) &&
+	    nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_ETYPE) &&
+	    nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type)))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &d->tcf_tm);
+	if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
+		goto nla_put_failure;
+
+	return skb->len;
+nla_put_failure:
+	rcu_read_unlock();
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_skbmod_ops = {
+	.kind		=	"skbmod",
+	.type		=	TCA_ACT_SKBMOD,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbmod_run,
+	.dump		=	tcf_skbmod_dump,
+	.init		=	tcf_skbmod_init,
+	.cleanup	=	tcf_skbmod_cleanup,
+	.walk		=	tcf_skbmod_walker,
+	.lookup		=	tcf_skbmod_search,
+	.size		=	sizeof(struct tcf_skbmod),
+};
+
+static __net_init int skbmod_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK);
+}
+
+static void __net_exit skbmod_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbmod_net_ops = {
+	.init = skbmod_init_net,
+	.exit = skbmod_exit_net,
+	.id   = &skbmod_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
+MODULE_DESCRIPTION("SKB data mod-ing");
+MODULE_LICENSE("GPL");
+
+static int __init skbmod_init_module(void)
+{
+	return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+static void __exit skbmod_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+module_init(skbmod_init_module);
+module_exit(skbmod_cleanup_module);
-- 
cgit v1.2.3


From aa72d708373dacfa690960b336543b867784b350 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 15 Sep 2016 15:28:22 +0300
Subject: net/sched: cls_flower: Support masking for matching on tcp/udp ports

Add the definitions for src/dst udp/tcp port masks and use
them when setting && dumping the relevant keys.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  4 ++++
 net/sched/cls_flower.c       | 20 ++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index f9c287c67eae..60ea2a084880 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -442,6 +442,10 @@ enum {
 	TCA_FLOWER_KEY_ENC_IPV6_DST,	/* struct in6_addr */
 	TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
 
+	TCA_FLOWER_KEY_TCP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_TCP_DST_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_DST_MASK,	/* be16 */
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b084b2aab2d7..027523c82797 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -335,6 +335,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
 	[TCA_FLOWER_KEY_ENC_IPV6_DST]	= { .len = sizeof(struct in6_addr) },
 	[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_TCP_SRC_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_TCP_DST_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_UDP_SRC_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_UDP_DST_MASK]	= { .type = NLA_U16 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -432,17 +436,17 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 
 	if (key->basic.ip_proto == IPPROTO_TCP) {
 		fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
-			       &mask->tp.src, TCA_FLOWER_UNSPEC,
+			       &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
 			       sizeof(key->tp.src));
 		fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
-			       &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			       &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
 			       sizeof(key->tp.dst));
 	} else if (key->basic.ip_proto == IPPROTO_UDP) {
 		fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
-			       &mask->tp.src, TCA_FLOWER_UNSPEC,
+			       &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
 			       sizeof(key->tp.src));
 		fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
-			       &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			       &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
 			       sizeof(key->tp.dst));
 	}
 
@@ -877,18 +881,18 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	if (key->basic.ip_proto == IPPROTO_TCP &&
 	    (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
-			     &mask->tp.src, TCA_FLOWER_UNSPEC,
+			     &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
 			     sizeof(key->tp.src)) ||
 	     fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
-			     &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			     &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
 			     sizeof(key->tp.dst))))
 		goto nla_put_failure;
 	else if (key->basic.ip_proto == IPPROTO_UDP &&
 		 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
-				  &mask->tp.src, TCA_FLOWER_UNSPEC,
+				  &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
 				  sizeof(key->tp.src)) ||
 		  fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
-				  &mask->tp.dst, TCA_FLOWER_UNSPEC,
+				  &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
 				  sizeof(key->tp.dst))))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 37a6c1512314d2439ef7136d773d5a470e0996b9 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 15 Sep 2016 15:28:24 +0300
Subject: net/sched: cls_flower: Specify vlan attributes format in the UAPI
 header

Specify the format (size and endianess) for the vlan attributes.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 60ea2a084880..8915b61bbf83 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -428,9 +428,9 @@ enum {
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
 
 	TCA_FLOWER_FLAGS,
-	TCA_FLOWER_KEY_VLAN_ID,
-	TCA_FLOWER_KEY_VLAN_PRIO,
-	TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+	TCA_FLOWER_KEY_VLAN_ID,		/* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,	/* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,	/* be16 */
 
 	TCA_FLOWER_KEY_ENC_KEY_ID,	/* be32 */
 	TCA_FLOWER_KEY_ENC_IPV4_SRC,	/* be32 */
-- 
cgit v1.2.3


From cafdc45c949b9963cbfb8fe3a68d0ab16b0208ce Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Thu, 15 Sep 2016 16:26:40 +0200
Subject: net-next: dsa: add Qualcomm tag RX/TX handler

Add support for the 2-bytes Qualcomm tag that gigabit switches such as
the QCA8337/N might insert when receiving packets, or that we need
to insert while targeting specific switch ports. The tag is inserted
directly behind the ethernet header.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |   1 +
 net/dsa/Kconfig    |   3 ++
 net/dsa/Makefile   |   1 +
 net/dsa/dsa.c      |   3 ++
 net/dsa/dsa_priv.h |   2 +
 net/dsa/tag_qca.c  | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 148 insertions(+)
 create mode 100644 net/dsa/tag_qca.c

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9d97c5214341..7556646db2d3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -26,6 +26,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_TRAILER,
 	DSA_TAG_PROTO_EDSA,
 	DSA_TAG_PROTO_BRCM,
+	DSA_TAG_PROTO_QCA,
 	DSA_TAG_LAST,		/* MUST BE LAST */
 };
 
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index ff7736f7ff42..96e47c539bee 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,4 +38,7 @@ config NET_DSA_TAG_EDSA
 config NET_DSA_TAG_TRAILER
 	bool
 
+config NET_DSA_TAG_QCA
+	bool
+
 endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 8af4ded70f1c..a3380ed0e0be 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -7,3 +7,4 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
 dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index d8d267e9a872..66e31acfcad8 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -53,6 +53,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
 #endif
 #ifdef CONFIG_NET_DSA_TAG_BRCM
 	[DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
+#endif
+#ifdef CONFIG_NET_DSA_TAG_QCA
+	[DSA_TAG_PROTO_QCA] = &qca_netdev_ops,
 #endif
 	[DSA_TAG_PROTO_NONE] = &none_ops,
 };
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 00077a9c97f4..6cfd7388834e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -81,5 +81,7 @@ extern const struct dsa_device_ops trailer_netdev_ops;
 /* tag_brcm.c */
 extern const struct dsa_device_ops brcm_netdev_ops;
 
+/* tag_qca.c */
+extern const struct dsa_device_ops qca_netdev_ops;
 
 #endif
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
new file mode 100644
index 000000000000..0c90cacee7aa
--- /dev/null
+++ b/net/dsa/tag_qca.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/etherdevice.h>
+#include "dsa_priv.h"
+
+#define QCA_HDR_LEN	2
+#define QCA_HDR_VERSION	0x2
+
+#define QCA_HDR_RECV_VERSION_MASK	GENMASK(15, 14)
+#define QCA_HDR_RECV_VERSION_S		14
+#define QCA_HDR_RECV_PRIORITY_MASK	GENMASK(13, 11)
+#define QCA_HDR_RECV_PRIORITY_S		11
+#define QCA_HDR_RECV_TYPE_MASK		GENMASK(10, 6)
+#define QCA_HDR_RECV_TYPE_S		6
+#define QCA_HDR_RECV_FRAME_IS_TAGGED	BIT(3)
+#define QCA_HDR_RECV_SOURCE_PORT_MASK	GENMASK(2, 0)
+
+#define QCA_HDR_XMIT_VERSION_MASK	GENMASK(15, 14)
+#define QCA_HDR_XMIT_VERSION_S		14
+#define QCA_HDR_XMIT_PRIORITY_MASK	GENMASK(13, 11)
+#define QCA_HDR_XMIT_PRIORITY_S		11
+#define QCA_HDR_XMIT_CONTROL_MASK	GENMASK(10, 8)
+#define QCA_HDR_XMIT_CONTROL_S		8
+#define QCA_HDR_XMIT_FROM_CPU		BIT(7)
+#define QCA_HDR_XMIT_DP_BIT_MASK	GENMASK(6, 0)
+
+static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	u16 *phdr, hdr;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	if (skb_cow_head(skb, 0) < 0)
+		goto out_free;
+
+	skb_push(skb, QCA_HDR_LEN);
+
+	memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN);
+	phdr = (u16 *)(skb->data + 2 * ETH_ALEN);
+
+	/* Set the version field, and set destination port information */
+	hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
+		QCA_HDR_XMIT_FROM_CPU |
+		BIT(p->port);
+
+	*phdr = htons(hdr);
+
+	return skb;
+
+out_free:
+	kfree_skb(skb);
+	return NULL;
+}
+
+static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
+		       struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
+	u8 ver;
+	int port;
+	__be16 *phdr, hdr;
+
+	if (unlikely(!dst))
+		goto out_drop;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (!skb)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
+		goto out_drop;
+
+	/* The QCA header is added by the switch between src addr and Ethertype
+	 * At this point, skb->data points to ethertype so header should be
+	 * right before
+	 */
+	phdr = (__be16 *)(skb->data - 2);
+	hdr = ntohs(*phdr);
+
+	/* Make sure the version is correct */
+	ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
+	if (unlikely(ver != QCA_HDR_VERSION))
+		goto out_drop;
+
+	/* Remove QCA tag and recalculate checksum */
+	skb_pull_rcsum(skb, QCA_HDR_LEN);
+	memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN,
+		ETH_HLEN - QCA_HDR_LEN);
+
+	/* This protocol doesn't support cascading multiple switches so it's
+	 * safe to assume the switch is first in the tree
+	 */
+	ds = dst->ds[0];
+	if (!ds)
+		goto out_drop;
+
+	/* Get source port information */
+	port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
+	if (!ds->ports[port].netdev)
+		goto out_drop;
+
+	/* Update skb & forward the frame accordingly */
+	skb_push(skb, ETH_HLEN);
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = ds->ports[port].netdev;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+const struct dsa_device_ops qca_netdev_ops = {
+	.xmit	= qca_tag_xmit,
+	.rcv	= qca_tag_rcv,
+};
-- 
cgit v1.2.3


From fbd05e4a6e82fd573d3aa79e284e424b8d78c149 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Thu, 15 Sep 2016 18:15:09 +0300
Subject: cfg80211: add helper to find an IE that matches a byte-array

There are a few places where an IE that matches not only the EID, but
also other bytes inside the element, needs to be found.  To simplify
that and reduce the amount of similar code, implement a new helper
function to match the EID and an extra array of bytes.

Additionally, simplify cfg80211_find_vendor_ie() by using the new
match function.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 33 +++++++++++++++++++++++++++-
 net/wireless/scan.c    | 58 +++++++++++++++++++++++---------------------------
 2 files changed, 59 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d5e7f690bad9..533cb6410678 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3946,6 +3946,34 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 unsigned int cfg80211_classify8021d(struct sk_buff *skb,
 				    struct cfg80211_qos_map *qos_map);
 
+/**
+ * cfg80211_find_ie_match - match information element and byte array in data
+ *
+ * @eid: element ID
+ * @ies: data consisting of IEs
+ * @len: length of data
+ * @match: byte array to match
+ * @match_len: number of bytes in the match array
+ * @match_offset: offset in the IE where the byte array should match.
+ *	If match_len is zero, this must also be set to zero.
+ *	Otherwise this must be set to 2 or more, because the first
+ *	byte is the element id, which is already compared to eid, and
+ *	the second byte is the IE length.
+ *
+ * Return: %NULL if the element ID could not be found or if
+ * the element is invalid (claims to be longer than the given
+ * data) or if the byte array doesn't match, or a pointer to the first
+ * byte of the requested element, that is the byte containing the
+ * element ID.
+ *
+ * Note: There are no checks on the element length other than
+ * having to fit into the given data and being large enough for the
+ * byte array to match.
+ */
+const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len,
+				 const u8 *match, int match_len,
+				 int match_offset);
+
 /**
  * cfg80211_find_ie - find information element in data
  *
@@ -3961,7 +3989,10 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb,
  * Note: There are no checks on the element length other than
  * having to fit into the given data.
  */
-const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len);
+static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
+{
+	return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0);
+}
 
 /**
  * cfg80211_find_vendor_ie - find vendor specific information element in data
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 0358e12be54b..b5bd58d0f731 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -352,52 +352,48 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev)
 	__cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE);
 }
 
-const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
+const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len,
+				 const u8 *match, int match_len,
+				 int match_offset)
 {
-	while (len > 2 && ies[0] != eid) {
+	/* match_offset can't be smaller than 2, unless match_len is
+	 * zero, in which case match_offset must be zero as well.
+	 */
+	if (WARN_ON((match_len && match_offset < 2) ||
+		    (!match_len && match_offset)))
+		return NULL;
+
+	while (len >= 2 && len >= ies[1] + 2) {
+		if ((ies[0] == eid) &&
+		    (ies[1] + 2 >= match_offset + match_len) &&
+		    !memcmp(ies + match_offset, match, match_len))
+			return ies;
+
 		len -= ies[1] + 2;
 		ies += ies[1] + 2;
 	}
-	if (len < 2)
-		return NULL;
-	if (len < 2 + ies[1])
-		return NULL;
-	return ies;
+
+	return NULL;
 }
-EXPORT_SYMBOL(cfg80211_find_ie);
+EXPORT_SYMBOL(cfg80211_find_ie_match);
 
 const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
 				  const u8 *ies, int len)
 {
-	struct ieee80211_vendor_ie *ie;
-	const u8 *pos = ies, *end = ies + len;
-	int ie_oui;
+	const u8 *ie;
+	u8 match[] = { oui >> 16, oui >> 8, oui, oui_type };
+	int match_len = (oui_type < 0) ? 3 : sizeof(match);
 
 	if (WARN_ON(oui_type > 0xff))
 		return NULL;
 
-	while (pos < end) {
-		pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos,
-				       end - pos);
-		if (!pos)
-			return NULL;
-
-		ie = (struct ieee80211_vendor_ie *)pos;
-
-		/* make sure we can access ie->len */
-		BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1);
+	ie = cfg80211_find_ie_match(WLAN_EID_VENDOR_SPECIFIC, ies, len,
+				    match, match_len, 2);
 
-		if (ie->len < sizeof(*ie))
-			goto cont;
+	if (ie && (ie[1] < 4))
+		return NULL;
 
-		ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2];
-		if (ie_oui == oui &&
-		    (oui_type < 0 || ie->oui_type == oui_type))
-			return pos;
-cont:
-		pos += 2 + ie->len;
-	}
-	return NULL;
+	return ie;
 }
 EXPORT_SYMBOL(cfg80211_find_vendor_ie);
 
-- 
cgit v1.2.3


From a3868bfc8d5b0f36c784deab644ee1d2b0e6974b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Print the packet type name in the Rx packet trace

Print a symbolic packet type name for each valid received packet in the
trace output, not just a number.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 5 +++--
 net/rxrpc/ar-internal.h      | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index ea3b10ed91a8..0a30c673509c 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -93,11 +93,12 @@ TRACE_EVENT(rxrpc_rx_packet,
 		    memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr));
 			   ),
 
-	    TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x",
+	    TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s",
 		      __entry->hdr.epoch, __entry->hdr.cid,
 		      __entry->hdr.callNumber, __entry->hdr.serviceId,
 		      __entry->hdr.serial, __entry->hdr.seq,
-		      __entry->hdr.type, __entry->hdr.flags)
+		      __entry->hdr.type, __entry->hdr.flags,
+		      __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK")
 	    );
 
 TRACE_EVENT(rxrpc_rx_done,
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e78c40b37db5..0f6fafa2c271 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -551,6 +551,9 @@ enum rxrpc_call_trace {
 
 extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
 
+extern const char *const rxrpc_pkts[];
+extern const char *rxrpc_acks(u8 reason);
+
 #include <trace/events/rxrpc.h>
 
 /*
@@ -851,11 +854,8 @@ extern unsigned int rxrpc_rx_mtu;
 extern unsigned int rxrpc_rx_jumbo_max;
 extern unsigned int rxrpc_resend_timeout;
 
-extern const char *const rxrpc_pkts[];
 extern const s8 rxrpc_ack_priority[];
 
-extern const char *rxrpc_acks(u8 reason);
-
 /*
  * output.c
  */
-- 
cgit v1.2.3


From 363deeab6d0f308d33d011323661ae9cf5f9f8d6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:14 +0100
Subject: rxrpc: Add connection tracepoint and client conn state tracepoint

Add a pair of tracepoints, one to track rxrpc_connection struct ref
counting and the other to track the client connection cache state.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 60 ++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 76 +++++++++++++++++++++++++---------------
 net/rxrpc/call_accept.c      |  4 +++
 net/rxrpc/call_object.c      |  2 --
 net/rxrpc/conn_client.c      | 82 ++++++++++++++++++++++++++++++--------------
 net/rxrpc/conn_event.c       |  2 +-
 net/rxrpc/conn_object.c      | 72 ++++++++++++++++++++++++++++++++++++--
 net/rxrpc/conn_service.c     |  4 +++
 net/rxrpc/misc.c             | 31 +++++++++++++++++
 9 files changed, 274 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 0a30c673509c..c0c496c83f31 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -16,6 +16,66 @@
 
 #include <linux/tracepoint.h>
 
+TRACE_EVENT(rxrpc_conn,
+	    TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op,
+		     int usage, const void *where),
+
+	    TP_ARGS(conn, op, usage, where),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_connection *,	conn		)
+		    __field(int,			op		)
+		    __field(int,			usage		)
+		    __field(const void *,		where		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->conn = conn;
+		    __entry->op = op;
+		    __entry->usage = usage;
+		    __entry->where = where;
+			   ),
+
+	    TP_printk("C=%p %s u=%d sp=%pSR",
+		      __entry->conn,
+		      rxrpc_conn_traces[__entry->op],
+		      __entry->usage,
+		      __entry->where)
+	    );
+
+TRACE_EVENT(rxrpc_client,
+	    TP_PROTO(struct rxrpc_connection *conn, int channel,
+		     enum rxrpc_client_trace op),
+
+	    TP_ARGS(conn, channel, op),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_connection *,	conn		)
+		    __field(u32,			cid		)
+		    __field(int,			channel		)
+		    __field(int,			usage		)
+		    __field(enum rxrpc_client_trace,	op		)
+		    __field(enum rxrpc_conn_cache_state, cs		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->conn = conn;
+		    __entry->channel = channel;
+		    __entry->usage = atomic_read(&conn->usage);
+		    __entry->op = op;
+		    __entry->cid = conn->proto.cid;
+		    __entry->cs = conn->cache_state;
+			   ),
+
+	    TP_printk("C=%p h=%2d %s %s i=%08x u=%d",
+		      __entry->conn,
+		      __entry->channel,
+		      rxrpc_client_traces[__entry->op],
+		      rxrpc_conn_cache_states[__entry->cs],
+		      __entry->cid,
+		      __entry->usage)
+	    );
+
 TRACE_EVENT(rxrpc_call,
 	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op,
 		     int usage, const void *where, const void *aux),
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 4a73c20d9436..6ca40eea3022 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -314,6 +314,7 @@ enum rxrpc_conn_cache_state {
 	RXRPC_CONN_CLIENT_ACTIVE,	/* Conn is on active list, doing calls */
 	RXRPC_CONN_CLIENT_CULLED,	/* Conn is culled and delisted, doing calls */
 	RXRPC_CONN_CLIENT_IDLE,		/* Conn is on idle list, doing mostly nothing */
+	RXRPC_CONN__NR_CACHE_STATES
 };
 
 /*
@@ -533,6 +534,44 @@ struct rxrpc_call {
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 };
 
+enum rxrpc_conn_trace {
+	rxrpc_conn_new_client,
+	rxrpc_conn_new_service,
+	rxrpc_conn_queued,
+	rxrpc_conn_seen,
+	rxrpc_conn_got,
+	rxrpc_conn_put_client,
+	rxrpc_conn_put_service,
+	rxrpc_conn__nr_trace
+};
+
+extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
+
+enum rxrpc_client_trace {
+	rxrpc_client_activate_chans,
+	rxrpc_client_alloc,
+	rxrpc_client_chan_activate,
+	rxrpc_client_chan_disconnect,
+	rxrpc_client_chan_pass,
+	rxrpc_client_chan_unstarted,
+	rxrpc_client_cleanup,
+	rxrpc_client_count,
+	rxrpc_client_discard,
+	rxrpc_client_duplicate,
+	rxrpc_client_exposed,
+	rxrpc_client_replace,
+	rxrpc_client_to_active,
+	rxrpc_client_to_culled,
+	rxrpc_client_to_idle,
+	rxrpc_client_to_inactive,
+	rxrpc_client_to_waiting,
+	rxrpc_client_uncount,
+	rxrpc_client__nr_trace
+};
+
+extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
+extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
+
 enum rxrpc_call_trace {
 	rxrpc_call_new_client,
 	rxrpc_call_new_service,
@@ -734,7 +773,11 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
 void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
 void rxrpc_disconnect_call(struct rxrpc_call *);
 void rxrpc_kill_connection(struct rxrpc_connection *);
-void __rxrpc_put_connection(struct rxrpc_connection *);
+bool rxrpc_queue_conn(struct rxrpc_connection *);
+void rxrpc_see_connection(struct rxrpc_connection *);
+void rxrpc_get_connection(struct rxrpc_connection *);
+struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
+void rxrpc_put_service_conn(struct rxrpc_connection *);
 void __exit rxrpc_destroy_all_connections(void);
 
 static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
@@ -747,38 +790,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
 	return !rxrpc_conn_is_client(conn);
 }
 
-static inline void rxrpc_get_connection(struct rxrpc_connection *conn)
-{
-	atomic_inc(&conn->usage);
-}
-
-static inline
-struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
-{
-	return atomic_inc_not_zero(&conn->usage) ? conn : NULL;
-}
-
 static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
 {
 	if (!conn)
 		return;
 
-	if (rxrpc_conn_is_client(conn)) {
-		if (atomic_dec_and_test(&conn->usage))
-			rxrpc_put_client_conn(conn);
-	} else {
-		if (atomic_dec_return(&conn->usage) == 1)
-			__rxrpc_put_connection(conn);
-	}
-}
-
-static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
-{
-	if (!rxrpc_get_connection_maybe(conn))
-		return false;
-	if (!rxrpc_queue_work(&conn->processor))
-		rxrpc_put_connection(conn);
-	return true;
+	if (rxrpc_conn_is_client(conn))
+		rxrpc_put_client_conn(conn);
+	else
+		rxrpc_put_service_conn(conn);
 }
 
 /*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 323b8da50163..3e474508ba75 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -85,6 +85,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 		b->conn_backlog[head] = conn;
 		smp_store_release(&b->conn_backlog_head,
 				  (head + 1) & (size - 1));
+
+		trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+				 atomic_read(&conn->usage), here);
 	}
 
 	/* Now it gets complicated, because calls get registered with the
@@ -290,6 +293,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
 		rxrpc_get_local(local);
 		conn->params.local = local;
 		conn->params.peer = peer;
+		rxrpc_see_connection(conn);
 		rxrpc_new_incoming_connection(conn, skb);
 	} else {
 		rxrpc_get_connection(conn);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 0df9d1af8edb..54f30482a7fd 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -479,8 +479,6 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 				  struct rxrpc_call, accept_link);
 		list_del(&call->accept_link);
 		rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET);
-		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
-		rxrpc_release_call(rx, call);
 		rxrpc_put_call(call, rxrpc_call_put);
 	}
 
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 226bc910e556..c76a125df891 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -105,6 +105,14 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
 static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
 			    rxrpc_discard_expired_client_conns);
 
+const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
+	[RXRPC_CONN_CLIENT_INACTIVE]	= "Inac",
+	[RXRPC_CONN_CLIENT_WAITING]	= "Wait",
+	[RXRPC_CONN_CLIENT_ACTIVE]	= "Actv",
+	[RXRPC_CONN_CLIENT_CULLED]	= "Cull",
+	[RXRPC_CONN_CLIENT_IDLE]	= "Idle",
+};
+
 /*
  * Get a connection ID and epoch for a client connection from the global pool.
  * The connection struct pointer is then recorded in the idr radix tree.  The
@@ -220,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
 	rxrpc_get_local(conn->params.local);
 	key_get(conn->params.key);
 
+	trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage),
+			 __builtin_return_address(0));
+	trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
 	_leave(" = %p", conn);
 	return conn;
 
@@ -385,6 +396,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
 			rb_replace_node(&conn->client_node,
 					&candidate->client_node,
 					&local->client_conns);
+			trace_rxrpc_client(conn, -1, rxrpc_client_replace);
 			goto candidate_published;
 		}
 	}
@@ -409,8 +421,11 @@ found_extant_conn:
 	_debug("found conn");
 	spin_unlock(&local->client_conns_lock);
 
-	rxrpc_put_connection(candidate);
-	candidate = NULL;
+	if (candidate) {
+		trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
+		rxrpc_put_connection(candidate);
+		candidate = NULL;
+	}
 
 	spin_lock(&conn->channel_lock);
 	call->conn = conn;
@@ -433,6 +448,7 @@ error:
  */
 static void rxrpc_activate_conn(struct rxrpc_connection *conn)
 {
+	trace_rxrpc_client(conn, -1, rxrpc_client_to_active);
 	conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
 	rxrpc_nr_active_client_conns++;
 	list_move_tail(&conn->cache_link, &rxrpc_active_client_conns);
@@ -462,8 +478,10 @@ static void rxrpc_animate_client_conn(struct rxrpc_connection *conn)
 	spin_lock(&rxrpc_client_conn_cache_lock);
 
 	nr_conns = rxrpc_nr_client_conns;
-	if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags))
+	if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+		trace_rxrpc_client(conn, -1, rxrpc_client_count);
 		rxrpc_nr_client_conns = nr_conns + 1;
+	}
 
 	switch (conn->cache_state) {
 	case RXRPC_CONN_CLIENT_ACTIVE:
@@ -494,6 +512,7 @@ activate_conn:
 
 wait_for_capacity:
 	_debug("wait");
+	trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
 	conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
 	list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns);
 	goto out_unlock;
@@ -524,6 +543,8 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
 					     struct rxrpc_call, chan_wait_link);
 	u32 call_id = chan->call_counter + 1;
 
+	trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
+
 	write_lock_bh(&call->state_lock);
 	call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
 	write_unlock_bh(&call->state_lock);
@@ -563,6 +584,8 @@ static void rxrpc_activate_channels(struct rxrpc_connection *conn)
 
 	_enter("%d", conn->debug_id);
 
+	trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans);
+
 	if (conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE ||
 	    conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
 		return;
@@ -657,10 +680,13 @@ int rxrpc_connect_call(struct rxrpc_call *call,
  * had a chance at re-use (the per-connection security negotiation is
  * expensive).
  */
-static void rxrpc_expose_client_conn(struct rxrpc_connection *conn)
+static void rxrpc_expose_client_conn(struct rxrpc_connection *conn,
+				     unsigned int channel)
 {
-	if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags))
+	if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
+		trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
 		rxrpc_get_connection(conn);
+	}
 }
 
 /*
@@ -669,9 +695,9 @@ static void rxrpc_expose_client_conn(struct rxrpc_connection *conn)
  */
 void rxrpc_expose_client_call(struct rxrpc_call *call)
 {
+	unsigned int channel = call->cid & RXRPC_CHANNELMASK;
 	struct rxrpc_connection *conn = call->conn;
-	struct rxrpc_channel *chan =
-		&conn->channels[call->cid & RXRPC_CHANNELMASK];
+	struct rxrpc_channel *chan = &conn->channels[channel];
 
 	if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
 		/* Mark the call ID as being used.  If the callNumber counter
@@ -682,7 +708,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
 		chan->call_counter++;
 		if (chan->call_counter >= INT_MAX)
 			set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
-		rxrpc_expose_client_conn(conn);
+		rxrpc_expose_client_conn(conn, channel);
 	}
 }
 
@@ -695,6 +721,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 	struct rxrpc_connection *conn = call->conn;
 	struct rxrpc_channel *chan = &conn->channels[channel];
 
+	trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
 	call->conn = NULL;
 
 	spin_lock(&conn->channel_lock);
@@ -709,6 +736,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 		ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
 		list_del_init(&call->chan_wait_link);
 
+		trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted);
+
 		/* We must deactivate or idle the connection if it's now
 		 * waiting for nothing.
 		 */
@@ -739,7 +768,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 	/* See if we can pass the channel directly to another call. */
 	if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE &&
 	    !list_empty(&conn->waiting_calls)) {
-		_debug("pass chan");
+		trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
 		rxrpc_activate_one_channel(conn, channel);
 		goto out_2;
 	}
@@ -762,7 +791,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 			goto out;
 		}
 
-		_debug("pass chan 2");
+		trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
 		rxrpc_activate_one_channel(conn, channel);
 		goto out;
 
@@ -794,7 +823,7 @@ idle_connection:
 	 * immediately or moved to the idle list for a short while.
 	 */
 	if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
-		_debug("make idle");
+		trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
 		conn->idle_timestamp = jiffies;
 		conn->cache_state = RXRPC_CONN_CLIENT_IDLE;
 		list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns);
@@ -804,7 +833,7 @@ idle_connection:
 					   &rxrpc_client_conn_reap,
 					   rxrpc_conn_idle_client_expiry);
 	} else {
-		_debug("make inactive");
+		trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
 		conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
 		list_del_init(&conn->cache_link);
 	}
@@ -821,6 +850,8 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
 	struct rxrpc_local *local = conn->params.local;
 	unsigned int nr_conns;
 
+	trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+
 	if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) {
 		spin_lock(&local->client_conns_lock);
 		if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS,
@@ -834,6 +865,7 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
 	ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
 
 	if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+		trace_rxrpc_client(conn, -1, rxrpc_client_uncount);
 		spin_lock(&rxrpc_client_conn_cache_lock);
 		nr_conns = --rxrpc_nr_client_conns;
 
@@ -863,20 +895,18 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
  */
 void rxrpc_put_client_conn(struct rxrpc_connection *conn)
 {
-	struct rxrpc_connection *next;
+	const void *here = __builtin_return_address(0);
+	int n;
 
 	do {
-		_enter("%p{u=%d,d=%d}",
-		       conn, atomic_read(&conn->usage), conn->debug_id);
-
-		next = rxrpc_put_one_client_conn(conn);
-
-		if (!next)
-			break;
-		conn = next;
-	} while (atomic_dec_and_test(&conn->usage));
-
-	_leave("");
+		n = atomic_dec_return(&conn->usage);
+		trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here);
+		if (n > 0)
+			return;
+		ASSERTCMP(n, >=, 0);
+
+		conn = rxrpc_put_one_client_conn(conn);
+	} while (conn);
 }
 
 /*
@@ -907,9 +937,11 @@ static void rxrpc_cull_active_client_conns(void)
 		ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE);
 
 		if (list_empty(&conn->waiting_calls)) {
+			trace_rxrpc_client(conn, -1, rxrpc_client_to_culled);
 			conn->cache_state = RXRPC_CONN_CLIENT_CULLED;
 			list_del_init(&conn->cache_link);
 		} else {
+			trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
 			conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
 			list_move_tail(&conn->cache_link,
 				       &rxrpc_waiting_client_conns);
@@ -983,7 +1015,7 @@ next:
 			goto not_yet_expired;
 	}
 
-	_debug("discard conn %d", conn->debug_id);
+	trace_rxrpc_client(conn, -1, rxrpc_client_discard);
 	if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
 		BUG();
 	conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 0691007cfc02..a43f4c94a88d 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -377,7 +377,7 @@ void rxrpc_process_connection(struct work_struct *work)
 	u32 abort_code = RX_PROTOCOL_ERROR;
 	int ret;
 
-	_enter("{%d}", conn->debug_id);
+	rxrpc_see_connection(conn);
 
 	if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
 		rxrpc_secure_connection(conn);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index bb1f29280aea..3b55aee0c436 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -246,11 +246,77 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn)
 }
 
 /*
- * release a virtual connection
+ * Queue a connection's work processor, getting a ref to pass to the work
+ * queue.
  */
-void __rxrpc_put_connection(struct rxrpc_connection *conn)
+bool rxrpc_queue_conn(struct rxrpc_connection *conn)
 {
-	rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+	const void *here = __builtin_return_address(0);
+	int n = __atomic_add_unless(&conn->usage, 1, 0);
+	if (n == 0)
+		return false;
+	if (rxrpc_queue_work(&conn->processor))
+		trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here);
+	else
+		rxrpc_put_connection(conn);
+	return true;
+}
+
+/*
+ * Note the re-emergence of a connection.
+ */
+void rxrpc_see_connection(struct rxrpc_connection *conn)
+{
+	const void *here = __builtin_return_address(0);
+	if (conn) {
+		int n = atomic_read(&conn->usage);
+
+		trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here);
+	}
+}
+
+/*
+ * Get a ref on a connection.
+ */
+void rxrpc_get_connection(struct rxrpc_connection *conn)
+{
+	const void *here = __builtin_return_address(0);
+	int n = atomic_inc_return(&conn->usage);
+
+	trace_rxrpc_conn(conn, rxrpc_conn_got, n, here);
+}
+
+/*
+ * Try to get a ref on a connection.
+ */
+struct rxrpc_connection *
+rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
+{
+	const void *here = __builtin_return_address(0);
+
+	if (conn) {
+		int n = __atomic_add_unless(&conn->usage, 1, 0);
+		if (n > 0)
+			trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
+		else
+			conn = NULL;
+	}
+	return conn;
+}
+
+/*
+ * Release a service connection
+ */
+void rxrpc_put_service_conn(struct rxrpc_connection *conn)
+{
+	const void *here = __builtin_return_address(0);
+	int n;
+
+	n = atomic_dec_return(&conn->usage);
+	trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
+	ASSERTCMP(n, >=, 0);
+	if (n == 0)
+		rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
 }
 
 /*
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 83d54da4ce8b..eef551f40dc2 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -136,6 +136,10 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
 		list_add_tail(&conn->link, &rxrpc_connections);
 		list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
 		write_unlock(&rxrpc_connection_lock);
+
+		trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+				 atomic_read(&conn->usage),
+				 __builtin_return_address(0));
 	}
 
 	return conn;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 8b910780f1ac..598064d3bdd2 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -101,3 +101,34 @@ const char *rxrpc_acks(u8 reason)
 		reason = ARRAY_SIZE(str) - 1;
 	return str[reason];
 }
+
+const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
+	[rxrpc_conn_new_client]		= "NWc",
+	[rxrpc_conn_new_service]	= "NWs",
+	[rxrpc_conn_queued]		= "QUE",
+	[rxrpc_conn_seen]		= "SEE",
+	[rxrpc_conn_got]		= "GOT",
+	[rxrpc_conn_put_client]		= "PTc",
+	[rxrpc_conn_put_service]	= "PTs",
+};
+
+const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
+	[rxrpc_client_activate_chans]	= "Activa",
+	[rxrpc_client_alloc]		= "Alloc ",
+	[rxrpc_client_chan_activate]	= "ChActv",
+	[rxrpc_client_chan_disconnect]	= "ChDisc",
+	[rxrpc_client_chan_pass]	= "ChPass",
+	[rxrpc_client_chan_unstarted]	= "ChUnst",
+	[rxrpc_client_cleanup]		= "Clean ",
+	[rxrpc_client_count]		= "Count ",
+	[rxrpc_client_discard]		= "Discar",
+	[rxrpc_client_duplicate]	= "Duplic",
+	[rxrpc_client_exposed]		= "Expose",
+	[rxrpc_client_replace]		= "Replac",
+	[rxrpc_client_to_active]	= "->Actv",
+	[rxrpc_client_to_culled]	= "->Cull",
+	[rxrpc_client_to_idle]		= "->Idle",
+	[rxrpc_client_to_inactive]	= "->Inac",
+	[rxrpc_client_to_waiting]	= "->Wait",
+	[rxrpc_client_uncount]		= "Uncoun",
+};
-- 
cgit v1.2.3


From a124fe3ee5d82f2c9a9b8818ed5cb9f61685f1d3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Add a tracepoint to follow the life of a packet in the Tx
 buffer

Add a tracepoint to follow the insertion of a packet into the transmit
buffer, its transmission and its rotation out of the buffer.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 26 ++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 12 ++++++++++++
 net/rxrpc/input.c            |  2 ++
 net/rxrpc/misc.c             |  9 +++++++++
 net/rxrpc/sendmsg.c          |  9 ++++++++-
 5 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index c0c496c83f31..ffc74b3e5b76 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -208,6 +208,32 @@ TRACE_EVENT(rxrpc_abort,
 		      __entry->abort_code, __entry->error, __entry->why)
 	    );
 
+TRACE_EVENT(rxrpc_transmit,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_transmit_trace why),
+
+	    TP_ARGS(call, why),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_transmit_trace,	why		)
+		    __field(rxrpc_seq_t,		tx_hard_ack	)
+		    __field(rxrpc_seq_t,		tx_top		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->tx_hard_ack = call->tx_hard_ack;
+		    __entry->tx_top = call->tx_top;
+			   ),
+
+	    TP_printk("c=%p %s f=%08x n=%u",
+		      __entry->call,
+		      rxrpc_transmit_traces[__entry->why],
+		      __entry->tx_hard_ack + 1,
+		      __entry->tx_top - __entry->tx_hard_ack)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 6ca40eea3022..afa5dcc05fe0 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -593,6 +593,18 @@ enum rxrpc_call_trace {
 
 extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
 
+enum rxrpc_transmit_trace {
+	rxrpc_transmit_wait,
+	rxrpc_transmit_queue,
+	rxrpc_transmit_queue_reqack,
+	rxrpc_transmit_queue_last,
+	rxrpc_transmit_rotate,
+	rxrpc_transmit_end,
+	rxrpc_transmit__nr_trace
+};
+
+extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c1f83d22f9b7..c7eb5104e91a 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -59,6 +59,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 
 	spin_unlock(&call->lock);
 
+	trace_rxrpc_transmit(call, rxrpc_transmit_rotate);
 	wake_up(&call->waitq);
 
 	while (list) {
@@ -107,6 +108,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
 	}
 
 	write_unlock(&call->state_lock);
+	trace_rxrpc_transmit(call, rxrpc_transmit_end);
 	_leave(" = ok");
 	return true;
 }
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 598064d3bdd2..dca89995f03e 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -132,3 +132,12 @@ const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
 	[rxrpc_client_to_waiting]	= "->Wait",
 	[rxrpc_client_uncount]		= "Uncoun",
 };
+
+const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
+	[rxrpc_transmit_wait]		= "WAI",
+	[rxrpc_transmit_queue]		= "QUE",
+	[rxrpc_transmit_queue_reqack]	= "QRA",
+	[rxrpc_transmit_queue_last]	= "QLS",
+	[rxrpc_transmit_rotate]		= "ROT",
+	[rxrpc_transmit_end]		= "END",
+};
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 8bfddf4e338c..28d8f73cf11d 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -56,6 +56,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 			break;
 		}
 
+		trace_rxrpc_transmit(call, rxrpc_transmit_wait);
 		release_sock(&rx->sk);
 		*timeo = schedule_timeout(*timeo);
 		lock_sock(&rx->sk);
@@ -104,8 +105,14 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	smp_wmb();
 	call->rxtx_buffer[ix] = skb;
 	call->tx_top = seq;
-	if (last)
+	if (last) {
 		set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+		trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
+	} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
+		trace_rxrpc_transmit(call, rxrpc_transmit_queue_reqack);
+	} else {
+		trace_rxrpc_transmit(call, rxrpc_transmit_queue);
+	}
 
 	if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
 		_debug("________awaiting reply/ACK__________");
-- 
cgit v1.2.3


From ec71eb9ada34f8d1a58b7c35d906c59411295445 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Add a tracepoint to log received ACK packets

Add a tracepoint to log information from received ACK packets.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 26 ++++++++++++++++++++++++++
 net/rxrpc/input.c            |  2 ++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index ffc74b3e5b76..2b19f3fa5174 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -234,6 +234,32 @@ TRACE_EVENT(rxrpc_transmit,
 		      __entry->tx_top - __entry->tx_hard_ack)
 	    );
 
+TRACE_EVENT(rxrpc_rx_ack,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first, u8 reason, u8 n_acks),
+
+	    TP_ARGS(call, first, reason, n_acks),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		first		)
+		    __field(u8,				reason		)
+		    __field(u8,				n_acks		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->first = first;
+		    __entry->reason = reason;
+		    __entry->n_acks = n_acks;
+			   ),
+
+	    TP_printk("c=%p %s f=%08x n=%u",
+		      __entry->call,
+		      rxrpc_acks(__entry->reason),
+		      __entry->first,
+		      __entry->n_acks)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c7eb5104e91a..7b18ca124978 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -440,6 +440,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	hard_ack = first_soft_ack - 1;
 	nr_acks = buf.ack.nAcks;
 
+	trace_rxrpc_rx_ack(call, first_soft_ack, buf.ack.reason, nr_acks);
+
 	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
 	       sp->hdr.serial,
 	       ntohs(buf.ack.maxSkew),
-- 
cgit v1.2.3


From f3639df2d90bc919328c459b3c7c49ed5667a52f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Add a tracepoint to log ACK transmission

Add a tracepoint to log information about ACK transmission.

Signed-off-by: David Howels <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 30 ++++++++++++++++++++++++++++++
 net/rxrpc/conn_event.c       |  3 +++
 net/rxrpc/output.c           |  7 ++++++-
 3 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 2b19f3fa5174..d545d692ae22 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -260,6 +260,36 @@ TRACE_EVENT(rxrpc_rx_ack,
 		      __entry->n_acks)
 	    );
 
+TRACE_EVENT(rxrpc_tx_ack,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first,
+		     rxrpc_serial_t serial, u8 reason, u8 n_acks),
+
+	    TP_ARGS(call, first, serial, reason, n_acks),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		first		)
+		    __field(rxrpc_serial_t,		serial		)
+		    __field(u8,				reason		)
+		    __field(u8,				n_acks		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->first = first;
+		    __entry->serial = serial;
+		    __entry->reason = reason;
+		    __entry->n_acks = n_acks;
+			   ),
+
+	    TP_printk("c=%p %s f=%08x r=%08x n=%u",
+		      __entry->call,
+		      rxrpc_acks(__entry->reason),
+		      __entry->first,
+		      __entry->serial,
+		      __entry->n_acks)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index a43f4c94a88d..9b19c51831aa 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -98,6 +98,9 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.info.rwind		= htonl(rxrpc_rx_window_size);
 		pkt.info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
 		len += sizeof(pkt.ack) + sizeof(pkt.info);
+
+		trace_rxrpc_tx_ack(NULL, chan->last_seq, 0,
+				   RXRPC_ACK_DUPLICATE, 0);
 		break;
 	}
 
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 0b21ed859de7..2c9daeadce87 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -38,12 +38,14 @@ struct rxrpc_pkt_buffer {
 static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 				 struct rxrpc_pkt_buffer *pkt)
 {
+	rxrpc_serial_t serial;
 	rxrpc_seq_t hard_ack, top, seq;
 	int ix;
 	u32 mtu, jmax;
 	u8 *ackp = pkt->acks;
 
 	/* Barrier against rxrpc_input_data(). */
+	serial = call->ackr_serial;
 	hard_ack = READ_ONCE(call->rx_hard_ack);
 	top = smp_load_acquire(&call->rx_top);
 
@@ -51,7 +53,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	pkt->ack.maxSkew	= htons(call->ackr_skew);
 	pkt->ack.firstPacket	= htonl(hard_ack + 1);
 	pkt->ack.previousPacket	= htonl(call->ackr_prev_seq);
-	pkt->ack.serial		= htonl(call->ackr_serial);
+	pkt->ack.serial		= htonl(serial);
 	pkt->ack.reason		= call->ackr_reason;
 	pkt->ack.nAcks		= top - hard_ack;
 
@@ -75,6 +77,9 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	pkt->ackinfo.rwind	= htonl(call->rx_winsize);
 	pkt->ackinfo.jumbo_max	= htonl(jmax);
 
+	trace_rxrpc_tx_ack(call, hard_ack + 1, serial, call->ackr_reason,
+			   top - hard_ack);
+
 	*ackp++ = 0;
 	*ackp++ = 0;
 	*ackp++ = 0;
-- 
cgit v1.2.3


From 58dc63c998ea3c5a27e2bf9251eddbf0977056a6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Add a tracepoint to follow packets in the Rx buffer

Add a tracepoint to follow the life of packets that get added to a call's
receive buffer.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 33 +++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 12 ++++++++++++
 net/rxrpc/call_accept.c      |  3 +++
 net/rxrpc/input.c            |  6 +++++-
 net/rxrpc/misc.c             |  9 +++++++++
 net/rxrpc/recvmsg.c          | 11 +++++++++++
 6 files changed, 73 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index d545d692ae22..7dd5f0188681 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -290,6 +290,39 @@ TRACE_EVENT(rxrpc_tx_ack,
 		      __entry->n_acks)
 	    );
 
+TRACE_EVENT(rxrpc_receive,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_receive_trace why,
+		     rxrpc_serial_t serial, rxrpc_seq_t seq),
+
+	    TP_ARGS(call, why, serial, seq),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_receive_trace,	why		)
+		    __field(rxrpc_serial_t,		serial		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(rxrpc_seq_t,		hard_ack	)
+		    __field(rxrpc_seq_t,		top		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->serial = serial;
+		    __entry->seq = seq;
+		    __entry->hard_ack = call->rx_hard_ack;
+		    __entry->top = call->rx_top;
+			   ),
+
+	    TP_printk("c=%p %s r=%08x q=%08x w=%08x-%08x",
+		      __entry->call,
+		      rxrpc_receive_traces[__entry->why],
+		      __entry->serial,
+		      __entry->seq,
+		      __entry->hard_ack,
+		      __entry->top)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index afa5dcc05fe0..e5d2f2fb8e41 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -605,6 +605,18 @@ enum rxrpc_transmit_trace {
 
 extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
 
+enum rxrpc_receive_trace {
+	rxrpc_receive_incoming,
+	rxrpc_receive_queue,
+	rxrpc_receive_queue_last,
+	rxrpc_receive_front,
+	rxrpc_receive_rotate,
+	rxrpc_receive_end,
+	rxrpc_receive__nr_trace
+};
+
+extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 3e474508ba75..a8d39d7cf42c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -367,6 +367,9 @@ found_service:
 		goto out;
 	}
 
+	trace_rxrpc_receive(call, rxrpc_receive_incoming,
+			    sp->hdr.serial, sp->hdr.seq);
+
 	/* Make the call live. */
 	rxrpc_incoming_call(rx, call, skb);
 	conn = call->conn;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 7b18ca124978..b690220533c6 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -284,8 +284,12 @@ next_subpacket:
 	call->rxtx_buffer[ix] = skb;
 	if (after(seq, call->rx_top))
 		smp_store_release(&call->rx_top, seq);
-	if (flags & RXRPC_LAST_PACKET)
+	if (flags & RXRPC_LAST_PACKET) {
 		set_bit(RXRPC_CALL_RX_LAST, &call->flags);
+		trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
+	} else {
+		trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
+	}
 	queued = true;
 
 	if (after_eq(seq, call->rx_expect_next)) {
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index dca89995f03e..db5f1d54fc90 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -141,3 +141,12 @@ const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
 	[rxrpc_transmit_rotate]		= "ROT",
 	[rxrpc_transmit_end]		= "END",
 };
+
+const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
+	[rxrpc_receive_incoming]	= "INC",
+	[rxrpc_receive_queue]		= "QUE",
+	[rxrpc_receive_queue_last]	= "QLS",
+	[rxrpc_receive_front]		= "FRN",
+	[rxrpc_receive_rotate]		= "ROT",
+	[rxrpc_receive_end]		= "END",
+};
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 8b8d7e14f800..22d51087c580 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -134,6 +134,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 {
 	_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
 
+	trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
 	ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
 
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
@@ -167,6 +168,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 {
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
+	rxrpc_serial_t serial;
 	rxrpc_seq_t hard_ack, top;
 	u8 flags;
 	int ix;
@@ -183,6 +185,10 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	rxrpc_see_skb(skb);
 	sp = rxrpc_skb(skb);
 	flags = sp->hdr.flags;
+	serial = sp->hdr.serial;
+	if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO)
+		serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1;
+
 	call->rxtx_buffer[ix] = NULL;
 	call->rxtx_annotations[ix] = 0;
 	/* Barrier against rxrpc_input_data(). */
@@ -191,6 +197,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	rxrpc_free_skb(skb);
 
 	_debug("%u,%u,%02x", hard_ack, top, flags);
+	trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
 	if (flags & RXRPC_LAST_PACKET)
 		rxrpc_end_rx_phase(call);
 }
@@ -309,6 +316,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		rxrpc_see_skb(skb);
 		sp = rxrpc_skb(skb);
 
+		if (!(flags & MSG_PEEK))
+			trace_rxrpc_receive(call, rxrpc_receive_front,
+					    sp->hdr.serial, seq);
+
 		if (msg)
 			sock_recv_timestamp(msg, sock->sk, skb);
 
-- 
cgit v1.2.3


From 849979051cbc9352857d8bb31895ae55afe19d96 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 11:13:31 +0100
Subject: rxrpc: Add a tracepoint to follow what recvmsg does

Add a tracepoint to follow what recvmsg does within AF_RXRPC.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 34 ++++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 17 +++++++++++++++++
 net/rxrpc/misc.c             | 14 ++++++++++++++
 net/rxrpc/recvmsg.c          | 34 ++++++++++++++++++++++++++--------
 4 files changed, 91 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 7dd5f0188681..58732202e9f0 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -323,6 +323,40 @@ TRACE_EVENT(rxrpc_receive,
 		      __entry->top)
 	    );
 
+TRACE_EVENT(rxrpc_recvmsg,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why,
+		     rxrpc_seq_t seq, unsigned int offset, unsigned int len,
+		     int ret),
+
+	    TP_ARGS(call, why, seq, offset, len, ret),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_recvmsg_trace,	why		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(unsigned int,		offset		)
+		    __field(unsigned int,		len		)
+		    __field(int,			ret		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->seq = seq;
+		    __entry->offset = offset;
+		    __entry->len = len;
+		    __entry->ret = ret;
+			   ),
+
+	    TP_printk("c=%p %s q=%08x o=%u l=%u ret=%d",
+		      __entry->call,
+		      rxrpc_recvmsg_traces[__entry->why],
+		      __entry->seq,
+		      __entry->offset,
+		      __entry->len,
+		      __entry->ret)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e5d2f2fb8e41..a17341d2df3d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -617,6 +617,23 @@ enum rxrpc_receive_trace {
 
 extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
 
+enum rxrpc_recvmsg_trace {
+	rxrpc_recvmsg_enter,
+	rxrpc_recvmsg_wait,
+	rxrpc_recvmsg_dequeue,
+	rxrpc_recvmsg_hole,
+	rxrpc_recvmsg_next,
+	rxrpc_recvmsg_cont,
+	rxrpc_recvmsg_full,
+	rxrpc_recvmsg_data_return,
+	rxrpc_recvmsg_terminal,
+	rxrpc_recvmsg_to_be_accepted,
+	rxrpc_recvmsg_return,
+	rxrpc_recvmsg__nr_trace
+};
+
+extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index db5f1d54fc90..c7065d893d1e 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -150,3 +150,17 @@ const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
 	[rxrpc_receive_rotate]		= "ROT",
 	[rxrpc_receive_end]		= "END",
 };
+
+const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
+	[rxrpc_recvmsg_enter]		= "ENTR",
+	[rxrpc_recvmsg_wait]		= "WAIT",
+	[rxrpc_recvmsg_dequeue]		= "DEQU",
+	[rxrpc_recvmsg_hole]		= "HOLE",
+	[rxrpc_recvmsg_next]		= "NEXT",
+	[rxrpc_recvmsg_cont]		= "CONT",
+	[rxrpc_recvmsg_full]		= "FULL",
+	[rxrpc_recvmsg_data_return]	= "DATA",
+	[rxrpc_recvmsg_terminal]	= "TERM",
+	[rxrpc_recvmsg_to_be_accepted]	= "TBAC",
+	[rxrpc_recvmsg_return]		= "RETN",
+};
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 22d51087c580..b62a08151895 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -94,6 +94,8 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
 		break;
 	}
 
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack,
+			    call->rx_pkt_offset, call->rx_pkt_len, ret);
 	return ret;
 }
 
@@ -124,6 +126,7 @@ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
 		write_unlock(&rx->call_lock);
 	}
 
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret);
 	return ret;
 }
 
@@ -310,8 +313,11 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 	for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
 		skb = call->rxtx_buffer[ix];
-		if (!skb)
+		if (!skb) {
+			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq,
+					    rx_pkt_offset, rx_pkt_len, 0);
 			break;
+		}
 		smp_rmb();
 		rxrpc_see_skb(skb);
 		sp = rxrpc_skb(skb);
@@ -327,10 +333,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 			ret2 = rxrpc_locate_data(call, skb,
 						 &call->rxtx_annotations[ix],
 						 &rx_pkt_offset, &rx_pkt_len);
+			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
+					    rx_pkt_offset, rx_pkt_len, ret2);
 			if (ret2 < 0) {
 				ret = ret2;
 				goto out;
 			}
+		} else {
+			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq,
+					    rx_pkt_offset, rx_pkt_len, 0);
 		}
 		_debug("recvmsg %x DATA #%u { %d, %d }",
 		       sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
@@ -357,6 +368,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		}
 
 		if (rx_pkt_len > 0) {
+			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq,
+					    rx_pkt_offset, rx_pkt_len, 0);
 			_debug("buffer full");
 			ASSERTCMP(*_offset, ==, len);
 			ret = 0;
@@ -383,6 +396,8 @@ out:
 		call->rx_pkt_len = rx_pkt_len;
 	}
 done:
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
+			    rx_pkt_offset, rx_pkt_len, ret);
 	_leave(" = %d [%u/%u]", ret, seq, top);
 	return ret;
 }
@@ -404,7 +419,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	DEFINE_WAIT(wait);
 
-	_enter(",,,%zu,%d", len, flags);
+	trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0);
 
 	if (flags & (MSG_OOB | MSG_TRUNC))
 		return -EOPNOTSUPP;
@@ -424,8 +439,10 @@ try_again:
 
 	if (list_empty(&rx->recvmsg_q)) {
 		ret = -EWOULDBLOCK;
-		if (timeo == 0)
+		if (timeo == 0) {
+			call = NULL;
 			goto error_no_call;
+		}
 
 		release_sock(&rx->sk);
 
@@ -439,6 +456,8 @@ try_again:
 		if (list_empty(&rx->recvmsg_q)) {
 			if (signal_pending(current))
 				goto wait_interrupted;
+			trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait,
+					    0, 0, 0, 0);
 			timeo = schedule_timeout(timeo);
 		}
 		finish_wait(sk_sleep(&rx->sk), &wait);
@@ -457,7 +476,7 @@ try_again:
 		rxrpc_get_call(call, rxrpc_call_got);
 	write_unlock_bh(&rx->recvmsg_lock);
 
-	_debug("recvmsg call %p", call);
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
 
 	if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
 		BUG();
@@ -527,16 +546,15 @@ error:
 	rxrpc_put_call(call, rxrpc_call_put);
 error_no_call:
 	release_sock(&rx->sk);
-	_leave(" = %d", ret);
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
 	return ret;
 
 wait_interrupted:
 	ret = sock_intr_errno(timeo);
 wait_error:
 	finish_wait(sk_sleep(&rx->sk), &wait);
-	release_sock(&rx->sk);
-	_leave(" = %d [wait]", ret);
-	return ret;
+	call = NULL;
+	goto error_no_call;
 }
 
 /**
-- 
cgit v1.2.3


From 71f3ca408fd43b586c02480768a503af075b247e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:14 +0100
Subject: rxrpc: Improve skb tracing

Improve sk_buff tracing within AF_RXRPC by the following means:

 (1) Use an enum to note the event type rather than plain integers and use
     an array of event names rather than a big multi ?: list.

 (2) Distinguish Rx from Tx packets and account them separately.  This
     requires the call phase to be tracked so that we know what we might
     find in rxtx_buffer[].

 (3) Add a parameter to rxrpc_{new,see,get,free}_skb() to indicate the
     event type.

 (4) A pair of 'rotate' events are added to indicate packets that are about
     to be rotated out of the Rx and Tx windows.

 (5) A pair of 'lost' events are added, along with rxrpc_lose_skb() for
     packet loss injection recording.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 12 ++++------
 net/rxrpc/af_rxrpc.c         |  5 +++--
 net/rxrpc/ar-internal.h      | 33 ++++++++++++++++++++++-----
 net/rxrpc/call_event.c       |  8 +++----
 net/rxrpc/call_object.c      | 11 ++++++---
 net/rxrpc/conn_event.c       |  6 ++---
 net/rxrpc/input.c            | 13 ++++++-----
 net/rxrpc/local_event.c      |  4 ++--
 net/rxrpc/misc.c             | 18 +++++++++++++++
 net/rxrpc/output.c           |  4 ++--
 net/rxrpc/peer_event.c       | 10 ++++-----
 net/rxrpc/recvmsg.c          |  7 +++---
 net/rxrpc/sendmsg.c          | 10 ++++-----
 net/rxrpc/skbuff.c           | 53 +++++++++++++++++++++++++++++++-------------
 14 files changed, 131 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 58732202e9f0..75a5d8bf50e1 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -107,14 +107,14 @@ TRACE_EVENT(rxrpc_call,
 	    );
 
 TRACE_EVENT(rxrpc_skb,
-	    TP_PROTO(struct sk_buff *skb, int op, int usage, int mod_count,
-		     const void *where),
+	    TP_PROTO(struct sk_buff *skb, enum rxrpc_skb_trace op,
+		     int usage, int mod_count, const void *where),
 
 	    TP_ARGS(skb, op, usage, mod_count, where),
 
 	    TP_STRUCT__entry(
 		    __field(struct sk_buff *,		skb		)
-		    __field(int,			op		)
+		    __field(enum rxrpc_skb_trace,	op		)
 		    __field(int,			usage		)
 		    __field(int,			mod_count	)
 		    __field(const void *,		where		)
@@ -130,11 +130,7 @@ TRACE_EVENT(rxrpc_skb,
 
 	    TP_printk("s=%p %s u=%d m=%d p=%pSR",
 		      __entry->skb,
-		      (__entry->op == 0 ? "NEW" :
-		       __entry->op == 1 ? "SEE" :
-		       __entry->op == 2 ? "GET" :
-		       __entry->op == 3 ? "FRE" :
-		       "PUR"),
+		      rxrpc_skb_traces[__entry->op],
 		      __entry->usage,
 		      __entry->mod_count,
 		      __entry->where)
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 09f81befc705..8dbf7bed2cc4 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -45,7 +45,7 @@ u32 rxrpc_epoch;
 atomic_t rxrpc_debug_id;
 
 /* count of skbs currently in use */
-atomic_t rxrpc_n_skbs;
+atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
 
 struct workqueue_struct *rxrpc_workqueue;
 
@@ -867,7 +867,8 @@ static void __exit af_rxrpc_exit(void)
 	proto_unregister(&rxrpc_proto);
 	rxrpc_destroy_all_calls();
 	rxrpc_destroy_all_connections();
-	ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+	ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0);
+	ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0);
 	rxrpc_destroy_all_locals();
 
 	remove_proc_entry("rxrpc_conns", init_net.proc_net);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index a17341d2df3d..034f525f2235 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -520,6 +520,7 @@ struct rxrpc_call {
 	rxrpc_seq_t		rx_expect_next;	/* Expected next packet sequence number */
 	u8			rx_winsize;	/* Size of Rx window */
 	u8			tx_winsize;	/* Maximum size of Tx window */
+	bool			tx_phase;	/* T if transmission phase, F if receive phase */
 	u8			nr_jumbo_bad;	/* Number of jumbo dups/exceeds-windows */
 
 	/* receive-phase ACK management */
@@ -534,6 +535,27 @@ struct rxrpc_call {
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 };
 
+enum rxrpc_skb_trace {
+	rxrpc_skb_rx_cleaned,
+	rxrpc_skb_rx_freed,
+	rxrpc_skb_rx_got,
+	rxrpc_skb_rx_lost,
+	rxrpc_skb_rx_received,
+	rxrpc_skb_rx_rotated,
+	rxrpc_skb_rx_purged,
+	rxrpc_skb_rx_seen,
+	rxrpc_skb_tx_cleaned,
+	rxrpc_skb_tx_freed,
+	rxrpc_skb_tx_got,
+	rxrpc_skb_tx_lost,
+	rxrpc_skb_tx_new,
+	rxrpc_skb_tx_rotated,
+	rxrpc_skb_tx_seen,
+	rxrpc_skb__nr_trace
+};
+
+extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
+
 enum rxrpc_conn_trace {
 	rxrpc_conn_new_client,
 	rxrpc_conn_new_service,
@@ -642,7 +664,7 @@ extern const char *rxrpc_acks(u8 reason);
 /*
  * af_rxrpc.c
  */
-extern atomic_t rxrpc_n_skbs;
+extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
 extern u32 rxrpc_epoch;
 extern atomic_t rxrpc_debug_id;
 extern struct workqueue_struct *rxrpc_workqueue;
@@ -1000,10 +1022,11 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
  */
 void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_packet_destructor(struct sk_buff *);
-void rxrpc_new_skb(struct sk_buff *);
-void rxrpc_see_skb(struct sk_buff *);
-void rxrpc_get_skb(struct sk_buff *);
-void rxrpc_free_skb(struct sk_buff *);
+void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
 void rxrpc_purge_queue(struct sk_buff_head *);
 
 /*
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index f0cabc48a1b7..7d1b99824ed9 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -170,7 +170,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 			continue;
 
 		skb = call->rxtx_buffer[ix];
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
 		sp = rxrpc_skb(skb);
 
 		if (annotation == RXRPC_TX_ANNO_UNACK) {
@@ -199,7 +199,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 			continue;
 
 		skb = call->rxtx_buffer[ix];
-		rxrpc_get_skb(skb);
+		rxrpc_get_skb(skb, rxrpc_skb_tx_got);
 		spin_unlock_bh(&call->lock);
 		sp = rxrpc_skb(skb);
 
@@ -211,7 +211,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 		if (rxrpc_send_data_packet(call->conn, skb) < 0) {
 			call->resend_at = now + 2;
-			rxrpc_free_skb(skb);
+			rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 			return;
 		}
 
@@ -219,7 +219,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 			rxrpc_expose_client_call(call);
 		sp->resend_at = now + rxrpc_resend_timeout;
 
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 		spin_lock_bh(&call->lock);
 
 		/* We need to clear the retransmit state, but there are two
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 54f30482a7fd..f50a6094e198 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -182,6 +182,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 		return ERR_PTR(-ENOMEM);
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
 	call->service_id = srx->srx_service;
+	call->tx_phase = true;
 
 	_leave(" = %p", call);
 	return call;
@@ -458,7 +459,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 		rxrpc_disconnect_call(call);
 
 	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
-		rxrpc_free_skb(call->rxtx_buffer[i]);
+		rxrpc_free_skb(call->rxtx_buffer[i],
+			       (call->tx_phase ? rxrpc_skb_tx_cleaned :
+				rxrpc_skb_rx_cleaned));
 		call->rxtx_buffer[i] = NULL;
 	}
 
@@ -552,9 +555,11 @@ void rxrpc_cleanup_call(struct rxrpc_call *call)
 
 	/* Clean up the Rx/Tx buffer */
 	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
-		rxrpc_free_skb(call->rxtx_buffer[i]);
+		rxrpc_free_skb(call->rxtx_buffer[i],
+			       (call->tx_phase ? rxrpc_skb_tx_cleaned :
+				rxrpc_skb_rx_cleaned));
 
-	rxrpc_free_skb(call->tx_pending);
+	rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned);
 
 	call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
 }
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 9b19c51831aa..75a15a4c74c3 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -388,7 +388,7 @@ void rxrpc_process_connection(struct work_struct *work)
 	/* go through the conn-level event packets, releasing the ref on this
 	 * connection that each one has when we've finished with it */
 	while ((skb = skb_dequeue(&conn->rx_queue))) {
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
 		ret = rxrpc_process_event(conn, skb, &abort_code);
 		switch (ret) {
 		case -EPROTO:
@@ -399,7 +399,7 @@ void rxrpc_process_connection(struct work_struct *work)
 			goto requeue_and_leave;
 		case -ECONNABORTED:
 		default:
-			rxrpc_free_skb(skb);
+			rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 			break;
 		}
 	}
@@ -416,7 +416,7 @@ requeue_and_leave:
 protocol_error:
 	if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
 		goto requeue_and_leave;
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 	_leave(" [EPROTO]");
 	goto out;
 }
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index b690220533c6..84bb16d47b85 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -50,7 +50,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 		call->tx_hard_ack++;
 		ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
 		skb = call->rxtx_buffer[ix];
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
 		call->rxtx_buffer[ix] = NULL;
 		call->rxtx_annotations[ix] = 0;
 		skb->next = list;
@@ -66,7 +66,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 		skb = list;
 		list = skb->next;
 		skb->next = NULL;
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	}
 }
 
@@ -99,6 +99,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
 	default:
 		break;
 	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+		call->tx_phase = false;
 		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
 		break;
 	case RXRPC_CALL_SERVER_AWAIT_ACK:
@@ -278,7 +279,7 @@ next_subpacket:
 	 * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
 	 * and also rxrpc_fill_out_ack().
 	 */
-	rxrpc_get_skb(skb);
+	rxrpc_get_skb(skb, rxrpc_skb_rx_got);
 	call->rxtx_annotations[ix] = annotation;
 	smp_wmb();
 	call->rxtx_buffer[ix] = skb;
@@ -691,13 +692,13 @@ void rxrpc_data_ready(struct sock *udp_sk)
 		return;
 	}
 
-	rxrpc_new_skb(skb);
+	rxrpc_new_skb(skb, rxrpc_skb_rx_received);
 
 	_net("recv skb %p", skb);
 
 	/* we'll probably need to checksum it (didn't call sock_recvmsg) */
 	if (skb_checksum_complete(skb)) {
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 		__UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
 		_leave(" [CSUM failed]");
 		return;
@@ -821,7 +822,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
 discard_unlock:
 	rcu_read_unlock();
 discard:
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 out:
 	trace_rxrpc_rx_done(0, 0);
 	return;
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index f073e932500e..190f68bd9e27 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -90,7 +90,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
 	if (skb) {
 		struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
 		_debug("{%d},{%u}", local->debug_id, sp->hdr.type);
 
 		switch (sp->hdr.type) {
@@ -107,7 +107,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
 			break;
 		}
 
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 	}
 
 	_leave("");
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index c7065d893d1e..026e1f2e83ff 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -102,6 +102,24 @@ const char *rxrpc_acks(u8 reason)
 	return str[reason];
 }
 
+const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
+	[rxrpc_skb_rx_cleaned]		= "Rx CLN",
+	[rxrpc_skb_rx_freed]		= "Rx FRE",
+	[rxrpc_skb_rx_got]		= "Rx GOT",
+	[rxrpc_skb_rx_lost]		= "Rx *L*",
+	[rxrpc_skb_rx_received]		= "Rx RCV",
+	[rxrpc_skb_rx_purged]		= "Rx PUR",
+	[rxrpc_skb_rx_rotated]		= "Rx ROT",
+	[rxrpc_skb_rx_seen]		= "Rx SEE",
+	[rxrpc_skb_tx_cleaned]		= "Tx CLN",
+	[rxrpc_skb_tx_freed]		= "Tx FRE",
+	[rxrpc_skb_tx_got]		= "Tx GOT",
+	[rxrpc_skb_tx_lost]		= "Tx *L*",
+	[rxrpc_skb_tx_new]		= "Tx NEW",
+	[rxrpc_skb_tx_rotated]		= "Tx ROT",
+	[rxrpc_skb_tx_seen]		= "Tx SEE",
+};
+
 const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
 	[rxrpc_conn_new_client]		= "NWc",
 	[rxrpc_conn_new_service]	= "NWs",
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 2c9daeadce87..a2cad5ce7416 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -324,7 +324,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 	whdr.type = RXRPC_PACKET_TYPE_ABORT;
 
 	while ((skb = skb_dequeue(&local->reject_queue))) {
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
 		sp = rxrpc_skb(skb);
 
 		if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
@@ -343,7 +343,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 			kernel_sendmsg(local->socket, &msg, iov, 2, size);
 		}
 
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 	}
 
 	_leave("");
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 9e0725f5652b..18276e7cb9e0 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -155,11 +155,11 @@ void rxrpc_error_report(struct sock *sk)
 		_leave("UDP socket errqueue empty");
 		return;
 	}
-	rxrpc_new_skb(skb);
+	rxrpc_new_skb(skb, rxrpc_skb_rx_received);
 	serr = SKB_EXT_ERR(skb);
 	if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
 		_leave("UDP empty message");
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 		return;
 	}
 
@@ -169,7 +169,7 @@ void rxrpc_error_report(struct sock *sk)
 		peer = NULL;
 	if (!peer) {
 		rcu_read_unlock();
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 		_leave(" [no peer]");
 		return;
 	}
@@ -179,7 +179,7 @@ void rxrpc_error_report(struct sock *sk)
 	     serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
 		rxrpc_adjust_mtu(peer, serr);
 		rcu_read_unlock();
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 		rxrpc_put_peer(peer);
 		_leave(" [MTU update]");
 		return;
@@ -187,7 +187,7 @@ void rxrpc_error_report(struct sock *sk)
 
 	rxrpc_store_error(peer, serr);
 	rcu_read_unlock();
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 
 	/* The ref we obtained is passed off to the work item */
 	rxrpc_queue_work(&peer->error_distributor);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 79e65668bc58..6ba4af5a8d95 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -155,6 +155,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 		break;
 
 	case RXRPC_CALL_SERVER_RECV_REQUEST:
+		call->tx_phase = true;
 		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
 		break;
 	default:
@@ -185,7 +186,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	hard_ack++;
 	ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
 	skb = call->rxtx_buffer[ix];
-	rxrpc_see_skb(skb);
+	rxrpc_see_skb(skb, rxrpc_skb_rx_rotated);
 	sp = rxrpc_skb(skb);
 	flags = sp->hdr.flags;
 	serial = sp->hdr.serial;
@@ -197,7 +198,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	/* Barrier against rxrpc_input_data(). */
 	smp_store_release(&call->rx_hard_ack, hard_ack);
 
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 
 	_debug("%u,%u,%02x", hard_ack, top, flags);
 	trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
@@ -317,7 +318,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 			break;
 		}
 		smp_rmb();
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
 		sp = rxrpc_skb(skb);
 
 		if (!(flags & MSG_PEEK))
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 28d8f73cf11d..6a39ee97a0b7 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -100,7 +100,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	ASSERTCMP(seq, ==, call->tx_top + 1);
 
 	ix = seq & RXRPC_RXTX_BUFF_MASK;
-	rxrpc_get_skb(skb);
+	rxrpc_get_skb(skb, rxrpc_skb_tx_got);
 	call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
 	smp_wmb();
 	call->rxtx_buffer[ix] = skb;
@@ -146,7 +146,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 		rxrpc_instant_resend(call, ix);
 	}
 
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	_leave("");
 }
 
@@ -201,7 +201,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 	skb = call->tx_pending;
 	call->tx_pending = NULL;
-	rxrpc_see_skb(skb);
+	rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
 
 	copied = 0;
 	do {
@@ -242,7 +242,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			if (!skb)
 				goto maybe_error;
 
-			rxrpc_new_skb(skb);
+			rxrpc_new_skb(skb, rxrpc_skb_tx_new);
 
 			_debug("ALLOC SEND %p", skb);
 
@@ -352,7 +352,7 @@ out:
 	return ret;
 
 call_terminated:
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	_leave(" = %d", -call->error);
 	return -call->error;
 
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 620d9ccaf3c1..5154cbf7e540 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -18,54 +18,76 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
+#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
+
 /*
- * Note the existence of a new-to-us socket buffer (allocated or dequeued).
+ * Note the allocation or reception of a socket buffer.
  */
-void rxrpc_new_skb(struct sk_buff *skb)
+void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
 {
 	const void *here = __builtin_return_address(0);
-	int n = atomic_inc_return(&rxrpc_n_skbs);
-	trace_rxrpc_skb(skb, 0, atomic_read(&skb->users), n, here);
+	int n = atomic_inc_return(select_skb_count(op));
+	trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
 }
 
 /*
  * Note the re-emergence of a socket buffer from a queue or buffer.
  */
-void rxrpc_see_skb(struct sk_buff *skb)
+void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	if (skb) {
-		int n = atomic_read(&rxrpc_n_skbs);
-		trace_rxrpc_skb(skb, 1, atomic_read(&skb->users), n, here);
+		int n = atomic_read(select_skb_count(op));
+		trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
 	}
 }
 
 /*
  * Note the addition of a ref on a socket buffer.
  */
-void rxrpc_get_skb(struct sk_buff *skb)
+void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
 {
 	const void *here = __builtin_return_address(0);
-	int n = atomic_inc_return(&rxrpc_n_skbs);
-	trace_rxrpc_skb(skb, 2, atomic_read(&skb->users), n, here);
+	int n = atomic_inc_return(select_skb_count(op));
+	trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
 	skb_get(skb);
 }
 
 /*
  * Note the destruction of a socket buffer.
  */
-void rxrpc_free_skb(struct sk_buff *skb)
+void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	if (skb) {
 		int n;
 		CHECK_SLAB_OKAY(&skb->users);
-		n = atomic_dec_return(&rxrpc_n_skbs);
-		trace_rxrpc_skb(skb, 3, atomic_read(&skb->users), n, here);
+		n = atomic_dec_return(select_skb_count(op));
+		trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
 		kfree_skb(skb);
 	}
 }
 
+/*
+ * Note the injected loss of a socket buffer.
+ */
+void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+{
+	const void *here = __builtin_return_address(0);
+	if (skb) {
+		int n;
+		CHECK_SLAB_OKAY(&skb->users);
+		if (op == rxrpc_skb_tx_lost) {
+			n = atomic_read(select_skb_count(op));
+			trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+		} else {
+			n = atomic_dec_return(select_skb_count(op));
+			trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+			kfree_skb(skb);
+		}
+	}
+}
+
 /*
  * Clear a queue of socket buffers.
  */
@@ -74,8 +96,9 @@ void rxrpc_purge_queue(struct sk_buff_head *list)
 	const void *here = __builtin_return_address(0);
 	struct sk_buff *skb;
 	while ((skb = skb_dequeue((list))) != NULL) {
-		int n = atomic_dec_return(&rxrpc_n_skbs);
-		trace_rxrpc_skb(skb, 4, atomic_read(&skb->users), n, here);
+		int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged));
+		trace_rxrpc_skb(skb, rxrpc_skb_rx_purged,
+				atomic_read(&skb->users), n, here);
 		kfree_skb(skb);
 	}
 }
-- 
cgit v1.2.3


From 20c64d5cd5a2bdcdc8982a06cb05e5e1bd851a3d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 15 Sep 2016 08:48:46 -0700
Subject: net: avoid sk_forward_alloc overflows

A malicious TCP receiver, sending SACK, can force the sender to split
skbs in write queue and increase its memory usage.

Then, when socket is closed and its write queue purged, we might
overflow sk_forward_alloc (It becomes negative)

sk_mem_reclaim() does nothing in this case, and more than 2GB
are leaked from TCP perspective (tcp_memory_allocated is not changed)

Then warnings trigger from inet_sock_destruct() and
sk_stream_kill_queues() seeing a not zero sk_forward_alloc

All TCP stack can be stuck because TCP is under memory pressure.

A simple fix is to preemptively reclaim from sk_mem_uncharge().

This makes sure a socket wont have more than 2 MB forward allocated,
after burst and idle period.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index ff5be7e8ddea..8741988e6880 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1332,6 +1332,16 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 	if (!sk_has_account(sk))
 		return;
 	sk->sk_forward_alloc += size;
+
+	/* Avoid a possible overflow.
+	 * TCP send queues can make this happen, if sk_mem_reclaim()
+	 * is not called and more than 2 GBytes are released at once.
+	 *
+	 * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
+	 * no need to hold that much forward allocation anyway.
+	 */
+	if (unlikely(sk->sk_forward_alloc >= 1 << 21))
+		__sk_mem_reclaim(sk, 1 << 20);
 }
 
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
-- 
cgit v1.2.3


From 4496195ddd75c4ad57b783739414e69b7d79843e Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 15 Sep 2016 15:02:38 -0300
Subject: sctp: fix SSN comparision

This function actually operates on u32 yet its paramteres were declared
as u16, causing integer truncation upon calling.

Note in patch context that ADDIP_SERIAL_SIGN_BIT is already 32 bits.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index efc01743b9d6..bafe2a0ab908 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -382,7 +382,7 @@ enum {
 	ADDIP_SERIAL_SIGN_BIT = (1<<31)
 };
 
-static inline int ADDIP_SERIAL_gte(__u16 s, __u16 t)
+static inline int ADDIP_SERIAL_gte(__u32 s, __u32 t)
 {
 	return ((s) == (t)) || (((t) - (s)) & ADDIP_SERIAL_SIGN_BIT);
 }
-- 
cgit v1.2.3


From 19664c6a000956290cce84c6924b13488ab794d6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 15 Sep 2016 10:18:45 -0700
Subject: net: l3mdev: Remove netif_index_is_l3_master

No longer used after e0d56fdd73422 ("net: l3mdev: remove redundant calls")

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 3832099289c5..b220dabeab45 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -114,25 +114,6 @@ static inline u32 l3mdev_fib_table(const struct net_device *dev)
 	return tb_id;
 }
 
-static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
-{
-	struct net_device *dev;
-	bool rc = false;
-
-	if (ifindex == 0)
-		return false;
-
-	rcu_read_lock();
-
-	dev = dev_get_by_index_rcu(net, ifindex);
-	if (dev)
-		rc = netif_is_l3_master(dev);
-
-	rcu_read_unlock();
-
-	return rc;
-}
-
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
 
 static inline
@@ -226,11 +207,6 @@ static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
 	return 0;
 }
 
-static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
-{
-	return false;
-}
-
 static inline
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
 {
-- 
cgit v1.2.3


From cfc7381b3002756b1dcada32979e942aa3126e31 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:29 -0700
Subject: ip_tunnel: add collect_md mode to IPIP tunnel

Similar to gre, vxlan, geneve tunnels allow IPIP tunnels to
operate in 'collect metadata' mode.
bpf_skb_[gs]et_tunnel_key() helpers can make use of it right away.
ovs can use it as well in the future (once appropriate ovs-vport
abstractions and user apis are added).
Note that just like in other tunnels we cannot cache the dst,
since tunnel_info metadata can be different for every packet.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h       |  2 ++
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/ip_tunnel.c           | 76 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ipip.c                | 35 +++++++++++++++----
 4 files changed, 108 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index e598c639aa6f..59557c07904b 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -255,6 +255,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+		       const u8 proto);
 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 9865c8caedde..18d5dc13985d 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -73,6 +73,7 @@ enum {
 	IFLA_IPTUN_ENCAP_FLAGS,
 	IFLA_IPTUN_ENCAP_SPORT,
 	IFLA_IPTUN_ENCAP_DPORT,
+	IFLA_IPTUN_COLLECT_METADATA,
 	__IFLA_IPTUN_MAX,
 };
 #define IFLA_IPTUN_MAX	(__IFLA_IPTUN_MAX - 1)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd2874..5719d6ba0824 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/udp.h>
+#include <net/dst_metadata.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	u32 headroom = sizeof(struct iphdr);
+	struct ip_tunnel_info *tun_info;
+	const struct ip_tunnel_key *key;
+	const struct iphdr *inner_iph;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	__be16 df = 0;
+	u8 tos, ttl;
+
+	tun_info = skb_tunnel_info(skb);
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
+		goto tx_error;
+	key = &tun_info->key;
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	tos = key->tos;
+	if (tos == 1) {
+		if (skb->protocol == htons(ETH_P_IP))
+			tos = inner_iph->tos;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+	}
+	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+			 RT_TOS(tos), tunnel->parms.link);
+	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+		goto tx_error;
+	rt = ip_route_output_key(tunnel->net, &fl4);
+	if (IS_ERR(rt)) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+	if (rt->dst.dev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+	ttl = key->ttl;
+	if (ttl == 0) {
+		if (skb->protocol == htons(ETH_P_IP))
+			ttl = inner_iph->ttl;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+		else
+			ttl = ip4_dst_hoplimit(&rt->dst);
+	}
+	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+		df = htons(IP_DF);
+	else if (skb->protocol == htons(ETH_P_IP))
+		df = inner_iph->frag_off & htons(IP_DF);
+	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+	if (headroom > dev->needed_headroom)
+		dev->needed_headroom = headroom;
+
+	if (skb_cow_head(skb, dev->needed_headroom)) {
+		ip_rt_put(rt);
+		goto tx_dropped;
+	}
+	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+	return;
+tx_error:
+	dev->stats.tx_errors++;
+	goto kfree;
+tx_dropped:
+	dev->stats.tx_dropped++;
+kfree:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, u8 protocol)
 {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4ae3f8e6c6cc..c9392589c415 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -115,6 +115,7 @@
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/dst_metadata.h>
 
 static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
@@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+	struct metadata_dst *tun_dst = NULL;
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph;
 
@@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 			tpi = &ipip_tpi;
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		if (tunnel->collect_md) {
+			tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+			if (!tun_dst)
+				return 0;
+		}
+		return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 	}
 
 	return -1;
@@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
 
 	skb_set_inner_ipproto(skb, ipproto);
 
-	ip_tunnel_xmit(skb, dev, tiph, ipproto);
+	if (tunnel->collect_md)
+		ip_md_tunnel_xmit(skb, dev, ipproto);
+	else
+		ip_tunnel_xmit(skb, dev, tiph, ipproto);
 	return NETDEV_TX_OK;
 
 tx_error:
@@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
 }
 
 static void ipip_netlink_parms(struct nlattr *data[],
-			       struct ip_tunnel_parm *parms)
+			       struct ip_tunnel_parm *parms, bool *collect_md)
 {
 	memset(parms, 0, sizeof(*parms));
 
 	parms->iph.version = 4;
 	parms->iph.protocol = IPPROTO_IPIP;
 	parms->iph.ihl = 5;
+	*collect_md = false;
 
 	if (!data)
 		return;
@@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
 
 	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
 		parms->iph.frag_off = htons(IP_DF);
+
+	if (data[IFLA_IPTUN_COLLECT_METADATA])
+		*collect_md = true;
 }
 
 /* This function returns true when ENCAP attributes are present in the nl msg */
@@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
 static int ipip_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[])
 {
+	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
 
 	if (ipip_netlink_encap_parms(data, &ipencap)) {
-		struct ip_tunnel *t = netdev_priv(dev);
 		int err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
 			return err;
 	}
 
-	ipip_netlink_parms(data, &p);
+	ipip_netlink_parms(data, &p, &t->collect_md);
 	return ip_tunnel_newlink(dev, tb, &p);
 }
 
@@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 {
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	bool collect_md;
 
 	if (ipip_netlink_encap_parms(data, &ipencap)) {
 		struct ip_tunnel *t = netdev_priv(dev);
@@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 
-	ipip_netlink_parms(data, &p);
+	ipip_netlink_parms(data, &p, &collect_md);
+	if (collect_md)
+		return -EINVAL;
 
 	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
 	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_IPTUN_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_IPTUN_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			tunnel->encap.flags))
 		goto nla_put_failure;
 
+	if (tunnel->collect_md)
+		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ipip_link_ops __read_mostly = {
-- 
cgit v1.2.3


From 8d79266bc48c6ab6477d04e159cabf1e7809cb72 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:30 -0700
Subject: ip6_tunnel: add collect_md mode to IPv6 tunnels

Similar to gre, vxlan, geneve tunnels allow IPIP6 and IP6IP6 tunnels
to operate in 'collect metadata' mode.
Unlike ipv4 code here it's possible to reuse ip6_tnl_xmit() function
for both collect_md and traditional tunnels.
bpf_skb_[gs]et_tunnel_key() helpers and ovs (in the future) are the users.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h |   1 +
 net/ipv6/ip6_tunnel.c    | 178 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 134 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 43a5a0e4524c..20ed9699fcd4 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -23,6 +23,7 @@ struct __ip6_tnl_parm {
 	__u8 proto;		/* tunnel protocol */
 	__u8 encap_limit;	/* encapsulation limit for tunnel */
 	__u8 hop_limit;		/* hop limit for tunnel */
+	bool collect_md;
 	__be32 flowinfo;	/* traffic class and flowlabel for tunnel */
 	__u32 flags;		/* tunnel flags */
 	struct in6_addr laddr;	/* local tunnel end-point address */
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 5c5779720ef1..6a66adba0c22 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -57,6 +57,7 @@
 #include <net/inet_ecn.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/dst_metadata.h>
 
 MODULE_AUTHOR("Ville Nuorvala");
 MODULE_DESCRIPTION("IPv6 tunneling device");
@@ -90,6 +91,7 @@ struct ip6_tnl_net {
 	struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
 	struct ip6_tnl __rcu *tnls_wc[1];
 	struct ip6_tnl __rcu **tnls[2];
+	struct ip6_tnl __rcu *collect_md_tun;
 };
 
 static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
 			return t;
 	}
 
+	t = rcu_dereference(ip6n->collect_md_tun);
+	if (t)
+		return t;
+
 	t = rcu_dereference(ip6n->tnls_wc[0]);
 	if (t && (t->dev->flags & IFF_UP))
 		return t;
@@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 {
 	struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ip6n->collect_md_tun, t);
 	rcu_assign_pointer(t->next , rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
 }
@@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 	struct ip6_tnl __rcu **tp;
 	struct ip6_tnl *iter;
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ip6n->collect_md_tun, NULL);
+
 	for (tp = ip6_tnl_bucket(ip6n, &t->parms);
 	     (iter = rtnl_dereference(*tp)) != NULL;
 	     tp = &iter->next) {
@@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
 
 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 
+	if (tun_dst)
+		skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
@@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 {
 	struct ip6_tnl *t;
 	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct metadata_dst *tun_dst = NULL;
 	int ret = -1;
 
 	rcu_read_lock();
@@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 			goto drop;
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
-		ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate,
+		if (t->parms.collect_md) {
+			tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+			if (!tun_dst)
+				return 0;
+		}
+		ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
 				    log_ecn_error);
 	}
 
@@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	int mtu;
 	unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
 	unsigned int max_headroom = psh_hlen;
+	u8 hop_limit;
 	int err = -1;
 
+	if (t->parms.collect_md) {
+		hop_limit = skb_tunnel_info(skb)->key.ttl;
+		goto route_lookup;
+	} else {
+		hop_limit = t->parms.hop_limit;
+	}
+
 	/* NBMA tunnel */
 	if (ipv6_addr_any(&t->parms.raddr)) {
 		struct in6_addr *addr6;
@@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 		goto tx_err_link_failure;
 
 	if (!dst) {
+route_lookup:
 		dst = ip6_route_output(net, NULL, fl6);
 
 		if (dst->error)
@@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 			dst = NULL;
 			goto tx_err_link_failure;
 		}
+		if (t->parms.collect_md &&
+		    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+				       &fl6->daddr, 0, &fl6->saddr))
+			goto tx_err_link_failure;
 		ndst = dst;
 	}
 
@@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	}
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
-	if (skb_dst(skb))
+	if (skb_dst(skb) && !t->parms.collect_md)
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 	if (skb->len > mtu && !skb_is_gso(skb)) {
 		*pmtu = mtu;
@@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 		skb = new_skb;
 	}
 
-	if (!fl6->flowi6_mark && ndst)
-		dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+	if (t->parms.collect_md) {
+		if (t->encap.type != TUNNEL_ENCAP_NONE)
+			goto tx_err_dst_release;
+	} else {
+		if (!fl6->flowi6_mark && ndst)
+			dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+	}
 	skb_dst_set(skb, dst);
 
 	if (encap_limit >= 0) {
@@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	ipv6h = ipv6_hdr(skb);
 	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
 		     ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
-	ipv6h->hop_limit = t->parms.hop_limit;
+	ipv6h->hop_limit = hop_limit;
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
 	ipv6h->daddr = fl6->daddr;
@@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (tproto != IPPROTO_IPIP && tproto != 0)
 		return -1;
 
-	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
+	dsfield = ipv4_get_dsfield(iph);
 
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-	fl6.flowi6_proto = IPPROTO_IPIP;
+	if (t->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
 
-	dsfield = ipv4_get_dsfield(iph);
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
+			return -1;
+		key = &tun_info->key;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPIP;
+		fl6.daddr = key->u.ipv6.dst;
+		fl6.flowlabel = key->label;
+	} else {
+		if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+			encap_limit = t->parms.encap_limit;
 
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
-					  & IPV6_TCLASS_MASK;
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPIP;
+
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+			fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+					 & IPV6_TCLASS_MASK;
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+			fl6.flowi6_mark = skb->mark;
+	}
 
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
 		return -1;
@@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	    ip6_tnl_addr_conflict(t, ipv6h))
 		return -1;
 
-	offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
-	if (offset > 0) {
-		struct ipv6_tlv_tnl_enc_lim *tel;
-		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
-		if (tel->encap_limit == 0) {
-			icmpv6_send(skb, ICMPV6_PARAMPROB,
-				    ICMPV6_HDR_FIELD, offset + 2);
+	dsfield = ipv6_get_dsfield(ipv6h);
+
+	if (t->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
+
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
 			return -1;
+		key = &tun_info->key;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPV6;
+		fl6.daddr = key->u.ipv6.dst;
+		fl6.flowlabel = key->label;
+	} else {
+		offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+		if (offset > 0) {
+			struct ipv6_tlv_tnl_enc_lim *tel;
+
+			tel = (void *)&skb_network_header(skb)[offset];
+			if (tel->encap_limit == 0) {
+				icmpv6_send(skb, ICMPV6_PARAMPROB,
+					    ICMPV6_HDR_FIELD, offset + 2);
+				return -1;
+			}
+			encap_limit = tel->encap_limit - 1;
+		} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+			encap_limit = t->parms.encap_limit;
 		}
-		encap_limit = tel->encap_limit - 1;
-	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
 
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-	fl6.flowi6_proto = IPPROTO_IPV6;
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPV6;
 
-	dsfield = ipv6_get_dsfield(ipv6h);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
-		fl6.flowlabel |= ip6_flowlabel(ipv6h);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+			fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+			fl6.flowlabel |= ip6_flowlabel(ipv6h);
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+			fl6.flowi6_mark = skb->mark;
+	}
 
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
 		return -1;
@@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
 	if (err)
 		return err;
 	ip6_tnl_link_config(t);
+	if (t->parms.collect_md) {
+		dev->features |= NETIF_F_NETNS_LOCAL;
+		netif_keep_dst(dev);
+	}
 	return 0;
 }
 
@@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
 
 	if (data[IFLA_IPTUN_PROTO])
 		parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+	if (data[IFLA_IPTUN_COLLECT_METADATA])
+		parms->collect_md = true;
 }
 
 static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[])
 {
 	struct net *net = dev_net(dev);
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 	struct ip6_tnl *nt, *t;
 	struct ip_tunnel_encap ipencap;
 
@@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 
 	ip6_tnl_netlink_parms(data, &nt->parms);
 
-	t = ip6_tnl_locate(net, &nt->parms, 0);
-	if (!IS_ERR(t))
-		return -EEXIST;
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ip6n->collect_md_tun))
+			return -EEXIST;
+	} else {
+		t = ip6_tnl_locate(net, &nt->parms, 0);
+		if (!IS_ERR(t))
+			return -EEXIST;
+	}
 
 	return ip6_tnl_create2(dev);
 }
@@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 	ip6_tnl_netlink_parms(data, &p);
+	if (p.collect_md)
+		return -EINVAL;
 
 	t = ip6_tnl_locate(net, &p, 0);
 	if (!IS_ERR(t)) {
@@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_IPTUN_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_IPTUN_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
 		goto nla_put_failure;
 
-	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
-			tunnel->encap.type) ||
-	nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
-		     tunnel->encap.sport) ||
-	nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
-		     tunnel->encap.dport) ||
-	nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
-		    tunnel->encap.flags))
+	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
+	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
 		goto nla_put_failure;
 
+	if (parm->collect_md)
+		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ip6_link_ops __read_mostly = {
-- 
cgit v1.2.3


From d4690f1e1cdabb4d61207b6787b1605a0dc0aeab Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 16 Sep 2016 00:11:45 +0100
Subject: fix iov_iter_fault_in_readable()

... by turning it into what used to be multipages counterpart

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uio.h |  2 +-
 lib/iov_iter.c      | 24 ++----------------------
 2 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 1b5d1cd796e2..75b4aaf31a9d 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -76,7 +76,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_multipages_readable(struct iov_iter *i, size_t bytes);
+#define iov_iter_fault_in_multipages_readable iov_iter_fault_in_readable
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 9e8c7386b3a0..7e3138cfc8c9 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -290,26 +290,6 @@ done:
 	return wanted - bytes;
 }
 
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
-	if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-		return fault_in_pages_readable(buf, bytes);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
 /*
  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
  * bytes.  For each iovec, fault in each page that constitutes the iovec.
@@ -317,7 +297,7 @@ EXPORT_SYMBOL(iov_iter_fault_in_readable);
  * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
  * because it is an invalid address).
  */
-int iov_iter_fault_in_multipages_readable(struct iov_iter *i, size_t bytes)
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 {
 	size_t skip = i->iov_offset;
 	const struct iovec *iov;
@@ -334,7 +314,7 @@ int iov_iter_fault_in_multipages_readable(struct iov_iter *i, size_t bytes)
 	}
 	return 0;
 }
-EXPORT_SYMBOL(iov_iter_fault_in_multipages_readable);
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
 
 void iov_iter_init(struct iov_iter *i, int direction,
 			const struct iovec *iov, unsigned long nr_segs,
-- 
cgit v1.2.3


From b61c654f9b3f1a271217e46c893f80565b1f754d Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Sep 2016 02:04:20 +0800
Subject: sctp: free msg->chunks when sctp_primitive_SEND return err

Last patch "sctp: do not return the transmit err back to sctp_sendmsg"
made sctp_primitive_SEND return err only when asoc state is unavailable.
In this case, chunks are not enqueued, they have no chance to be freed if
we don't take care of them later.

This Patch is actually to revert commit 1cd4d5c4326a ("sctp: remove the
unused sctp_datamsg_free()"), commit 69b5777f2e57 ("sctp: hold the chunks
only after the chunk is enqueued in outq") and commit 8b570dc9f7b6 ("sctp:
only drop the reference on the datamsg after sending a msg"), to use
sctp_datamsg_free to free the chunks of current msg.

Fixes: 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg after sending a msg")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 +
 net/sctp/chunk.c           | 13 +++++++++++++
 net/sctp/outqueue.c        |  1 -
 net/sctp/socket.c          |  8 ++++++--
 4 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ce93c4b10d26..f61fb7c87e53 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -537,6 +537,7 @@ struct sctp_datamsg {
 struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
 					    struct sctp_sndrcvinfo *,
 					    struct iov_iter *);
+void sctp_datamsg_free(struct sctp_datamsg *);
 void sctp_datamsg_put(struct sctp_datamsg *);
 void sctp_chunk_fail(struct sctp_chunk *, int error);
 int sctp_chunk_abandoned(struct sctp_chunk *);
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index a55e54738b81..af9cc8055465 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -70,6 +70,19 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
 	return msg;
 }
 
+void sctp_datamsg_free(struct sctp_datamsg *msg)
+{
+	struct sctp_chunk *chunk;
+
+	/* This doesn't have to be a _safe vairant because
+	 * sctp_chunk_free() only drops the refs.
+	 */
+	list_for_each_entry(chunk, &msg->chunks, frag_list)
+		sctp_chunk_free(chunk);
+
+	sctp_datamsg_put(msg);
+}
+
 /* Final destructruction of datamsg memory. */
 static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index da2418b64c86..6c109b0f8495 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -304,7 +304,6 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 			 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
 			 "illegal chunk");
 
-		sctp_chunk_hold(chunk);
 		sctp_outq_tail_data(q, chunk);
 		if (chunk->asoc->prsctp_enable &&
 		    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9fc417a8b476..6cdc61c21438 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1958,6 +1958,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 
 	/* Now send the (possibly) fragmented message. */
 	list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
+		sctp_chunk_hold(chunk);
+
 		/* Do accounting for the write space.  */
 		sctp_set_owner_w(chunk);
 
@@ -1970,13 +1972,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	 * breaks.
 	 */
 	err = sctp_primitive_SEND(net, asoc, datamsg);
-	sctp_datamsg_put(datamsg);
 	/* Did the lower layer accept the chunk? */
-	if (err)
+	if (err) {
+		sctp_datamsg_free(datamsg);
 		goto out_free;
+	}
 
 	pr_debug("%s: we sent primitively\n", __func__);
 
+	sctp_datamsg_put(datamsg);
 	err = msg_len;
 
 	if (unlikely(wait_connect)) {
-- 
cgit v1.2.3


From 83dbc3d4a38411ef38f680d7045c8478cc9c5a56 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Sep 2016 02:04:22 +0800
Subject: sctp: make sctp_outq_flush/tail/uncork return void

sctp_outq_flush return value is meaningless now, this patch is
to make sctp_outq_flush return void, as well as sctp_outq_fail
and sctp_outq_uncork.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 ++--
 net/sctp/outqueue.c        | 19 +++++++------------
 net/sctp/sm_sideeffect.c   |  9 ++++-----
 3 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index f61fb7c87e53..8693dc452a7f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1077,7 +1077,7 @@ struct sctp_outq {
 void sctp_outq_init(struct sctp_association *, struct sctp_outq *);
 void sctp_outq_teardown(struct sctp_outq *);
 void sctp_outq_free(struct sctp_outq*);
-int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
+void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
 int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *);
 int sctp_outq_is_empty(const struct sctp_outq *);
 void sctp_outq_restart(struct sctp_outq *);
@@ -1085,7 +1085,7 @@ void sctp_outq_restart(struct sctp_outq *);
 void sctp_retransmit(struct sctp_outq *, struct sctp_transport *,
 		     sctp_retransmit_reason_t);
 void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
-int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
+void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
 void sctp_prsctp_prune(struct sctp_association *asoc,
 		       struct sctp_sndrcvinfo *sinfo, int msg_len);
 /* Uncork and flush an outqueue.  */
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 052a4796a457..8c3f446d965c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
 
 static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
 
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
 static inline void sctp_outq_head_data(struct sctp_outq *q,
@@ -285,10 +285,9 @@ void sctp_outq_free(struct sctp_outq *q)
 }
 
 /* Put a new chunk in an sctp_outq.  */
-int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
+void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 {
 	struct net *net = sock_net(q->asoc->base.sk);
-	int error = 0;
 
 	pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
 		 chunk && chunk->chunk_hdr ?
@@ -318,9 +317,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 	}
 
 	if (!q->cork)
-		error = sctp_outq_flush(q, 0, gfp);
-
-	return error;
+		sctp_outq_flush(q, 0, gfp);
 }
 
 /* Insert a chunk into the sorted list based on the TSNs.  The retransmit list
@@ -748,12 +745,12 @@ redo:
 }
 
 /* Cork the outqueue so queued chunks are really queued. */
-int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
+void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 {
 	if (q->cork)
 		q->cork = 0;
 
-	return sctp_outq_flush(q, 0, gfp);
+	sctp_outq_flush(q, 0, gfp);
 }
 
 
@@ -766,7 +763,7 @@ int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
  * locking concerns must be made.  Today we use the sock lock to protect
  * this function.
  */
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_packet *packet;
 	struct sctp_packet singleton;
@@ -891,7 +888,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			error = sctp_packet_transmit(&singleton, gfp);
 			if (error < 0) {
 				asoc->base.sk->sk_err = -error;
-				return 0;
+				return;
 			}
 			break;
 
@@ -1175,8 +1172,6 @@ sctp_flush_out:
 		/* Clear the burst limited state, if any */
 		sctp_transport_burst_reset(t);
 	}
-
-	return 0;
 }
 
 /* Update unack_data based on the incoming SACK chunk */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index cf6e4f0de729..c345bf153bed 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1421,8 +1421,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				local_cork = 1;
 			}
 			/* Send a chunk to our peer.  */
-			error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk,
-					       gfp);
+			sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp);
 			break;
 
 		case SCTP_CMD_SEND_PKT:
@@ -1676,7 +1675,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_FORCE_PRIM_RETRAN:
 			t = asoc->peer.retran_path;
 			asoc->peer.retran_path = asoc->peer.primary_path;
-			error = sctp_outq_uncork(&asoc->outqueue, gfp);
+			sctp_outq_uncork(&asoc->outqueue, gfp);
 			local_cork = 0;
 			asoc->peer.retran_path = t;
 			break;
@@ -1733,9 +1732,9 @@ out:
 	 */
 	if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
 		if (chunk->end_of_packet || chunk->singleton)
-			error = sctp_outq_uncork(&asoc->outqueue, gfp);
+			sctp_outq_uncork(&asoc->outqueue, gfp);
 	} else if (local_cork)
-		error = sctp_outq_uncork(&asoc->outqueue, gfp);
+		sctp_outq_uncork(&asoc->outqueue, gfp);
 
 	if (sp->data_ready_signalled)
 		sp->data_ready_signalled = 0;
-- 
cgit v1.2.3


From 2c9d85d4d82d9e0a62aad08bf50650804e68ed30 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Fri, 16 Sep 2016 15:05:36 +0200
Subject: netdevice: Add offload statistics ndo

Add a new ndo to return statistics for offloaded operation.
Since there can be many different offloaded operation with many
stats types, the ndo gets an attribute id by which it knows which
stats are wanted. The ndo also gets a void pointer to be cast according
to the attribute id.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2095b6ab3661..a10d8d18ce19 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -924,6 +924,14 @@ struct netdev_xdp {
  *	3. Update dev->stats asynchronously and atomically, and define
  *	   neither operation.
  *
+ * bool (*ndo_has_offload_stats)(int attr_id)
+ *	Return true if this device supports offload stats of this attr_id.
+ *
+ * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
+ *	void *attr_data)
+ *	Get statistics for offload operations by attr_id. Write it into the
+ *	attr_data pointer.
+ *
  * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
  *	If device supports VLAN filtering this function is called when a
  *	VLAN id is registered.
@@ -1155,6 +1163,10 @@ struct net_device_ops {
 
 	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
 						     struct rtnl_link_stats64 *storage);
+	bool			(*ndo_has_offload_stats)(int attr_id);
+	int			(*ndo_get_offload_stats)(int attr_id,
+							 const struct net_device *dev,
+							 void *attr_data);
 	struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 
 	int			(*ndo_vlan_rx_add_vid)(struct net_device *dev,
-- 
cgit v1.2.3


From 69ae6ad2ff37911903a90256e216d7e7ae460002 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Fri, 16 Sep 2016 15:05:37 +0200
Subject: net: core: Add offload stats to if_stats_msg

Add a nested attribute of offload stats to if_stats_msg
named IFLA_STATS_LINK_OFFLOAD_XSTATS.
Under it, add SW stats, meaning stats only per packets that went via
slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |   9 ++++
 net/core/rtnetlink.c         | 111 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9bf3aecfe05b..2351776a724f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -826,6 +826,7 @@ enum {
 	IFLA_STATS_LINK_64,
 	IFLA_STATS_LINK_XSTATS,
 	IFLA_STATS_LINK_XSTATS_SLAVE,
+	IFLA_STATS_LINK_OFFLOAD_XSTATS,
 	__IFLA_STATS_MAX,
 };
 
@@ -845,6 +846,14 @@ enum {
 };
 #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1)
 
+/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */
+enum {
+	IFLA_OFFLOAD_XSTATS_UNSPEC,
+	IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */
+	__IFLA_OFFLOAD_XSTATS_MAX
+};
+#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1)
+
 /* XDP section */
 
 enum {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 937e459bdaa9..0dbae4244a89 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3577,6 +3577,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
 	       (!idxattr || idxattr == attrid);
 }
 
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return sizeof(struct rtnl_link_stats64);
+	}
+
+	return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+				  int *prividx)
+{
+	struct nlattr *attr = NULL;
+	int attr_id, size;
+	void *attr_data;
+	int err;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return -ENODATA;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (attr_id < *prividx)
+			continue;
+
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		if (!size)
+			continue;
+
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+
+		attr = nla_reserve_64bit(skb, attr_id, size,
+					 IFLA_OFFLOAD_XSTATS_UNSPEC);
+		if (!attr)
+			goto nla_put_failure;
+
+		attr_data = nla_data(attr);
+		memset(attr_data, 0, size);
+		err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+							     attr_data);
+		if (err)
+			goto get_offload_stats_failure;
+	}
+
+	if (!attr)
+		return -ENODATA;
+
+	*prividx = 0;
+	return 0;
+
+nla_put_failure:
+	err = -EMSGSIZE;
+get_offload_stats_failure:
+	*prividx = attr_id;
+	return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+	int nla_size = 0;
+	int attr_id;
+	int size;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return 0;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		nla_size += nla_total_size_64bit(size);
+	}
+
+	if (nla_size != 0)
+		nla_size += nla_total_size(0);
+
+	return nla_size;
+}
+
 static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 			       int type, u32 pid, u32 seq, u32 change,
 			       unsigned int flags, unsigned int filter_mask,
@@ -3586,6 +3671,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 	struct nlmsghdr *nlh;
 	struct nlattr *attr;
 	int s_prividx = *prividx;
+	int err;
 
 	ASSERT_RTNL();
 
@@ -3614,8 +3700,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
 
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS);
@@ -3639,8 +3723,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		if (master)
 			ops = master->rtnl_link_ops;
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+			     *idxattr)) {
+		*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+		attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+		if (!attr)
+			goto nla_put_failure;
+
+		err = rtnl_get_offload_stats(skb, dev, prividx);
+		if (err == -ENODATA)
+			nla_nest_cancel(skb, attr);
+		else
+			nla_nest_end(skb, attr);
+
+		if (err && err != -ENODATA)
+			goto nla_put_failure;
+		*idxattr = 0;
+	}
+
 	nlmsg_end(skb, nlh);
 
 	return 0;
@@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+		size += rtnl_get_offload_stats_size(dev);
+
 	return size;
 }
 
-- 
cgit v1.2.3


From d409b84768037ad03d1d73538d99fb902adf7365 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Fri, 16 Sep 2016 12:59:08 -0700
Subject: ipv6: Export p6_route_input_lookup symbol

Make ip6_route_input_lookup available outside of ipv6 the module
similar to ip_route_input_noref in the IPv4 world.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 3 +++
 net/ipv6/route.c        | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d97305d0e71f..e0cd318d5103 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 }
 
 void ip6_route_input(struct sk_buff *skb);
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+					 struct net_device *dev,
+					 struct flowi6 *fl6, int flags);
 
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 					 struct flowi6 *fl6, int flags);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff301fc..4dab585f7642 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *
 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 }
 
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
-						struct net_device *dev,
-						struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+					 struct net_device *dev,
+					 struct flowi6 *fl6, int flags)
 {
 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
 		flags |= RT6_LOOKUP_F_IFACE;
 
 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
 void ip6_route_input(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From e8bffe0cf964f0330595bb376b74921cccdaac88 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Fri, 16 Sep 2016 12:59:13 -0700
Subject: net: Add _nf_(un)register_hooks symbols

Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow
caller to hold RTNL mutex.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h |  2 ++
 net/netfilter/core.c      | 51 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..e82b76781bf6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 
 /* Functions to register get/setsockopt ranges (non-inclusive).  You
    need to check permissions yourself! */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..2c5327e43a88 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 static LIST_HEAD(nf_hook_list);
 
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
 {
 	struct net *net, *last;
 	int ret;
 
-	rtnl_lock();
 	for_each_net(net) {
 		ret = nf_register_net_hook(net, reg);
 		if (ret && ret != -ENOENT)
 			goto rollback;
 	}
 	list_add_tail(&reg->list, &nf_hook_list);
-	rtnl_unlock();
 
 	return 0;
 rollback:
@@ -210,19 +208,34 @@ rollback:
 			break;
 		nf_unregister_net_hook(net, reg);
 	}
+	return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	int ret;
+
+	rtnl_lock();
+	ret = _nf_register_hook(reg);
 	rtnl_unlock();
+
 	return ret;
 }
 EXPORT_SYMBOL(nf_register_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
 {
 	struct net *net;
 
-	rtnl_lock();
 	list_del(&reg->list);
 	for_each_net(net)
 		nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	rtnl_lock();
+	_nf_unregister_hook(reg);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +259,26 @@ err:
 }
 EXPORT_SYMBOL(nf_register_hooks);
 
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = _nf_register_hook(&reg[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		_nf_unregister_hooks(reg, i);
+	return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
 	while (n-- > 0)
@@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	while (n-- > 0)
+		_nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
 unsigned int nf_iterate(struct list_head *head,
 			struct sk_buff *skb,
 			struct nf_hook_state *state,
-- 
cgit v1.2.3


From 4fbae7d83c98c30efcf0a2a2ac55fbb75ef5a1a5 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Fri, 16 Sep 2016 12:59:19 -0700
Subject: ipvlan: Introduce l3s mode

In a typical IPvlan L3 setup where master is in default-ns and
each slave is into different (slave) ns. In this setup egress
packet processing for traffic originating from slave-ns will
hit all NF_HOOKs in slave-ns as well as default-ns. However same
is not true for ingress processing. All these NF_HOOKs are
hit only in the slave-ns skipping them in the default-ns.
IPvlan in L3 mode is restrictive and if admins want to deploy
iptables rules in default-ns, this asymmetric data path makes it
impossible to do so.

This patch makes use of the l3_rcv() (added as part of l3mdev
enhancements) to perform input route lookup on RX packets without
changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN
to change the skb->dev just before handing over skb to L4.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig                 |  1 +
 drivers/net/ipvlan/ipvlan.h         |  6 +++
 drivers/net/ipvlan/ipvlan_core.c    | 94 +++++++++++++++++++++++++++++++++++++
 drivers/net/ipvlan/ipvlan_main.c    | 87 +++++++++++++++++++++++++++++++---
 include/uapi/linux/if_link.h        |  1 +
 6 files changed, 188 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index 14422f8fcdc4..24196cef7c91 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
 	There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 }
+	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | l3 | l3s }
 
 	e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
 
@@ -48,6 +48,11 @@ master device for the L2 processing and routing from that instance will be
 used before packets are queued on the outbound device. In this mode the slaves
 will not receive nor can send multicast / broadcast traffic.
 
+4.3 L3S mode:
+	This is very similar to the L3 mode except that iptables (conn-tracking)
+works in this mode and hence it is L3-symmetric (L3s). This will have slightly less
+performance but that shouldn't matter since you are choosing this mode over plain-L3
+mode to make conn-tracking work.
 
 5. What to choose (macvlan vs. ipvlan)?
 	These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0c5415b05ea9..8768a625350d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -149,6 +149,7 @@ config IPVLAN
     tristate "IP-VLAN support"
     depends on INET
     depends on IPV6
+    depends on NET_L3_MASTER_DEV
     ---help---
       This allows one to create virtual devices off of a main interface
       and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 695a5dc9ace3..7e0732f5ea07 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -23,11 +23,13 @@
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <linux/inetdevice.h>
+#include <linux/netfilter.h>
 #include <net/ip.h>
 #include <net/ip6_route.h>
 #include <net/rtnetlink.h>
 #include <net/route.h>
 #include <net/addrconf.h>
+#include <net/l3mdev.h>
 
 #define IPVLAN_DRV	"ipvlan"
 #define IPV_DRV_VER	"0.1"
@@ -124,4 +126,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
 				   const void *iaddr, bool is_v6);
 bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
 void ipvlan_ht_addr_del(struct ipvl_addr *addr);
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+			      u16 proto);
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+			     const struct nf_hook_state *state);
 #endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b5f9511d819e..b4e990743e1d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 	case IPVLAN_MODE_L2:
 		return ipvlan_xmit_mode_l2(skb, dev);
 	case IPVLAN_MODE_L3:
+	case IPVLAN_MODE_L3S:
 		return ipvlan_xmit_mode_l3(skb, dev);
 	}
 
@@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
 		return ipvlan_handle_mode_l2(pskb, port);
 	case IPVLAN_MODE_L3:
 		return ipvlan_handle_mode_l3(pskb, port);
+	case IPVLAN_MODE_L3S:
+		return RX_HANDLER_PASS;
 	}
 
 	/* Should not reach here */
@@ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
 	kfree_skb(skb);
 	return RX_HANDLER_CONSUMED;
 }
+
+static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb,
+					    struct net_device *dev)
+{
+	struct ipvl_addr *addr = NULL;
+	struct ipvl_port *port;
+	void *lyr3h;
+	int addr_type;
+
+	if (!dev || !netif_is_ipvlan_port(dev))
+		goto out;
+
+	port = ipvlan_port_get_rcu(dev);
+	if (!port || port->mode != IPVLAN_MODE_L3S)
+		goto out;
+
+	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+	if (!lyr3h)
+		goto out;
+
+	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
+out:
+	return addr;
+}
+
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+			      u16 proto)
+{
+	struct ipvl_addr *addr;
+	struct net_device *sdev;
+
+	addr = ipvlan_skb_to_addr(skb, dev);
+	if (!addr)
+		goto out;
+
+	sdev = addr->master->dev;
+	switch (proto) {
+	case AF_INET:
+	{
+		int err;
+		struct iphdr *ip4h = ip_hdr(skb);
+
+		err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr,
+					   ip4h->tos, sdev);
+		if (unlikely(err))
+			goto out;
+		break;
+	}
+	case AF_INET6:
+	{
+		struct dst_entry *dst;
+		struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		int flags = RT6_LOOKUP_F_HAS_SADDR;
+		struct flowi6 fl6 = {
+			.flowi6_iif   = sdev->ifindex,
+			.daddr        = ip6h->daddr,
+			.saddr        = ip6h->saddr,
+			.flowlabel    = ip6_flowinfo(ip6h),
+			.flowi6_mark  = skb->mark,
+			.flowi6_proto = ip6h->nexthdr,
+		};
+
+		skb_dst_drop(skb);
+		dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags);
+		skb_dst_set(skb, dst);
+		break;
+	}
+	default:
+		break;
+	}
+
+out:
+	return skb;
+}
+
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+			     const struct nf_hook_state *state)
+{
+	struct ipvl_addr *addr;
+	unsigned int len;
+
+	addr = ipvlan_skb_to_addr(skb, skb->dev);
+	if (!addr)
+		goto out;
+
+	skb->dev = addr->master->dev;
+	len = skb->len + ETH_HLEN;
+	ipvlan_count_rx(addr->master, len, true, false);
+out:
+	return NF_ACCEPT;
+}
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 18b4e8c7f68a..f442eb366863 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -9,24 +9,87 @@
 
 #include "ipvlan.h"
 
+static u32 ipvl_nf_hook_refcnt = 0;
+
+static struct nf_hook_ops ipvl_nfops[] __read_mostly = {
+	{
+		.hook     = ipvlan_nf_input,
+		.pf       = NFPROTO_IPV4,
+		.hooknum  = NF_INET_LOCAL_IN,
+		.priority = INT_MAX,
+	},
+	{
+		.hook     = ipvlan_nf_input,
+		.pf       = NFPROTO_IPV6,
+		.hooknum  = NF_INET_LOCAL_IN,
+		.priority = INT_MAX,
+	},
+};
+
+static struct l3mdev_ops ipvl_l3mdev_ops __read_mostly = {
+	.l3mdev_l3_rcv = ipvlan_l3_rcv,
+};
+
 static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
 {
 	ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
 }
 
-static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
+static int ipvlan_register_nf_hook(void)
+{
+	int err = 0;
+
+	if (!ipvl_nf_hook_refcnt) {
+		err = _nf_register_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops));
+		if (!err)
+			ipvl_nf_hook_refcnt = 1;
+	} else {
+		ipvl_nf_hook_refcnt++;
+	}
+
+	return err;
+}
+
+static void ipvlan_unregister_nf_hook(void)
+{
+	WARN_ON(!ipvl_nf_hook_refcnt);
+
+	ipvl_nf_hook_refcnt--;
+	if (!ipvl_nf_hook_refcnt)
+		_nf_unregister_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops));
+}
+
+static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
 {
 	struct ipvl_dev *ipvlan;
+	struct net_device *mdev = port->dev;
+	int err = 0;
 
+	ASSERT_RTNL();
 	if (port->mode != nval) {
+		if (nval == IPVLAN_MODE_L3S) {
+			/* New mode is L3S */
+			err = ipvlan_register_nf_hook();
+			if (!err) {
+				mdev->l3mdev_ops = &ipvl_l3mdev_ops;
+				mdev->priv_flags |= IFF_L3MDEV_MASTER;
+			} else
+				return err;
+		} else if (port->mode == IPVLAN_MODE_L3S) {
+			/* Old mode was L3S */
+			mdev->priv_flags &= ~IFF_L3MDEV_MASTER;
+			ipvlan_unregister_nf_hook();
+			mdev->l3mdev_ops = NULL;
+		}
 		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
-			if (nval == IPVLAN_MODE_L3)
+			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S)
 				ipvlan->dev->flags |= IFF_NOARP;
 			else
 				ipvlan->dev->flags &= ~IFF_NOARP;
 		}
 		port->mode = nval;
 	}
+	return err;
 }
 
 static int ipvlan_port_create(struct net_device *dev)
@@ -74,6 +137,11 @@ static void ipvlan_port_destroy(struct net_device *dev)
 	struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
 
 	dev->priv_flags &= ~IFF_IPVLAN_MASTER;
+	if (port->mode == IPVLAN_MODE_L3S) {
+		dev->priv_flags &= ~IFF_L3MDEV_MASTER;
+		ipvlan_unregister_nf_hook();
+		dev->l3mdev_ops = NULL;
+	}
 	netdev_rx_handler_unregister(dev);
 	cancel_work_sync(&port->wq);
 	__skb_queue_purge(&port->backlog);
@@ -132,7 +200,8 @@ static int ipvlan_open(struct net_device *dev)
 	struct net_device *phy_dev = ipvlan->phy_dev;
 	struct ipvl_addr *addr;
 
-	if (ipvlan->port->mode == IPVLAN_MODE_L3)
+	if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
+	    ipvlan->port->mode == IPVLAN_MODE_L3S)
 		dev->flags |= IFF_NOARP;
 	else
 		dev->flags &= ~IFF_NOARP;
@@ -372,13 +441,14 @@ static int ipvlan_nl_changelink(struct net_device *dev,
 {
 	struct ipvl_dev *ipvlan = netdev_priv(dev);
 	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
+	int err = 0;
 
 	if (data && data[IFLA_IPVLAN_MODE]) {
 		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
-		ipvlan_set_port_mode(port, nmode);
+		err = ipvlan_set_port_mode(port, nmode);
 	}
-	return 0;
+	return err;
 }
 
 static size_t ipvlan_nl_getsize(const struct net_device *dev)
@@ -473,10 +543,13 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 		unregister_netdevice(dev);
 		return err;
 	}
+	err = ipvlan_set_port_mode(port, mode);
+	if (err) {
+		unregister_netdevice(dev);
+		return err;
+	}
 
 	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
-	ipvlan_set_port_mode(port, mode);
-
 	netif_stacked_transfer_operstate(phy_dev, dev);
 	return 0;
 }
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 2351776a724f..7ec9e99d5491 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -464,6 +464,7 @@ enum {
 enum ipvlan_mode {
 	IPVLAN_MODE_L2 = 0,
 	IPVLAN_MODE_L3,
+	IPVLAN_MODE_L3S,
 	IPVLAN_MODE_MAX
 };
 
-- 
cgit v1.2.3


From ec323368793b8570c02e723127611a8d906a9b3f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 18 Sep 2016 00:57:32 +0200
Subject: sched: remove qdisc arg from __qdisc_dequeue_head

Moves qdisc stat accouting to qdisc_dequeue_head.

The only direct caller of the __qdisc_dequeue_head version open-codes
this now.

This allows us to later use __qdisc_dequeue_head as a replacement
of __skb_dequeue() (which operates on sk_buff_head list).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 15 ++++++++-------
 net/sched/sch_generic.c   |  7 ++++++-
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 52a2015667b4..0741ed41575b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -614,11 +614,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
 	return __qdisc_enqueue_tail(skb, sch, &sch->q);
 }
 
-static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch,
-						   struct sk_buff_head *list)
+static inline struct sk_buff *__qdisc_dequeue_head(struct sk_buff_head *list)
 {
 	struct sk_buff *skb = __skb_dequeue(list);
 
+	return skb;
+}
+
+static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
+{
+	struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+
 	if (likely(skb != NULL)) {
 		qdisc_qstats_backlog_dec(sch, skb);
 		qdisc_bstats_update(sch, skb);
@@ -627,11 +633,6 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch,
 	return skb;
 }
 
-static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
-{
-	return __qdisc_dequeue_head(sch, &sch->q);
-}
-
 /* Instead of calling kfree_skb() while root qdisc lock is held,
  * queue the skb for future freeing at end of __dev_xmit_skb()
  */
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5e63bf638350..73877d9c2bcb 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -506,7 +506,12 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 
 	if (likely(band >= 0)) {
 		struct sk_buff_head *list = band2list(priv, band);
-		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+		struct sk_buff *skb = __qdisc_dequeue_head(list);
+
+		if (likely(skb != NULL)) {
+			qdisc_qstats_backlog_dec(qdisc, skb);
+			qdisc_bstats_update(qdisc, skb);
+		}
 
 		qdisc->q.qlen--;
 		if (skb_queue_empty(list))
-- 
cgit v1.2.3


From 48da34b7a74201f15315cb1fc40bb9a7bd2b4940 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 18 Sep 2016 00:57:34 +0200
Subject: sched: add and use qdisc_skb_head helpers

This change replaces sk_buff_head struct in Qdiscs with new qdisc_skb_head.

Its similar to the skb_buff_head api, but does not use skb->prev pointers.

Qdiscs will commonly enqueue at the tail of a list and dequeue at head.
While skb_buff_head works fine for this, enqueue/dequeue needs to also
adjust the prev pointer of next element.

The ->prev pointer is not required for qdiscs so we can just leave
it undefined and avoid one cacheline write access for en/dequeue.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 63 ++++++++++++++++++++++++++++++++++++++---------
 net/sched/sch_generic.c   | 21 ++++++++--------
 net/sched/sch_htb.c       | 24 +++++++++++++++---
 net/sched/sch_netem.c     | 14 +++++++++--
 4 files changed, 94 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0741ed41575b..e6aa0a249672 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,14 @@ struct qdisc_size_table {
 	u16			data[];
 };
 
+/* similar to sk_buff_head, but skb->prev pointer is undefined. */
+struct qdisc_skb_head {
+	struct sk_buff	*head;
+	struct sk_buff	*tail;
+	__u32		qlen;
+	spinlock_t	lock;
+};
+
 struct Qdisc {
 	int 			(*enqueue)(struct sk_buff *skb,
 					   struct Qdisc *sch,
@@ -76,7 +84,7 @@ struct Qdisc {
 	 * For performance sake on SMP, we put highly modified fields at the end
 	 */
 	struct sk_buff		*gso_skb ____cacheline_aligned_in_smp;
-	struct sk_buff_head	q;
+	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_packed bstats;
 	seqcount_t		running;
 	struct gnet_stats_queue	qstats;
@@ -600,10 +608,27 @@ static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
 	sch->qstats.overlimits++;
 }
 
+static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh)
+{
+	qh->head = NULL;
+	qh->tail = NULL;
+	qh->qlen = 0;
+}
+
 static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
-				       struct sk_buff_head *list)
+				       struct qdisc_skb_head *qh)
 {
-	__skb_queue_tail(list, skb);
+	struct sk_buff *last = qh->tail;
+
+	if (last) {
+		skb->next = NULL;
+		last->next = skb;
+		qh->tail = skb;
+	} else {
+		qh->tail = skb;
+		qh->head = skb;
+	}
+	qh->qlen++;
 	qdisc_qstats_backlog_inc(sch, skb);
 
 	return NET_XMIT_SUCCESS;
@@ -614,9 +639,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
 	return __qdisc_enqueue_tail(skb, sch, &sch->q);
 }
 
-static inline struct sk_buff *__qdisc_dequeue_head(struct sk_buff_head *list)
+static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
 {
-	struct sk_buff *skb = __skb_dequeue(list);
+	struct sk_buff *skb = qh->head;
+
+	if (likely(skb != NULL)) {
+		qh->head = skb->next;
+		qh->qlen--;
+		if (qh->head == NULL)
+			qh->tail = NULL;
+		skb->next = NULL;
+	}
 
 	return skb;
 }
@@ -643,10 +676,10 @@ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
 }
 
 static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
-						   struct sk_buff_head *list,
+						   struct qdisc_skb_head *qh,
 						   struct sk_buff **to_free)
 {
-	struct sk_buff *skb = __skb_dequeue(list);
+	struct sk_buff *skb = __qdisc_dequeue_head(qh);
 
 	if (likely(skb != NULL)) {
 		unsigned int len = qdisc_pkt_len(skb);
@@ -667,7 +700,9 @@ static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch,
 
 static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
 {
-	return skb_peek(&sch->q);
+	const struct qdisc_skb_head *qh = &sch->q;
+
+	return qh->head;
 }
 
 /* generic pseudo peek method for non-work-conserving qdisc */
@@ -702,15 +737,19 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 	return skb;
 }
 
-static inline void __qdisc_reset_queue(struct sk_buff_head *list)
+static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
 {
 	/*
 	 * We do not know the backlog in bytes of this list, it
 	 * is up to the caller to correct it
 	 */
-	if (!skb_queue_empty(list)) {
-		rtnl_kfree_skbs(list->next, list->prev);
-		__skb_queue_head_init(list);
+	ASSERT_RTNL();
+	if (qh->qlen) {
+		rtnl_kfree_skbs(qh->head, qh->tail);
+
+		qh->head = NULL;
+		qh->tail = NULL;
+		qh->qlen = 0;
 	}
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 73877d9c2bcb..6cfb6e9038c2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -466,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
  */
 struct pfifo_fast_priv {
 	u32 bitmap;
-	struct sk_buff_head q[PFIFO_FAST_BANDS];
+	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
 };
 
 /*
@@ -477,7 +477,7 @@ struct pfifo_fast_priv {
  */
 static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
 
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
 					     int band)
 {
 	return priv->q + band;
@@ -489,7 +489,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
 	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
 		int band = prio2band[skb->priority & TC_PRIO_MAX];
 		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-		struct sk_buff_head *list = band2list(priv, band);
+		struct qdisc_skb_head *list = band2list(priv, band);
 
 		priv->bitmap |= (1 << band);
 		qdisc->q.qlen++;
@@ -505,8 +505,8 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 	int band = bitmap2band[priv->bitmap];
 
 	if (likely(band >= 0)) {
-		struct sk_buff_head *list = band2list(priv, band);
-		struct sk_buff *skb = __qdisc_dequeue_head(list);
+		struct qdisc_skb_head *qh = band2list(priv, band);
+		struct sk_buff *skb = __qdisc_dequeue_head(qh);
 
 		if (likely(skb != NULL)) {
 			qdisc_qstats_backlog_dec(qdisc, skb);
@@ -514,7 +514,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 		}
 
 		qdisc->q.qlen--;
-		if (skb_queue_empty(list))
+		if (qh->qlen == 0)
 			priv->bitmap &= ~(1 << band);
 
 		return skb;
@@ -529,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
 	int band = bitmap2band[priv->bitmap];
 
 	if (band >= 0) {
-		struct sk_buff_head *list = band2list(priv, band);
+		struct qdisc_skb_head *qh = band2list(priv, band);
 
-		return skb_peek(list);
+		return qh->head;
 	}
 
 	return NULL;
@@ -569,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		__skb_queue_head_init(band2list(priv, prio));
+		qdisc_skb_head_init(band2list(priv, prio));
 
 	/* Can by-pass the queue discipline */
 	qdisc->flags |= TCQ_F_CAN_BYPASS;
@@ -617,7 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 		sch->padded = (char *) sch - (char *) p;
 	}
-	skb_queue_head_init(&sch->q);
+	qdisc_skb_head_init(&sch->q);
+	spin_lock_init(&sch->q.lock);
 
 	spin_lock_init(&sch->busylock);
 	lockdep_set_class(&sch->busylock,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 53dbfa187870..c798d0de8a9d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -162,7 +162,7 @@ struct htb_sched {
 	struct work_struct	work;
 
 	/* non shaped skbs; let them go directly thru */
-	struct sk_buff_head	direct_queue;
+	struct qdisc_skb_head	direct_queue;
 	long			direct_pkts;
 
 	struct qdisc_watchdog	watchdog;
@@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 	list_del_init(&cl->un.leaf.drop_list);
 }
 
+static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
+			     struct qdisc_skb_head *qh)
+{
+	struct sk_buff *last = qh->tail;
+
+	if (last) {
+		skb->next = NULL;
+		last->next = skb;
+		qh->tail = skb;
+	} else {
+		qh->tail = skb;
+		qh->head = skb;
+	}
+	qh->qlen++;
+}
+
 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		       struct sk_buff **to_free)
 {
@@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (cl == HTB_DIRECT) {
 		/* enqueue to helper queue */
 		if (q->direct_queue.qlen < q->direct_qlen) {
-			__skb_queue_tail(&q->direct_queue, skb);
+			htb_enqueue_tail(skb, sch, &q->direct_queue);
 			q->direct_pkts++;
 		} else {
 			return qdisc_drop(skb, sch, to_free);
@@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 	unsigned long start_at;
 
 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
-	skb = __skb_dequeue(&q->direct_queue);
+	skb = __qdisc_dequeue_head(&q->direct_queue);
 	if (skb != NULL) {
 ok:
 		qdisc_bstats_update(sch, skb);
@@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 
 	qdisc_watchdog_init(&q->watchdog, sch);
 	INIT_WORK(&q->work, htb_work_func);
-	__skb_queue_head_init(&q->direct_queue);
+	qdisc_skb_head_init(&q->direct_queue);
 
 	if (tb[TCA_HTB_DIRECT_QLEN])
 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 0a964b35f8c7..9f7b380cf0a3 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 	return segs;
 }
 
+static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
+{
+	skb->next = qh->head;
+
+	if (!qh->head)
+		qh->tail = skb;
+	qh->head = skb;
+	qh->qlen++;
+}
+
 /*
  * Insert one skb into qdisc.
  * Note: parent depends on return value to account for queue length.
@@ -523,7 +533,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			struct sk_buff *last;
 
 			if (sch->q.qlen)
-				last = skb_peek_tail(&sch->q);
+				last = sch->q.tail;
 			else
 				last = netem_rb_to_skb(rb_last(&q->t_root));
 			if (last) {
@@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		cb->time_to_send = psched_get_time();
 		q->counter = 0;
 
-		__skb_queue_head(&sch->q, skb);
+		netem_enqueue_skb_head(&sch->q, skb);
 		sch->qstats.requeues++;
 	}
 
-- 
cgit v1.2.3


From 53f863a66904542b03204f2b115d050b04c11ba5 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Thu, 21 Jul 2016 14:12:40 +0200
Subject: Bluetooth: Put led_trigger field behind CONFIG_BT_LEDS

The led_trigger field in hci_dev should be conditional based on if
CONFIG_BT_LEDS is set or not.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index ee7fc47680a1..b8d43bd9c71b 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -399,7 +399,9 @@ struct hci_dev {
 	struct delayed_work	rpa_expired;
 	bdaddr_t		rpa;
 
+#if IS_ENABLED(CONFIG_BT_LEDS)
 	struct led_trigger	*power_led;
+#endif
 
 	int (*open)(struct hci_dev *hdev);
 	int (*close)(struct hci_dev *hdev);
-- 
cgit v1.2.3


From 65010e68efbeda4275845240869138c0c4587422 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Date: Fri, 12 Aug 2016 17:01:27 -0700
Subject: Bluetooth: Add HCI device identifier for Qualcomm SMD

This patch assigns the next free HCI device identifier to Bluetooth
devices based on the Qualcomm Shared Memory channels.

Signed-off-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 003b25283407..0aac123b5eee 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -63,6 +63,7 @@
 #define HCI_SDIO	6
 #define HCI_SPI		7
 #define HCI_I2C		8
+#define HCI_SMD		9
 
 /* HCI controller types */
 #define HCI_PRIMARY	0x00
-- 
cgit v1.2.3


From 1aabbbcefe8e62fbffaaa01ca8bdd4cd6ed1625b Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Fri, 29 Jul 2016 13:28:25 +0200
Subject: Bluetooth: add printf format attribute to hci_set_[fh]w_info()

Commit 5177a83827cd ("Bluetooth: Add debugfs fields for hardware and
firmware info") introduced hci_set_hw_info() and hci_set_fw_info().
These functions use kvasprintf_const() but are not marked with a
__printf attribute.  Adding such an attribute helps detecting issues
related to printf-formatting at build time.

Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b8d43bd9c71b..cc349f633570 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1028,8 +1028,8 @@ int hci_resume_dev(struct hci_dev *hdev);
 int hci_reset_dev(struct hci_dev *hdev);
 int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb);
 int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb);
-void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...);
-void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...);
+__printf(2, 3) void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...);
+__printf(2, 3) void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...);
 int hci_dev_open(__u16 dev);
 int hci_dev_close(__u16 dev);
 int hci_dev_do_close(struct hci_dev *hdev);
-- 
cgit v1.2.3


From 70ecce91e3a2d7e332fe56fd065c67d404b8fccf Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:38 +0200
Subject: Bluetooth: Store control socket cookie and comm information

To further allow unique identification and tracking of control socket,
store cookie and comm information when binding the socket.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/bluetooth.h |  1 +
 net/bluetooth/hci_sock.c          | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index bfd1590821d6..69b5174168b7 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -371,6 +371,7 @@ void hci_sock_set_flag(struct sock *sk, int nr);
 void hci_sock_clear_flag(struct sock *sk, int nr);
 int hci_sock_test_flag(struct sock *sk, int nr);
 unsigned short hci_sock_get_channel(struct sock *sk);
+u32 hci_sock_get_cookie(struct sock *sk);
 
 int hci_sock_init(void);
 void hci_sock_cleanup(void);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 99dd1503ef56..4dce6dfdb0f2 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -26,6 +26,7 @@
 
 #include <linux/export.h>
 #include <linux/utsname.h>
+#include <linux/sched.h>
 #include <asm/unaligned.h>
 
 #include <net/bluetooth/bluetooth.h>
@@ -38,6 +39,8 @@
 static LIST_HEAD(mgmt_chan_list);
 static DEFINE_MUTEX(mgmt_chan_list_lock);
 
+static DEFINE_IDA(sock_cookie_ida);
+
 static atomic_t monitor_promisc = ATOMIC_INIT(0);
 
 /* ----- HCI socket interface ----- */
@@ -52,6 +55,8 @@ struct hci_pinfo {
 	__u32             cmsg_mask;
 	unsigned short    channel;
 	unsigned long     flags;
+	__u32             cookie;
+	char              comm[TASK_COMM_LEN];
 };
 
 void hci_sock_set_flag(struct sock *sk, int nr)
@@ -74,6 +79,11 @@ unsigned short hci_sock_get_channel(struct sock *sk)
 	return hci_pi(sk)->channel;
 }
 
+u32 hci_sock_get_cookie(struct sock *sk)
+{
+	return hci_pi(sk)->cookie;
+}
+
 static inline int hci_test_bit(int nr, const void *addr)
 {
 	return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
@@ -585,6 +595,7 @@ static int hci_sock_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct hci_dev *hdev;
+	int id;
 
 	BT_DBG("sock %p sk %p", sock, sk);
 
@@ -593,8 +604,17 @@ static int hci_sock_release(struct socket *sock)
 
 	hdev = hci_pi(sk)->hdev;
 
-	if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR)
+	switch (hci_pi(sk)->channel) {
+	case HCI_CHANNEL_MONITOR:
 		atomic_dec(&monitor_promisc);
+		break;
+	case HCI_CHANNEL_CONTROL:
+		id = hci_pi(sk)->cookie;
+
+		hci_pi(sk)->cookie = 0xffffffff;
+		ida_simple_remove(&sock_cookie_ida, id);
+		break;
+	}
 
 	bt_sock_unlink(&hci_sk_list, sk);
 
@@ -957,6 +977,15 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		 * are changes to settings, class of device, name etc.
 		 */
 		if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+			int id;
+
+			id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+			if (id < 0)
+				id = 0xffffffff;
+
+			hci_pi(sk)->cookie = id;
+			get_task_comm(hci_pi(sk)->comm, current);
+
 			hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
-- 
cgit v1.2.3


From 03c979c4717c7fa0c058fafe76ac4d6acdd1fb0d Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:39 +0200
Subject: Bluetooth: Introduce helper to pack mgmt version information

The mgmt version information will be also needed for the control
changell tracing feature. This provides a helper to pack them.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/mgmt.c             | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index cc349f633570..9f181b583b96 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1451,6 +1451,7 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c);
 #define DISCOV_BREDR_INQUIRY_LEN	0x08
 #define DISCOV_LE_RESTART_DELAY		msecs_to_jiffies(200)	/* msec */
 
+void mgmt_fill_version_info(void *ver);
 int mgmt_new_settings(struct hci_dev *hdev);
 void mgmt_index_added(struct hci_dev *hdev);
 void mgmt_index_removed(struct hci_dev *hdev);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7639290b6de3..9071886df194 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -278,6 +278,14 @@ static u8 le_addr_type(u8 mgmt_addr_type)
 		return ADDR_LE_DEV_RANDOM;
 }
 
+void mgmt_fill_version_info(void *ver)
+{
+	struct mgmt_rp_read_version *rp = ver;
+
+	rp->version = MGMT_VERSION;
+	rp->revision = cpu_to_le16(MGMT_REVISION);
+}
+
 static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
 			u16 data_len)
 {
@@ -285,8 +293,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	BT_DBG("sock %p", sk);
 
-	rp.version = MGMT_VERSION;
-	rp.revision = cpu_to_le16(MGMT_REVISION);
+	mgmt_fill_version_info(&rp);
 
 	return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0,
 				 &rp, sizeof(rp));
-- 
cgit v1.2.3


From 249fa1699f8642c73eb43e61b321969f0549ab2c Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:40 +0200
Subject: Bluetooth: Add support for sending MGMT open and close to monitor

This sends new notifications to the monitor support whenever a
management channel has been opened or closed. This allows tracing of
control channels really easily.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_mon.h |  2 +
 net/bluetooth/hci_sock.c        | 95 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h
index 587d0131b349..9640790cbbcc 100644
--- a/include/net/bluetooth/hci_mon.h
+++ b/include/net/bluetooth/hci_mon.h
@@ -45,6 +45,8 @@ struct hci_mon_hdr {
 #define HCI_MON_VENDOR_DIAG	11
 #define HCI_MON_SYSTEM_NOTE	12
 #define HCI_MON_USER_LOGGING	13
+#define HCI_MON_CTRL_OPEN	14
+#define HCI_MON_CTRL_CLOSE	15
 
 struct hci_mon_new_index {
 	__u8		type;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 4dce6dfdb0f2..2d8725006838 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -394,6 +394,59 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
 	return skb;
 }
 
+static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
+{
+	struct hci_mon_hdr *hdr;
+	struct sk_buff *skb;
+	u16 format = 0x0002;
+	u8 ver[3];
+	u32 flags;
+
+	skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	mgmt_fill_version_info(ver);
+	flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
+
+	put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+	put_unaligned_le16(format, skb_put(skb, 2));
+	memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver));
+	put_unaligned_le32(flags, skb_put(skb, 4));
+	*skb_put(skb, 1) = TASK_COMM_LEN;
+	memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN);
+
+	__net_timestamp(skb);
+
+	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN);
+	hdr->index = cpu_to_le16(HCI_DEV_NONE);
+	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+	return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
+{
+	struct hci_mon_hdr *hdr;
+	struct sk_buff *skb;
+
+	skb = bt_skb_alloc(4, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+
+	__net_timestamp(skb);
+
+	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE);
+	hdr->index = cpu_to_le16(HCI_DEV_NONE);
+	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+	return skb;
+}
+
 static void __printf(2, 3)
 send_monitor_note(struct sock *sk, const char *fmt, ...)
 {
@@ -468,6 +521,29 @@ static void send_monitor_replay(struct sock *sk)
 	read_unlock(&hci_dev_list_lock);
 }
 
+static void send_monitor_control_replay(struct sock *mon_sk)
+{
+	struct sock *sk;
+
+	read_lock(&hci_sk_list.lock);
+
+	sk_for_each(sk, &hci_sk_list.head) {
+		struct sk_buff *skb;
+
+		if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
+			continue;
+
+		skb = create_monitor_ctrl_open(sk);
+		if (!skb)
+			continue;
+
+		if (sock_queue_rcv_skb(mon_sk, skb))
+			kfree_skb(skb);
+	}
+
+	read_unlock(&hci_sk_list.lock);
+}
+
 /* Generate internal stack event */
 static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
 {
@@ -595,6 +671,7 @@ static int hci_sock_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct hci_dev *hdev;
+	struct sk_buff *skb;
 	int id;
 
 	BT_DBG("sock %p sk %p", sock, sk);
@@ -611,6 +688,14 @@ static int hci_sock_release(struct socket *sock)
 	case HCI_CHANNEL_CONTROL:
 		id = hci_pi(sk)->cookie;
 
+		/* Send event to monitor */
+		skb = create_monitor_ctrl_close(sk);
+		if (skb) {
+			hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+					    HCI_SOCK_TRUSTED, NULL);
+			kfree_skb(skb);
+		}
+
 		hci_pi(sk)->cookie = 0xffffffff;
 		ida_simple_remove(&sock_cookie_ida, id);
 		break;
@@ -931,6 +1016,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		send_monitor_note(sk, "Bluetooth subsystem version %s",
 				  BT_SUBSYS_VERSION);
 		send_monitor_replay(sk);
+		send_monitor_control_replay(sk);
 
 		atomic_inc(&monitor_promisc);
 		break;
@@ -977,6 +1063,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		 * are changes to settings, class of device, name etc.
 		 */
 		if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+			struct sk_buff *skb;
 			int id;
 
 			id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
@@ -986,6 +1073,14 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 			hci_pi(sk)->cookie = id;
 			get_task_comm(hci_pi(sk)->comm, current);
 
+			/* Send event to monitor */
+			skb = create_monitor_ctrl_open(sk);
+			if (skb) {
+				hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+						    HCI_SOCK_TRUSTED, NULL);
+				kfree_skb(skb);
+			}
+
 			hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
-- 
cgit v1.2.3


From 38ceaa00d02dceb22c6bdd5268f5a44d5c00e123 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:41 +0200
Subject: Bluetooth: Add support for sending MGMT commands and events to
 monitor

This adds support for tracing all management commands and events via the
monitor interface.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h |  3 ++
 include/net/bluetooth/hci_mon.h  |  2 +
 net/bluetooth/hci_sock.c         | 94 ++++++++++++++++++++++++++++++++++++++++
 net/bluetooth/mgmt_util.c        | 66 ++++++++++++++++++++++++++--
 4 files changed, 162 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 9f181b583b96..a48f71d73dc8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1406,6 +1406,9 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb);
 void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
 			 int flag, struct sock *skip_sk);
 void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb);
+void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
+				 void *data, u16 data_len, ktime_t tstamp,
+				 int flag, struct sock *skip_sk);
 
 void hci_sock_dev_event(struct hci_dev *hdev, int event);
 
diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h
index 9640790cbbcc..240786b04a46 100644
--- a/include/net/bluetooth/hci_mon.h
+++ b/include/net/bluetooth/hci_mon.h
@@ -47,6 +47,8 @@ struct hci_mon_hdr {
 #define HCI_MON_USER_LOGGING	13
 #define HCI_MON_CTRL_OPEN	14
 #define HCI_MON_CTRL_CLOSE	15
+#define HCI_MON_CTRL_COMMAND	16
+#define HCI_MON_CTRL_EVENT	17
 
 struct hci_mon_new_index {
 	__u8		type;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 2d8725006838..576ea48631b9 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -315,6 +315,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
 	kfree_skb(skb_copy);
 }
 
+void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
+				 void *data, u16 data_len, ktime_t tstamp,
+				 int flag, struct sock *skip_sk)
+{
+	struct sock *sk;
+	__le16 index;
+
+	if (hdev)
+		index = cpu_to_le16(hdev->id);
+	else
+		index = cpu_to_le16(MGMT_INDEX_NONE);
+
+	read_lock(&hci_sk_list.lock);
+
+	sk_for_each(sk, &hci_sk_list.head) {
+		struct hci_mon_hdr *hdr;
+		struct sk_buff *skb;
+
+		if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
+			continue;
+
+		/* Ignore socket without the flag set */
+		if (!hci_sock_test_flag(sk, flag))
+			continue;
+
+		/* Skip the original socket */
+		if (sk == skip_sk)
+			continue;
+
+		skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC);
+		if (!skb)
+			continue;
+
+		put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+		put_unaligned_le16(event, skb_put(skb, 2));
+
+		if (data)
+			memcpy(skb_put(skb, data_len), data, data_len);
+
+		skb->tstamp = tstamp;
+
+		hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+		hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+		hdr->index = index;
+		hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+		hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+				    HCI_SOCK_TRUSTED, NULL);
+		kfree_skb(skb);
+	}
+
+	read_unlock(&hci_sk_list.lock);
+}
+
 static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
 {
 	struct hci_mon_hdr *hdr;
@@ -447,6 +501,33 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
 	return skb;
 }
 
+static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index,
+						   u16 opcode, u16 len,
+						   const void *buf)
+{
+	struct hci_mon_hdr *hdr;
+	struct sk_buff *skb;
+
+	skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+	put_unaligned_le16(opcode, skb_put(skb, 2));
+
+	if (buf)
+		memcpy(skb_put(skb, len), buf, len);
+
+	__net_timestamp(skb);
+
+	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND);
+	hdr->index = cpu_to_le16(index);
+	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+	return skb;
+}
+
 static void __printf(2, 3)
 send_monitor_note(struct sock *sk, const char *fmt, ...)
 {
@@ -1257,6 +1338,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
 		goto done;
 	}
 
+	if (chan->channel == HCI_CHANNEL_CONTROL) {
+		struct sk_buff *skb;
+
+		/* Send event to monitor */
+		skb = create_monitor_ctrl_command(sk, index, opcode, len,
+						  buf + sizeof(*hdr));
+		if (skb) {
+			hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+					    HCI_SOCK_TRUSTED, NULL);
+			kfree_skb(skb);
+		}
+	}
+
 	if (opcode >= chan->handler_count ||
 	    chan->handlers[opcode].func == NULL) {
 		BT_DBG("Unknown op %u", opcode);
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index 8c30c7eb8bef..c933bd08c1fe 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -21,12 +21,41 @@
    SOFTWARE IS DISCLAIMED.
 */
 
+#include <asm/unaligned.h>
+
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_mon.h>
 #include <net/bluetooth/mgmt.h>
 
 #include "mgmt_util.h"
 
+static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie,
+						 u16 opcode, u16 len, void *buf)
+{
+	struct hci_mon_hdr *hdr;
+	struct sk_buff *skb;
+
+	skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	put_unaligned_le32(cookie, skb_put(skb, 4));
+	put_unaligned_le16(opcode, skb_put(skb, 2));
+
+	if (buf)
+		memcpy(skb_put(skb, len), buf, len);
+
+	__net_timestamp(skb);
+
+	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+	hdr->index = index;
+	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+	return skb;
+}
+
 int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
 		    void *data, u16 data_len, int flag, struct sock *skip_sk)
 {
@@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
 	__net_timestamp(skb);
 
 	hci_send_to_channel(channel, skb, flag, skip_sk);
-	kfree_skb(skb);
 
+	if (channel == HCI_CHANNEL_CONTROL)
+		hci_send_monitor_ctrl_event(hdev, event, data, data_len,
+					    skb_get_ktime(skb), flag, skip_sk);
+
+	kfree_skb(skb);
 	return 0;
 }
 
 int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb, *mskb;
 	struct mgmt_hdr *hdr;
 	struct mgmt_ev_cmd_status *ev;
 	int err;
@@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
 	ev->status = status;
 	ev->opcode = cpu_to_le16(cmd);
 
+	mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+					 MGMT_EV_CMD_STATUS, sizeof(*ev), ev);
+	if (mskb)
+		skb->tstamp = mskb->tstamp;
+	else
+		__net_timestamp(skb);
+
 	err = sock_queue_rcv_skb(sk, skb);
 	if (err < 0)
 		kfree_skb(skb);
 
+	if (mskb) {
+		hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+				    HCI_SOCK_TRUSTED, NULL);
+		kfree_skb(mskb);
+	}
+
 	return err;
 }
 
 int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
 		      void *rp, size_t rp_len)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb, *mskb;
 	struct mgmt_hdr *hdr;
 	struct mgmt_ev_cmd_complete *ev;
 	int err;
@@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
 	if (rp)
 		memcpy(ev->data, rp, rp_len);
 
+	mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+					 MGMT_EV_CMD_COMPLETE,
+					 sizeof(*ev) + rp_len, ev);
+	if (mskb)
+		skb->tstamp = mskb->tstamp;
+	else
+		__net_timestamp(skb);
+
 	err = sock_queue_rcv_skb(sk, skb);
 	if (err < 0)
 		kfree_skb(skb);
 
+	if (mskb) {
+		hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+				    HCI_SOCK_TRUSTED, NULL);
+		kfree_skb(mskb);
+	}
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 5504c3a31061704512707bb23bd7835e8a5281e4 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 29 Aug 2016 06:19:46 +0200
Subject: Bluetooth: Use individual flags for certain management events

Instead of hiding everything behind a general managment events flag,
introduce indivdual flags that allow fine control over which events are
send to a given management channel.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci.h |  5 ++++-
 net/bluetooth/hci_sock.c    |  5 ++++-
 net/bluetooth/mgmt.c        | 32 +++++++++++++-------------------
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 0aac123b5eee..ddb9accac3a5 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -208,7 +208,10 @@ enum {
 	HCI_MGMT_INDEX_EVENTS,
 	HCI_MGMT_UNCONF_INDEX_EVENTS,
 	HCI_MGMT_EXT_INDEX_EVENTS,
-	HCI_MGMT_GENERIC_EVENTS,
+	HCI_MGMT_OPTION_EVENTS,
+	HCI_MGMT_SETTING_EVENTS,
+	HCI_MGMT_DEV_CLASS_EVENTS,
+	HCI_MGMT_LOCAL_NAME_EVENTS,
 	HCI_MGMT_OOB_DATA_EVENTS,
 };
 
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 576ea48631b9..d37c2243157b 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1164,7 +1164,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 
 			hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
-			hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
+			hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS);
+			hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS);
+			hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+			hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
 		}
 		break;
 	}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index f9af5f7c2ea2..469f5cc3109b 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -256,13 +256,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data,
 			       flag, skip_sk);
 }
 
-static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data,
-			      u16 len, struct sock *skip_sk)
-{
-	return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
-			       HCI_MGMT_GENERIC_EVENTS, skip_sk);
-}
-
 static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
 		      struct sock *skip_sk)
 {
@@ -579,8 +572,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip)
 {
 	__le32 options = get_missing_options(hdev);
 
-	return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
-				  sizeof(options), skip);
+	return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
+				  sizeof(options), HCI_MGMT_OPTION_EVENTS, skip);
 }
 
 static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
@@ -1007,8 +1000,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip)
 {
 	__le32 ev = cpu_to_le32(get_current_settings(hdev));
 
-	return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
-				  sizeof(ev), skip);
+	return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
+				  sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
 }
 
 int mgmt_new_settings(struct hci_dev *hdev)
@@ -3000,8 +2993,8 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
 		if (err < 0)
 			goto failed;
 
-		err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev,
-					 data, len, sk);
+		err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
+					 len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
 
 		goto failed;
 	}
@@ -6502,8 +6495,9 @@ void __mgmt_power_off(struct hci_dev *hdev)
 	mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
 
 	if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0)
-		mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
-				   zero_cod, sizeof(zero_cod), NULL);
+		mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
+				   zero_cod, sizeof(zero_cod),
+				   HCI_MGMT_DEV_CLASS_EVENTS, NULL);
 
 	new_settings(hdev, match.sk);
 
@@ -7100,8 +7094,8 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
 	mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
 
 	if (!status)
-		mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
-				   dev_class, 3, NULL);
+		mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
+				   3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
 
 	if (match.sk)
 		sock_put(match.sk);
@@ -7130,8 +7124,8 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
 			return;
 	}
 
-	mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
-			   cmd ? cmd->sk : NULL);
+	mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
+			   HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
 }
 
 static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
-- 
cgit v1.2.3


From 9e8305b39bfa23a83b932007654097f4676c2ba2 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 30 Aug 2016 05:00:35 +0200
Subject: Bluetooth: Use numbers for subsystem version string

Instead of keeping a version string around, use version and revision
numbers and then stringify them for use as module parameter.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/bluetooth.h |  3 ++-
 net/bluetooth/af_bluetooth.c      | 10 +++++++---
 net/bluetooth/hci_sock.c          |  4 ++--
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 69b5174168b7..d705bcf40710 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -29,7 +29,8 @@
 #include <net/sock.h>
 #include <linux/seq_file.h>
 
-#define BT_SUBSYS_VERSION "2.21"
+#define BT_SUBSYS_VERSION	2
+#define BT_SUBSYS_REVISION	21
 
 #ifndef AF_BLUETOOTH
 #define AF_BLUETOOTH	31
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 1d96ff3a8d87..1aff2da9bc74 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -26,6 +26,7 @@
 
 #include <linux/module.h>
 #include <linux/debugfs.h>
+#include <linux/stringify.h>
 #include <asm/ioctls.h>
 
 #include <net/bluetooth/bluetooth.h>
@@ -713,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = {
 struct dentry *bt_debugfs;
 EXPORT_SYMBOL_GPL(bt_debugfs);
 
+#define VERSION __stringify(BT_SUBSYS_VERSION) "." \
+		__stringify(BT_SUBSYS_REVISION)
+
 static int __init bt_init(void)
 {
 	int err;
 
 	sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
 
-	BT_INFO("Core ver %s", BT_SUBSYS_VERSION);
+	BT_INFO("Core ver %s", VERSION);
 
 	err = bt_selftest();
 	if (err < 0)
@@ -797,7 +801,7 @@ subsys_initcall(bt_init);
 module_exit(bt_exit);
 
 MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
-MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION);
-MODULE_VERSION(BT_SUBSYS_VERSION);
+MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
+MODULE_VERSION(VERSION);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 804208d48368..a4227c777d16 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1117,8 +1117,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		send_monitor_note(sk, "Linux version %s (%s)",
 				  init_utsname()->release,
 				  init_utsname()->machine);
-		send_monitor_note(sk, "Bluetooth subsystem version %s",
-				  BT_SUBSYS_VERSION);
+		send_monitor_note(sk, "Bluetooth subsystem version %u.%u",
+				  BT_SUBSYS_VERSION, BT_SUBSYS_REVISION);
 		send_monitor_replay(sk);
 		send_monitor_control_replay(sk);
 
-- 
cgit v1.2.3


From 321c6feed2519a2691f65e41c4d62332d6ee3d52 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Thu, 1 Sep 2016 16:46:23 +0200
Subject: Bluetooth: Add framework for Extended Controller Information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This command is used to retrieve the current state and basic
information of a controller. It is typically used right after
getting the response to the Read Controller Index List command
or an Index Added event (or its extended counterparts).

When any of the values in the EIR_Data field changes, the event
Extended Controller Information Changed will be used to inform
clients about the updated information.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
---
 include/net/bluetooth/hci.h  |  1 +
 include/net/bluetooth/mgmt.h | 18 +++++++++++++
 net/bluetooth/mgmt.c         | 62 ++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index ddb9accac3a5..99aa5e5e3100 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -208,6 +208,7 @@ enum {
 	HCI_MGMT_INDEX_EVENTS,
 	HCI_MGMT_UNCONF_INDEX_EVENTS,
 	HCI_MGMT_EXT_INDEX_EVENTS,
+	HCI_MGMT_EXT_INFO_EVENTS,
 	HCI_MGMT_OPTION_EVENTS,
 	HCI_MGMT_SETTING_EVENTS,
 	HCI_MGMT_DEV_CLASS_EVENTS,
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 7647964b1efa..611b243713ea 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -586,6 +586,18 @@ struct mgmt_rp_get_adv_size_info {
 
 #define MGMT_OP_START_LIMITED_DISCOVERY	0x0041
 
+#define MGMT_OP_READ_EXT_INFO		0x0042
+#define MGMT_READ_EXT_INFO_SIZE		0
+struct mgmt_rp_read_ext_info {
+	bdaddr_t bdaddr;
+	__u8     version;
+	__le16   manufacturer;
+	__le32   supported_settings;
+	__le32   current_settings;
+	__le16   eir_len;
+	__u8     eir[0];
+} __packed;
+
 #define MGMT_EV_CMD_COMPLETE		0x0001
 struct mgmt_ev_cmd_complete {
 	__le16	opcode;
@@ -800,3 +812,9 @@ struct mgmt_ev_advertising_added {
 struct mgmt_ev_advertising_removed {
 	__u8    instance;
 } __packed;
+
+#define MGMT_EV_EXT_INFO_CHANGED	0x0025
+struct mgmt_ev_ext_info_changed {
+	__le16	eir_len;
+	__u8	eir[0];
+} __packed;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 47efdb4a669a..69001f415efa 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -104,6 +104,7 @@ static const u16 mgmt_commands[] = {
 	MGMT_OP_REMOVE_ADVERTISING,
 	MGMT_OP_GET_ADV_SIZE_INFO,
 	MGMT_OP_START_LIMITED_DISCOVERY,
+	MGMT_OP_READ_EXT_INFO,
 };
 
 static const u16 mgmt_events[] = {
@@ -141,6 +142,7 @@ static const u16 mgmt_events[] = {
 	MGMT_EV_LOCAL_OOB_DATA_UPDATED,
 	MGMT_EV_ADVERTISING_ADDED,
 	MGMT_EV_ADVERTISING_REMOVED,
+	MGMT_EV_EXT_INFO_CHANGED,
 };
 
 static const u16 mgmt_untrusted_commands[] = {
@@ -149,6 +151,7 @@ static const u16 mgmt_untrusted_commands[] = {
 	MGMT_OP_READ_UNCONF_INDEX_LIST,
 	MGMT_OP_READ_CONFIG_INFO,
 	MGMT_OP_READ_EXT_INDEX_LIST,
+	MGMT_OP_READ_EXT_INFO,
 };
 
 static const u16 mgmt_untrusted_events[] = {
@@ -162,6 +165,7 @@ static const u16 mgmt_untrusted_events[] = {
 	MGMT_EV_NEW_CONFIG_OPTIONS,
 	MGMT_EV_EXT_INDEX_ADDED,
 	MGMT_EV_EXT_INDEX_REMOVED,
+	MGMT_EV_EXT_INFO_CHANGED,
 };
 
 #define CACHE_TIMEOUT	msecs_to_jiffies(2 * 1000)
@@ -862,6 +866,52 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
 				 sizeof(rp));
 }
 
+static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
+				    void *data, u16 data_len)
+{
+	struct mgmt_rp_read_ext_info rp;
+
+	BT_DBG("sock %p %s", sk, hdev->name);
+
+	hci_dev_lock(hdev);
+
+	memset(&rp, 0, sizeof(rp));
+
+	bacpy(&rp.bdaddr, &hdev->bdaddr);
+
+	rp.version = hdev->hci_ver;
+	rp.manufacturer = cpu_to_le16(hdev->manufacturer);
+
+	rp.supported_settings = cpu_to_le32(get_supported_settings(hdev));
+	rp.current_settings = cpu_to_le32(get_current_settings(hdev));
+
+	rp.eir_len = cpu_to_le16(0);
+
+	hci_dev_unlock(hdev);
+
+	/* If this command is called at least once, then the events
+	 * for class of device and local name changes are disabled
+	 * and only the new extended controller information event
+	 * is used.
+	 */
+	hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS);
+	hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+	hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
+
+	return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, &rp,
+				 sizeof(rp));
+}
+
+static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
+{
+	struct mgmt_ev_ext_info_changed ev;
+
+	ev.eir_len = cpu_to_le16(0);
+
+	return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, &ev,
+				  sizeof(ev), HCI_MGMT_EXT_INFO_EVENTS, skip);
+}
+
 static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
 {
 	__le32 settings = cpu_to_le32(get_current_settings(hdev));
@@ -2995,6 +3045,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
 
 		err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
 					 len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
+		ext_info_changed(hdev, sk);
 
 		goto failed;
 	}
@@ -6356,6 +6407,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
 	{ remove_advertising,	   MGMT_REMOVE_ADVERTISING_SIZE },
 	{ get_adv_size_info,       MGMT_GET_ADV_SIZE_INFO_SIZE },
 	{ start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
+	{ read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
+						HCI_MGMT_UNTRUSTED },
 };
 
 void mgmt_index_added(struct hci_dev *hdev)
@@ -6494,10 +6547,12 @@ void __mgmt_power_off(struct hci_dev *hdev)
 
 	mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
 
-	if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0)
+	if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) {
 		mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
 				   zero_cod, sizeof(zero_cod),
 				   HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+		ext_info_changed(hdev, NULL);
+	}
 
 	new_settings(hdev, match.sk);
 
@@ -7093,9 +7148,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
 	mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match);
 	mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
 
-	if (!status)
+	if (!status) {
 		mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
 				   3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+		ext_info_changed(hdev, NULL);
+	}
 
 	if (match.sk)
 		sock_put(match.sk);
@@ -7126,6 +7183,7 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
 
 	mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
 			   HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
+	ext_info_changed(hdev, cmd ? cmd->sk : NULL);
 }
 
 static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
-- 
cgit v1.2.3


From 4037a7747d7b5a3e5bb4d10fb9ea6e2fd8a23c3b Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Thu, 8 Sep 2016 18:07:07 +0200
Subject: Bluetooth: Increase the subsystem minor version number

While the subsystem version information are purely informational,
increase the minor number due to the addition of user channel and
management control monitoring suppport. It is helpful for debugging
purposes to see the version numbers change.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/bluetooth.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index d705bcf40710..0a1e21d7bce1 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -30,7 +30,7 @@
 #include <linux/seq_file.h>
 
 #define BT_SUBSYS_VERSION	2
-#define BT_SUBSYS_REVISION	21
+#define BT_SUBSYS_REVISION	22
 
 #ifndef AF_BLUETOOTH
 #define AF_BLUETOOTH	31
-- 
cgit v1.2.3


From c4960ecf2b09210930964ef2c05ce2590802ccf4 Mon Sep 17 00:00:00 2001
From: Michał Narajowski <michal.narajowski@codecoup.pl>
Date: Sun, 18 Sep 2016 12:50:03 +0200
Subject: Bluetooth: Add support for appearance in scan rsp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch enables prepending appearance value to scan response data.
It also adds support for setting appearance value through mgmt command.
If currently advertised instance has apperance flag set it is expired
immediately.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  1 +
 include/net/bluetooth/mgmt.h     |  6 ++++++
 net/bluetooth/hci_request.c      |  8 ++++++++
 net/bluetooth/mgmt.c             | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index a48f71d73dc8..f00bf667ec33 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -211,6 +211,7 @@ struct hci_dev {
 	__u8		dev_name[HCI_MAX_NAME_LENGTH];
 	__u8		short_name[HCI_MAX_SHORT_NAME_LENGTH];
 	__u8		eir[HCI_MAX_EIR_LENGTH];
+	__u16		appearance;
 	__u8		dev_class[3];
 	__u8		major_class;
 	__u8		minor_class;
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 611b243713ea..72a456bbbcd5 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -598,6 +598,12 @@ struct mgmt_rp_read_ext_info {
 	__u8     eir[0];
 } __packed;
 
+#define MGMT_OP_SET_APPEARANCE		0x0043
+struct mgmt_cp_set_appearance {
+	__u16	appearance;
+} __packed;
+#define MGMT_SET_APPEARANCE_SIZE	2
+
 #define MGMT_EV_CMD_COMPLETE		0x0001
 struct mgmt_ev_cmd_complete {
 	__le16	opcode;
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 0ce6cdd278b2..c8135680c43e 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1015,6 +1015,14 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
 
 	instance_flags = adv_instance->flags;
 
+	if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) {
+		ptr[0] = 3;
+		ptr[1] = EIR_APPEARANCE;
+		put_unaligned_le16(hdev->appearance, ptr + 2);
+		scan_rsp_len += 4;
+		ptr += 4;
+	}
+
 	memcpy(ptr, adv_instance->scan_rsp_data,
 	       adv_instance->scan_rsp_len);
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 89954bb19222..78d708851208 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -105,6 +105,7 @@ static const u16 mgmt_commands[] = {
 	MGMT_OP_GET_ADV_SIZE_INFO,
 	MGMT_OP_START_LIMITED_DISCOVERY,
 	MGMT_OP_READ_EXT_INFO,
+	MGMT_OP_SET_APPEARANCE,
 };
 
 static const u16 mgmt_events[] = {
@@ -3143,6 +3144,34 @@ failed:
 	return err;
 }
 
+static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
+			  u16 len)
+{
+	struct mgmt_cp_set_appearance *cp = data;
+	u16 apperance;
+	int err;
+
+	BT_DBG("");
+
+	apperance = le16_to_cpu(cp->appearance);
+
+	hci_dev_lock(hdev);
+
+	if (hdev->appearance != apperance) {
+		hdev->appearance = apperance;
+
+		if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+			adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
+	}
+
+	err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL,
+				0);
+
+	hci_dev_unlock(hdev);
+
+	return err;
+}
+
 static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
 				         u16 opcode, struct sk_buff *skb)
 {
@@ -5918,6 +5947,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
 	flags |= MGMT_ADV_FLAG_DISCOV;
 	flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
 	flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
+	flags |= MGMT_ADV_FLAG_APPEARANCE;
 	flags |= MGMT_ADV_FLAG_LOCAL_NAME;
 
 	if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
@@ -5999,6 +6029,9 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
 		/* at least 1 byte of name should fit in */
 		if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
 			max_len -= 3;
+
+		if (adv_flags & MGMT_ADV_FLAG_APPEARANCE)
+			max_len -= 4;
 	}
 
 	if (len > max_len)
@@ -6335,6 +6368,9 @@ static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
 		/* at least 1 byte of name should fit in */
 		if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
 			max_len -= 3;
+
+		if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
+			max_len -= 4;
 	}
 
 	return max_len;
@@ -6470,6 +6506,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
 	{ start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
 	{ read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
 						HCI_MGMT_UNTRUSTED },
+	{ set_appearance,	   MGMT_SET_APPEARANCE_SIZE },
 };
 
 void mgmt_index_added(struct hci_dev *hdev)
-- 
cgit v1.2.3


From 12703dbfeb15402260e7554d32a34ac40c233990 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 19 Sep 2016 14:44:27 -0700
Subject: fsnotify: add a way to stop queueing events on group shutdown

Implement a function that can be called when a group is being shutdown
to stop queueing new events to the group.  Fanotify will use this.

Fixes: 5838d4442bd5 ("fanotify: fix double free of pending permission events")
Link: http://lkml.kernel.org/r/1473797711-14111-2-git-send-email-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/group.c                | 19 +++++++++++++++++++
 fs/notify/notification.c         |  8 +++++++-
 include/linux/fsnotify_backend.h |  3 +++
 3 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/notify/group.c b/fs/notify/group.c
index 3e2dd85be5dd..b47f7cfdcaa4 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -39,6 +39,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 	kfree(group);
 }
 
+/*
+ * Stop queueing new events for this group. Once this function returns
+ * fsnotify_add_event() will not add any new events to the group's queue.
+ */
+void fsnotify_group_stop_queueing(struct fsnotify_group *group)
+{
+	mutex_lock(&group->notification_mutex);
+	group->shutdown = true;
+	mutex_unlock(&group->notification_mutex);
+}
+
 /*
  * Trying to get rid of a group. Remove all marks, flush all events and release
  * the group reference.
@@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
  */
 void fsnotify_destroy_group(struct fsnotify_group *group)
 {
+	/*
+	 * Stop queueing new events. The code below is careful enough to not
+	 * require this but fanotify needs to stop queuing events even before
+	 * fsnotify_destroy_group() is called and this makes the other callers
+	 * of fsnotify_destroy_group() to see the same behavior.
+	 */
+	fsnotify_group_stop_queueing(group);
+
 	/* clear all inode marks for this group, attach them to destroy_list */
 	fsnotify_detach_group_marks(group);
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index a95d8e037aeb..3d76e65ff84f 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
  * Add an event to the group notification queue.  The group can later pull this
  * event off the queue to deal with.  The function returns 0 if the event was
  * added to the queue, 1 if the event was merged with some other queued event,
- * 2 if the queue of events has overflown.
+ * 2 if the event was not queued - either the queue of events has overflown
+ * or the group is shutting down.
  */
 int fsnotify_add_event(struct fsnotify_group *group,
 		       struct fsnotify_event *event,
@@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group,
 
 	mutex_lock(&group->notification_mutex);
 
+	if (group->shutdown) {
+		mutex_unlock(&group->notification_mutex);
+		return 2;
+	}
+
 	if (group->q_len >= group->max_events) {
 		ret = 2;
 		/* Queue overflow event only if it isn't already queued */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 58205f33af02..40a9e99de703 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -148,6 +148,7 @@ struct fsnotify_group {
 	#define FS_PRIO_1	1 /* fanotify content based access control */
 	#define FS_PRIO_2	2 /* fanotify pre-content access */
 	unsigned int priority;
+	bool shutdown;		/* group is being shut down, don't queue more events */
 
 	/* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
 	struct mutex mark_mutex;	/* protect marks_list */
@@ -292,6 +293,8 @@ extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *op
 extern void fsnotify_get_group(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_alloc_group */
 extern void fsnotify_put_group(struct fsnotify_group *group);
+/* group destruction begins, stop queuing new events */
+extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
 /* destroy group */
 extern void fsnotify_destroy_group(struct fsnotify_group *group);
 /* fasync handler function */
-- 
cgit v1.2.3


From 96d41019e3ac55f6f0115b0ce97e4f24a3d636d2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 19 Sep 2016 14:44:30 -0700
Subject: fanotify: fix list corruption in fanotify_get_response()

fanotify_get_response() calls fsnotify_remove_event() when it finds that
group is being released from fanotify_release() (bypass_perm is set).

However the event it removes need not be only in the group's notification
queue but it can have already moved to access_list (userspace read the
event before closing the fanotify instance fd) which is protected by a
different lock.  Thus when fsnotify_remove_event() races with
fanotify_release() operating on access_list, the list can get corrupted.

Fix the problem by moving all the logic removing permission events from
the lists to one place - fanotify_release().

Fixes: 5838d4442bd5 ("fanotify: fix double free of pending permission events")
Link: http://lkml.kernel.org/r/1473797711-14111-3-git-send-email-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Miklos Szeredi <mszeredi@redhat.com>
Tested-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify.c      | 13 +------------
 fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++------------
 fs/notify/notification.c           | 15 ---------------
 include/linux/fsnotify_backend.h   |  3 ---
 4 files changed, 25 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d2f97ecca6a5..e0e5f7c3c99f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	wait_event(group->fanotify_data.access_waitq, event->response ||
-				atomic_read(&group->fanotify_data.bypass_perm));
-
-	if (!event->response) {	/* bypass_perm set */
-		/*
-		 * Event was canceled because group is being destroyed. Remove
-		 * it from group's event list because we are responsible for
-		 * freeing the permission event.
-		 */
-		fsnotify_remove_event(group, &event->fae.fse);
-		return 0;
-	}
+	wait_event(group->fanotify_data.access_waitq, event->response);
 
 	/* userspace responded, convert to something usable */
 	switch (event->response) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e8e6bcd1d43..a64313868d3a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	struct fanotify_perm_event_info *event, *next;
+	struct fsnotify_event *fsn_event;
 
 	/*
-	 * There may be still new events arriving in the notification queue
-	 * but since userspace cannot use fanotify fd anymore, no event can
-	 * enter or leave access_list by now.
+	 * Stop new events from arriving in the notification queue. since
+	 * userspace cannot use fanotify fd anymore, no event can enter or
+	 * leave access_list by now either.
 	 */
-	spin_lock(&group->fanotify_data.access_lock);
-
-	atomic_inc(&group->fanotify_data.bypass_perm);
+	fsnotify_group_stop_queueing(group);
 
+	/*
+	 * Process all permission events on access_list and notification queue
+	 * and simulate reply from userspace.
+	 */
+	spin_lock(&group->fanotify_data.access_lock);
 	list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
 				 fae.fse.list) {
 		pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 	spin_unlock(&group->fanotify_data.access_lock);
 
 	/*
-	 * Since bypass_perm is set, newly queued events will not wait for
-	 * access response. Wake up the already sleeping ones now.
-	 * synchronize_srcu() in fsnotify_destroy_group() will wait for all
-	 * processes sleeping in fanotify_handle_event() waiting for access
-	 * response and thus also for all permission events to be freed.
+	 * Destroy all non-permission events. For permission events just
+	 * dequeue them and set the response. They will be freed once the
+	 * response is consumed and fanotify_get_response() returns.
 	 */
+	mutex_lock(&group->notification_mutex);
+	while (!fsnotify_notify_queue_is_empty(group)) {
+		fsn_event = fsnotify_remove_first_event(group);
+		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+			fsnotify_destroy_event(group, fsn_event);
+		else
+			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+	}
+	mutex_unlock(&group->notification_mutex);
+
+	/* Response for all permission events it set, wakeup waiters */
 	wake_up(&group->fanotify_data.access_waitq);
 #endif
 
@@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	spin_lock_init(&group->fanotify_data.access_lock);
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
-	atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
 	switch (flags & FAN_ALL_CLASS_BITS) {
 	case FAN_CLASS_NOTIF:
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3d76e65ff84f..e455e83ceeeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -131,21 +131,6 @@ queue:
 	return ret;
 }
 
-/*
- * Remove @event from group's notification queue. It is the responsibility of
- * the caller to destroy the event.
- */
-void fsnotify_remove_event(struct fsnotify_group *group,
-			   struct fsnotify_event *event)
-{
-	mutex_lock(&group->notification_mutex);
-	if (!list_empty(&event->list)) {
-		list_del_init(&event->list);
-		group->q_len--;
-	}
-	mutex_unlock(&group->notification_mutex);
-}
-
 /*
  * Remove and return the first event from the notification list.  It is the
  * responsibility of the caller to destroy the obtained event
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 40a9e99de703..7268ed076be8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -180,7 +180,6 @@ struct fsnotify_group {
 			spinlock_t access_lock;
 			struct list_head access_list;
 			wait_queue_head_t access_waitq;
-			atomic_t bypass_perm;
 #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
 			int f_flags;
 			unsigned int max_marks;
@@ -307,8 +306,6 @@ extern int fsnotify_add_event(struct fsnotify_group *group,
 			      struct fsnotify_event *event,
 			      int (*merge)(struct list_head *,
 					   struct fsnotify_event *));
-/* Remove passed event from groups notification queue */
-extern void fsnotify_remove_event(struct fsnotify_group *group, struct fsnotify_event *event);
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
-- 
cgit v1.2.3


From 6a5d58b67e205f2ffc62d0a9ee4ef7d237e9a7fb Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 18 Sep 2016 07:31:42 -0400
Subject: net sched ife action: add 16 bit helpers

encoder and checker for 16 bits metadata

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h |  2 ++
 net/sched/act_ife.c         | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 5164bd7a38fb..9fd2bea0a6e0 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -50,9 +50,11 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
 int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp);
 int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp);
 int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi);
+int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi);
 int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi);
 int ife_validate_meta_u32(void *val, int len);
 int ife_validate_meta_u16(void *val, int len);
+int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi);
 void ife_release_meta_gen(struct tcf_meta_info *mi);
 int register_ife_op(struct tcf_meta_ops *mops);
 int unregister_ife_op(struct tcf_meta_ops *mops);
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index e87cd81315e1..ccf7b4b655fe 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
 }
 EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
 
+int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+	u16 edata = 0;
+
+	if (mi->metaval)
+		edata = *(u16 *)mi->metaval;
+	else if (metaval)
+		edata = metaval;
+
+	if (!edata) /* will not encode */
+		return 0;
+
+	edata = htons(edata);
+	return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u16);
+
 int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
 {
 	if (mi->metaval)
@@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
 }
 EXPORT_SYMBOL_GPL(ife_check_meta_u32);
 
+int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi)
+{
+	if (metaval || mi->metaval)
+		return 8; /* T+L+(V) == 2+2+(2+2bytepad) */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u16);
+
 int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
 {
 	u32 edata = metaval;
-- 
cgit v1.2.3


From 408fbc22ef1efb00dd896acd00e9f7d9b641e047 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 18 Sep 2016 07:31:43 -0400
Subject: net sched ife action: Introduce skb tcindex metadata encap decap

Sample use case of how this is encoded:
user space via tuntap (or a connected VM/Machine/container)
encodes the tcindex TLV.

Sample use case of decoding:
IFE action decodes it and the skb->tc_index is then used to classify.
So something like this for encoded ICMP packets:

.. first decode then reclassify... skb->tcindex will be set
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xbeef \
u32 match u32 0 0 flowid 1:1 \
action ife decode reclassify

...next match the decode icmp packet...
sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \
u32 match ip protocol 1 0xff flowid 1:1 \
action continue

... last classify it using the tcindex classifier and do someaction..
sudo $TC filter add dev $ETH parent ffff: prio 5 protocol ip \
handle 0x11 tcindex classid 1:1 \
action blah..

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_ife.h |  3 +-
 net/sched/Kconfig                  |  5 +++
 net/sched/Makefile                 |  1 +
 net/sched/act_meta_skbtcindex.c    | 79 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 net/sched/act_meta_skbtcindex.c

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
index 4ece02a77b9a..cd18360eca24 100644
--- a/include/uapi/linux/tc_act/tc_ife.h
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -32,8 +32,9 @@ enum {
 #define IFE_META_HASHID 2
 #define	IFE_META_PRIO 3
 #define	IFE_META_QMAP 4
+#define	IFE_META_TCINDEX 5
 /*Can be overridden at runtime by module option*/
-#define	__IFE_META_MAX 5
+#define	__IFE_META_MAX 6
 #define IFE_META_MAX (__IFE_META_MAX - 1)
 
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7795d5a3f79a..87956a768d1b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -793,6 +793,11 @@ config NET_IFE_SKBPRIO
         depends on NET_ACT_IFE
         ---help---
 
+config NET_IFE_SKBTCINDEX
+        tristate "Support to encoding decoding skb tcindex on IFE action"
+        depends on NET_ACT_IFE
+        ---help---
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 148ae0d5ac2c..4bdda3634e0b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
+obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
 obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
new file mode 100644
index 000000000000..3b35774ce890
--- /dev/null
+++ b/net/sched/act_meta_skbtcindex.c
@@ -0,0 +1,79 @@
+/*
+ * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2016)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/rtnetlink.h>
+
+static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
+			     struct tcf_meta_info *e)
+{
+	u32 ifetc_index = skb->tc_index;
+
+	return ife_encode_meta_u16(ifetc_index, skbdata, e);
+}
+
+static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len)
+{
+	u16 ifetc_index = *(u16 *)data;
+
+	skb->tc_index = ntohs(ifetc_index);
+	return 0;
+}
+
+static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+	return ife_check_meta_u16(skb->tc_index, e);
+}
+
+static struct tcf_meta_ops ife_skbtcindex_ops = {
+	.metaid = IFE_META_TCINDEX,
+	.metatype = NLA_U16,
+	.name = "tc_index",
+	.synopsis = "skb tc_index 16 bit metadata",
+	.check_presence = skbtcindex_check,
+	.encode = skbtcindex_encode,
+	.decode = skbtcindex_decode,
+	.get = ife_get_meta_u16,
+	.alloc = ife_alloc_meta_u16,
+	.release = ife_release_meta_gen,
+	.validate = ife_validate_meta_u16,
+	.owner = THIS_MODULE,
+};
+
+static int __init ifetc_index_init_module(void)
+{
+	return register_ife_op(&ife_skbtcindex_ops);
+}
+
+static void __exit ifetc_index_cleanup_module(void)
+{
+	unregister_ife_op(&ife_skbtcindex_ops);
+}
+
+module_init(ifetc_index_init_module);
+module_exit(ifetc_index_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2016)");
+MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX);
-- 
cgit v1.2.3


From ca26893f05e86497a86732768ec53cd38c0819ca Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 19 Sep 2016 19:00:09 +0800
Subject: rhashtable: Add rhlist interface

The insecure_elasticity setting is an ugly wart brought out by
users who need to insert duplicate objects (that is, distinct
objects with identical keys) into the same table.

In fact, those users have a much bigger problem.  Once those
duplicate objects are inserted, they don't have an interface to
find them (unless you count the walker interface which walks
over the entire table).

Some users have resorted to doing a manual walk over the hash
table which is of course broken because they don't handle the
potential existence of multiple hash tables.  The result is that
they will break sporadically when they encounter a hash table
resize/rehash.

This patch provides a way out for those users, at the expense
of an extra pointer per object.  Essentially each object is now
a list of objects carrying the same key.  The hash table will
only see the lists so nothing changes as far as rhashtable is
concerned.

To use this new interface, you need to insert a struct rhlist_head
into your objects instead of struct rhash_head.  While the hash
table is unchanged, for type-safety you'll need to use struct
rhltable instead of struct rhashtable.  All the existing interfaces
have been duplicated for rhlist, including the hash table walker.

One missing feature is nulls marking because AFAIK the only potential
user of it does not need duplicate objects.  Should anyone need
this it shouldn't be too hard to add.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 491 ++++++++++++++++++++++++++++++++++-----------
 lib/rhashtable.c           | 258 +++++++++++++++++++-----
 2 files changed, 583 insertions(+), 166 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index fd82584acd48..5c132d3188be 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,7 +1,7 @@
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  *
@@ -53,6 +53,11 @@ struct rhash_head {
 	struct rhash_head __rcu		*next;
 };
 
+struct rhlist_head {
+	struct rhash_head		rhead;
+	struct rhlist_head __rcu	*next;
+};
+
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
@@ -137,6 +142,7 @@ struct rhashtable_params {
  * @key_len: Key length for hashfn
  * @elasticity: Maximum chain length before rehash
  * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
@@ -147,11 +153,20 @@ struct rhashtable {
 	unsigned int			key_len;
 	unsigned int			elasticity;
 	struct rhashtable_params	p;
+	bool				rhlist;
 	struct work_struct		run_work;
 	struct mutex                    mutex;
 	spinlock_t			lock;
 };
 
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+	struct rhashtable ht;
+};
+
 /**
  * struct rhashtable_walker - Hash table walker
  * @list: List entry on list of walkers
@@ -163,9 +178,10 @@ struct rhashtable_walker {
 };
 
 /**
- * struct rhashtable_iter - Hash table iterator, fits into netlink cb
+ * struct rhashtable_iter - Hash table iterator
  * @ht: Table to iterate through
  * @p: Current pointer
+ * @list: Current hash list pointer
  * @walker: Associated rhashtable walker
  * @slot: Current slot
  * @skip: Number of entries to skip in slot
@@ -173,6 +189,7 @@ struct rhashtable_walker {
 struct rhashtable_iter {
 	struct rhashtable *ht;
 	struct rhash_head *p;
+	struct rhlist_head *list;
 	struct rhashtable_walker walker;
 	unsigned int slot;
 	unsigned int skip;
@@ -339,13 +356,11 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
 
 int rhashtable_init(struct rhashtable *ht,
 		    const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+		  const struct rhashtable_params *params);
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-					    const void *key,
-					    struct rhash_head *obj,
-					    struct bucket_table *old_tbl,
-					    void **data);
-int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+			     struct rhash_head *obj);
 
 void rhashtable_walk_enter(struct rhashtable *ht,
 			   struct rhashtable_iter *iter);
@@ -507,6 +522,31 @@ void rhashtable_destroy(struct rhashtable *ht);
 	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
 					tbl, hash, member)
 
+/**
+ * rhl_for_each_rcu - iterate over rcu hash table list
+ * @pos:	the &struct rlist_head to use as a loop cursor.
+ * @list:	the head of the list
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_rcu(pos, list)					\
+	for (pos = list; pos; pos = rcu_dereference_raw(pos->next))
+
+/**
+ * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rlist_head to use as a loop cursor.
+ * @list:	the head of the list
+ * @member:	name of the &struct rlist_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_entry_rcu(tpos, pos, list, member)			\
+	for (pos = list; pos && rht_entry(tpos, pos, member);		\
+	     pos = rcu_dereference_raw(pos->next))
+
 static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
 				     const void *obj)
 {
@@ -516,18 +556,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
 	return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
 }
 
-/**
- * rhashtable_lookup_fast - search hash table, inlined version
- * @ht:		hash table
- * @key:	the pointer to the key
- * @params:	hash table parameters
- *
- * Computes the hash value for the key and traverses the bucket chain looking
- * for a entry with an identical key. The first matching entry is returned.
- *
- * Returns the first entry on which the compare function returned true.
- */
-static inline void *rhashtable_lookup_fast(
+/* Internal function, do not use. */
+static inline struct rhash_head *__rhashtable_lookup(
 	struct rhashtable *ht, const void *key,
 	const struct rhashtable_params params)
 {
@@ -539,8 +569,6 @@ static inline void *rhashtable_lookup_fast(
 	struct rhash_head *he;
 	unsigned int hash;
 
-	rcu_read_lock();
-
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
@@ -549,8 +577,7 @@ restart:
 		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 		    rhashtable_compare(&arg, rht_obj(ht, he)))
 			continue;
-		rcu_read_unlock();
-		return rht_obj(ht, he);
+		return he;
 	}
 
 	/* Ensure we see any new tables. */
@@ -559,96 +586,165 @@ restart:
 	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	if (unlikely(tbl))
 		goto restart;
-	rcu_read_unlock();
 
 	return NULL;
 }
 
+/**
+ * rhashtable_lookup - search hash table
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+
+	return he ? rht_obj(ht, he) : NULL;
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, without RCU read lock
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Only use this function when you have other mechanisms guaranteeing
+ * that the object won't go away after the RCU read lock is released.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	void *obj;
+
+	rcu_read_lock();
+	obj = rhashtable_lookup(ht, key, params);
+	rcu_read_unlock();
+
+	return obj;
+}
+
+/**
+ * rhltable_lookup - search hash list table
+ * @hlt:	hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key.  All matching entries are returned
+ * in a list.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the list of entries that match the given key.
+ */
+static inline struct rhlist_head *rhltable_lookup(
+	struct rhltable *hlt, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
+
+	return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+}
+
 /* Internal function, please use rhashtable_insert_fast() instead. This
  * function returns the existing element already in hashes in there is a clash,
  * otherwise it returns an error via ERR_PTR().
  */
 static inline void *__rhashtable_insert_fast(
 	struct rhashtable *ht, const void *key, struct rhash_head *obj,
-	const struct rhashtable_params params)
+	const struct rhashtable_params params, bool rhlist)
 {
 	struct rhashtable_compare_arg arg = {
 		.ht = ht,
 		.key = key,
 	};
-	struct bucket_table *tbl, *new_tbl;
+	struct rhash_head __rcu **pprev;
+	struct bucket_table *tbl;
 	struct rhash_head *head;
 	spinlock_t *lock;
-	unsigned int elasticity;
 	unsigned int hash;
-	void *data = NULL;
-	int err;
+	int elasticity;
+	void *data;
 
-restart:
 	rcu_read_lock();
 
 	tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = rht_head_hashfn(ht, tbl, obj, params);
+	lock = rht_bucket_lock(tbl, hash);
+	spin_lock_bh(lock);
 
-	/* All insertions must grab the oldest table containing
-	 * the hashed bucket that is yet to be rehashed.
-	 */
-	for (;;) {
-		hash = rht_head_hashfn(ht, tbl, obj, params);
-		lock = rht_bucket_lock(tbl, hash);
-		spin_lock_bh(lock);
-
-		if (tbl->rehash <= hash)
-			break;
-
+	if (unlikely(rht_dereference_bucket(tbl->future_tbl, tbl, hash))) {
+slow_path:
 		spin_unlock_bh(lock);
-		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+		rcu_read_unlock();
+		return rhashtable_insert_slow(ht, key, obj);
 	}
 
-	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	if (unlikely(new_tbl)) {
-		tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data);
-		if (!IS_ERR_OR_NULL(tbl))
-			goto slow_path;
+	elasticity = ht->elasticity;
+	pprev = &tbl->buckets[hash];
+	rht_for_each(head, tbl, hash) {
+		struct rhlist_head *plist;
+		struct rhlist_head *list;
+
+		elasticity--;
+		if (!key ||
+		    (params.obj_cmpfn ?
+		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+		     rhashtable_compare(&arg, rht_obj(ht, head))))
+			continue;
+
+		data = rht_obj(ht, head);
 
-		err = PTR_ERR(tbl);
-		if (err == -EEXIST)
-			err = 0;
+		if (!rhlist)
+			goto out;
 
-		goto out;
-	}
 
-	err = -E2BIG;
-	if (unlikely(rht_grow_above_max(ht, tbl)))
-		goto out;
+		list = container_of(obj, struct rhlist_head, rhead);
+		plist = container_of(head, struct rhlist_head, rhead);
 
-	if (unlikely(rht_grow_above_100(ht, tbl))) {
-slow_path:
-		spin_unlock_bh(lock);
-		err = rhashtable_insert_rehash(ht, tbl);
-		rcu_read_unlock();
-		if (err)
-			return ERR_PTR(err);
+		RCU_INIT_POINTER(list->next, plist);
+		head = rht_dereference_bucket(head->next, tbl, hash);
+		RCU_INIT_POINTER(list->rhead.next, head);
+		rcu_assign_pointer(*pprev, obj);
 
-		goto restart;
+		goto good;
 	}
 
-	err = 0;
-	elasticity = ht->elasticity;
-	rht_for_each(head, tbl, hash) {
-		if (key &&
-		    unlikely(!(params.obj_cmpfn ?
-			       params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-			       rhashtable_compare(&arg, rht_obj(ht, head))))) {
-			data = rht_obj(ht, head);
-			goto out;
-		}
-		if (!--elasticity)
-			goto slow_path;
-	}
+	if (elasticity <= 0)
+		goto slow_path;
+
+	data = ERR_PTR(-E2BIG);
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		goto out;
+
+	if (unlikely(rht_grow_above_100(ht, tbl)))
+		goto slow_path;
 
 	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
+	if (rhlist) {
+		struct rhlist_head *list;
+
+		list = container_of(obj, struct rhlist_head, rhead);
+		RCU_INIT_POINTER(list->next, NULL);
+	}
 
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 
@@ -656,11 +752,14 @@ slow_path:
 	if (rht_grow_above_75(ht, tbl))
 		schedule_work(&ht->run_work);
 
+good:
+	data = NULL;
+
 out:
 	spin_unlock_bh(lock);
 	rcu_read_unlock();
 
-	return err ? ERR_PTR(err) : data;
+	return data;
 }
 
 /**
@@ -685,13 +784,65 @@ static inline int rhashtable_insert_fast(
 {
 	void *ret;
 
-	ret = __rhashtable_insert_fast(ht, NULL, obj, params);
+	ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 
 	return ret == NULL ? 0 : -EEXIST;
 }
 
+/**
+ * rhltable_insert_key - insert object into hash list table
+ * @hlt:	hash list table
+ * @key:	the pointer to the key
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Will take a per bucket spinlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket lock.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+static inline int rhltable_insert_key(
+	struct rhltable *hlt, const void *key, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+						params, true));
+}
+
+/**
+ * rhltable_insert - insert object into hash list table
+ * @hlt:	hash list table
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Will take a per bucket spinlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket lock.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+static inline int rhltable_insert(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	const char *key = rht_obj(&hlt->ht, &list->rhead);
+
+	key += params.key_offset;
+
+	return rhltable_insert_key(hlt, key, list, params);
+}
+
 /**
  * rhashtable_lookup_insert_fast - lookup and insert object into hash table
  * @ht:		hash table
@@ -722,7 +873,8 @@ static inline int rhashtable_lookup_insert_fast(
 
 	BUG_ON(ht->p.obj_hashfn);
 
-	ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params);
+	ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+				       false);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 
@@ -759,7 +911,7 @@ static inline int rhashtable_lookup_insert_key(
 
 	BUG_ON(!ht->p.obj_hashfn || !key);
 
-	ret = __rhashtable_insert_fast(ht, key, obj, params);
+	ret = __rhashtable_insert_fast(ht, key, obj, params, false);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 
@@ -783,13 +935,14 @@ static inline void *rhashtable_lookup_get_insert_key(
 {
 	BUG_ON(!ht->p.obj_hashfn || !key);
 
-	return __rhashtable_insert_fast(ht, key, obj, params);
+	return __rhashtable_insert_fast(ht, key, obj, params, false);
 }
 
 /* Internal function, please use rhashtable_remove_fast() instead */
-static inline int __rhashtable_remove_fast(
+static inline int __rhashtable_remove_fast_one(
 	struct rhashtable *ht, struct bucket_table *tbl,
-	struct rhash_head *obj, const struct rhashtable_params params)
+	struct rhash_head *obj, const struct rhashtable_params params,
+	bool rhlist)
 {
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
@@ -804,39 +957,66 @@ static inline int __rhashtable_remove_fast(
 
 	pprev = &tbl->buckets[hash];
 	rht_for_each(he, tbl, hash) {
+		struct rhlist_head *list;
+
+		list = container_of(he, struct rhlist_head, rhead);
+
 		if (he != obj) {
+			struct rhlist_head __rcu **lpprev;
+
 			pprev = &he->next;
-			continue;
+
+			if (!rhlist)
+				continue;
+
+			do {
+				lpprev = &list->next;
+				list = rht_dereference_bucket(list->next,
+							      tbl, hash);
+			} while (list && obj != &list->rhead);
+
+			if (!list)
+				continue;
+
+			list = rht_dereference_bucket(list->next, tbl, hash);
+			RCU_INIT_POINTER(*lpprev, list);
+			err = 0;
+			break;
 		}
 
-		rcu_assign_pointer(*pprev, obj->next);
-		err = 0;
+		obj = rht_dereference_bucket(obj->next, tbl, hash);
+		err = 1;
+
+		if (rhlist) {
+			list = rht_dereference_bucket(list->next, tbl, hash);
+			if (list) {
+				RCU_INIT_POINTER(list->rhead.next, obj);
+				obj = &list->rhead;
+				err = 0;
+			}
+		}
+
+		rcu_assign_pointer(*pprev, obj);
 		break;
 	}
 
 	spin_unlock_bh(lock);
 
+	if (err > 0) {
+		atomic_dec(&ht->nelems);
+		if (unlikely(ht->p.automatic_shrinking &&
+			     rht_shrink_below_30(ht, tbl)))
+			schedule_work(&ht->run_work);
+		err = 0;
+	}
+
 	return err;
 }
 
-/**
- * rhashtable_remove_fast - remove object from hash table
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- * @params:	hash table parameters
- *
- * Since the hash chain is single linked, the removal operation needs to
- * walk the bucket chain upon removal. The removal operation is thus
- * considerable slow if the hash table is not correctly sized.
- *
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
- *
- * Returns zero on success, -ENOENT if the entry could not be found.
- */
-static inline int rhashtable_remove_fast(
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
-	const struct rhashtable_params params)
+	const struct rhashtable_params params, bool rhlist)
 {
 	struct bucket_table *tbl;
 	int err;
@@ -850,24 +1030,60 @@ static inline int rhashtable_remove_fast(
 	 * visible then that guarantees the entry to still be in
 	 * the old tbl if it exists.
 	 */
-	while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) &&
+	while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
+						   rhlist)) &&
 	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
 		;
 
-	if (err)
-		goto out;
-
-	atomic_dec(&ht->nelems);
-	if (unlikely(ht->p.automatic_shrinking &&
-		     rht_shrink_below_30(ht, tbl)))
-		schedule_work(&ht->run_work);
-
-out:
 	rcu_read_unlock();
 
 	return err;
 }
 
+/**
+ * rhashtable_remove_fast - remove object from hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table via rhashtable_expand() if the
+ * shrink_decision function specified at rhashtable_init() returns true.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhashtable_remove_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	return __rhashtable_remove_fast(ht, obj, params, false);
+}
+
+/**
+ * rhltable_remove - remove object from hash list table
+ * @hlt:	hash list table
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table via rhashtable_expand() if the
+ * shrink_decision function specified at rhashtable_init() returns true.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhltable_remove(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
+}
+
 /* Internal function, please use rhashtable_replace_fast() instead */
 static inline int __rhashtable_replace_fast(
 	struct rhashtable *ht, struct bucket_table *tbl,
@@ -958,4 +1174,51 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
 	return 0;
 }
 
+/**
+ * rhltable_walk_enter - Initialise an iterator
+ * @hlt:	Table to walk over
+ * @iter:	Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may sleep so you must not call it from interrupt
+ * context or with spin locks held.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+				       struct rhashtable_iter *iter)
+{
+	return rhashtable_walk_enter(&hlt->ht, iter);
+}
+
+/**
+ * rhltable_free_and_destroy - free elements and destroy hash list table
+ * @hlt:	the hash list table to destroy
+ * @free_fn:	callback to release resources of element
+ * @arg:	pointer passed to free_fn
+ *
+ * See documentation for rhashtable_free_and_destroy.
+ */
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+					     void (*free_fn)(void *ptr,
+							     void *arg),
+					     void *arg)
+{
+	return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+	return rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
 #endif /* _LINUX_RHASHTABLE_H */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 06c28728bb53..32d0ad058380 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -378,22 +378,8 @@ static void rht_deferred_worker(struct work_struct *work)
 		schedule_work(&ht->run_work);
 }
 
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
-					struct bucket_table *tbl,
-					unsigned int hash)
-{
-	unsigned int elasticity = ht->elasticity;
-	struct rhash_head *head;
-
-	rht_for_each(head, tbl, hash)
-		if (!--elasticity)
-			return true;
-
-	return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
-			     struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+				    struct bucket_table *tbl)
 {
 	struct bucket_table *old_tbl;
 	struct bucket_table *new_tbl;
@@ -439,57 +425,165 @@ fail:
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-					    const void *key,
-					    struct rhash_head *obj,
-					    struct bucket_table *tbl,
-					    void **data)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+				   struct bucket_table *tbl, unsigned int hash,
+				   const void *key, struct rhash_head *obj)
 {
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	struct rhash_head __rcu **pprev;
 	struct rhash_head *head;
-	unsigned int hash;
-	int err;
+	int elasticity;
 
-	tbl = rhashtable_last_table(ht, tbl);
-	hash = head_hashfn(ht, tbl, obj);
-	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
-
-	err = -EEXIST;
-	if (key) {
-		*data = rhashtable_lookup_fast(ht, key, ht->p);
-		if (*data)
-			goto exit;
+	elasticity = ht->elasticity;
+	pprev = &tbl->buckets[hash];
+	rht_for_each(head, tbl, hash) {
+		struct rhlist_head *list;
+		struct rhlist_head *plist;
+
+		elasticity--;
+		if (!key ||
+		    (ht->p.obj_cmpfn ?
+		     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+		     rhashtable_compare(&arg, rht_obj(ht, head))))
+			continue;
+
+		if (!ht->rhlist)
+			return rht_obj(ht, head);
+
+		list = container_of(obj, struct rhlist_head, rhead);
+		plist = container_of(head, struct rhlist_head, rhead);
+
+		RCU_INIT_POINTER(list->next, plist);
+		head = rht_dereference_bucket(head->next, tbl, hash);
+		RCU_INIT_POINTER(list->rhead.next, head);
+		rcu_assign_pointer(*pprev, obj);
+
+		return NULL;
 	}
 
-	err = -E2BIG;
-	if (unlikely(rht_grow_above_max(ht, tbl)))
-		goto exit;
+	if (elasticity <= 0)
+		return ERR_PTR(-EAGAIN);
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
+						  struct bucket_table *tbl,
+						  unsigned int hash,
+						  struct rhash_head *obj,
+						  void *data)
+{
+	struct bucket_table *new_tbl;
+	struct rhash_head *head;
+
+	if (!IS_ERR_OR_NULL(data))
+		return ERR_PTR(-EEXIST);
 
-	err = -EAGAIN;
-	if (rhashtable_check_elasticity(ht, tbl, hash) ||
-	    rht_grow_above_100(ht, tbl))
-		goto exit;
+	if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+		return ERR_CAST(data);
 
-	err = 0;
+	new_tbl = rcu_dereference(tbl->future_tbl);
+	if (new_tbl)
+		return new_tbl;
+
+	if (PTR_ERR(data) != -ENOENT)
+		return ERR_CAST(data);
+
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		return ERR_PTR(-E2BIG);
+
+	if (unlikely(rht_grow_above_100(ht, tbl)))
+		return ERR_PTR(-EAGAIN);
 
 	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
+	if (ht->rhlist) {
+		struct rhlist_head *list;
+
+		list = container_of(obj, struct rhlist_head, rhead);
+		RCU_INIT_POINTER(list->next, NULL);
+	}
 
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 
 	atomic_inc(&ht->nelems);
+	if (rht_grow_above_75(ht, tbl))
+		schedule_work(&ht->run_work);
 
-exit:
-	spin_unlock(rht_bucket_lock(tbl, hash));
+	return NULL;
+}
 
-	if (err == 0)
-		return NULL;
-	else if (err == -EAGAIN)
-		return tbl;
-	else
-		return ERR_PTR(err);
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+				   struct rhash_head *obj)
+{
+	struct bucket_table *new_tbl;
+	struct bucket_table *tbl;
+	unsigned int hash;
+	spinlock_t *lock;
+	void *data;
+
+	tbl = rcu_dereference(ht->tbl);
+
+	/* All insertions must grab the oldest table containing
+	 * the hashed bucket that is yet to be rehashed.
+	 */
+	for (;;) {
+		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+		lock = rht_bucket_lock(tbl, hash);
+		spin_lock_bh(lock);
+
+		if (tbl->rehash <= hash)
+			break;
+
+		spin_unlock_bh(lock);
+		tbl = rcu_dereference(tbl->future_tbl);
+	}
+
+	data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
+	new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
+	if (PTR_ERR(new_tbl) != -EEXIST)
+		data = ERR_CAST(new_tbl);
+
+	while (!IS_ERR_OR_NULL(new_tbl)) {
+		tbl = new_tbl;
+		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+		spin_lock_nested(rht_bucket_lock(tbl, hash),
+				 SINGLE_DEPTH_NESTING);
+
+		data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
+		new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
+		if (PTR_ERR(new_tbl) != -EEXIST)
+			data = ERR_CAST(new_tbl);
+
+		spin_unlock(rht_bucket_lock(tbl, hash));
+	}
+
+	spin_unlock_bh(lock);
+
+	if (PTR_ERR(data) == -EAGAIN)
+		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+			       -EAGAIN);
+
+	return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+			     struct rhash_head *obj)
+{
+	void *data;
+
+	do {
+		rcu_read_lock();
+		data = rhashtable_try_insert(ht, key, obj);
+		rcu_read_unlock();
+	} while (PTR_ERR(data) == -EAGAIN);
+
+	return data;
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
@@ -593,11 +687,16 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start);
 void *rhashtable_walk_next(struct rhashtable_iter *iter)
 {
 	struct bucket_table *tbl = iter->walker.tbl;
+	struct rhlist_head *list = iter->list;
 	struct rhashtable *ht = iter->ht;
 	struct rhash_head *p = iter->p;
+	bool rhlist = ht->rhlist;
 
 	if (p) {
-		p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
+		if (!rhlist || !(list = rcu_dereference(list->next))) {
+			p = rcu_dereference(p->next);
+			list = container_of(p, struct rhlist_head, rhead);
+		}
 		goto next;
 	}
 
@@ -605,6 +704,18 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 		int skip = iter->skip;
 
 		rht_for_each_rcu(p, tbl, iter->slot) {
+			if (rhlist) {
+				list = container_of(p, struct rhlist_head,
+						    rhead);
+				do {
+					if (!skip)
+						goto next;
+					skip--;
+					list = rcu_dereference(list->next);
+				} while (list);
+
+				continue;
+			}
 			if (!skip)
 				break;
 			skip--;
@@ -614,7 +725,8 @@ next:
 		if (!rht_is_a_nulls(p)) {
 			iter->skip++;
 			iter->p = p;
-			return rht_obj(ht, p);
+			iter->list = list;
+			return rht_obj(ht, rhlist ? &list->rhead : p);
 		}
 
 		iter->skip = 0;
@@ -802,6 +914,48 @@ int rhashtable_init(struct rhashtable *ht,
 }
 EXPORT_SYMBOL_GPL(rhashtable_init);
 
+/**
+ * rhltable_init - initialize a new hash list table
+ * @hlt:	hash list table to be initialized
+ * @params:	configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
+{
+	int err;
+
+	/* No rhlist NULLs marking for now. */
+	if (params->nulls_base)
+		return -EINVAL;
+
+	err = rhashtable_init(&hlt->ht, params);
+	hlt->ht.rhlist = true;
+	return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+				void (*free_fn)(void *ptr, void *arg),
+				void *arg)
+{
+	struct rhlist_head *list;
+
+	if (!ht->rhlist) {
+		free_fn(rht_obj(ht, obj), arg);
+		return;
+	}
+
+	list = container_of(obj, struct rhlist_head, rhead);
+	do {
+		obj = &list->rhead;
+		list = rht_dereference(list->next, ht);
+		free_fn(rht_obj(ht, obj), arg);
+	} while (list);
+}
+
 /**
  * rhashtable_free_and_destroy - free elements and destroy hash table
  * @ht:		the hash table to destroy
@@ -839,7 +993,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 			     pos = next,
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL)
-				free_fn(rht_obj(ht, pos), arg);
+				rhashtable_free_one(ht, pos, free_fn, arg);
 		}
 	}
 
-- 
cgit v1.2.3


From e23d4159b109167126e5bcd7f3775c95de7fee47 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Tue, 20 Sep 2016 20:07:42 +0100
Subject: fix fault_in_multipages_...() on architectures with no-op access_ok()

Switching iov_iter fault-in to multipages variants has exposed an old
bug in underlying fault_in_multipages_...(); they break if the range
passed to them wraps around.  Normally access_ok() done by callers will
prevent such (and it's a guaranteed EFAULT - ERR_PTR() values fall into
such a range and they should not point to any valid objects).

However, on architectures where userland and kernel live in different
MMU contexts (e.g. s390) access_ok() is a no-op and on those a range
with a wraparound can reach fault_in_multipages_...().

Since any wraparound means EFAULT there, the fix is trivial - turn
those

    while (uaddr <= end)
	    ...
into

    if (unlikely(uaddr > end))
	    return -EFAULT;
    do
	    ...
    while (uaddr <= end);

Reported-by: Jan Stancek <jstancek@redhat.com>
Tested-by: Jan Stancek <jstancek@redhat.com>
Cc: stable@vger.kernel.org # v3.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 66a1260b33de..7e3d53753612 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -571,56 +571,56 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size)
  */
 static inline int fault_in_multipages_writeable(char __user *uaddr, int size)
 {
-	int ret = 0;
 	char __user *end = uaddr + size - 1;
 
 	if (unlikely(size == 0))
-		return ret;
+		return 0;
 
+	if (unlikely(uaddr > end))
+		return -EFAULT;
 	/*
 	 * Writing zeroes into userspace here is OK, because we know that if
 	 * the zero gets there, we'll be overwriting it.
 	 */
-	while (uaddr <= end) {
-		ret = __put_user(0, uaddr);
-		if (ret != 0)
-			return ret;
+	do {
+		if (unlikely(__put_user(0, uaddr) != 0))
+			return -EFAULT;
 		uaddr += PAGE_SIZE;
-	}
+	} while (uaddr <= end);
 
 	/* Check whether the range spilled into the next page. */
 	if (((unsigned long)uaddr & PAGE_MASK) ==
 			((unsigned long)end & PAGE_MASK))
-		ret = __put_user(0, end);
+		return __put_user(0, end);
 
-	return ret;
+	return 0;
 }
 
 static inline int fault_in_multipages_readable(const char __user *uaddr,
 					       int size)
 {
 	volatile char c;
-	int ret = 0;
 	const char __user *end = uaddr + size - 1;
 
 	if (unlikely(size == 0))
-		return ret;
+		return 0;
 
-	while (uaddr <= end) {
-		ret = __get_user(c, uaddr);
-		if (ret != 0)
-			return ret;
+	if (unlikely(uaddr > end))
+		return -EFAULT;
+
+	do {
+		if (unlikely(__get_user(c, uaddr) != 0))
+			return -EFAULT;
 		uaddr += PAGE_SIZE;
-	}
+	} while (uaddr <= end);
 
 	/* Check whether the range spilled into the next page. */
 	if (((unsigned long)uaddr & PAGE_MASK) ==
 			((unsigned long)end & PAGE_MASK)) {
-		ret = __get_user(c, end);
-		(void)c;
+		return __get_user(c, end);
 	}
 
-	return ret;
+	return 0;
 }
 
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-- 
cgit v1.2.3


From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 20 Sep 2016 00:26:13 +0200
Subject: bpf: direct packet write and access for helpers for clsact progs

This work implements direct packet access for helpers and direct packet
write in a similar fashion as already available for XDP types via commits
4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and
6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a
complementary feature to the already available direct packet read for tc
(cls/act) programs.

For enabling this, we need to introduce two helpers, bpf_skb_pull_data()
and bpf_csum_update(). The first is generally needed for both, read and
write, because they would otherwise only be limited to the current linear
skb head. Usually, when the data_end test fails, programs just bail out,
or, in the direct read case, use bpf_skb_load_bytes() as an alternative
to overcome this limitation. If such data sits in non-linear parts, we
can just pull them in once with the new helper, retest and eventually
access them.

At the same time, this also makes sure the skb is uncloned, which is, of
course, a necessary condition for direct write. As this needs to be an
invariant for the write part only, the verifier detects writes and adds
a prologue that is calling bpf_skb_pull_data() to effectively unclone the
skb from the very beginning in case it is indeed cloned. The heuristic
makes use of a similar trick that was done in 233577a22089 ("net: filter:
constify detection of pkt_type_offset"). This comes at zero cost for other
programs that do not use the direct write feature. Should a program use
this feature only sparsely and has read access for the most parts with,
for example, drop return codes, then such write action can be delegated
to a tail called program for mitigating this cost of potential uncloning
to a late point in time where it would have been paid similarly with the
bpf_skb_store_bytes() as well. Advantage of direct write is that the
writes are inlined whereas the helper cannot make any length assumptions
and thus needs to generate a call to memcpy() also for small sizes, as well
as cost of helper call itself with sanity checks are avoided. Plus, when
direct read is already used, we don't need to cache or perform rechecks
on the data boundaries (due to verifier invalidating previous checks for
helpers that change skb->data), so more complex programs using rewrites
can benefit from switching to direct read plus write.

For direct packet access to helpers, we save the otherwise needed copy into
a temp struct sitting on stack memory when use-case allows. Both facilities
are enabled via may_access_direct_pkt_data() in verifier. For now, we limit
this to map helpers and csum_diff, and can successively enable other helpers
where we find it makes sense. Helpers that definitely cannot be allowed for
this are those part of bpf_helper_changes_skb_data() since they can change
underlying data, and those that write into memory as this could happen for
packet typed args when still cloned. bpf_csum_update() helper accommodates
for the fact that we need to fixup checksum_complete when using direct write
instead of bpf_skb_store_bytes(), meaning the programs can use available
helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(),
csum_block_add(), csum_block_sub() equivalents in eBPF together with the
new helper. A usage example will be provided for iproute2's examples/bpf/
directory.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   4 +-
 include/linux/skbuff.h   |  14 ++++-
 include/uapi/linux/bpf.h |  21 ++++++++
 kernel/bpf/helpers.c     |   3 ++
 kernel/bpf/verifier.c    |  54 ++++++++++++++-----
 net/core/filter.c        | 134 +++++++++++++++++++++++++++++++++++++++++------
 6 files changed, 196 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a904f63f8c1..5691fdc83819 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_return_type {
 struct bpf_func_proto {
 	u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 	bool gpl_only;
+	bool pkt_access;
 	enum bpf_return_type ret_type;
 	enum bpf_arg_type arg1_type;
 	enum bpf_arg_type arg2_type;
@@ -151,7 +152,8 @@ struct bpf_verifier_ops {
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
 				enum bpf_reg_type *reg_type);
-
+	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
+			    const struct bpf_prog *prog);
 	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
 				  int src_reg, int ctx_off,
 				  struct bpf_insn *insn, struct bpf_prog *prog);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c5662f05bda..c6dab3f7457c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -676,13 +676,23 @@ struct sk_buff {
 	 */
 	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
+
+/* if you move cloned around you also must adapt those constants */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define CLONED_MASK	(1 << 7)
+#else
+#define CLONED_MASK	1
+#endif
+#define CLONED_OFFSET()		offsetof(struct sk_buff, __cloned_offset)
+
+	__u8			__cloned_offset[0];
 	__u8			cloned:1,
 				nohdr:1,
 				fclone:2,
 				peeked:1,
 				head_frag:1,
-				xmit_more:1;
-	/* one bit hole */
+				xmit_more:1,
+				__unused:1; /* one bit hole */
 	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f896dfac4ac0..e07432b9f8b8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -398,6 +398,27 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_change_tail,
 
+	/**
+	 * bpf_skb_pull_data(skb, len)
+	 * The helper will pull in non-linear data in case the
+	 * skb is non-linear and not all of len are part of the
+	 * linear section. Only needed for read/write with direct
+	 * packet access.
+	 * @skb: pointer to skb
+	 * @len: len to make read/writeable
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_skb_pull_data,
+
+	/**
+	 * bpf_csum_update(skb, csum)
+	 * Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+	 * @skb: pointer to skb
+	 * @csum: csum to add
+	 * Return: csum on success or negative error
+	 */
+	BPF_FUNC_csum_update,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a5b8bf8cfcfd..39918402e6e9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -36,6 +36,7 @@ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.func		= bpf_map_lookup_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -51,6 +52,7 @@ BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 const struct bpf_func_proto bpf_map_update_elem_proto = {
 	.func		= bpf_map_update_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -67,6 +69,7 @@ BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.func		= bpf_map_delete_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc138f34e38c..3a75ee3bdcd1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -196,6 +196,7 @@ struct verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
+	bool seen_direct_write;
 };
 
 #define BPF_COMPLEXITY_LIMIT_INSNS	65536
@@ -204,6 +205,7 @@ struct verifier_env {
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
+	bool pkt_access;
 	int regno;
 	int access_size;
 };
@@ -654,10 +656,17 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
-static bool may_write_pkt_data(enum bpf_prog_type type)
+static bool may_access_direct_pkt_data(struct verifier_env *env,
+				       const struct bpf_call_arg_meta *meta)
 {
-	switch (type) {
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+		if (meta)
+			return meta->pkt_access;
+
+		env->seen_direct_write = true;
 		return true;
 	default:
 		return false;
@@ -817,7 +826,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -950,8 +959,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) {
-		verbose("helper access to the packet is not allowed for clsact\n");
+	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1191,6 +1200,7 @@ static int check_call(struct verifier_env *env, int func_id)
 	changes_data = bpf_helper_changes_skb_data(fn->func);
 
 	memset(&meta, 0, sizeof(meta));
+	meta.pkt_access = fn->pkt_access;
 
 	/* We only support one arg being in raw mode at the moment, which
 	 * is sufficient for the helper functions we have right now.
@@ -2675,18 +2685,35 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
  */
 static int convert_ctx_accesses(struct verifier_env *env)
 {
-	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
-	struct bpf_insn insn_buf[16];
+	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i;
+	int i, insn_cnt, cnt;
 
-	if (!env->prog->aux->ops->convert_ctx_access)
+	if (ops->gen_prologue) {
+		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+					env->prog);
+		if (cnt >= ARRAY_SIZE(insn_buf)) {
+			verbose("bpf verifier is misconfigured\n");
+			return -EINVAL;
+		} else if (cnt) {
+			new_prog = bpf_patch_insn_single(env->prog, 0,
+							 insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+			env->prog = new_prog;
+		}
+	}
+
+	if (!ops->convert_ctx_access)
 		return 0;
 
+	insn_cnt = env->prog->len;
+	insn = env->prog->insnsi;
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		u32 insn_delta, cnt;
+		u32 insn_delta;
 
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
@@ -2703,9 +2730,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			continue;
 		}
 
-		cnt = env->prog->aux->ops->
-			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					   insn->off, insn_buf, env->prog);
+		cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+					      insn->off, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index 298b146b47e7..0920c2ac1d00 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 	return err;
 }
 
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+	return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
 static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
 {
 	if (skb_at_tc_ingress(skb))
@@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_STACK_SIZE,
 };
 
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+	/* Idea is the following: should the needed direct read/write
+	 * test fail during runtime, we can pull in more data and redo
+	 * again, since implicitly, we invalidate previous checks here.
+	 *
+	 * Or, since we know how much we need to make read/writeable,
+	 * this can be done once at the program beginning for direct
+	 * access case. By this we overcome limitations of only current
+	 * headroom being accessible.
+	 */
+	return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+	.func		= bpf_skb_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
 	   u64, from, u64, to, u64, flags)
 {
@@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
 static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.func		= bpf_csum_diff,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_STACK,
 	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+	/* The interface is to be used in combination with bpf_csum_diff()
+	 * for direct packet writes. csum rotation for alignment as well
+	 * as emulating csum_sub() can be done from the eBPF program.
+	 */
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		return (skb->csum = csum_add(skb->csum, csum));
+
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+	.func		= bpf_csum_update,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
 {
 	return dev_forward_skb(dev, skb);
@@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 {
 	struct net_device *dev;
+	struct sk_buff *clone;
+	int ret;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return -EINVAL;
@@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 	if (unlikely(!dev))
 		return -EINVAL;
 
-	skb = skb_clone(skb, GFP_ATOMIC);
-	if (unlikely(!skb))
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (unlikely(!clone))
 		return -ENOMEM;
 
-	bpf_push_mac_rcsum(skb);
+	/* For direct write, we need to keep the invariant that the skbs
+	 * we're dealing with need to be uncloned. Should uncloning fail
+	 * here, we need to free the just generated clone to unclone once
+	 * again.
+	 */
+	ret = bpf_try_make_head_writable(skb);
+	if (unlikely(ret)) {
+		kfree_skb(clone);
+		return -ENOMEM;
+	}
+
+	bpf_push_mac_rcsum(clone);
 
 	return flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+	       __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
 }
 
 static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -2063,19 +2124,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 
 bool bpf_helper_changes_skb_data(void *func)
 {
-	if (func == bpf_skb_vlan_push)
-		return true;
-	if (func == bpf_skb_vlan_pop)
-		return true;
-	if (func == bpf_skb_store_bytes)
-		return true;
-	if (func == bpf_skb_change_proto)
-		return true;
-	if (func == bpf_skb_change_tail)
-		return true;
-	if (func == bpf_l3_csum_replace)
-		return true;
-	if (func == bpf_l4_csum_replace)
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace)
 		return true;
 
 	return false;
@@ -2440,8 +2496,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
 	case BPF_FUNC_csum_diff:
 		return &bpf_csum_diff_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
@@ -2533,6 +2593,45 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			       const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (!direct_write)
+		return 0;
+
+	/* if (!skb->cloned)
+	 *       goto start;
+	 *
+	 * (Fast-path, otherwise approximation that we might be
+	 *  a clone, do the rest in helper.)
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+	/* ret = bpf_skb_pull_data(skb, 0); */
+	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			       BPF_FUNC_skb_pull_data);
+	/* if (!ret)
+	 *      goto restore;
+	 * return TC_ACT_SHOT;
+	 */
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+	*insn++ = BPF_EXIT_INSN();
+
+	/* restore: */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+	/* start: */
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
 				       enum bpf_reg_type *reg_type)
@@ -2810,6 +2909,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
 };
 
 static const struct bpf_verifier_ops xdp_ops = {
-- 
cgit v1.2.3


From a4f1f9ac8153e22869b6408832b5a9bb9c762bf6 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:09 -0400
Subject: lib/win_minmax: windowed min or max estimator

This commit introduces a generic library to estimate either the min or
max value of a time-varying variable over a recent time window. This
is code originally from Kathleen Nichols. The current form of the code
is from Van Jacobson.

A single struct minmax_sample will track the estimated windowed-max
value of the series if you call minmax_running_max() or the estimated
windowed-min value of the series if you call minmax_running_min().

Nearly equivalent code is already in place for minimum RTT estimation
in the TCP stack. This commit extracts that code and generalizes it to
handle both min and max. Moving the code here reduces the footprint
and complexity of the TCP code base and makes the filter generally
available for other parts of the codebase, including an upcoming TCP
congestion control module.

This library works well for time series where the measurements are
smoothly increasing or decreasing.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/win_minmax.h | 37 +++++++++++++++++
 lib/Makefile               |  2 +-
 lib/win_minmax.c           | 98 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/win_minmax.h
 create mode 100644 lib/win_minmax.c

(limited to 'include')

diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h
new file mode 100644
index 000000000000..56569604278f
--- /dev/null
+++ b/include/linux/win_minmax.h
@@ -0,0 +1,37 @@
+/**
+ * lib/minmax.c: windowed min/max tracker by Kathleen Nichols.
+ *
+ */
+#ifndef MINMAX_H
+#define MINMAX_H
+
+#include <linux/types.h>
+
+/* A single data point for our parameterized min-max tracker */
+struct minmax_sample {
+	u32	t;	/* time measurement was taken */
+	u32	v;	/* value measured */
+};
+
+/* State for the parameterized min-max tracker */
+struct minmax {
+	struct minmax_sample s[3];
+};
+
+static inline u32 minmax_get(const struct minmax *m)
+{
+	return m->s[0].v;
+}
+
+static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas)
+{
+	struct minmax_sample val = { .t = t, .v = meas };
+
+	m->s[2] = m->s[1] = m->s[0] = val;
+	return m->s[0].v;
+}
+
+u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas);
+u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas);
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 5dc77a8ec297..df747e5eeb7a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
+	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/win_minmax.c b/lib/win_minmax.c
new file mode 100644
index 000000000000..c8420d404926
--- /dev/null
+++ b/lib/win_minmax.c
@@ -0,0 +1,98 @@
+/**
+ * lib/minmax.c: windowed min/max tracker
+ *
+ * Kathleen Nichols' algorithm for tracking the minimum (or maximum)
+ * value of a data stream over some fixed time interval.  (E.g.,
+ * the minimum RTT over the past five minutes.) It uses constant
+ * space and constant time per update yet almost always delivers
+ * the same minimum as an implementation that has to keep all the
+ * data in the window.
+ *
+ * The algorithm keeps track of the best, 2nd best & 3rd best min
+ * values, maintaining an invariant that the measurement time of
+ * the n'th best >= n-1'th best. It also makes sure that the three
+ * values are widely separated in the time window since that bounds
+ * the worse case error when that data is monotonically increasing
+ * over the window.
+ *
+ * Upon getting a new min, we can forget everything earlier because
+ * it has no value - the new min is <= everything else in the window
+ * by definition and it's the most recent. So we restart fresh on
+ * every new min and overwrites 2nd & 3rd choices. The same property
+ * holds for 2nd & 3rd best.
+ */
+#include <linux/module.h>
+#include <linux/win_minmax.h>
+
+/* As time advances, update the 1st, 2nd, and 3rd choices. */
+static u32 minmax_subwin_update(struct minmax *m, u32 win,
+				const struct minmax_sample *val)
+{
+	u32 dt = val->t - m->s[0].t;
+
+	if (unlikely(dt > win)) {
+		/*
+		 * Passed entire window without a new val so make 2nd
+		 * choice the new val & 3rd choice the new 2nd choice.
+		 * we may have to iterate this since our 2nd choice
+		 * may also be outside the window (we checked on entry
+		 * that the third choice was in the window).
+		 */
+		m->s[0] = m->s[1];
+		m->s[1] = m->s[2];
+		m->s[2] = *val;
+		if (unlikely(val->t - m->s[0].t > win)) {
+			m->s[0] = m->s[1];
+			m->s[1] = m->s[2];
+			m->s[2] = *val;
+		}
+	} else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) {
+		/*
+		 * We've passed a quarter of the window without a new val
+		 * so take a 2nd choice from the 2nd quarter of the window.
+		 */
+		m->s[2] = m->s[1] = *val;
+	} else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) {
+		/*
+		 * We've passed half the window without finding a new val
+		 * so take a 3rd choice from the last half of the window
+		 */
+		m->s[2] = *val;
+	}
+	return m->s[0].v;
+}
+
+/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */
+u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas)
+{
+	struct minmax_sample val = { .t = t, .v = meas };
+
+	if (unlikely(val.v >= m->s[0].v) ||	  /* found new max? */
+	    unlikely(val.t - m->s[2].t > win))	  /* nothing left in window? */
+		return minmax_reset(m, t, meas);  /* forget earlier samples */
+
+	if (unlikely(val.v >= m->s[1].v))
+		m->s[2] = m->s[1] = val;
+	else if (unlikely(val.v >= m->s[2].v))
+		m->s[2] = val;
+
+	return minmax_subwin_update(m, win, &val);
+}
+EXPORT_SYMBOL(minmax_running_max);
+
+/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */
+u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas)
+{
+	struct minmax_sample val = { .t = t, .v = meas };
+
+	if (unlikely(val.v <= m->s[0].v) ||	  /* found new min? */
+	    unlikely(val.t - m->s[2].t > win))	  /* nothing left in window? */
+		return minmax_reset(m, t, meas);  /* forget earlier samples */
+
+	if (unlikely(val.v <= m->s[1].v))
+		m->s[2] = m->s[1] = val;
+	else if (unlikely(val.v <= m->s[2].v))
+		m->s[2] = val;
+
+	return minmax_subwin_update(m, win, &val);
+}
-- 
cgit v1.2.3


From 6403389211e1f4d40ed963fe47a96fce1a3ba7a9 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:10 -0400
Subject: tcp: use windowed min filter library for TCP min_rtt estimation

Refactor the TCP min_rtt code to reuse the new win_minmax library in
lib/win_minmax.c to simplify the TCP code.

This is a pure refactor: the functionality is exactly the same. We
just moved the windowed min code to make TCP easier to read and
maintain, and to allow other parts of the kernel to use the windowed
min/max filter code.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  5 ++--
 include/net/tcp.h        |  2 +-
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_input.c     | 64 ++++--------------------------------------------
 net/ipv4/tcp_minisocks.c |  2 +-
 5 files changed, 10 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c723a465125d..6433cc8b4667 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -19,6 +19,7 @@
 
 
 #include <linux/skbuff.h>
+#include <linux/win_minmax.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -234,9 +235,7 @@ struct tcp_sock {
 	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
 	u32	rttvar_us;	/* smoothed mdev_max			*/
 	u32	rtt_seq;	/* sequence number to update rttvar	*/
-	struct rtt_meas {
-		u32 rtt, ts;	/* RTT in usec and sampling time in jiffies. */
-	} rtt_min[3];
+	struct  minmax rtt_min;
 
 	u32	packets_out;	/* Packets which are "in flight"	*/
 	u32	retrans_out;	/* Retransmitted packets out		*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fdfbedd61c67..2f1648af4d12 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -671,7 +671,7 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
 /* Minimum RTT in usec. ~0 means not available. */
 static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
 {
-	return tp->rtt_min[0].rtt;
+	return minmax_get(&tp->rtt_min);
 }
 
 /* Compute the actual receive window we are currently advertising.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7dae800092e6..e79ed17ccfd6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
 
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
-	tp->rtt_min[0].rtt = ~0U;
+	minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
 
 	/* So many TCP implementations out there (incorrectly) count the
 	 * initial SYN frame in their delayed-ACK and congestion control
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dad3e7eeed94..6886f386464f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2879,67 +2879,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	*rexmit = REXMIT_LOST;
 }
 
-/* Kathleen Nichols' algorithm for tracking the minimum value of
- * a data stream over some fixed time interval. (E.g., the minimum
- * RTT over the past five minutes.) It uses constant space and constant
- * time per update yet almost always delivers the same minimum as an
- * implementation that has to keep all the data in the window.
- *
- * The algorithm keeps track of the best, 2nd best & 3rd best min
- * values, maintaining an invariant that the measurement time of the
- * n'th best >= n-1'th best. It also makes sure that the three values
- * are widely separated in the time window since that bounds the worse
- * case error when that data is monotonically increasing over the window.
- *
- * Upon getting a new min, we can forget everything earlier because it
- * has no value - the new min is <= everything else in the window by
- * definition and it's the most recent. So we restart fresh on every new min
- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
- * best.
- */
 static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
 {
-	const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
-	struct rtt_meas *m = tcp_sk(sk)->rtt_min;
-	struct rtt_meas rttm = {
-		.rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
-		.ts = now,
-	};
-	u32 elapsed;
-
-	/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
-	if (unlikely(rttm.rtt <= m[0].rtt))
-		m[0] = m[1] = m[2] = rttm;
-	else if (rttm.rtt <= m[1].rtt)
-		m[1] = m[2] = rttm;
-	else if (rttm.rtt <= m[2].rtt)
-		m[2] = rttm;
-
-	elapsed = now - m[0].ts;
-	if (unlikely(elapsed > wlen)) {
-		/* Passed entire window without a new min so make 2nd choice
-		 * the new min & 3rd choice the new 2nd. So forth and so on.
-		 */
-		m[0] = m[1];
-		m[1] = m[2];
-		m[2] = rttm;
-		if (now - m[0].ts > wlen) {
-			m[0] = m[1];
-			m[1] = rttm;
-			if (now - m[0].ts > wlen)
-				m[0] = rttm;
-		}
-	} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
-		/* Passed a quarter of the window without a new min so
-		 * take 2nd choice from the 2nd quarter of the window.
-		 */
-		m[2] = m[1] = rttm;
-	} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
-		/* Passed half the window without a new min so take the 3rd
-		 * choice from the last half of the window.
-		 */
-		m[2] = rttm;
-	}
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+
+	minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+			   rtt_us ? : jiffies_to_usecs(1));
 }
 
 static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f63c73dc0acb..568947110b60 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 
 		newtp->srtt_us = 0;
 		newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
-		newtp->rtt_min[0].rtt = ~0U;
+		minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
 		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
 
 		newtp->packets_out = 0;
-- 
cgit v1.2.3


From 77879147a3481babffd7e368d977ab682545a6bd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Sep 2016 23:39:11 -0400
Subject: net_sched: sch_fq: add low_rate_threshold parameter

This commit adds to the fq module a low_rate_threshold parameter to
insert a delay after all packets if the socket requests a pacing rate
below the threshold.

This helps achieve more precise control of the sending rate with
low-rate paths, especially policers. The basic issue is that if a
congestion control module detects a policer at a certain rate, it may
want fq to be able to shape to that policed rate. That way the sender
can avoid policer drops by having the packets arrive at the policer at
or just under the policed rate.

The default threshold of 550Kbps was chosen analytically so that for
policers or links at 500Kbps or 512Kbps fq would very likely invoke
this mechanism, even if the pacing rate was briefly slightly above the
available bandwidth. This value was then empirically validated with
two years of production testing on YouTube video servers.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_fq.c             | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 2382eed50278..f8e39dbaa781 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -792,6 +792,8 @@ enum {
 
 	TCA_FQ_ORPHAN_MASK,	/* mask applied to orphaned skb hashes */
 
+	TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
+
 	__TCA_FQ_MAX
 };
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index dc52cc10d6ed..5dd929cc1423 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -94,6 +94,7 @@ struct fq_sched_data {
 	u32		flow_max_rate;	/* optional max rate per flow */
 	u32		flow_plimit;	/* max packets per flow */
 	u32		orphan_mask;	/* mask for orphaned skb */
+	u32		low_rate_threshold;
 	struct rb_root	*fq_root;
 	u8		rate_enable;
 	u8		fq_trees_log;
@@ -433,7 +434,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
-	u32 rate;
+	u32 rate, plen;
 
 	skb = fq_dequeue_head(sch, &q->internal);
 	if (skb)
@@ -482,7 +483,7 @@ begin:
 	prefetch(&skb->end);
 	f->credit -= qdisc_pkt_len(skb);
 
-	if (f->credit > 0 || !q->rate_enable)
+	if (!q->rate_enable)
 		goto out;
 
 	/* Do not pace locally generated ack packets */
@@ -493,8 +494,15 @@ begin:
 	if (skb->sk)
 		rate = min(skb->sk->sk_pacing_rate, rate);
 
+	if (rate <= q->low_rate_threshold) {
+		f->credit = 0;
+		plen = qdisc_pkt_len(skb);
+	} else {
+		plen = max(qdisc_pkt_len(skb), q->quantum);
+		if (f->credit > 0)
+			goto out;
+	}
 	if (rate != ~0U) {
-		u32 plen = max(qdisc_pkt_len(skb), q->quantum);
 		u64 len = (u64)plen * NSEC_PER_SEC;
 
 		if (likely(rate))
@@ -662,6 +670,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
+	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +725,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_FQ_FLOW_MAX_RATE])
 		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
 
+	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+		q->low_rate_threshold =
+			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
 	if (tb[TCA_FQ_RATE_ENABLE]) {
 		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
 
@@ -781,6 +794,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->fq_root		= NULL;
 	q->fq_trees_log		= ilog2(1024);
 	q->orphan_mask		= 1024 - 1;
+	q->low_rate_threshold	= 550000 / 8;
 	qdisc_watchdog_init(&q->watchdog, sch);
 
 	if (opt)
@@ -811,6 +825,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
 			jiffies_to_usecs(q->flow_refill_delay)) ||
 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+			q->low_rate_threshold) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 0682e6902a52aca7caf6ad42551b16ea0f87bc31 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:13 -0400
Subject: tcp: count packets marked lost for a TCP connection

Count the number of packets that a TCP connection marks lost.

Congestion control modules can use this loss rate information for more
intelligent decisions about how fast to send.

Specifically, this is used in TCP BBR policer detection. BBR uses a
high packet loss rate as one signal in its policer detection and
policer bandwidth estimation algorithm.

The BBR policer detection algorithm cannot simply track retransmits,
because a retransmit can be (and often is) an indicator of packets
lost long, long ago. This is particularly true in a long CA_Loss
period that repairs the initial massive losses when a policer kicks
in.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  1 +
 net/ipv4/tcp_input.c | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6433cc8b4667..38590fbc0ac5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -267,6 +267,7 @@ struct tcp_sock {
 				 * receiver in Recovery. */
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
 	u32	delivered;	/* Total data packets delivered incl. rexmits */
+	u32	lost;		/* Total data packets lost incl. rexmits */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6886f386464f..9413288c2778 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -899,12 +899,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
+/* Sum the number of packets on the wire we have marked as lost.
+ * There are two cases we care about here:
+ * a) Packet hasn't been marked lost (nor retransmitted),
+ *    and this is the first loss.
+ * b) Packet has been marked both lost and retransmitted,
+ *    and this means we think it was lost again.
+ */
+static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	__u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+	if (!(sacked & TCPCB_LOST) ||
+	    ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+		tp->lost += tcp_skb_pcount(skb);
+}
+
 static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
 		tcp_verify_retransmit_hint(tp, skb);
 
 		tp->lost_out += tcp_skb_pcount(skb);
+		tcp_sum_lost(tp, skb);
 		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 	}
 }
@@ -913,6 +930,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	tcp_verify_retransmit_hint(tp, skb);
 
+	tcp_sum_lost(tp, skb);
 	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
 		tp->lost_out += tcp_skb_pcount(skb);
 		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1890,6 +1908,7 @@ void tcp_enter_loss(struct sock *sk)
 	struct sk_buff *skb;
 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
+	bool mark_lost;
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1942,12 @@ void tcp_enter_loss(struct sock *sk)
 		if (skb == tcp_send_head(sk))
 			break;
 
+		mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+			     is_reneg);
+		if (mark_lost)
+			tcp_sum_lost(tp, skb);
 		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+		if (mark_lost) {
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
-- 
cgit v1.2.3


From b9f64820fb226a4e8ab10591f46cecd91ca56b30 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:14 -0400
Subject: tcp: track data delivery rate for a TCP connection

This patch generates data delivery rate (throughput) samples on a
per-ACK basis. These rate samples can be used by congestion control
modules, and specifically will be used by TCP BBR in later patches in
this series.

Key state:

tp->delivered: Tracks the total number of data packets (original or not)
	       delivered so far. This is an already-existing field.

tp->delivered_mstamp: the last time tp->delivered was updated.

Algorithm:

A rate sample is calculated as (d1 - d0)/(t1 - t0) on a per-ACK basis:

  d1: the current tp->delivered after processing the ACK
  t1: the current time after processing the ACK

  d0: the prior tp->delivered when the acked skb was transmitted
  t0: the prior tp->delivered_mstamp when the acked skb was transmitted

When an skb is transmitted, we snapshot d0 and t0 in its control
block in tcp_rate_skb_sent().

When an ACK arrives, it may SACK and ACK some skbs. For each SACKed
or ACKed skb, tcp_rate_skb_delivered() updates the rate_sample struct
to reflect the latest (d0, t0).

Finally, tcp_rate_gen() generates a rate sample by storing
(d1 - d0) in rs->delivered and (t1 - t0) in rs->interval_us.

One caveat: if an skb was sent with no packets in flight, then
tp->delivered_mstamp may be either invalid (if the connection is
starting) or outdated (if the connection was idle). In that case,
we'll re-stamp tp->delivered_mstamp.

At first glance it seems t0 should always be the time when an skb was
transmitted, but actually this could over-estimate the rate due to
phase mismatch between transmit and ACK events. To track the delivery
rate, we ensure that if packets are in flight then t0 and and t1 are
times at which packets were marked delivered.

If the initial and final RTTs are different then one may be corrupted
by some sort of noise. The noise we see most often is sending gaps
caused by delayed, compressed, or stretched acks. This either affects
both RTTs equally or artificially reduces the final RTT. We approach
this by recording the info we need to compute the initial RTT
(duration of the "send phase" of the window) when we recorded the
associated inflight. Then, for a filter to avoid bandwidth
overestimates, we generalize the per-sample bandwidth computation
from:

    bw = delivered / ack_phase_rtt

to the following:

    bw = delivered / max(send_phase_rtt, ack_phase_rtt)

In large-scale experiments, this filtering approach incorporating
send_phase_rtt is effective at avoiding bandwidth overestimates due to
ACK compression or stretched ACKs.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |   2 +
 include/net/tcp.h     |  35 +++++++++++-
 net/ipv4/Makefile     |   2 +-
 net/ipv4/tcp_input.c  |  46 +++++++++++-----
 net/ipv4/tcp_output.c |   4 ++
 net/ipv4/tcp_rate.c   | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 222 insertions(+), 16 deletions(-)
 create mode 100644 net/ipv4/tcp_rate.c

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 38590fbc0ac5..c50e6aec005a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,6 +268,8 @@ struct tcp_sock {
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
 	u32	delivered;	/* Total data packets delivered incl. rexmits */
 	u32	lost;		/* Total data packets lost incl. rexmits */
+	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
+	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2f1648af4d12..b261c892605a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -763,8 +763,14 @@ struct tcp_skb_cb {
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
 		struct {
-			/* There is space for up to 20 bytes */
+			/* There is space for up to 24 bytes */
 			__u32 in_flight;/* Bytes in flight when packet sent */
+			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+			__u32 delivered;
+			/* start of send pipeline phase */
+			struct skb_mstamp first_tx_mstamp;
+			/* when we reached the "delivered" count */
+			struct skb_mstamp delivered_mstamp;
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -860,6 +866,26 @@ struct ack_sample {
 	u32 in_flight;
 };
 
+/* A rate sample measures the number of (original/retransmitted) data
+ * packets delivered "delivered" over an interval of time "interval_us".
+ * The tcp_rate.c code fills in the rate sample, and congestion
+ * control modules that define a cong_control function to run at the end
+ * of ACK processing can optionally chose to consult this sample when
+ * setting cwnd and pacing rate.
+ * A sample is invalid if "delivered" or "interval_us" is negative.
+ */
+struct rate_sample {
+	struct	skb_mstamp prior_mstamp; /* starting timestamp for interval */
+	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+	s32  delivered;		/* number of packets delivered over interval */
+	long interval_us;	/* time for tp->delivered to incr "delivered" */
+	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
+	int  losses;		/* number of packets marked lost upon ACK */
+	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
+	u32  prior_in_flight;	/* in flight before this ACK */
+	bool is_retrans;	/* is sample from retransmission? */
+};
+
 struct tcp_congestion_ops {
 	struct list_head	list;
 	u32 key;
@@ -946,6 +972,13 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 		icsk->icsk_ca_ops->cwnd_event(sk, event);
 }
 
+/* From tcp_rate.c */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+			    struct rate_sample *rs);
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+		  struct skb_mstamp *now, struct rate_sample *rs);
+
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
  * between different flows.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6f57cc..9cfff1a0bf71 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-	     tcp_recovery.o \
+	     tcp_rate.o tcp_recovery.o \
 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9413288c2778..d9ed4bb96f74 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1112,6 +1112,7 @@ struct tcp_sacktag_state {
 	 */
 	struct skb_mstamp first_sackt;
 	struct skb_mstamp last_sackt;
+	struct rate_sample *rate;
 	int	flag;
 };
 
@@ -1279,6 +1280,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
 			start_seq, end_seq, dup_sack, pcount,
 			&skb->skb_mstamp);
+	tcp_rate_skb_delivered(sk, skb, state->rate);
 
 	if (skb == tp->lost_skb_hint)
 		tp->lost_cnt_hint += pcount;
@@ -1329,6 +1331,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 		tcp_advance_highest_sack(sk, skb);
 
 	tcp_skb_collapse_tstamp(prev, skb);
+	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+		TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
 	tcp_unlink_write_queue(skb, sk);
 	sk_wmem_free_skb(sk, skb);
 
@@ -1558,6 +1563,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 						dup_sack,
 						tcp_skb_pcount(skb),
 						&skb->skb_mstamp);
+			tcp_rate_skb_delivered(sk, skb, state->rate);
 
 			if (!before(TCP_SKB_CB(skb)->seq,
 				    tcp_highest_sack_seq(tp)))
@@ -1640,8 +1646,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 
 	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
 					 num_sacks, prior_snd_una);
-	if (found_dup_sack)
+	if (found_dup_sack) {
 		state->flag |= FLAG_DSACKING_ACK;
+		tp->delivered++; /* A spurious retransmission is delivered */
+	}
 
 	/* Eliminate too old ACKs, but take into
 	 * account more or less fresh ones, they can
@@ -3071,10 +3079,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			       u32 prior_snd_una, int *acked,
-			       struct tcp_sacktag_state *sack)
+			       struct tcp_sacktag_state *sack,
+			       struct skb_mstamp *now)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct skb_mstamp first_ackt, last_ackt, now;
+	struct skb_mstamp first_ackt, last_ackt;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
@@ -3106,7 +3115,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			acked_pcount = tcp_tso_acked(sk, skb);
 			if (!acked_pcount)
 				break;
-
 			fully_acked = false;
 		} else {
 			/* Speedup tcp_unlink_write_queue() and next loop */
@@ -3142,6 +3150,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 		tp->packets_out -= acked_pcount;
 		pkts_acked += acked_pcount;
+		tcp_rate_skb_delivered(sk, skb, sack->rate);
 
 		/* Initial outgoing SYN's get put onto the write_queue
 		 * just like anything else we transmit.  It is not
@@ -3174,16 +3183,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		flag |= FLAG_SACK_RENEGING;
 
-	skb_mstamp_get(&now);
 	if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
-		seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
-		ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+		seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+		ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
 	}
 	if (sack->first_sackt.v64) {
-		sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
-		ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+		sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+		ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
 	}
-
+	sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
 	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
 					ca_rtt_us);
 
@@ -3211,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
 
 	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
-		   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+		   sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
 		/* Do not re-arm RTO if the sack RTT is measured from data sent
 		 * after when the head was last (re)transmitted. Otherwise the
 		 * timeout may continue to extend in loss recovery.
@@ -3548,17 +3556,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_sacktag_state sack_state;
+	struct rate_sample rs = { .prior_delivered = 0 };
 	u32 prior_snd_una = tp->snd_una;
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
 	u32 prior_fackets;
 	int prior_packets = tp->packets_out;
-	u32 prior_delivered = tp->delivered;
+	u32 delivered = tp->delivered;
+	u32 lost = tp->lost;
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+	struct skb_mstamp now;
 
 	sack_state.first_sackt.v64 = 0;
+	sack_state.rate = &rs;
 
 	/* We very likely will need to access write queue head. */
 	prefetchw(sk->sk_write_queue.next);
@@ -3581,6 +3593,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, tp->snd_nxt))
 		goto invalid_ack;
 
+	skb_mstamp_get(&now);
+
 	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
@@ -3591,6 +3605,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	}
 
 	prior_fackets = tp->fackets_out;
+	rs.prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* ts_recent update must be made after we are sure that the packet
 	 * is in window.
@@ -3646,7 +3661,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-				    &sack_state);
+				    &sack_state, &now);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3663,7 +3678,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
-	tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
+	lost = tp->lost - lost;			/* freshly marked lost */
+	tcp_rate_gen(sk, delivered, lost, &now, &rs);
+	tcp_cong_control(sk, ack, delivered, flag);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8b45794eb6b2..e02c8ebf3ed4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -918,6 +918,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		skb_mstamp_get(&skb->skb_mstamp);
 		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
 			- tp->snd_una;
+		tcp_rate_skb_sent(sk, skb);
 
 		if (unlikely(skb_cloned(skb)))
 			skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1214,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	tcp_set_skb_tso_segs(skb, mss_now);
 	tcp_set_skb_tso_segs(buff, mss_now);
 
+	/* Update delivered info for the new segment */
+	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
 	/* If this packet has been sent out already, we must
 	 * adjust the various packet counters.
 	 */
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 000000000000..1daed6af6e80
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,149 @@
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ *    send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ *    ack_rate  = #pkts_delivered/(last_ack_time - first_ack_time)
+ *    bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits.  The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ */
+
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	 /* In general we need to start delivery rate samples from the
+	  * time we received the most recent ACK, to ensure we include
+	  * the full time the network needs to deliver all in-flight
+	  * packets. If there are no packets in flight yet, then we
+	  * know that any ACKs after now indicate that the network was
+	  * able to deliver those packets completely in the sampling
+	  * interval between now and the next ACK.
+	  *
+	  * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+	  * because the latter is a guess based on RTO and loss-marking
+	  * heuristics. We don't want spurious RTOs or loss markings to cause
+	  * a spuriously small time interval, causing a spuriously high
+	  * bandwidth estimate.
+	  */
+	if (!tp->packets_out) {
+		tp->first_tx_mstamp  = skb->skb_mstamp;
+		tp->delivered_mstamp = skb->skb_mstamp;
+	}
+
+	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
+	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the highest prior_delivered count.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+			    struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+	if (!scb->tx.delivered_mstamp.v64)
+		return;
+
+	if (!rs->prior_delivered ||
+	    after(scb->tx.delivered, rs->prior_delivered)) {
+		rs->prior_delivered  = scb->tx.delivered;
+		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
+
+		/* Find the duration of the "send phase" of this window: */
+		rs->interval_us      = skb_mstamp_us_delta(
+						&skb->skb_mstamp,
+						&scb->tx.first_tx_mstamp);
+
+		/* Record send time of most recently ACKed packet: */
+		tp->first_tx_mstamp  = skb->skb_mstamp;
+	}
+	/* Mark off the skb delivered once it's sacked to avoid being
+	 * used again when it's cumulatively acked. For acked packets
+	 * we don't need to reset since it'll be freed soon.
+	 */
+	if (scb->sacked & TCPCB_SACKED_ACKED)
+		scb->tx.delivered_mstamp.v64 = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+		  struct skb_mstamp *now, struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 snd_us, ack_us;
+
+	/* TODO: there are multiple places throughout tcp_ack() to get
+	 * current time. Refactor the code using a new "tcp_acktag_state"
+	 * to carry current time, flags, stats like "tcp_sacktag_state".
+	 */
+	if (delivered)
+		tp->delivered_mstamp = *now;
+
+	rs->acked_sacked = delivered;	/* freshly ACKed or SACKed */
+	rs->losses = lost;		/* freshly marked lost */
+	/* Return an invalid sample if no timing information is available. */
+	if (!rs->prior_mstamp.v64) {
+		rs->delivered = -1;
+		rs->interval_us = -1;
+		return;
+	}
+	rs->delivered   = tp->delivered - rs->prior_delivered;
+
+	/* Model sending data and receiving ACKs as separate pipeline phases
+	 * for a window. Usually the ACK phase is longer, but with ACK
+	 * compression the send phase can be longer. To be safe we use the
+	 * longer phase.
+	 */
+	snd_us = rs->interval_us;				/* send phase */
+	ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp);	/* ack phase */
+	rs->interval_us = max(snd_us, ack_us);
+
+	/* Normally we expect interval_us >= min-rtt.
+	 * Note that rate may still be over-estimated when a spuriously
+	 * retransmistted skb was first (s)acked because "interval_us"
+	 * is under-estimated (up to an RTT). However continuously
+	 * measuring the delivery rate during loss recovery is crucial
+	 * for connections suffer heavy or prolonged losses.
+	 */
+	if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+		rs->interval_us = -1;
+		if (!rs->is_retrans)
+			pr_debug("tcp rate: %ld %d %u %u %u\n",
+				 rs->interval_us, rs->delivered,
+				 inet_csk(sk)->icsk_ca_state,
+				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+	}
+}
-- 
cgit v1.2.3


From d7722e8570fc0f1e003cee7cf37694041828918b Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Mon, 19 Sep 2016 23:39:15 -0400
Subject: tcp: track application-limited rate samples

This commit adds code to track whether the delivery rate represented
by each rate_sample was limited by the application.

Upon each transmit, we store in the is_app_limited field in the skb a
boolean bit indicating whether there is a known "bubble in the pipe":
a point in the rate sample interval where the sender was
application-limited, and did not transmit even though the cwnd and
pacing rate allowed it.

This logic marks the flow app-limited on a write if *all* of the
following are true:

  1) There is less than 1 MSS of unsent data in the write queue
     available to transmit.

  2) There is no packet in the sender's queues (e.g. in fq or the NIC
     tx queue).

  3) The connection is not limited by cwnd.

  4) There are no lost packets to retransmit.

The tcp_rate_check_app_limited() code in tcp_rate.c determines whether
the connection is application-limited at the moment. If the flow is
application-limited, it sets the tp->app_limited field. If the flow is
application-limited then that means there is effectively a "bubble" of
silence in the pipe now, and this silence will be reflected in a lower
bandwidth sample for any rate samples from now until we get an ACK
indicating this bubble has exited the pipe: specifically, until we get
an ACK for the next packet we transmit.

When we send every skb we record in scb->tx.is_app_limited whether the
resulting rate sample will be application-limited.

The code in tcp_rate_gen() checks to see when it is safe to mark all
known application-limited bubbles of silence as having exited the
pipe. It does this by checking to see when the delivered count moves
past the tp->app_limited marker. At this point it zeroes the
tp->app_limited marker, as all known bubbles are out of the pipe.

We make room for the tx.is_app_limited bit in the skb by borrowing a
bit from the in_flight field used by NV to record the number of bytes
in flight. The receive window in the TCP header is 16 bits, and the
max receive window scaling shift factor is 14 (RFC 1323). So the max
receive window offered by the TCP protocol is 2^(16+14) = 2^30. So we
only need 30 bits for the tx.in_flight used by NV.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 +
 include/net/tcp.h        |  6 +++++-
 net/ipv4/tcp.c           |  8 ++++++++
 net/ipv4/tcp_minisocks.c |  3 +++
 net/ipv4/tcp_rate.c      | 29 ++++++++++++++++++++++++++++-
 5 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c50e6aec005a..fdcd00ffcb66 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,6 +268,7 @@ struct tcp_sock {
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
 	u32	delivered;	/* Total data packets delivered incl. rexmits */
 	u32	lost;		/* Total data packets lost incl. rexmits */
+	u32	app_limited;	/* limited until "delivered" reaches this val */
 	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
 	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b261c892605a..a69ed7f0030c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -764,7 +764,9 @@ struct tcp_skb_cb {
 	union {
 		struct {
 			/* There is space for up to 24 bytes */
-			__u32 in_flight;/* Bytes in flight when packet sent */
+			__u32 in_flight:30,/* Bytes in flight at transmit */
+			      is_app_limited:1, /* cwnd not fully used? */
+			      unused:1;
 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
 			__u32 delivered;
 			/* start of send pipeline phase */
@@ -883,6 +885,7 @@ struct rate_sample {
 	int  losses;		/* number of packets marked lost upon ACK */
 	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
 	u32  prior_in_flight;	/* in flight before this ACK */
+	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
 };
 
@@ -978,6 +981,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 		  struct skb_mstamp *now, struct rate_sample *rs);
+void tcp_rate_check_app_limited(struct sock *sk);
 
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index de02fb4b1349..2250f891f931 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk)
 	 */
 	tp->snd_cwnd = TCP_INIT_CWND;
 
+	/* There's a bubble in the pipe until at least the first ACK. */
+	tp->app_limited = ~0U;
+
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
 	 */
@@ -1014,6 +1017,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 					flags);
 
 	lock_sock(sk);
+
+	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
+
 	res = do_tcp_sendpages(sk, page, offset, size, flags);
 	release_sock(sk);
 	return res;
@@ -1115,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
+	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
+
 	/* Wait for a connection to finish. One exception is TCP Fast Open
 	 * (passive side) where data is allowed to be sent before a connection
 	 * is fully established.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 568947110b60..6234ebaa7db1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -487,6 +487,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 
+		/* There's a bubble in the pipe until at least the first ACK. */
+		newtp->app_limited = ~0U;
+
 		tcp_init_xmit_timers(newsk);
 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
 
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 1daed6af6e80..52ff84be59ab 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -26,9 +26,13 @@
  * other factors like applications or receiver window limits.  The estimator
  * deliberately avoids using the inter-packet spacing approach because that
  * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
  */
 
-
 /* Snapshot the current delivery information in the skb, to generate
  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
  */
@@ -58,6 +62,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
 }
 
 /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -80,6 +85,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 	    after(scb->tx.delivered, rs->prior_delivered)) {
 		rs->prior_delivered  = scb->tx.delivered;
 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+		rs->is_app_limited   = scb->tx.is_app_limited;
 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
 
 		/* Find the duration of the "send phase" of this window: */
@@ -105,6 +111,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 snd_us, ack_us;
 
+	/* Clear app limited if bubble is acked and gone. */
+	if (tp->app_limited && after(tp->delivered, tp->app_limited))
+		tp->app_limited = 0;
+
 	/* TODO: there are multiple places throughout tcp_ack() to get
 	 * current time. Refactor the code using a new "tcp_acktag_state"
 	 * to carry current time, flags, stats like "tcp_sacktag_state".
@@ -147,3 +157,20 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
 	}
 }
+
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (/* We have less than one packet to send. */
+	    tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+	    /* Nothing in sending host's qdisc queues or NIC tx queue. */
+	    sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+	    /* We are not limited by CWND. */
+	    tcp_packets_in_flight(tp) < tp->snd_cwnd &&
+	    /* All lost packets have been retransmitted. */
+	    tp->lost_out <= tp->retrans_out)
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
-- 
cgit v1.2.3


From eb8329e0a04db0061f714f033b4454326ba147f4 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:16 -0400
Subject: tcp: export data delivery rate

This commit export two new fields in struct tcp_info:

  tcpi_delivery_rate: The most recent goodput, as measured by
    tcp_rate_gen(). If the socket is limited by the sending
    application (e.g., no data to send), it reports the highest
    measurement instead of the most recent. The unit is bytes per
    second (like other rate fields in tcp_info).

  tcpi_delivery_rate_app_limited: A boolean indicating if the goodput
    was measured when the socket's throughput was limited by the
    sending application.

This delivery rate information can be useful for applications that
want to know the current throughput the TCP connection is seeing,
e.g. adaptive bitrate video streaming. It can also be very useful for
debugging or troubleshooting.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  5 ++++-
 include/uapi/linux/tcp.h |  3 +++
 net/ipv4/tcp.c           | 11 ++++++++++-
 net/ipv4/tcp_rate.c      | 12 +++++++++++-
 4 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fdcd00ffcb66..a17ae7b85218 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -213,7 +213,8 @@ struct tcp_sock {
 		u8 reord;    /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
-	u8	unused;
+	u8	rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
+		unused:7;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
@@ -271,6 +272,8 @@ struct tcp_sock {
 	u32	app_limited;	/* limited until "delivered" reaches this val */
 	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
 	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
+	u32	rate_delivered;    /* saved rate sample: packets delivered */
+	u32	rate_interval_us;  /* saved rate sample: time elapsed */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 482898fc433a..73ac0db487f8 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -167,6 +167,7 @@ struct tcp_info {
 	__u8	tcpi_backoff;
 	__u8	tcpi_options;
 	__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
+	__u8	tcpi_delivery_rate_app_limited:1;
 
 	__u32	tcpi_rto;
 	__u32	tcpi_ato;
@@ -211,6 +212,8 @@ struct tcp_info {
 	__u32	tcpi_min_rtt;
 	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
 	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
+
+	__u64   tcpi_delivery_rate;
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2250f891f931..f253e5019d22 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2712,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
 	const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	u32 now = tcp_time_stamp;
+	u32 now = tcp_time_stamp, intv;
 	unsigned int start;
 	int notsent_bytes;
 	u64 rate64;
@@ -2802,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_min_rtt = tcp_min_rtt(tp);
 	info->tcpi_data_segs_in = tp->data_segs_in;
 	info->tcpi_data_segs_out = tp->data_segs_out;
+
+	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+	rate = READ_ONCE(tp->rate_delivered);
+	intv = READ_ONCE(tp->rate_interval_us);
+	if (rate && intv) {
+		rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+		do_div(rate64, intv);
+		put_unaligned(rate64, &info->tcpi_delivery_rate);
+	}
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 52ff84be59ab..9be1581a5a08 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -149,12 +149,22 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	 * for connections suffer heavy or prolonged losses.
 	 */
 	if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
-		rs->interval_us = -1;
 		if (!rs->is_retrans)
 			pr_debug("tcp rate: %ld %d %u %u %u\n",
 				 rs->interval_us, rs->delivered,
 				 inet_csk(sk)->icsk_ca_state,
 				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+		rs->interval_us = -1;
+		return;
+	}
+
+	/* Record the last non-app-limited or the highest app-limited bw */
+	if (!rs->is_app_limited ||
+	    ((u64)rs->delivered * tp->rate_interval_us >=
+	     (u64)tp->rate_delivered * rs->interval_us)) {
+		tp->rate_delivered = rs->delivered;
+		tp->rate_interval_us = rs->interval_us;
+		tp->rate_app_limited = rs->is_app_limited;
 	}
 }
 
-- 
cgit v1.2.3


From ed6e7268b930e0a9a65d895d368eac79a438d992 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:17 -0400
Subject: tcp: allow congestion control module to request TSO skb segment count

Add the tso_segs_goal() function in tcp_congestion_ops to allow the
congestion control module to specify the number of segments that
should be in a TSO skb sent by tcp_write_xmit() and
tcp_xmit_retransmit_queue(). The congestion control module can either
request a particular number of segments in TSO skb that we transmit,
or return 0 if it doesn't care.

This allows the upcoming BBR congestion control module to select small
TSO skb sizes if the module detects that the bottleneck bandwidth is
very low, or that the connection is policed to a low rate.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  2 ++
 net/ipv4/tcp_output.c | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a69ed7f0030c..f8f581fd05f5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -913,6 +913,8 @@ struct tcp_congestion_ops {
 	u32  (*undo_cwnd)(struct sock *sk);
 	/* hook for packet ack accounting (optional) */
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+	/* suggest number of segments for each skb to transmit (optional) */
+	u32 (*tso_segs_goal)(struct sock *sk);
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e02c8ebf3ed4..01379567a732 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1566,6 +1566,17 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
 	return min_t(u32, segs, sk->sk_gso_max_segs);
 }
 
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+
+	return tso_segs ? : tcp_tso_autosize(sk, mss_now);
+}
+
 /* Returns the portion of skb which can be sent right away */
 static unsigned int tcp_mss_split_point(const struct sock *sk,
 					const struct sk_buff *skb,
@@ -2061,7 +2072,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		}
 	}
 
-	max_segs = tcp_tso_autosize(sk, mss_now);
+	max_segs = tcp_tso_segs(sk, mss_now);
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
@@ -2778,7 +2789,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		last_lost = tp->snd_una;
 	}
 
-	max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
+	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
 	tcp_for_write_queue_from(skb, sk) {
 		__u8 sacked;
 		int segs;
-- 
cgit v1.2.3


From 1b3878ca1551f3baab2c408d1e703b5ef785a1b2 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:18 -0400
Subject: tcp: export tcp_tso_autosize() and parameterize minimum number of TSO
 segments

To allow congestion control modules to use the default TSO auto-sizing
algorithm as one of the ingredients in their own decision about TSO sizing:

1) Export tcp_tso_autosize() so that CC modules can use it.

2) Change tcp_tso_autosize() to allow callers to specify a minimum
   number of segments per TSO skb, in case the congestion control
   module has a different notion of the best floor for TSO skbs for
   the connection right now. For very low-rate paths or policed
   connections it can be appropriate to use smaller TSO skbs.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 2 ++
 net/ipv4/tcp_output.c | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f8f581fd05f5..349204130d84 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -533,6 +533,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
 #endif
 /* tcp_output.c */
 
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+		     int min_tso_segs);
 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 			       int nonagle);
 bool tcp_may_send_now(struct sock *sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 01379567a732..0bf3d481fa85 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1549,7 +1549,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
 /* Return how many segs we'd like on a TSO packet,
  * to send one TSO packet per ms
  */
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+		     int min_tso_segs)
 {
 	u32 bytes, segs;
 
@@ -1561,10 +1562,11 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
 	 * This preserves ACK clocking and is consistent
 	 * with tcp_tso_should_defer() heuristic.
 	 */
-	segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+	segs = max_t(u32, bytes / mss_now, min_tso_segs);
 
 	return min_t(u32, segs, sk->sk_gso_max_segs);
 }
+EXPORT_SYMBOL(tcp_tso_autosize);
 
 /* Return the number of segments we want in the skb we are transmitting.
  * See if congestion control module wants to decide; otherwise, autosize.
@@ -1574,7 +1576,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
 	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
 
-	return tso_segs ? : tcp_tso_autosize(sk, mss_now);
+	return tso_segs ? :
+		tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
 }
 
 /* Returns the portion of skb which can be sent right away */
-- 
cgit v1.2.3


From 77bfc174c38e558a3425d3b069aa2762b2fedfdd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:20 -0400
Subject: tcp: allow congestion control to expand send buffer differently

Currently the TCP send buffer expands to twice cwnd, in order to allow
limited transmits in the CA_Recovery state. This assumes that cwnd
does not increase in the CA_Recovery.

For some congestion control algorithms, like the upcoming BBR module,
if the losses in recovery do not indicate congestion then we may
continue to raise cwnd multiplicatively in recovery. In such cases the
current multiplier will falsely limit the sending rate, much as if it
were limited by the application.

This commit adds an optional congestion control callback to use a
different multiplier to expand the TCP send buffer. For congestion
control modules that do not specificy this callback, TCP continues to
use the previous default of 2.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 2 ++
 net/ipv4/tcp_input.c | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 349204130d84..1aa9628ae608 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -917,6 +917,8 @@ struct tcp_congestion_ops {
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
 	/* suggest number of segments for each skb to transmit (optional) */
 	u32 (*tso_segs_goal)(struct sock *sk);
+	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
+	u32 (*sndbuf_expand)(struct sock *sk);
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d9ed4bb96f74..13a2e70141f5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
 static void tcp_sndbuf_expand(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
 	int sndmem, per_mss;
 	u32 nr_segs;
 
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
 	 * Cubic needs 1.7 factor, rounded to 2 to include
 	 * extra cushion (application might react slowly to POLLOUT)
 	 */
-	sndmem = 2 * nr_segs * per_mss;
+	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+	sndmem *= nr_segs * per_mss;
 
 	if (sk->sk_sndbuf < sndmem)
 		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
-- 
cgit v1.2.3


From c0402760f565ae066621ebf8720a32fba074d538 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:21 -0400
Subject: tcp: new CC hook to set sending rate with rate_sample in any CA state

This commit introduces an optional new "omnipotent" hook,
cong_control(), for congestion control modules. The cong_control()
function is called at the end of processing an ACK (i.e., after
updating sequence numbers, the SACK scoreboard, and loss
detection). At that moment we have precise delivery rate information
the congestion control module can use to control the sending behavior
(using cwnd, TSO skb size, and pacing rate) in any CA state.

This function can also be used by a congestion control that prefers
not to use the default cwnd reduction approach (i.e., the PRR
algorithm) during CA_Recovery to control the cwnd and sending rate
during loss recovery.

We take advantage of the fact that recent changes defer the
retransmission or transmission of new data (e.g. by F-RTO) in recovery
until the new tcp_cong_control() function is run.

With this commit, we only run tcp_update_pacing_rate() if the
congestion control is not using this new API. New congestion controls
which use the new API do not want the TCP stack to run the default
pacing rate calculation and overwrite whatever pacing rate they have
chosen at initialization time.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  4 ++++
 net/ipv4/tcp_cong.c  |  2 +-
 net/ipv4/tcp_input.c | 17 ++++++++++++++---
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1aa9628ae608..f83b7f220a65 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -919,6 +919,10 @@ struct tcp_congestion_ops {
 	u32 (*tso_segs_goal)(struct sock *sk);
 	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
 	u32 (*sndbuf_expand)(struct sock *sk);
+	/* call when packets are delivered to update cwnd and pacing rate,
+	 * after all the ca_state processing. (optional)
+	 */
+	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 882caa4e72bc..1294af4e0127 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 	int ret = 0;
 
 	/* all algorithms must implement ssthresh and cong_avoid ops */
-	if (!ca->ssthresh || !ca->cong_avoid) {
+	if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
 		pr_err("%s does not implement required ops\n", ca->name);
 		return -EINVAL;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 13a2e70141f5..980a83edfa63 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2536,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	if (inet_csk(sk)->icsk_ca_ops->cong_control)
+		return;
+
 	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
 	if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
 	    (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -3312,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
  * information. All transmission or retransmission are delayed afterwards.
  */
 static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
-			     int flag)
+			     int flag, const struct rate_sample *rs)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->cong_control) {
+		icsk->icsk_ca_ops->cong_control(sk, rs);
+		return;
+	}
+
 	if (tcp_in_cwnd_reduction(sk)) {
 		/* Reduce cwnd if state mandates */
 		tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3683,7 +3693,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
 	tcp_rate_gen(sk, delivered, lost, &now, &rs);
-	tcp_cong_control(sk, ack, delivered, flag);
+	tcp_cong_control(sk, ack, delivered, flag, &rs);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
@@ -5982,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		} else
 			tcp_init_metrics(sk);
 
-		tcp_update_pacing_rate(sk);
+		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+			tcp_update_pacing_rate(sk);
 
 		/* Prevent spurious tcp_cwnd_restart() on first data packet */
 		tp->lsndtime = tcp_time_stamp;
-- 
cgit v1.2.3


From 7e744171386ae6da1248d3d27d10b6dbdc54f0ff Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:22 -0400
Subject: tcp: increase ICSK_CA_PRIV_SIZE from 64 bytes to 88

The TCP CUBIC module already uses 64 bytes.
The upcoming TCP BBR module uses 88 bytes.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 49dcad4fe99e..197a30d221e9 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -134,8 +134,8 @@ struct inet_connection_sock {
 	} icsk_mtup;
 	u32			  icsk_user_timeout;
 
-	u64			  icsk_ca_priv[64 / sizeof(u64)];
-#define ICSK_CA_PRIV_SIZE      (8 * sizeof(u64))
+	u64			  icsk_ca_priv[88 / sizeof(u64)];
+#define ICSK_CA_PRIV_SIZE      (11 * sizeof(u64))
 };
 
 #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
-- 
cgit v1.2.3


From 0f8782ea14974ce992618b55f0c041ef43ed0b78 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:23 -0400
Subject: tcp_bbr: add BBR congestion control

This commit implements a new TCP congestion control algorithm: BBR
(Bottleneck Bandwidth and RTT). A detailed description of BBR will be
published in ACM Queue, Vol. 14 No. 5, September-October 2016, as
"BBR: Congestion-Based Congestion Control".

BBR has significantly increased throughput and reduced latency for
connections on Google's internal backbone networks and google.com and
YouTube Web servers.

BBR requires only changes on the sender side, not in the network or
the receiver side. Thus it can be incrementally deployed on today's
Internet, or in datacenters.

The Internet has predominantly used loss-based congestion control
(largely Reno or CUBIC) since the 1980s, relying on packet loss as the
signal to slow down. While this worked well for many years, loss-based
congestion control is unfortunately out-dated in today's networks. On
today's Internet, loss-based congestion control causes the infamous
bufferbloat problem, often causing seconds of needless queuing delay,
since it fills the bloated buffers in many last-mile links. On today's
high-speed long-haul links using commodity switches with shallow
buffers, loss-based congestion control has abysmal throughput because
it over-reacts to losses caused by transient traffic bursts.

In 1981 Kleinrock and Gale showed that the optimal operating point for
a network maximizes delivered bandwidth while minimizing delay and
loss, not only for single connections but for the network as a
whole. Finding that optimal operating point has been elusive, since
any single network measurement is ambiguous: network measurements are
the result of both bandwidth and propagation delay, and those two
cannot be measured simultaneously.

While it is impossible to disambiguate any single bandwidth or RTT
measurement, a connection's behavior over time tells a clearer
story. BBR uses a measurement strategy designed to resolve this
ambiguity. It combines these measurements with a robust servo loop
using recent control systems advances to implement a distributed
congestion control algorithm that reacts to actual congestion, not
packet loss or transient queue delay, and is designed to converge with
high probability to a point near the optimal operating point.

In a nutshell, BBR creates an explicit model of the network pipe by
sequentially probing the bottleneck bandwidth and RTT. On the arrival
of each ACK, BBR derives the current delivery rate of the last round
trip, and feeds it through a windowed max-filter to estimate the
bottleneck bandwidth. Conversely it uses a windowed min-filter to
estimate the round trip propagation delay. The max-filtered bandwidth
and min-filtered RTT estimates form BBR's model of the network pipe.

Using its model, BBR sets control parameters to govern sending
behavior. The primary control is the pacing rate: BBR applies a gain
multiplier to transmit faster or slower than the observed bottleneck
bandwidth. The conventional congestion window (cwnd) is now the
secondary control; the cwnd is set to a small multiple of the
estimated BDP (bandwidth-delay product) in order to allow full
utilization and bandwidth probing while bounding the potential amount
of queue at the bottleneck.

When a BBR connection starts, it enters STARTUP mode and applies a
high gain to perform an exponential search to quickly probe the
bottleneck bandwidth (doubling its sending rate each round trip, like
slow start). However, instead of continuing until it fills up the
buffer (i.e. a loss), or until delay or ACK spacing reaches some
threshold (like Hystart), it uses its model of the pipe to estimate
when that pipe is full: it estimates the pipe is full when it notices
the estimated bandwidth has stopped growing. At that point it exits
STARTUP and enters DRAIN mode, where it reduces its pacing rate to
drain the queue it estimates it has created.

Then BBR enters steady state. In steady state, PROBE_BW mode cycles
between first pacing faster to probe for more bandwidth, then pacing
slower to drain any queue that created if no more bandwidth was
available, and then cruising at the estimated bandwidth to utilize the
pipe without creating excess queue. Occasionally, on an as-needed
basis, it sends significantly slower to probe for RTT (PROBE_RTT
mode).

BBR has been fully deployed on Google's wide-area backbone networks
and we're experimenting with BBR on Google.com and YouTube on a global
scale.  Replacing CUBIC with BBR has resulted in significant
improvements in network latency and application (RPC, browser, and
video) metrics. For more details please refer to our upcoming ACM
Queue publication.

Example performance results, to illustrate the difference between BBR
and CUBIC:

Resilience to random loss (e.g. from shallow buffers):
  Consider a netperf TCP_STREAM test lasting 30 secs on an emulated
  path with a 10Gbps bottleneck, 100ms RTT, and 1% packet loss
  rate. CUBIC gets 3.27 Mbps, and BBR gets 9150 Mbps (2798x higher).

Low latency with the bloated buffers common in today's last-mile links:
  Consider a netperf TCP_STREAM test lasting 120 secs on an emulated
  path with a 10Mbps bottleneck, 40ms RTT, and 1000-packet bottleneck
  buffer. Both fully utilize the bottleneck bandwidth, but BBR
  achieves this with a median RTT 25x lower (43 ms instead of 1.09
  secs).

Our long-term goal is to improve the congestion control algorithms
used on the Internet. We are hopeful that BBR can help advance the
efforts toward this goal, and motivate the community to do further
research.

Test results, performance evaluations, feedback, and BBR-related
discussions are very welcome in the public e-mail list for BBR:

  https://groups.google.com/forum/#!forum/bbr-dev

NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing
enabled, since pacing is integral to the BBR design and
implementation. BBR without pacing would not function properly, and
may incur unnecessary high packet loss rates.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  13 +
 net/ipv4/Kconfig               |  18 +
 net/ipv4/Makefile              |   1 +
 net/ipv4/tcp_bbr.c             | 896 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 928 insertions(+)
 create mode 100644 net/ipv4/tcp_bbr.c

(limited to 'include')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index b5c366f87b3e..509cd961068d 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -124,6 +124,7 @@ enum {
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
 	INET_DIAG_MARK,
+	INET_DIAG_BBRINFO,
 	__INET_DIAG_MAX,
 };
 
@@ -157,8 +158,20 @@ struct tcp_dctcp_info {
 	__u32	dctcp_ab_tot;
 };
 
+/* INET_DIAG_BBRINFO */
+
+struct tcp_bbr_info {
+	/* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
+	__u32	bbr_bw_lo;		/* lower 32 bits of bw */
+	__u32	bbr_bw_hi;		/* upper 32 bits of bw */
+	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+};
+
 union tcp_cc_info {
 	struct tcpvegas_info	vegas;
 	struct tcp_dctcp_info	dctcp;
+	struct tcp_bbr_info	bbr;
 };
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b49f6c..300b06888fdf 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
 	  D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
 	  delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
 
+config TCP_CONG_BBR
+	tristate "BBR TCP"
+	default n
+	---help---
+
+	BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+	maximize network utilization and minimize queues. It builds an explicit
+	model of the the bottleneck delivery rate and path round-trip
+	propagation delay. It tolerates packet loss and delay unrelated to
+	congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+	modem links. It can coexist with flows that use loss-based congestion
+	control, and can operate with shallow buffers, deep buffers,
+	bufferbloat, policers, or AQM schemes that do not provide a delay
+	signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
 choice
 	prompt "Default TCP congestion control"
 	default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
 	config DEFAULT_CDG
 		bool "CDG" if TCP_CONG_CDG=y
 
+	config DEFAULT_BBR
+		bool "BBR" if TCP_CONG_BBR=y
+
 	config DEFAULT_RENO
 		bool "Reno"
 endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9cfff1a0bf71..bc6a6c8b9bcd 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 000000000000..0ea66c2c9344
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,896 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ *   On each ACK, update our model of the network path:
+ *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ *      min_rtt = windowed_min(rtt, 10 seconds)
+ *   pacing_rate = pacing_gain * bottleneck_bandwidth
+ *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * BBR is described in detail in:
+ *   "BBR: Congestion-Based Congestion Control",
+ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ *   https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * since pacing is integral to the BBR design and implementation.
+ * BBR without pacing would not function properly, and may incur unnecessary
+ * high packet loss rates.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
+	BBR_DRAIN,	/* drain any queue created during startup */
+	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
+	BBR_PROBE_RTT,	/* cut cwnd to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
+	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+	struct skb_mstamp cycle_mstamp;  /* time of this cycle phase start */
+	u32     mode:3,		     /* current bbr_mode in state machine */
+		prev_ca_state:3,     /* CA state on previous ACK */
+		packet_conservation:1,  /* use packet conservation? */
+		restore_cwnd:1,	     /* decided to revert cwnd to old value */
+		round_start:1,	     /* start of packet-timed tx->ack round? */
+		tso_segs_goal:7,     /* segments we want in each skb we send */
+		idle_restart:1,	     /* restarting after idle? */
+		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+		unused:5,
+		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
+	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+		cwnd_gain:10,	/* current gain for setting cwnd */
+		full_bw_cnt:3,	/* number of rounds without large bw gains */
+		cycle_idx:3,	/* current index in pacing_gain cycle array */
+		unused_b:6;
+	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+};
+
+#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+	rate *= gain;
+	rate >>= BBR_SCALE;
+	rate *= USEC_PER_SEC;
+	return rate >> BW_SCALE;
+}
+
+/* Pace using current bw estimate and a gain factor. In order to help drive the
+ * network toward lower queues while maintaining high utilization and low
+ * latency, the average pacing rate aims to be slightly (~1%) lower than the
+ * estimated bandwidth. This is an important aspect of the design. In this
+ * implementation this slightly lower pacing rate is achieved implicitly by not
+ * including link-layer headers in the packet size used for the pacing rate.
+ */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 rate = bw;
+
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+		sk->sk_pacing_rate = rate;
+}
+
+/* Return count of segments we want in the skbs we send, or 0 for default. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->tso_segs_goal;
+}
+
+static void bbr_set_tso_segs_goal(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 min_segs;
+
+	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
+				 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
+	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (event == CA_EVENT_TX_START && tp->app_limited) {
+		bbr->idle_restart = 1;
+		/* Avoid pointless buffer overflows: pace at est. bw if we don't
+		 * need more speed (we're restarting from idle and app-limited).
+		 */
+		if (bbr->mode == BBR_PROBE_BW)
+			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+	}
+}
+
+/* Find target cwnd. Right-size the cwnd based on min RTT and the
+ * estimated bottleneck bandwidth:
+ *
+ * cwnd = bw * min_rtt * gain = BDP * gain
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ *
+ * To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ *   - one skb in sending host Qdisc,
+ *   - one skb in sending host TSO/GSO engine
+ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cwnd;
+	u64 w;
+
+	/* If we've never had a valid RTT sample, cap cwnd at the initial
+	 * default. This should only happen when the connection is not using TCP
+	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+	 */
+	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
+
+	w = (u64)bw * bbr->min_rtt_us;
+
+	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
+	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+	/* Allow enough full-sized skbs in flight to utilize end systems. */
+	cwnd += 3 * bbr->tso_segs_goal;
+
+	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+	cwnd = (cwnd + 1) & ~1U;
+
+	return cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+	u32 cwnd = tp->snd_cwnd;
+
+	/* An ACK for P pkts should release at most 2*P packets. We do this
+	 * in two steps. First, here we deduct the number of lost packets.
+	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+	 */
+	if (rs->losses > 0)
+		cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+		/* Starting 1st round of Recovery, so do packet conservation. */
+		bbr->packet_conservation = 1;
+		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+		cwnd = tcp_packets_in_flight(tp) + acked;
+	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+		/* Exiting loss recovery; restore cwnd saved before recovery. */
+		bbr->restore_cwnd = 1;
+		bbr->packet_conservation = 0;
+	}
+	bbr->prev_ca_state = state;
+
+	if (bbr->restore_cwnd) {
+		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
+		cwnd = max(cwnd, bbr->prior_cwnd);
+		bbr->restore_cwnd = 0;
+	}
+
+	if (bbr->packet_conservation) {
+		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+		return true;	/* yes, using packet conservation */
+	}
+	*new_cwnd = cwnd;
+	return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+			 u32 acked, u32 bw, int gain)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cwnd = 0, target_cwnd = 0;
+
+	if (!acked)
+		return;
+
+	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+		goto done;
+
+	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+	target_cwnd = bbr_target_cwnd(sk, bw, gain);
+	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+		cwnd = min(cwnd + acked, target_cwnd);
+	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+		cwnd = cwnd + acked;
+	cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
+	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+				    const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_full_length =
+		skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
+		bbr->min_rtt_us;
+	u32 inflight, bw;
+
+	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+	 * use the pipe without increasing the queue.
+	 */
+	if (bbr->pacing_gain == BBR_UNIT)
+		return is_full_length;		/* just use wall clock time */
+
+	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
+	bw = bbr_max_bw(sk);
+
+	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+	 * a path with small buffers may not hold that much.
+	 */
+	if (bbr->pacing_gain > BBR_UNIT)
+		return is_full_length &&
+			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+
+	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+	 * probing didn't find more bw. If inflight falls to match BDP then we
+	 * estimate queue is drained; persisting would underutilize the pipe.
+	 */
+	return is_full_length ||
+		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+	bbr->cycle_mstamp = tp->delivered_mstamp;
+	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+				   const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
+	    bbr_is_next_cycle_phase(sk, rs))
+		bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->mode = BBR_STARTUP;
+	bbr->pacing_gain = bbr_high_gain;
+	bbr->cwnd_gain	 = bbr_high_gain;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->mode = BBR_PROBE_BW;
+	bbr->pacing_gain = BBR_UNIT;
+	bbr->cwnd_gain = bbr_cwnd_gain;
+	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+	if (!bbr_full_bw_reached(sk))
+		bbr_reset_startup_mode(sk);
+	else
+		bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
+	bbr->lt_last_delivered = tp->delivered;
+	bbr->lt_last_lost = tp->lost;
+	bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->lt_bw = 0;
+	bbr->lt_use_bw = 0;
+	bbr->lt_is_sampling = false;
+	bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 diff;
+
+	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+		/* Is new bw close to the lt_bw from the previous interval? */
+		diff = abs(bw - bbr->lt_bw);
+		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+		     bbr_lt_bw_diff)) {
+			/* All criteria are met; estimate we're policed. */
+			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+			bbr->lt_use_bw = 1;
+			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+			bbr->lt_rtt_cnt = 0;
+			return;
+		}
+	}
+	bbr->lt_bw = bw;
+	bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 lost, delivered;
+	u64 bw;
+	s32 t;
+
+	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+		}
+		return;
+	}
+
+	/* Wait for the first loss before sampling, to let the policer exhaust
+	 * its tokens and estimate the steady-state rate allowed by the policer.
+	 * Starting samples earlier includes bursts that over-estimate the bw.
+	 */
+	if (!bbr->lt_is_sampling) {
+		if (!rs->losses)
+			return;
+		bbr_reset_lt_bw_sampling_interval(sk);
+		bbr->lt_is_sampling = true;
+	}
+
+	/* To avoid underestimates, reset sampling if we run out of data. */
+	if (rs->is_app_limited) {
+		bbr_reset_lt_bw_sampling(sk);
+		return;
+	}
+
+	if (bbr->round_start)
+		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+		return;		/* sampling interval needs to be longer */
+	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+		return;
+	}
+
+	/* End sampling interval when a packet is lost, so we estimate the
+	 * policer tokens were exhausted. Stopping the sampling before the
+	 * tokens are exhausted under-estimates the policed rate.
+	 */
+	if (!rs->losses)
+		return;
+
+	/* Calculate packets lost and delivered in sampling interval. */
+	lost = tp->lost - bbr->lt_last_lost;
+	delivered = tp->delivered - bbr->lt_last_delivered;
+	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+		return;
+
+	/* Find average delivery rate in this sampling interval. */
+	t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
+	if (t < 1)
+		return;		/* interval is less than one jiffy, so wait */
+	t = jiffies_to_usecs(t);
+	/* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
+	if (t < 1) {
+		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+		return;
+	}
+	bw = (u64)delivered * BW_UNIT;
+	do_div(bw, t);
+	bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->round_start = 0;
+	if (rs->delivered < 0 || rs->interval_us <= 0)
+		return; /* Not a valid observation */
+
+	/* See if we've reached the next RTT */
+	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+		bbr->next_rtt_delivered = tp->delivered;
+		bbr->rtt_cnt++;
+		bbr->round_start = 1;
+		bbr->packet_conservation = 0;
+	}
+
+	bbr_lt_bw_sampling(sk, rs);
+
+	/* Divide delivered by the interval to find a (lower bound) bottleneck
+	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+	 * ratio will be <<1 for most connections. So delivered is first scaled.
+	 */
+	bw = (u64)rs->delivered * BW_UNIT;
+	do_div(bw, rs->interval_us);
+
+	/* If this sample is application-limited, it is likely to have a very
+	 * low delivered count that represents application behavior rather than
+	 * the available network rate. Such a sample could drag down estimated
+	 * bw, causing needless slow-down. Thus, to continue to send at the
+	 * last measured network rate, we filter out app-limited samples unless
+	 * they describe the path bw at least as well as our bw model.
+	 *
+	 * So the goal during app-limited phase is to proceed with the best
+	 * network rate no matter how long. We automatically leave this
+	 * phase when app writes faster than the network can deliver :)
+	 */
+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+		/* Incorporate new sample into our max bw filter. */
+		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+	}
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+				      const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw_thresh;
+
+	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+		return;
+
+	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+	if (bbr_max_bw(sk) >= bw_thresh) {
+		bbr->full_bw = bbr_max_bw(sk);
+		bbr->full_bw_cnt = 0;
+		return;
+	}
+	++bbr->full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
+		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
+	}	/* fall through to check if in-flight is already small: */
+	if (bbr->mode == BBR_DRAIN &&
+	    tcp_packets_in_flight(tcp_sk(sk)) <=
+	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool filter_expired;
+
+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+	filter_expired = after(tcp_time_stamp,
+			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+	if (rs->rtt_us >= 0 &&
+	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+		bbr->min_rtt_us = rs->rtt_us;
+		bbr->min_rtt_stamp = tcp_time_stamp;
+	}
+
+	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+		bbr->pacing_gain = BBR_UNIT;
+		bbr->cwnd_gain = BBR_UNIT;
+		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+		bbr->probe_rtt_done_stamp = 0;
+	}
+
+	if (bbr->mode == BBR_PROBE_RTT) {
+		/* Ignore low rate samples during this mode. */
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+		/* Maintain min packets in flight for max(200 ms, 1 round). */
+		if (!bbr->probe_rtt_done_stamp &&
+		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+			bbr->probe_rtt_done_stamp = tcp_time_stamp +
+				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+			bbr->probe_rtt_round_done = 0;
+			bbr->next_rtt_delivered = tp->delivered;
+		} else if (bbr->probe_rtt_done_stamp) {
+			if (bbr->round_start)
+				bbr->probe_rtt_round_done = 1;
+			if (bbr->probe_rtt_round_done &&
+			    after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
+				bbr->min_rtt_stamp = tcp_time_stamp;
+				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
+				bbr_reset_mode(sk);
+			}
+		}
+	}
+	bbr->idle_restart = 0;
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+	bbr_update_bw(sk, rs);
+	bbr_update_cycle_phase(sk, rs);
+	bbr_check_full_bw_reached(sk, rs);
+	bbr_check_drain(sk, rs);
+	bbr_update_min_rtt(sk, rs);
+}
+
+static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw;
+
+	bbr_update_model(sk, rs);
+
+	bw = bbr_bw(sk);
+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+	bbr_set_tso_segs_goal(sk);
+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+static void bbr_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->prior_cwnd = 0;
+	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
+	bbr->rtt_cnt = 0;
+	bbr->next_rtt_delivered = 0;
+	bbr->prev_ca_state = TCP_CA_Open;
+	bbr->packet_conservation = 0;
+
+	bbr->probe_rtt_done_stamp = 0;
+	bbr->probe_rtt_round_done = 0;
+	bbr->min_rtt_us = tcp_min_rtt(tp);
+	bbr->min_rtt_stamp = tcp_time_stamp;
+
+	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
+
+	/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+	bw = (u64)tp->snd_cwnd * BW_UNIT;
+	do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
+	sk->sk_pacing_rate = 0;		/* force an update of sk_pacing_rate */
+	bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+
+	bbr->restore_cwnd = 0;
+	bbr->round_start = 0;
+	bbr->idle_restart = 0;
+	bbr->full_bw = 0;
+	bbr->full_bw_cnt = 0;
+	bbr->cycle_mstamp.v64 = 0;
+	bbr->cycle_idx = 0;
+	bbr_reset_lt_bw_sampling(sk);
+	bbr_reset_startup_mode(sk);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+	return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+static u32 bbr_undo_cwnd(struct sock *sk)
+{
+	return tcp_sk(sk)->snd_cwnd;
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+static u32 bbr_ssthresh(struct sock *sk)
+{
+	bbr_save_cwnd(sk);
+	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+			   union tcp_cc_info *info)
+{
+	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct bbr *bbr = inet_csk_ca(sk);
+		u64 bw = bbr_bw(sk);
+
+		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+		memset(&info->bbr, 0, sizeof(info->bbr));
+		info->bbr.bbr_bw_lo		= (u32)bw;
+		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
+		*attr = INET_DIAG_BBRINFO;
+		return sizeof(info->bbr);
+	}
+	return 0;
+}
+
+static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss) {
+		struct rate_sample rs = { .losses = 1 };
+
+		bbr->prev_ca_state = TCP_CA_Loss;
+		bbr->full_bw = 0;
+		bbr->round_start = 1;	/* treat RTO like end of a round */
+		bbr_lt_bw_sampling(sk, &rs);
+	}
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+	.flags		= TCP_CONG_NON_RESTRICTED,
+	.name		= "bbr",
+	.owner		= THIS_MODULE,
+	.init		= bbr_init,
+	.cong_control	= bbr_main,
+	.sndbuf_expand	= bbr_sndbuf_expand,
+	.undo_cwnd	= bbr_undo_cwnd,
+	.cwnd_event	= bbr_cwnd_event,
+	.ssthresh	= bbr_ssthresh,
+	.tso_segs_goal	= bbr_tso_segs_goal,
+	.get_info	= bbr_get_info,
+	.set_state	= bbr_set_state,
+};
+
+static int __init bbr_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
-- 
cgit v1.2.3


From 63c43787d35e45562a6b5927e2edc8f4783d95b8 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 19 Sep 2016 16:17:57 +0200
Subject: vti6: fix input path

Since commit 1625f4529957, vti6 is broken, all input packets are dropped
(LINUX_MIB_XFRMINNOSTATES is incremented).

XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 is set by vti6_rcv() before calling
xfrm6_rcv()/xfrm6_rcv_spi(), thus we cannot set to NULL that value in
xfrm6_rcv_spi().

A new function xfrm6_rcv_tnl() that enables to pass a value to
xfrm6_rcv_spi() is added, so that xfrm6_rcv() is not touched (this function
is used in several handlers).

CC: Alexey Kodanev <alexey.kodanev@oracle.com>
Fixes: 1625f4529957 ("net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  4 +++-
 net/ipv6/ip6_vti.c      |  4 +---
 net/ipv6/xfrm6_input.c  | 16 +++++++++++-----
 net/ipv6/xfrm6_tunnel.c |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index adfebd6f243c..17934312eecb 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1540,8 +1540,10 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
 void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
 int xfrm6_extract_header(struct sk_buff *skb);
 int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb);
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi);
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+		  struct ip6_tnl *t);
 int xfrm6_transport_finish(struct sk_buff *skb, int async);
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t);
 int xfrm6_rcv(struct sk_buff *skb);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		     xfrm_address_t *saddr, u8 proto);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 52a2f735881f..5bd3afdcc771 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb)
 			goto discard;
 		}
 
-		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
-
 		rcu_read_unlock();
 
-		return xfrm6_rcv(skb);
+		return xfrm6_rcv_tnl(skb, t);
 	}
 	rcu_read_unlock();
 	return -EINVAL;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00a2d40677d6..b5789562aded 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -21,9 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 	return xfrm6_extract_header(skb);
 }
 
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+		  struct ip6_tnl *t)
 {
-	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
 	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
 	return xfrm_input(skb, nexthdr, spi, 0);
@@ -49,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 	return -1;
 }
 
-int xfrm6_rcv(struct sk_buff *skb)
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
 {
 	return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
-			     0);
+			     0, t);
 }
-EXPORT_SYMBOL(xfrm6_rcv);
+EXPORT_SYMBOL(xfrm6_rcv_tnl);
 
+int xfrm6_rcv(struct sk_buff *skb)
+{
+	return xfrm6_rcv_tnl(skb, NULL);
+}
+EXPORT_SYMBOL(xfrm6_rcv);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		     xfrm_address_t *saddr, u8 proto)
 {
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 5743044cd660..e1c0bbe7996c 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb)
 	__be32 spi;
 
 	spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
-	return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi);
+	return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL);
 }
 
 static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-- 
cgit v1.2.3


From 332ae8e2f6ecda5e50c5c62ed62894963e3a83f5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:53 +0100
Subject: net: cls_bpf: add hardware offload

This patch adds hardware offload capability to cls_bpf classifier,
similar to what have been done with U32 and flower.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 include/net/pkt_cls.h     | 14 ++++++++++
 net/sched/cls_bpf.c       | 70 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a10d8d18ce19..69f242c71865 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -789,6 +789,7 @@ enum {
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_MATCHALL,
+	TC_SETUP_CLSBPF,
 };
 
 struct tc_cls_u32_offload;
@@ -800,6 +801,7 @@ struct tc_to_netdev {
 		struct tc_cls_u32_offload *cls_u32;
 		struct tc_cls_flower_offload *cls_flower;
 		struct tc_cls_matchall_offload *cls_mall;
+		struct tc_cls_bpf_offload *cls_bpf;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a459be5fe1c2..41e8071dff87 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -486,4 +486,18 @@ struct tc_cls_matchall_offload {
 	unsigned long cookie;
 };
 
+enum tc_clsbpf_command {
+	TC_CLSBPF_ADD,
+	TC_CLSBPF_REPLACE,
+	TC_CLSBPF_DESTROY,
+};
+
+struct tc_cls_bpf_offload {
+	enum tc_clsbpf_command command;
+	struct tcf_exts *exts;
+	struct bpf_prog *prog;
+	const char *name;
+	bool exts_integrated;
+};
+
 #endif
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index c6f7a47541eb..6523c5b4c0a5 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -39,6 +39,7 @@ struct cls_bpf_prog {
 	struct list_head link;
 	struct tcf_result res;
 	bool exts_integrated;
+	bool offloaded;
 	struct tcf_exts exts;
 	u32 handle;
 	union {
@@ -138,6 +139,71 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
 	return !prog->bpf_ops;
 }
 
+static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+			       enum tc_clsbpf_command cmd)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_bpf_offload bpf_offload = {};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSBPF;
+	offload.cls_bpf = &bpf_offload;
+
+	bpf_offload.command = cmd;
+	bpf_offload.exts = &prog->exts;
+	bpf_offload.prog = prog->filter;
+	bpf_offload.name = prog->bpf_name;
+	bpf_offload.exts_integrated = prog->exts_integrated;
+
+	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					     tp->protocol, &offload);
+}
+
+static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+			    struct cls_bpf_prog *oldprog)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct cls_bpf_prog *obj = prog;
+	enum tc_clsbpf_command cmd;
+
+	if (oldprog && oldprog->offloaded) {
+		if (tc_should_offload(dev, tp, 0)) {
+			cmd = TC_CLSBPF_REPLACE;
+		} else {
+			obj = oldprog;
+			cmd = TC_CLSBPF_DESTROY;
+		}
+	} else {
+		if (!tc_should_offload(dev, tp, 0))
+			return;
+		cmd = TC_CLSBPF_ADD;
+	}
+
+	if (cls_bpf_offload_cmd(tp, obj, cmd))
+		return;
+
+	obj->offloaded = true;
+	if (oldprog)
+		oldprog->offloaded = false;
+}
+
+static void cls_bpf_stop_offload(struct tcf_proto *tp,
+				 struct cls_bpf_prog *prog)
+{
+	int err;
+
+	if (!prog->offloaded)
+		return;
+
+	err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
+	if (err) {
+		pr_err("Stopping hardware offload failed: %d\n", err);
+		return;
+	}
+
+	prog->offloaded = false;
+}
+
 static int cls_bpf_init(struct tcf_proto *tp)
 {
 	struct cls_bpf_head *head;
@@ -177,6 +243,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
 {
 	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
 
+	cls_bpf_stop_offload(tp, prog);
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
 	call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -193,6 +260,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
 		return false;
 
 	list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+		cls_bpf_stop_offload(tp, prog);
 		list_del_rcu(&prog->link);
 		tcf_unbind_filter(tp, &prog->res);
 		call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -415,6 +483,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	if (ret < 0)
 		goto errout;
 
+	cls_bpf_offload(tp, prog, oldprog);
+
 	if (oldprog) {
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
-- 
cgit v1.2.3


From 0d01d45f1b251448590c710baa32f722e43c62c7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:54 +0100
Subject: net: cls_bpf: limit hardware offload by software-only flag

Add cls_bpf support for the TCA_CLS_FLAGS_SKIP_HW flag.
Unlike U32 and flower cls_bpf already has some netlink
flags defined.  Create a new attribute to be able to use
the same flag values as the above.

Unlike U32 and flower reject unknown flags.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        |  1 +
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/cls_bpf.c          | 22 ++++++++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 41e8071dff87..57af9f3032ff 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -498,6 +498,7 @@ struct tc_cls_bpf_offload {
 	struct bpf_prog *prog;
 	const char *name;
 	bool exts_integrated;
+	u32 gen_flags;
 };
 
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 8915b61bbf83..8fd715f806a2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -396,6 +396,7 @@ enum {
 	TCA_BPF_FD,
 	TCA_BPF_NAME,
 	TCA_BPF_FLAGS,
+	TCA_BPF_FLAGS_GEN,
 	__TCA_BPF_MAX,
 };
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 6523c5b4c0a5..ebf01f7c1470 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
 MODULE_DESCRIPTION("TC BPF based classifier");
 
 #define CLS_BPF_NAME_LEN	256
+#define CLS_BPF_SUPPORTED_GEN_FLAGS		\
+	TCA_CLS_FLAGS_SKIP_HW
 
 struct cls_bpf_head {
 	struct list_head plist;
@@ -40,6 +42,7 @@ struct cls_bpf_prog {
 	struct tcf_result res;
 	bool exts_integrated;
 	bool offloaded;
+	u32 gen_flags;
 	struct tcf_exts exts;
 	u32 handle;
 	union {
@@ -55,6 +58,7 @@ struct cls_bpf_prog {
 static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
 	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
 	[TCA_BPF_FLAGS]		= { .type = NLA_U32 },
+	[TCA_BPF_FLAGS_GEN]	= { .type = NLA_U32 },
 	[TCA_BPF_FD]		= { .type = NLA_U32 },
 	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING,
 				    .len = CLS_BPF_NAME_LEN },
@@ -154,6 +158,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	bpf_offload.prog = prog->filter;
 	bpf_offload.name = prog->bpf_name;
 	bpf_offload.exts_integrated = prog->exts_integrated;
+	bpf_offload.gen_flags = prog->gen_flags;
 
 	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
 					     tp->protocol, &offload);
@@ -167,14 +172,14 @@ static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	enum tc_clsbpf_command cmd;
 
 	if (oldprog && oldprog->offloaded) {
-		if (tc_should_offload(dev, tp, 0)) {
+		if (tc_should_offload(dev, tp, prog->gen_flags)) {
 			cmd = TC_CLSBPF_REPLACE;
 		} else {
 			obj = oldprog;
 			cmd = TC_CLSBPF_DESTROY;
 		}
 	} else {
-		if (!tc_should_offload(dev, tp, 0))
+		if (!tc_should_offload(dev, tp, prog->gen_flags))
 			return;
 		cmd = TC_CLSBPF_ADD;
 	}
@@ -370,6 +375,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 {
 	bool is_bpf, is_ebpf, have_exts = false;
 	struct tcf_exts exts;
+	u32 gen_flags = 0;
 	int ret;
 
 	is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
@@ -394,8 +400,17 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 
 		have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
 	}
+	if (tb[TCA_BPF_FLAGS_GEN]) {
+		gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
+		if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
+		    !tc_flags_valid(gen_flags)) {
+			ret = -EINVAL;
+			goto errout;
+		}
+	}
 
 	prog->exts_integrated = have_exts;
+	prog->gen_flags = gen_flags;
 
 	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
 		       cls_bpf_prog_from_efd(tb, prog, tp);
@@ -568,6 +583,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
 	if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
 		goto nla_put_failure;
+	if (prog->gen_flags &&
+	    nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
+		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 
-- 
cgit v1.2.3


From 58e2af8b3a6b587e4ac8414343581da4349d3c0f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:57 +0100
Subject: bpf: expose internal verfier structures

Move verifier's internal structures to a header file and
prefix their names with bpf_ to avoid potential namespace
conflicts.  Those structures will soon be used by external
analyzers.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  79 +++++++++++++
 kernel/bpf/verifier.c        | 266 +++++++++++++++++--------------------------
 2 files changed, 182 insertions(+), 163 deletions(-)
 create mode 100644 include/linux/bpf_verifier.h

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
new file mode 100644
index 000000000000..9457a22fc6e0
--- /dev/null
+++ b/include/linux/bpf_verifier.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_BPF_VERIFIER_H
+#define _LINUX_BPF_VERIFIER_H 1
+
+#include <linux/bpf.h> /* for enum bpf_reg_type */
+#include <linux/filter.h> /* for MAX_BPF_STACK */
+
+struct bpf_reg_state {
+	enum bpf_reg_type type;
+	union {
+		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
+		s64 imm;
+
+		/* valid when type == PTR_TO_PACKET* */
+		struct {
+			u32 id;
+			u16 off;
+			u16 range;
+		};
+
+		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+		 *   PTR_TO_MAP_VALUE_OR_NULL
+		 */
+		struct bpf_map *map_ptr;
+	};
+};
+
+enum bpf_stack_slot_type {
+	STACK_INVALID,    /* nothing was stored in this stack slot */
+	STACK_SPILL,      /* register spilled into stack */
+	STACK_MISC	  /* BPF program wrote some data into this slot */
+};
+
+#define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct bpf_verifier_state {
+	struct bpf_reg_state regs[MAX_BPF_REG];
+	u8 stack_slot_type[MAX_BPF_STACK];
+	struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
+};
+
+/* linked list of verifier states used to prune search */
+struct bpf_verifier_state_list {
+	struct bpf_verifier_state state;
+	struct bpf_verifier_state_list *next;
+};
+
+struct bpf_insn_aux_data {
+	enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
+};
+
+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct bpf_verifier_env {
+	struct bpf_prog *prog;		/* eBPF program being verified */
+	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
+	int stack_size;			/* number of states to be processed */
+	struct bpf_verifier_state cur_state; /* current verifier state */
+	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
+	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+	u32 used_map_cnt;		/* number of used maps */
+	u32 id_gen;			/* used to generate unique reg IDs */
+	bool allow_ptr_leaks;
+	bool seen_direct_write;
+	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+};
+
+#endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a9542d89f293..dca2b9b1d02e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
 #include <linux/filter.h>
 #include <net/netlink.h>
 #include <linux/file.h>
@@ -126,82 +127,16 @@
  * are set to NOT_INIT to indicate that they are no longer readable.
  */
 
-struct reg_state {
-	enum bpf_reg_type type;
-	union {
-		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
-		s64 imm;
-
-		/* valid when type == PTR_TO_PACKET* */
-		struct {
-			u32 id;
-			u16 off;
-			u16 range;
-		};
-
-		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
-		 *   PTR_TO_MAP_VALUE_OR_NULL
-		 */
-		struct bpf_map *map_ptr;
-	};
-};
-
-enum bpf_stack_slot_type {
-	STACK_INVALID,    /* nothing was stored in this stack slot */
-	STACK_SPILL,      /* register spilled into stack */
-	STACK_MISC	  /* BPF program wrote some data into this slot */
-};
-
-#define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
-
-/* state of the program:
- * type of all registers and stack info
- */
-struct verifier_state {
-	struct reg_state regs[MAX_BPF_REG];
-	u8 stack_slot_type[MAX_BPF_STACK];
-	struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
-};
-
-/* linked list of verifier states used to prune search */
-struct verifier_state_list {
-	struct verifier_state state;
-	struct verifier_state_list *next;
-};
-
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
-struct verifier_stack_elem {
+struct bpf_verifier_stack_elem {
 	/* verifer state is 'st'
 	 * before processing instruction 'insn_idx'
 	 * and after processing instruction 'prev_insn_idx'
 	 */
-	struct verifier_state st;
+	struct bpf_verifier_state st;
 	int insn_idx;
 	int prev_insn_idx;
-	struct verifier_stack_elem *next;
-};
-
-struct bpf_insn_aux_data {
-	enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
-};
-
-#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
-
-/* single container for all structs
- * one verifier_env per bpf_check() call
- */
-struct verifier_env {
-	struct bpf_prog *prog;		/* eBPF program being verified */
-	struct verifier_stack_elem *head; /* stack of verifier states to be processed */
-	int stack_size;			/* number of states to be processed */
-	struct verifier_state cur_state; /* current verifier state */
-	struct verifier_state_list **explored_states; /* search pruning optimization */
-	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
-	u32 used_map_cnt;		/* number of used maps */
-	u32 id_gen;			/* used to generate unique reg IDs */
-	bool allow_ptr_leaks;
-	bool seen_direct_write;
-	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+	struct bpf_verifier_stack_elem *next;
 };
 
 #define BPF_COMPLEXITY_LIMIT_INSNS	65536
@@ -254,9 +189,9 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
-static void print_verifier_state(struct verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_state *state)
 {
-	struct reg_state *reg;
+	struct bpf_reg_state *reg;
 	enum bpf_reg_type t;
 	int i;
 
@@ -432,9 +367,9 @@ static void print_bpf_insn(struct bpf_insn *insn)
 	}
 }
 
-static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
 {
-	struct verifier_stack_elem *elem;
+	struct bpf_verifier_stack_elem *elem;
 	int insn_idx;
 
 	if (env->head == NULL)
@@ -451,12 +386,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
 	return insn_idx;
 }
 
-static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
-					 int prev_insn_idx)
+static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
+					     int insn_idx, int prev_insn_idx)
 {
-	struct verifier_stack_elem *elem;
+	struct bpf_verifier_stack_elem *elem;
 
-	elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+	elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
 	if (!elem)
 		goto err;
 
@@ -482,7 +417,7 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
 	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
 
-static void init_reg_state(struct reg_state *regs)
+static void init_reg_state(struct bpf_reg_state *regs)
 {
 	int i;
 
@@ -498,7 +433,7 @@ static void init_reg_state(struct reg_state *regs)
 	regs[BPF_REG_1].type = PTR_TO_CTX;
 }
 
-static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
 {
 	BUG_ON(regno >= MAX_BPF_REG);
 	regs[regno].type = UNKNOWN_VALUE;
@@ -511,7 +446,7 @@ enum reg_arg_type {
 	DST_OP_NO_MARK	/* same as above, check only, don't mark */
 };
 
-static int check_reg_arg(struct reg_state *regs, u32 regno,
+static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
 			 enum reg_arg_type t)
 {
 	if (regno >= MAX_BPF_REG) {
@@ -571,8 +506,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct verifier_state *state, int off, int size,
-			     int value_regno)
+static int check_stack_write(struct bpf_verifier_state *state, int off,
+			     int size, int value_regno)
 {
 	int i;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -597,7 +532,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 	} else {
 		/* regular write of data into stack */
 		state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
-			(struct reg_state) {};
+			(struct bpf_reg_state) {};
 
 		for (i = 0; i < size; i++)
 			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -605,7 +540,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 	return 0;
 }
 
-static int check_stack_read(struct verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
 	u8 *slot_type;
@@ -646,7 +581,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
 }
 
 /* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct verifier_env *env, u32 regno, int off,
+static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			    int size)
 {
 	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -661,7 +596,7 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
-static bool may_access_direct_pkt_data(struct verifier_env *env,
+static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 				       const struct bpf_call_arg_meta *meta)
 {
 	switch (env->prog->type) {
@@ -678,11 +613,11 @@ static bool may_access_direct_pkt_data(struct verifier_env *env,
 	}
 }
 
-static int check_packet_access(struct verifier_env *env, u32 regno, int off,
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 			       int size)
 {
-	struct reg_state *regs = env->cur_state.regs;
-	struct reg_state *reg = &regs[regno];
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *reg = &regs[regno];
 
 	off += reg->off;
 	if (off < 0 || size <= 0 || off + size > reg->range) {
@@ -694,7 +629,7 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 }
 
 /* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct verifier_env *env, int off, int size,
+static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
 	if (env->prog->aux->ops->is_valid_access &&
@@ -709,7 +644,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 	return -EACCES;
 }
 
-static bool is_pointer_value(struct verifier_env *env, int regno)
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
 	if (env->allow_ptr_leaks)
 		return false;
@@ -723,12 +658,13 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
 	}
 }
 
-static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
-			       int off, int size)
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+			       struct bpf_reg_state *reg, int off, int size)
 {
 	if (reg->type != PTR_TO_PACKET) {
 		if (off % size != 0) {
-			verbose("misaligned access off %d size %d\n", off, size);
+			verbose("misaligned access off %d size %d\n",
+				off, size);
 			return -EACCES;
 		} else {
 			return 0;
@@ -769,12 +705,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
  * if t==write && value_regno==-1, some unknown value is stored into memory
  * if t==read && value_regno==-1, don't care what we read from memory
  */
-static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
-	struct verifier_state *state = &env->cur_state;
-	struct reg_state *reg = &state->regs[regno];
+	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_reg_state *reg = &state->regs[regno];
 	int size, err = 0;
 
 	if (reg->type == PTR_TO_STACK)
@@ -860,9 +796,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 	return err;
 }
 
-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state.regs;
 	int err;
 
 	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -896,12 +832,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
  * bytes from that pointer, make sure that it's within stack boundary
  * and all elements of stack are initialized
  */
-static int check_stack_boundary(struct verifier_env *env, int regno,
+static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 				int access_size, bool zero_size_allowed,
 				struct bpf_call_arg_meta *meta)
 {
-	struct verifier_state *state = &env->cur_state;
-	struct reg_state *regs = state->regs;
+	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_reg_state *regs = state->regs;
 	int off, i;
 
 	if (regs[regno].type != PTR_TO_STACK) {
@@ -940,11 +876,11 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
 	return 0;
 }
 
-static int check_func_arg(struct verifier_env *env, u32 regno,
+static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
 {
-	struct reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
 	enum bpf_reg_type expected_type, type = reg->type;
 	int err = 0;
 
@@ -1149,10 +1085,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-static void clear_all_pkt_pointers(struct verifier_env *env)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
-	struct verifier_state *state = &env->cur_state;
-	struct reg_state *regs = state->regs, *reg;
+	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
@@ -1172,12 +1108,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env)
 	}
 }
 
-static int check_call(struct verifier_env *env, int func_id)
+static int check_call(struct bpf_verifier_env *env, int func_id)
 {
-	struct verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
-	struct reg_state *regs = state->regs;
-	struct reg_state *reg;
+	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *reg;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
 	int i, err;
@@ -1280,12 +1216,13 @@ static int check_call(struct verifier_env *env, int func_id)
 	return 0;
 }
 
-static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
+static int check_packet_ptr_add(struct bpf_verifier_env *env,
+				struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs;
-	struct reg_state *dst_reg = &regs[insn->dst_reg];
-	struct reg_state *src_reg = &regs[insn->src_reg];
-	struct reg_state tmp_reg;
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+	struct bpf_reg_state tmp_reg;
 	s32 imm;
 
 	if (BPF_SRC(insn->code) == BPF_K) {
@@ -1353,10 +1290,10 @@ add_imm:
 	return 0;
 }
 
-static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
+static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs;
-	struct reg_state *dst_reg = &regs[insn->dst_reg];
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
 	u8 opcode = BPF_OP(insn->code);
 	s64 imm_log2;
 
@@ -1366,7 +1303,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
 	 */
 
 	if (BPF_SRC(insn->code) == BPF_X) {
-		struct reg_state *src_reg = &regs[insn->src_reg];
+		struct bpf_reg_state *src_reg = &regs[insn->src_reg];
 
 		if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
 		    dst_reg->imm && opcode == BPF_ADD) {
@@ -1455,11 +1392,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
-static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
+static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
+				struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs;
-	struct reg_state *dst_reg = &regs[insn->dst_reg];
-	struct reg_state *src_reg = &regs[insn->src_reg];
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
 	u8 opcode = BPF_OP(insn->code);
 
 	/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
@@ -1476,9 +1414,9 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
 }
 
 /* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
+static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs, *dst_reg;
+	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -1652,10 +1590,10 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
-static void find_good_pkt_pointers(struct verifier_state *state,
-				   const struct reg_state *dst_reg)
+static void find_good_pkt_pointers(struct bpf_verifier_state *state,
+				   struct bpf_reg_state *dst_reg)
 {
-	struct reg_state *regs = state->regs, *reg;
+	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
 	/* LLVM can generate two kind of checks:
@@ -1701,11 +1639,11 @@ static void find_good_pkt_pointers(struct verifier_state *state,
 	}
 }
 
-static int check_cond_jmp_op(struct verifier_env *env,
+static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
-	struct verifier_state *other_branch, *this_branch = &env->cur_state;
-	struct reg_state *regs = this_branch->regs, *dst_reg;
+	struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+	struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -1767,7 +1705,7 @@ static int check_cond_jmp_op(struct verifier_env *env,
 	if (!other_branch)
 		return -EFAULT;
 
-	/* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+	/* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
 	if (BPF_SRC(insn->code) == BPF_K &&
 	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
 	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1809,9 +1747,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 }
 
 /* verify BPF_LD_IMM64 instruction */
-static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state.regs;
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -1866,11 +1804,11 @@ static bool may_access_skb(enum bpf_prog_type type)
  * Output:
  *   R0 - 8/16/32-bit skb data converted to cpu endianness
  */
-static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state.regs;
 	u8 mode = BPF_MODE(insn->code);
-	struct reg_state *reg;
+	struct bpf_reg_state *reg;
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
@@ -1956,7 +1894,7 @@ enum {
 	BRANCH = 2,
 };
 
-#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
 
 static int *insn_stack;	/* stack of insns to process */
 static int cur_stack;	/* current stack index */
@@ -1967,7 +1905,7 @@ static int *insn_state;
  * w - next instruction
  * e - edge
  */
-static int push_insn(int t, int w, int e, struct verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 {
 	if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
 		return 0;
@@ -2008,7 +1946,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env)
 /* non-recursive depth-first-search to detect loops in BPF program
  * loop == back-edge in directed graph
  */
-static int check_cfg(struct verifier_env *env)
+static int check_cfg(struct bpf_verifier_env *env)
 {
 	struct bpf_insn *insns = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
@@ -2117,7 +2055,8 @@ err_free:
 /* the following conditions reduce the number of explored insns
  * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
  */
-static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
+static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
+				   struct bpf_reg_state *cur)
 {
 	if (old->id != cur->id)
 		return false;
@@ -2192,9 +2131,10 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
  * whereas register type in current state is meaningful, it means that
  * the current state will reach 'bpf_exit' instruction safely
  */
-static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+static bool states_equal(struct bpf_verifier_state *old,
+			 struct bpf_verifier_state *cur)
 {
-	struct reg_state *rold, *rcur;
+	struct bpf_reg_state *rold, *rcur;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
@@ -2234,9 +2174,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 			 * the same, check that stored pointers types
 			 * are the same as well.
 			 * Ex: explored safe path could have stored
-			 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
 			 * but current path has stored:
-			 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
 			 * such verifier states are not equivalent.
 			 * return false to continue verification of this path
 			 */
@@ -2247,10 +2187,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 	return true;
 }
 
-static int is_state_visited(struct verifier_env *env, int insn_idx)
+static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
-	struct verifier_state_list *new_sl;
-	struct verifier_state_list *sl;
+	struct bpf_verifier_state_list *new_sl;
+	struct bpf_verifier_state_list *sl;
 
 	sl = env->explored_states[insn_idx];
 	if (!sl)
@@ -2274,7 +2214,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
 	 * it will be rejected. Since there are no loops, we won't be
 	 * seeing this 'insn_idx' instruction again on the way to bpf_exit
 	 */
-	new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+	new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
 	if (!new_sl)
 		return -ENOMEM;
 
@@ -2285,11 +2225,11 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
 	return 0;
 }
 
-static int do_check(struct verifier_env *env)
+static int do_check(struct bpf_verifier_env *env)
 {
-	struct verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = &env->cur_state;
 	struct bpf_insn *insns = env->prog->insnsi;
-	struct reg_state *regs = state->regs;
+	struct bpf_reg_state *regs = state->regs;
 	int insn_cnt = env->prog->len;
 	int insn_idx, prev_insn_idx = 0;
 	int insn_processed = 0;
@@ -2572,7 +2512,7 @@ static int check_map_prog_compatibility(struct bpf_map *map,
 /* look for pseudo eBPF instructions that access map FDs and
  * replace them with actual map pointers
  */
-static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 {
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
@@ -2669,7 +2609,7 @@ next_insn:
 }
 
 /* drop refcnt of maps used by the rejected program */
-static void release_maps(struct verifier_env *env)
+static void release_maps(struct bpf_verifier_env *env)
 {
 	int i;
 
@@ -2678,7 +2618,7 @@ static void release_maps(struct verifier_env *env)
 }
 
 /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
-static void convert_pseudo_ld_imm64(struct verifier_env *env)
+static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
 {
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
@@ -2692,7 +2632,7 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
 /* convert load instructions that access fields of 'struct __sk_buff'
  * into sequence of instructions that access fields of 'struct sk_buff'
  */
-static int convert_ctx_accesses(struct verifier_env *env)
+static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
 	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
 	const int insn_cnt = env->prog->len;
@@ -2757,9 +2697,9 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	return 0;
 }
 
-static void free_states(struct verifier_env *env)
+static void free_states(struct bpf_verifier_env *env)
 {
-	struct verifier_state_list *sl, *sln;
+	struct bpf_verifier_state_list *sl, *sln;
 	int i;
 
 	if (!env->explored_states)
@@ -2782,16 +2722,16 @@ static void free_states(struct verifier_env *env)
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
 	char __user *log_ubuf = NULL;
-	struct verifier_env *env;
+	struct bpf_verifier_env *env;
 	int ret = -EINVAL;
 
 	if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
 		return -E2BIG;
 
-	/* 'struct verifier_env' can be global, but since it's not small,
+	/* 'struct bpf_verifier_env' can be global, but since it's not small,
 	 * allocate/free it every time bpf_check() is called
 	 */
-	env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
 
@@ -2833,7 +2773,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		goto skip_full_check;
 
 	env->explored_states = kcalloc(env->prog->len,
-				       sizeof(struct verifier_state_list *),
+				       sizeof(struct bpf_verifier_state_list *),
 				       GFP_USER);
 	ret = -ENOMEM;
 	if (!env->explored_states)
-- 
cgit v1.2.3


From 13a27dfc669724564aafa2699976ee756029fed2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:58 +0100
Subject: bpf: enable non-core use of the verfier

Advanced JIT compilers and translators may want to use
eBPF verifier as a base for parsers or to perform custom
checks and validations.

Add ability for external users to invoke the verifier
and provide callbacks to be invoked for every intruction
checked.  For now only add most basic callback for
per-instruction pre-interpretation checks is added.  More
advanced users may also like to have per-instruction post
callback and state comparison callback.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 11 +++++++
 kernel/bpf/verifier.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 9457a22fc6e0..c5cb661712c9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -59,6 +59,12 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+struct bpf_verifier_env;
+struct bpf_ext_analyzer_ops {
+	int (*insn_hook)(struct bpf_verifier_env *env,
+			 int insn_idx, int prev_insn_idx);
+};
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
@@ -68,6 +74,8 @@ struct bpf_verifier_env {
 	int stack_size;			/* number of states to be processed */
 	struct bpf_verifier_state cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
+	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
+	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
@@ -76,4 +84,7 @@ struct bpf_verifier_env {
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 };
 
+int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
+		 void *priv);
+
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dca2b9b1d02e..ee86a77dc40b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -632,6 +632,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
+	/* for analyzer ctx accesses are already validated and converted */
+	if (env->analyzer_ops)
+		return 0;
+
 	if (env->prog->aux->ops->is_valid_access &&
 	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
 		/* remember the offset of last byte accessed in ctx */
@@ -2225,6 +2229,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	return 0;
 }
 
+static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
+				  int insn_idx, int prev_insn_idx)
+{
+	if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
+		return 0;
+
+	return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+}
+
 static int do_check(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *state = &env->cur_state;
@@ -2283,6 +2296,10 @@ static int do_check(struct bpf_verifier_env *env)
 			print_bpf_insn(insn);
 		}
 
+		err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
+		if (err)
+			return err;
+
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -2845,3 +2862,54 @@ err_free_env:
 	kfree(env);
 	return ret;
 }
+
+int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
+		 void *priv)
+{
+	struct bpf_verifier_env *env;
+	int ret;
+
+	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+	if (!env)
+		return -ENOMEM;
+
+	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+				     prog->len);
+	ret = -ENOMEM;
+	if (!env->insn_aux_data)
+		goto err_free_env;
+	env->prog = prog;
+	env->analyzer_ops = ops;
+	env->analyzer_priv = priv;
+
+	/* grab the mutex to protect few globals used by verifier */
+	mutex_lock(&bpf_verifier_lock);
+
+	log_level = 0;
+
+	env->explored_states = kcalloc(env->prog->len,
+				       sizeof(struct bpf_verifier_state_list *),
+				       GFP_KERNEL);
+	ret = -ENOMEM;
+	if (!env->explored_states)
+		goto skip_full_check;
+
+	ret = check_cfg(env);
+	if (ret < 0)
+		goto skip_full_check;
+
+	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
+	ret = do_check(env);
+
+skip_full_check:
+	while (pop_stack(env, NULL) >= 0);
+	free_states(env);
+
+	mutex_unlock(&bpf_verifier_lock);
+	vfree(env->insn_aux_data);
+err_free_env:
+	kfree(env);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bpf_analyzer);
-- 
cgit v1.2.3


From 68d640630d4ef2a4bf3f68b5073dec5e4c4f878b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:44:02 +0100
Subject: net: cls_bpf: allow offloaded filters to update stats

Call into offloaded filters to update stats.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h |  1 +
 net/sched/cls_bpf.c   | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 57af9f3032ff..5ccaa4be7d96 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -490,6 +490,7 @@ enum tc_clsbpf_command {
 	TC_CLSBPF_ADD,
 	TC_CLSBPF_REPLACE,
 	TC_CLSBPF_DESTROY,
+	TC_CLSBPF_STATS,
 };
 
 struct tc_cls_bpf_offload {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1becc2fe1bc5..bb1d5a487081 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -221,6 +221,15 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp,
 	prog->offloaded = false;
 }
 
+static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
+					 struct cls_bpf_prog *prog)
+{
+	if (!prog->offloaded)
+		return;
+
+	cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
+}
+
 static int cls_bpf_init(struct tcf_proto *tp)
 {
 	struct cls_bpf_head *head;
@@ -577,6 +586,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	tm->tcm_handle = prog->handle;
 
+	cls_bpf_offload_update_stats(tp, prog);
+
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
-- 
cgit v1.2.3


From cf1a6474f80735ff4a5d99f3dd68a94dbec8455f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:41:53 +0100
Subject: rxrpc: Add per-peer RTT tracker

Add a function to track the average RTT for a peer.  Sources of RTT data
will be added in subsequent patches.

The RTT data will be useful in the future for determining resend timeouts
and for handling the slow-start part of the Rx protocol.

Also add a pair of tracepoints, one to log transmissions to elicit a
response for RTT purposes and one to log responses that contribute RTT
data.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 61 ++++++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 25 +++++++++++++++---
 net/rxrpc/misc.c             |  8 ++++++
 net/rxrpc/peer_event.c       | 41 +++++++++++++++++++++++++++++
 4 files changed, 131 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 75a5d8bf50e1..e8f2afbbe0bf 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -353,6 +353,67 @@ TRACE_EVENT(rxrpc_recvmsg,
 		      __entry->ret)
 	    );
 
+TRACE_EVENT(rxrpc_rtt_tx,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_tx_trace why,
+		     rxrpc_serial_t send_serial),
+
+	    TP_ARGS(call, why, send_serial),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_rtt_tx_trace,	why		)
+		    __field(rxrpc_serial_t,		send_serial	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->send_serial = send_serial;
+			   ),
+
+	    TP_printk("c=%p %s sr=%08x",
+		      __entry->call,
+		      rxrpc_rtt_tx_traces[__entry->why],
+		      __entry->send_serial)
+	    );
+
+TRACE_EVENT(rxrpc_rtt_rx,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+		     rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+		     s64 rtt, u8 nr, s64 avg),
+
+	    TP_ARGS(call, why, send_serial, resp_serial, rtt, nr, avg),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_rtt_rx_trace,	why		)
+		    __field(u8,				nr		)
+		    __field(rxrpc_serial_t,		send_serial	)
+		    __field(rxrpc_serial_t,		resp_serial	)
+		    __field(s64,			rtt		)
+		    __field(u64,			avg		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->send_serial = send_serial;
+		    __entry->resp_serial = resp_serial;
+		    __entry->rtt = rtt;
+		    __entry->nr = nr;
+		    __entry->avg = avg;
+			   ),
+
+	    TP_printk("c=%p %s sr=%08x rr=%08x rtt=%lld nr=%u avg=%lld",
+		      __entry->call,
+		      rxrpc_rtt_rx_traces[__entry->why],
+		      __entry->send_serial,
+		      __entry->resp_serial,
+		      __entry->rtt,
+		      __entry->nr,
+		      __entry->avg)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index dcf54e3fb478..79c671e552c3 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -258,10 +258,11 @@ struct rxrpc_peer {
 
 	/* calculated RTT cache */
 #define RXRPC_RTT_CACHE_SIZE 32
-	suseconds_t		rtt;		/* current RTT estimate (in uS) */
-	unsigned int		rtt_point;	/* next entry at which to insert */
-	unsigned int		rtt_usage;	/* amount of cache actually used */
-	suseconds_t		rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */
+	u64			rtt;		/* Current RTT estimate (in nS) */
+	u64			rtt_sum;	/* Sum of cache contents */
+	u64			rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
+	u8			rtt_cursor;	/* next entry at which to insert */
+	u8			rtt_usage;	/* amount of cache actually used */
 };
 
 /*
@@ -657,6 +658,20 @@ enum rxrpc_recvmsg_trace {
 
 extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
 
+enum rxrpc_rtt_tx_trace {
+	rxrpc_rtt_tx_ping,
+	rxrpc_rtt_tx__nr_trace
+};
+
+extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
+
+enum rxrpc_rtt_rx_trace {
+	rxrpc_rtt_rx_ping_response,
+	rxrpc_rtt_rx__nr_trace
+};
+
+extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
@@ -955,6 +970,8 @@ void rxrpc_reject_packets(struct rxrpc_local *);
  */
 void rxrpc_error_report(struct sock *);
 void rxrpc_peer_error_distributor(struct work_struct *);
+void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
+			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
 
 /*
  * peer_object.c
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 026e1f2e83ff..6321c23f9a6e 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -182,3 +182,11 @@ const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
 	[rxrpc_recvmsg_to_be_accepted]	= "TBAC",
 	[rxrpc_recvmsg_return]		= "RETN",
 };
+
+const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
+	[rxrpc_rtt_tx_ping]		= "PING",
+};
+
+const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
+	[rxrpc_rtt_rx_ping_response]	= "PONG",
+};
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 18276e7cb9e0..bf13b8470c9a 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -305,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
 	rxrpc_put_peer(peer);
 	_leave("");
 }
+
+/*
+ * Add RTT information to cache.  This is called in softirq mode and has
+ * exclusive access to the peer RTT data.
+ */
+void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+			rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+			ktime_t send_time, ktime_t resp_time)
+{
+	struct rxrpc_peer *peer = call->peer;
+	s64 rtt;
+	u64 sum = peer->rtt_sum, avg;
+	u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage;
+
+	rtt = ktime_to_ns(ktime_sub(resp_time, send_time));
+	if (rtt < 0)
+		return;
+
+	/* Replace the oldest datum in the RTT buffer */
+	sum -= peer->rtt_cache[cursor];
+	sum += rtt;
+	peer->rtt_cache[cursor] = rtt;
+	peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1);
+	peer->rtt_sum = sum;
+	if (usage < RXRPC_RTT_CACHE_SIZE) {
+		usage++;
+		peer->rtt_usage = usage;
+	}
+
+	/* Now recalculate the average */
+	if (usage == RXRPC_RTT_CACHE_SIZE) {
+		avg = sum / RXRPC_RTT_CACHE_SIZE;
+	} else {
+		avg = sum;
+		do_div(avg, usage);
+	}
+
+	peer->rtt = avg;
+	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
+			   usage, avg);
+}
-- 
cgit v1.2.3


From bfca4c520f7ea78138ddccea2de18dc062b0fefd Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 19 Sep 2016 19:11:09 +0300
Subject: net: skbuff: Export __skb_vlan_pop

This exports the functionality of extracting the tag from the payload,
without moving next vlan tag into hw accel tag.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c6dab3f7457c..9bf60b556bd2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3085,6 +3085,7 @@ bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7bf82a28e10a..6c22351bd519 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4522,8 +4522,10 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
 }
 EXPORT_SYMBOL(skb_ensure_writable);
 
-/* remove VLAN header from packet and update csum accordingly. */
-static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
 {
 	struct vlan_hdr *vhdr;
 	unsigned int offset = skb->data - skb_mac_header(skb);
@@ -4554,6 +4556,7 @@ pull:
 
 	return err;
 }
+EXPORT_SYMBOL(__skb_vlan_pop);
 
 int skb_vlan_pop(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 45a497f2d149a4a8061c61518a79d59f1f3034b2 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 19 Sep 2016 19:11:10 +0300
Subject: net/sched: act_vlan: Introduce TCA_VLAN_ACT_MODIFY vlan action

TCA_VLAN_ACT_MODIFY allows one to change an existing tag.

It accepts same attributes as TCA_VLAN_ACT_PUSH (protocol, id,
priority).
If packet is vlan tagged, then the tag gets overwritten according to
user specified attributes.

For example, this allows user to replace a tag's vid while preserving
its priority bits (as opposed to "action vlan pop pipe action vlan push").

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_vlan.h |  1 +
 net/sched/act_vlan.c                | 29 ++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h
index be72b6e3843b..bddb272b843f 100644
--- a/include/uapi/linux/tc_act/tc_vlan.h
+++ b/include/uapi/linux/tc_act/tc_vlan.h
@@ -16,6 +16,7 @@
 
 #define TCA_VLAN_ACT_POP	1
 #define TCA_VLAN_ACT_PUSH	2
+#define TCA_VLAN_ACT_MODIFY	3
 
 struct tc_vlan {
 	tc_gen;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 59a8d3150ae2..a95c00b119da 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -30,6 +30,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_vlan *v = to_vlan(a);
 	int action;
 	int err;
+	u16 tci;
 
 	spin_lock(&v->tcf_lock);
 	tcf_lastuse_update(&v->tcf_tm);
@@ -48,6 +49,30 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 		if (err)
 			goto drop;
 		break;
+	case TCA_VLAN_ACT_MODIFY:
+		/* No-op if no vlan tag (either hw-accel or in-payload) */
+		if (!skb_vlan_tagged(skb))
+			goto unlock;
+		/* extract existing tag (and guarantee no hw-accel tag) */
+		if (skb_vlan_tag_present(skb)) {
+			tci = skb_vlan_tag_get(skb);
+			skb->vlan_tci = 0;
+		} else {
+			/* in-payload vlan tag, pop it */
+			err = __skb_vlan_pop(skb, &tci);
+			if (err)
+				goto drop;
+		}
+		/* replace the vid */
+		tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
+		/* replace prio bits, if tcfv_push_prio specified */
+		if (v->tcfv_push_prio) {
+			tci &= ~VLAN_PRIO_MASK;
+			tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
+		}
+		/* put updated tci as hwaccel tag */
+		__vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci);
+		break;
 	default:
 		BUG();
 	}
@@ -102,6 +127,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	case TCA_VLAN_ACT_POP:
 		break;
 	case TCA_VLAN_ACT_PUSH:
+	case TCA_VLAN_ACT_MODIFY:
 		if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
 			if (exists)
 				tcf_hash_release(*a, bind);
@@ -185,7 +211,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 	if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
-	if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
+	if ((v->tcfv_action == TCA_VLAN_ACT_PUSH ||
+	     v->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
 	    (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
 	     nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
 			  v->tcfv_push_proto) ||
-- 
cgit v1.2.3


From efee95f42b5dddedcaff0a0eaa44e170fc7522e8 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 20 Sep 2016 19:25:58 -0400
Subject: ptp_clock: future-proofing drivers against PTP subsystem becoming
 optional

Drivers must be ready to accept NULL from ptp_clock_register() if the
PTP clock subsystem is configured out.

This patch documents that and ensures that all drivers cope well
with a NULL return.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Reviewed-by: Eugenia Emantayev <eugenia@mellanox.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/e1000e/ptp.c            |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_ptp.c         |  2 +-
 drivers/net/ethernet/intel/igb/igb_ptp.c           |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c       |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_clock.c      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_clock.c |  2 +-
 drivers/net/ethernet/sfc/ptp.c                     | 14 +++++++-------
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c   |  2 +-
 include/linux/ptp_clock_kernel.h                   |  5 +++++
 9 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c
index 2e1b17ad52a3..ad03763e009a 100644
--- a/drivers/net/ethernet/intel/e1000e/ptp.c
+++ b/drivers/net/ethernet/intel/e1000e/ptp.c
@@ -334,7 +334,7 @@ void e1000e_ptp_init(struct e1000_adapter *adapter)
 	if (IS_ERR(adapter->ptp_clock)) {
 		adapter->ptp_clock = NULL;
 		e_err("ptp_clock_register failed\n");
-	} else {
+	} else if (adapter->ptp_clock) {
 		e_info("registered PHC clock\n");
 	}
 }
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index ed39cbad24bd..f1feceab758a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -669,7 +669,7 @@ void i40e_ptp_init(struct i40e_pf *pf)
 		pf->ptp_clock = NULL;
 		dev_err(&pf->pdev->dev, "%s: ptp_clock_register failed\n",
 			__func__);
-	} else {
+	} else if (pf->ptp_clock) {
 		struct timespec64 ts;
 		u32 regval;
 
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index 66dfa2085cc7..1dd14e166dc8 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -1159,7 +1159,7 @@ void igb_ptp_init(struct igb_adapter *adapter)
 	if (IS_ERR(adapter->ptp_clock)) {
 		adapter->ptp_clock = NULL;
 		dev_err(&adapter->pdev->dev, "ptp_clock_register failed\n");
-	} else {
+	} else if (adapter->ptp_clock) {
 		dev_info(&adapter->pdev->dev, "added PHC on %s\n",
 			 adapter->netdev->name);
 		adapter->ptp_flags |= IGB_PTP_ENABLED;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index e5431bfe3339..a92277683a64 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -1254,7 +1254,7 @@ static long ixgbe_ptp_create_clock(struct ixgbe_adapter *adapter)
 		adapter->ptp_clock = NULL;
 		e_dev_err("ptp_clock_register failed\n");
 		return err;
-	} else
+	} else if (adapter->ptp_clock)
 		e_dev_info("registered PHC device on %s\n", netdev->name);
 
 	/* set default timestamp mode to disabled here. We do this in
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
index 1494997c4f7e..08fc5fc56d43 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -298,7 +298,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
 	if (IS_ERR(mdev->ptp_clock)) {
 		mdev->ptp_clock = NULL;
 		mlx4_err(mdev, "ptp_clock_register failed\n");
-	} else {
+	} else if (mdev->ptp_clock) {
 		mlx4_info(mdev, "registered PHC clock\n");
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
index 847a8f3ac2b2..13dc388667b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
@@ -273,7 +273,7 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv)
 
 	tstamp->ptp = ptp_clock_register(&tstamp->ptp_info,
 					 &priv->mdev->pdev->dev);
-	if (IS_ERR_OR_NULL(tstamp->ptp)) {
+	if (IS_ERR(tstamp->ptp)) {
 		mlx5_core_warn(priv->mdev, "ptp_clock_register failed %ld\n",
 			       PTR_ERR(tstamp->ptp));
 		tstamp->ptp = NULL;
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index dd204d9704c6..77a5364f7a10 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -1269,13 +1269,13 @@ int efx_ptp_probe(struct efx_nic *efx, struct efx_channel *channel)
 		if (IS_ERR(ptp->phc_clock)) {
 			rc = PTR_ERR(ptp->phc_clock);
 			goto fail3;
-		}
-
-		INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker);
-		ptp->pps_workwq = create_singlethread_workqueue("sfc_pps");
-		if (!ptp->pps_workwq) {
-			rc = -ENOMEM;
-			goto fail4;
+		} else if (ptp->phc_clock) {
+			INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker);
+			ptp->pps_workwq = create_singlethread_workqueue("sfc_pps");
+			if (!ptp->pps_workwq) {
+				rc = -ENOMEM;
+				goto fail4;
+			}
 		}
 	}
 	ptp->nic_ts_enabled = false;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 170a18b61281..6e3b82972ce8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -187,7 +187,7 @@ int stmmac_ptp_register(struct stmmac_priv *priv)
 	if (IS_ERR(priv->ptp_clock)) {
 		priv->ptp_clock = NULL;
 		pr_err("ptp_clock_register() failed on %s\n", priv->dev->name);
-	} else
+	} else if (priv->ptp_clock)
 		pr_debug("Added PTP HW clock successfully on %s\n",
 			 priv->dev->name);
 
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 6b15e168148a..5ad54fc66cf0 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -127,6 +127,11 @@ struct ptp_clock;
  *
  * @info:   Structure describing the new clock.
  * @parent: Pointer to the parent device of the new clock.
+ *
+ * Returns a valid pointer on success or PTR_ERR on failure.  If PHC
+ * support is missing at the configuration level, this function
+ * returns NULL, and drivers are expected to gracefully handle that
+ * case separately.
  */
 
 extern struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
-- 
cgit v1.2.3


From e2f036a97271cf5811ee754bf321a29a814577f9 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 21 Sep 2016 08:45:55 -0300
Subject: sctp: rename WORD_TRUNC/ROUND macros

To something more meaningful these days, specially because this is
working on packet headers or lengths and which are not tied to any CPU
arch but to the protocol itself.

So, WORD_TRUNC becomes SCTP_TRUNC4 and WORD_ROUND becomes SCTP_PAD4.

Reported-by: David Laight <David.Laight@ACULAB.COM>
Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h  | 10 +++++-----
 net/netfilter/xt_sctp.c  |  2 +-
 net/sctp/associola.c     |  2 +-
 net/sctp/chunk.c         |  6 +++---
 net/sctp/input.c         |  8 ++++----
 net/sctp/inqueue.c       |  2 +-
 net/sctp/output.c        | 12 ++++++------
 net/sctp/sm_make_chunk.c | 28 ++++++++++++++--------------
 net/sctp/sm_statefuns.c  |  6 +++---
 net/sctp/transport.c     |  4 ++--
 net/sctp/ulpevent.c      |  4 ++--
 11 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 632e205ca54b..87a7f42e7639 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -83,9 +83,9 @@
 #endif
 
 /* Round an int up to the next multiple of 4.  */
-#define WORD_ROUND(s) (((s)+3)&~3)
+#define SCTP_PAD4(s) (((s)+3)&~3)
 /* Truncate to the previous multiple of 4.  */
-#define WORD_TRUNC(s) ((s)&~3)
+#define SCTP_TRUNC4(s) ((s)&~3)
 
 /*
  * Function declarations.
@@ -433,7 +433,7 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu)
 	if (asoc->user_frag)
 		frag = min_t(int, frag, asoc->user_frag);
 
-	frag = WORD_TRUNC(min_t(int, frag, SCTP_MAX_CHUNK_LEN));
+	frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN));
 
 	return frag;
 }
@@ -462,7 +462,7 @@ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member)
 for (pos.v = chunk->member;\
      pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\
      ntohs(pos.p->length) >= sizeof(sctp_paramhdr_t);\
-     pos.v += WORD_ROUND(ntohs(pos.p->length)))
+     pos.v += SCTP_PAD4(ntohs(pos.p->length)))
 
 #define sctp_walk_errors(err, chunk_hdr)\
 _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length))
@@ -472,7 +472,7 @@ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \
 	    sizeof(sctp_chunkhdr_t));\
      (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\
      ntohs(err->length) >= sizeof(sctp_errhdr_t); \
-     err = (sctp_errhdr_t *)((void *)err + WORD_ROUND(ntohs(err->length))))
+     err = (sctp_errhdr_t *)((void *)err + SCTP_PAD4(ntohs(err->length))))
 
 #define sctp_walk_fwdtsn(pos, chunk)\
 _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk))
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index ef36a56a02c6..4dedb96d1a06 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb,
 			 ++i, offset, sch->type, htons(sch->length),
 			 sch->flags);
 #endif
-		offset += WORD_ROUND(ntohs(sch->length));
+		offset += SCTP_PAD4(ntohs(sch->length));
 
 		pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1c23060c41a6..f10d3397f917 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1408,7 +1408,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
 				transports) {
 		if (t->pmtu_pending && t->dst) {
 			sctp_transport_update_pmtu(sk, t,
-						   WORD_TRUNC(dst_mtu(t->dst)));
+						   SCTP_TRUNC4(dst_mtu(t->dst)));
 			t->pmtu_pending = 0;
 		}
 		if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index af9cc8055465..76eae828ec89 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -208,8 +208,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
 
 		if (hmac_desc)
-			max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) +
-					    hmac_desc->hmac_len);
+			max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) +
+					      hmac_desc->hmac_len);
 	}
 
 	/* Now, check if we need to reduce our max */
@@ -229,7 +229,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	    asoc->outqueue.out_qlen == 0 &&
 	    list_empty(&asoc->outqueue.retransmit) &&
 	    msg_len > max)
-		max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t));
+		max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
 
 	/* Encourage Cookie-ECHO bundling. */
 	if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 69444d32ecda..a1d85065bfc0 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -605,7 +605,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 		/* PMTU discovery (RFC1191) */
 		if (ICMP_FRAG_NEEDED == code) {
 			sctp_icmp_frag_needed(sk, asoc, transport,
-					      WORD_TRUNC(info));
+					      SCTP_TRUNC4(info));
 			goto out_unlock;
 		} else {
 			if (ICMP_PROT_UNREACH == code) {
@@ -673,7 +673,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
 		if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
 			break;
 
-		ch_end = offset + WORD_ROUND(ntohs(ch->length));
+		ch_end = offset + SCTP_PAD4(ntohs(ch->length));
 		if (ch_end > skb->len)
 			break;
 
@@ -1121,7 +1121,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
 		if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
 			break;
 
-		ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+		ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
 		if (ch_end > skb_tail_pointer(skb))
 			break;
 
@@ -1190,7 +1190,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
 	 * that the chunk length doesn't cause overflow.  Otherwise, we'll
 	 * walk off the end.
 	 */
-	if (WORD_ROUND(ntohs(ch->length)) > skb->len)
+	if (SCTP_PAD4(ntohs(ch->length)) > skb->len)
 		return NULL;
 
 	/* If this is INIT/INIT-ACK look inside the chunk too. */
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 6437aa97cfd7..f731de3e8428 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -213,7 +213,7 @@ new_skb:
 	}
 
 	chunk->chunk_hdr = ch;
-	chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+	chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
 	skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
 	chunk->subh.v = NULL; /* Subheader is no longer valid.  */
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 0c605ec74dc4..2a5c1896d18f 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -297,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
 					      struct sctp_chunk *chunk)
 {
 	sctp_xmit_t retval = SCTP_XMIT_OK;
-	__u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length));
+	__u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length));
 
 	/* Check to see if this chunk will fit into the packet */
 	retval = sctp_packet_will_fit(packet, chunk, chunk_len);
@@ -508,7 +508,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 		if (gso) {
 			pkt_size = packet->overhead;
 			list_for_each_entry(chunk, &packet->chunk_list, list) {
-				int padded = WORD_ROUND(chunk->skb->len);
+				int padded = SCTP_PAD4(chunk->skb->len);
 
 				if (pkt_size + padded > tp->pathmtu)
 					break;
@@ -538,7 +538,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 		 * included in the chunk length field.  The sender should
 		 * never pad with more than 3 bytes.
 		 *
-		 * [This whole comment explains WORD_ROUND() below.]
+		 * [This whole comment explains SCTP_PAD4() below.]
 		 */
 
 		pkt_size -= packet->overhead;
@@ -560,7 +560,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 				has_data = 1;
 			}
 
-			padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+			padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
 			if (padding)
 				memset(skb_put(chunk->skb, padding), 0, padding);
 
@@ -587,7 +587,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 			 * acknowledged or have failed.
 			 * Re-queue auth chunks if needed.
 			 */
-			pkt_size -= WORD_ROUND(chunk->skb->len);
+			pkt_size -= SCTP_PAD4(chunk->skb->len);
 
 			if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
 				sctp_chunk_free(chunk);
@@ -911,7 +911,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
 		 */
 		maxsize = pmtu - packet->overhead;
 		if (packet->auth)
-			maxsize -= WORD_ROUND(packet->auth->skb->len);
+			maxsize -= SCTP_PAD4(packet->auth->skb->len);
 		if (chunk_len > maxsize)
 			retval = SCTP_XMIT_PMTU_FULL;
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8c77b87a8565..79dd66079dd7 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	num_types = sp->pf->supported_addrs(sp, types);
 
 	chunksize = sizeof(init) + addrs_len;
-	chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
+	chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
 	chunksize += sizeof(ecap_param);
 
 	if (asoc->prsctp_enable)
@@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 		/* Add HMACS parameter length if any were defined */
 		auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
 		if (auth_hmacs->length)
-			chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+			chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
 		else
 			auth_hmacs = NULL;
 
 		/* Add CHUNKS parameter length */
 		auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
 		if (auth_chunks->length)
-			chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+			chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
 		else
 			auth_chunks = NULL;
 
@@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 
 	/* If we have any extensions to report, account for that */
 	if (num_ext)
-		chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
-					num_ext);
+		chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+				       num_ext);
 
 	/* RFC 2960 3.3.2 Initiation (INIT) (1)
 	 *
@@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 
 		auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
 		if (auth_hmacs->length)
-			chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+			chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
 		else
 			auth_hmacs = NULL;
 
 		auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
 		if (auth_chunks->length)
-			chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+			chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
 		else
 			auth_chunks = NULL;
 
@@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	}
 
 	if (num_ext)
-		chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
-					num_ext);
+		chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+				       num_ext);
 
 	/* Now allocate and fill out the chunk.  */
 	retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
@@ -1390,7 +1390,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 	struct sock *sk;
 
 	/* No need to allocate LL here, as this is only a chunk. */
-	skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
+	skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp);
 	if (!skb)
 		goto nodata;
 
@@ -1482,7 +1482,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
 	void *target;
 	void *padding;
 	int chunklen = ntohs(chunk->chunk_hdr->length);
-	int padlen = WORD_ROUND(chunklen) - chunklen;
+	int padlen = SCTP_PAD4(chunklen) - chunklen;
 
 	padding = skb_put(chunk->skb, padlen);
 	target = skb_put(chunk->skb, len);
@@ -1900,7 +1900,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc,
 	struct __sctp_missing report;
 	__u16 len;
 
-	len = WORD_ROUND(sizeof(report));
+	len = SCTP_PAD4(sizeof(report));
 
 	/* Make an ERROR chunk, preparing enough room for
 	 * returning multiple unknown parameters.
@@ -2098,9 +2098,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
 
 		if (*errp) {
 			if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
-					WORD_ROUND(ntohs(param.p->length))))
+					SCTP_PAD4(ntohs(param.p->length))))
 				sctp_addto_chunk_fixed(*errp,
-						WORD_ROUND(ntohs(param.p->length)),
+						SCTP_PAD4(ntohs(param.p->length)),
 						param.v);
 		} else {
 			/* If there is no memory for generating the ERROR
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d88bb2b0b699..026e3bca4a94 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3454,7 +3454,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
 		}
 
 		/* Report violation if chunk len overflows */
-		ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+		ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
 		if (ch_end > skb_tail_pointer(skb))
 			return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
@@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
 		hdr = unk_chunk->chunk_hdr;
 		err_chunk = sctp_make_op_error(asoc, unk_chunk,
 					       SCTP_ERROR_UNKNOWN_CHUNK, hdr,
-					       WORD_ROUND(ntohs(hdr->length)),
+					       SCTP_PAD4(ntohs(hdr->length)),
 					       0);
 		if (err_chunk) {
 			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
@@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
 		hdr = unk_chunk->chunk_hdr;
 		err_chunk = sctp_make_op_error(asoc, unk_chunk,
 					       SCTP_ERROR_UNKNOWN_CHUNK, hdr,
-					       WORD_ROUND(ntohs(hdr->length)),
+					       SCTP_PAD4(ntohs(hdr->length)),
 					       0);
 		if (err_chunk) {
 			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 81b86678be4d..ce54dce13ddb 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 	}
 
 	if (transport->dst) {
-		transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+		transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
 	} else
 		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
@@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport,
 		return;
 	}
 	if (transport->dst) {
-		transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+		transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
 
 		/* Initialize sk->sk_rcv_saddr, if the transport is the
 		 * association's active path for getsockname().
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d85b803da11d..bea00058ce35 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
 
 	ch = (sctp_errhdr_t *)(chunk->skb->data);
 	cause = ch->cause;
-	elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
+	elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
 
 	/* Pull off the ERROR header.  */
 	skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
@@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	 * MUST ignore the padding bytes.
 	 */
 	len = ntohs(chunk->chunk_hdr->length);
-	padding = WORD_ROUND(len) - len;
+	padding = SCTP_PAD4(len) - len;
 
 	/* Fixup cloned skb with just this chunks data.  */
 	skb_trim(skb, chunk->chunk_end - padding - skb->data);
-- 
cgit v1.2.3


From 77f2efcbdd7133466060198e02c6e8a170c3cd14 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:29:01 +0100
Subject: rxrpc: Add ktime_sub_ms()

Add a ktime_sub_ms() to go with ktime_add_ms() and co. for use in AF_RXRPC
RTT determination.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/ktime.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 2b6a204bd8d4..aa118bad1407 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -231,6 +231,11 @@ static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec)
 	return ktime_sub_ns(kt, usec * NSEC_PER_USEC);
 }
 
+static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec)
+{
+	return ktime_sub_ns(kt, msec * NSEC_PER_MSEC);
+}
+
 extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs);
 
 /**
-- 
cgit v1.2.3


From 572de608e36279f249c9a6350f142e69f23dacab Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Thu, 22 Sep 2016 10:33:54 +0800
Subject: net: ethernet: mediatek: add extension of phy-mode for TRGMII

adds PHY-mode "trgmii" as an extension for the operation
mode of the PHY interface for PHY_INTERFACE_MODE_TRGMII.
and adds a variable trgmii inside mtk_mac as the indication
to make the difference between the MAC connected to internal
switch or connected to external PHY by the given configuration
on the board and then to perform the corresponding setup on
TRGMII hardware module.

Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 ++
 drivers/net/ethernet/mediatek/mtk_eth_soc.h | 3 +++
 include/linux/phy.h                         | 3 +++
 3 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 2909372c4da0..e873e21fd20e 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -244,6 +244,8 @@ static int mtk_phy_connect(struct mtk_mac *mac)
 		return -ENODEV;
 
 	switch (of_get_phy_mode(np)) {
+	case PHY_INTERFACE_MODE_TRGMII:
+		mac->trgmii = true;
 	case PHY_INTERFACE_MODE_RGMII_TXID:
 	case PHY_INTERFACE_MODE_RGMII_RXID:
 	case PHY_INTERFACE_MODE_RGMII_ID:
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
index 7c5e534d2120..e3b9525f94c6 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -529,6 +529,8 @@ struct mtk_eth {
  * @hw:			Backpointer to our main datastruture
  * @hw_stats:		Packet statistics counter
  * @phy_dev:		The attached PHY if available
+ * @trgmii		Indicate if the MAC uses TRGMII connected to internal
+			switch
  */
 struct mtk_mac {
 	int				id;
@@ -539,6 +541,7 @@ struct mtk_mac {
 	struct phy_device		*phy_dev;
 	__be32				hwlro_ip[MTK_MAX_LRO_IP_CNT];
 	int				hwlro_ip_cnt;
+	bool				trgmii;
 };
 
 /* the struct describing the SoC. these are declared in the soc_xyz.c files */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 2d24b283aa2d..e25f1830fbcf 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -80,6 +80,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_XGMII,
 	PHY_INTERFACE_MODE_MOCA,
 	PHY_INTERFACE_MODE_QSGMII,
+	PHY_INTERFACE_MODE_TRGMII,
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
@@ -123,6 +124,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "moca";
 	case PHY_INTERFACE_MODE_QSGMII:
 		return "qsgmii";
+	case PHY_INTERFACE_MODE_TRGMII:
+		return "trgmii";
 	default:
 		return "unknown";
 	}
-- 
cgit v1.2.3


From 182691d0998400f35ad304718024e60feaa864aa Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 20 Sep 2016 18:19:14 -0300
Subject: sctp: improve how SSN, TSN and ASCONF serial are compared

Make it similar to time_before() macros:
- easier to understand
- make use of typecheck() to avoid working on unexpected variable types
  (made the issue on previous patch visible)
- for _[lg]te versions, slighly faster, as the compiler used to generate
  a sequence of cmp/je/cmp/js instructions and now it's sub/test/jle
  (for _lte):

Before, for sctp_outq_sack:
	if (primary->cacc.changeover_active) {
    1f01:	80 b9 84 02 00 00 00 	cmpb   $0x0,0x284(%rcx)
    1f08:	74 6e                	je     1f78 <sctp_outq_sack+0xe8>
		u8 clear_cycling = 0;

		if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) {
    1f0a:	8b 81 80 02 00 00    	mov    0x280(%rcx),%eax
	return ((s) - (t)) & TSN_SIGN_BIT;
}

static inline int TSN_lte(__u32 s, __u32 t)
{
	return ((s) == (t)) || (((s) - (t)) & TSN_SIGN_BIT);
    1f10:	8b 7d bc             	mov    -0x44(%rbp),%edi
    1f13:	39 c7                	cmp    %eax,%edi
    1f15:	74 25                	je     1f3c <sctp_outq_sack+0xac>
    1f17:	39 f8                	cmp    %edi,%eax
    1f19:	78 21                	js     1f3c <sctp_outq_sack+0xac>
			primary->cacc.changeover_active = 0;

After:
	if (primary->cacc.changeover_active) {
    1ee7:	80 b9 84 02 00 00 00 	cmpb   $0x0,0x284(%rcx)
    1eee:	74 73                	je     1f63 <sctp_outq_sack+0xf3>
		u8 clear_cycling = 0;

		if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) {
    1ef0:	8b 81 80 02 00 00    	mov    0x280(%rcx),%eax
    1ef6:	2b 45 b4             	sub    -0x4c(%rbp),%eax
    1ef9:	85 c0                	test   %eax,%eax
    1efb:	7e 26                	jle    1f23 <sctp_outq_sack+0xb3>
			primary->cacc.changeover_active = 0;

*_lt() generated pretty much the same code.
Tested with gcc (GCC) 6.1.1 20160621.

This patch also removes SSN_lte as it is not used and cleanups some
comments.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h | 94 ++++++++++-----------------------------------------
 1 file changed, 18 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index bafe2a0ab908..ca6c971dd74a 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -307,85 +307,27 @@ static inline __u16 sctp_data_size(struct sctp_chunk *chunk)
 }
 
 /* Compare two TSNs */
+#define TSN_lt(a,b)	\
+	(typecheck(__u32, a) && \
+	 typecheck(__u32, b) && \
+	 ((__s32)((a) - (b)) < 0))
 
-/* RFC 1982 - Serial Number Arithmetic
- *
- * 2. Comparison
- *  Then, s1 is said to be equal to s2 if and only if i1 is equal to i2,
- *  in all other cases, s1 is not equal to s2.
- *
- * s1 is said to be less than s2 if, and only if, s1 is not equal to s2,
- * and
- *
- *      (i1 < i2 and i2 - i1 < 2^(SERIAL_BITS - 1)) or
- *      (i1 > i2 and i1 - i2 > 2^(SERIAL_BITS - 1))
- *
- * s1 is said to be greater than s2 if, and only if, s1 is not equal to
- * s2, and
- *
- *      (i1 < i2 and i2 - i1 > 2^(SERIAL_BITS - 1)) or
- *      (i1 > i2 and i1 - i2 < 2^(SERIAL_BITS - 1))
- */
-
-/*
- * RFC 2960
- *  1.6 Serial Number Arithmetic
- *
- * Comparisons and arithmetic on TSNs in this document SHOULD use Serial
- * Number Arithmetic as defined in [RFC1982] where SERIAL_BITS = 32.
- */
-
-enum {
-	TSN_SIGN_BIT = (1<<31)
-};
-
-static inline int TSN_lt(__u32 s, __u32 t)
-{
-	return ((s) - (t)) & TSN_SIGN_BIT;
-}
-
-static inline int TSN_lte(__u32 s, __u32 t)
-{
-	return ((s) == (t)) || (((s) - (t)) & TSN_SIGN_BIT);
-}
+#define TSN_lte(a,b)	\
+	(typecheck(__u32, a) && \
+	 typecheck(__u32, b) && \
+	 ((__s32)((a) - (b)) <= 0))
 
 /* Compare two SSNs */
-
-/*
- * RFC 2960
- *  1.6 Serial Number Arithmetic
- *
- * Comparisons and arithmetic on Stream Sequence Numbers in this document
- * SHOULD use Serial Number Arithmetic as defined in [RFC1982] where
- * SERIAL_BITS = 16.
- */
-enum {
-	SSN_SIGN_BIT = (1<<15)
-};
-
-static inline int SSN_lt(__u16 s, __u16 t)
-{
-	return ((s) - (t)) & SSN_SIGN_BIT;
-}
-
-static inline int SSN_lte(__u16 s, __u16 t)
-{
-	return ((s) == (t)) || (((s) - (t)) & SSN_SIGN_BIT);
-}
-
-/*
- * ADDIP 3.1.1
- * The valid range of Serial Number is from 0 to 4294967295 (2**32 - 1). Serial
- * Numbers wrap back to 0 after reaching 4294967295.
- */
-enum {
-	ADDIP_SERIAL_SIGN_BIT = (1<<31)
-};
-
-static inline int ADDIP_SERIAL_gte(__u32 s, __u32 t)
-{
-	return ((s) == (t)) || (((t) - (s)) & ADDIP_SERIAL_SIGN_BIT);
-}
+#define SSN_lt(a,b)		\
+	(typecheck(__u16, a) && \
+	 typecheck(__u16, b) && \
+	 ((__s16)((a) - (b)) < 0))
+
+/* ADDIP 3.1.1 */
+#define ADDIP_SERIAL_gte(a,b)	\
+	(typecheck(__u32, a) && \
+	 typecheck(__u32, b) && \
+	 ((__s32)((b) - (a)) <= 0))
 
 /* Check VTAG of the packet matches the sender's own tag. */
 static inline int
-- 
cgit v1.2.3


From fefa569a9d4bc4b7758c0fddd75bb0382c95da77 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 22 Sep 2016 08:58:55 -0700
Subject: net_sched: sch_fq: account for schedule/timers drifts

It looks like the following patch can make FQ very precise, even in VM
or stressed hosts. It matters at high pacing rates.

We take into account the difference between the time that was programmed
when last packet was sent, and current time (a drift of tens of usecs is
often observed)

Add an EWMA of the unthrottle latency to help diagnostics.

This latency is the difference between current time and oldest packet in
delayed RB-tree. This accounts for the high resolution timer latency,
but can be different under stress, as fq_check_throttled() can be
opportunistically be called from a dequeue() called after an enqueue()
for a different flow.

Tested:
// Start a 10Gbit flow
$ netperf --google-pacing-rate 1250000000 -H lpaa24 -l 10000 -- -K bbr &

Before patch :
$ sar -n DEV 10 5 | grep eth0 | grep Average
Average:         eth0  17106.04 756876.84   1102.75 1119049.02      0.00      0.00      0.52

After patch :
$ sar -n DEV 10 5 | grep eth0 | grep Average
Average:         eth0  17867.00 800245.90   1151.77 1183172.12      0.00      0.00      0.52

A new iproute2 tc can output the 'unthrottle latency' :

$ tc -s qd sh dev eth0 | grep latency
  0 gc, 0 highprio, 32490767 throttled, 2382 ns latency

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 +-
 net/sched/sch_fq.c             | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f8e39dbaa781..df7451d35131 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -811,7 +811,7 @@ struct tc_fq_qd_stats {
 	__u32	flows;
 	__u32	inactive_flows;
 	__u32	throttled_flows;
-	__u32	pad;
+	__u32	unthrottle_latency_ns;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 5dd929cc1423..18e752439f6f 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
 
 	struct rb_root	delayed;	/* for rate limited flows */
 	u64		time_next_delayed_flow;
+	unsigned long	unthrottle_latency_ns;
 
 	struct fq_flow	internal;	/* for non classified or high prio packets */
 	u32		quantum;
@@ -408,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 {
+	unsigned long sample;
 	struct rb_node *p;
 
 	if (q->time_next_delayed_flow > now)
 		return;
 
+	/* Update unthrottle latency EWMA.
+	 * This is cheap and can help diagnosing timer/latency problems.
+	 */
+	sample = (unsigned long)(now - q->time_next_delayed_flow);
+	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+	q->unthrottle_latency_ns += sample >> 3;
+
 	q->time_next_delayed_flow = ~0ULL;
 	while ((p = rb_first(&q->delayed)) != NULL) {
 		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
@@ -515,7 +524,12 @@ begin:
 			len = NSEC_PER_SEC;
 			q->stat_pkts_too_long++;
 		}
-
+		/* Account for schedule/timers drifts.
+		 * f->time_next_packet was set when prior packet was sent,
+		 * and current time (@now) can be too late by tens of us.
+		 */
+		if (f->time_next_packet)
+			len -= min(len/2, now - f->time_next_packet);
 		f->time_next_packet = now + len;
 	}
 out:
@@ -787,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
 	q->flow_refill_delay	= msecs_to_jiffies(40);
 	q->flow_max_rate	= ~0U;
+	q->time_next_delayed_flow = ~0ULL;
 	q->rate_enable		= 1;
 	q->new_flows.first	= NULL;
 	q->old_flows.first	= NULL;
@@ -854,8 +869,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.flows		  = q->flows;
 	st.inactive_flows	  = q->inactive_flows;
 	st.throttled_flows	  = q->throttled_flows;
-	st.pad			  = 0;
-
+	st.unthrottle_latency_ns  = min_t(unsigned long,
+					  q->unthrottle_latency_ns, ~0U);
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
-- 
cgit v1.2.3


From 53e89941ba2a969c483aa29b907de9a823179297 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 22 Sep 2016 20:01:41 +0300
Subject: net_sched: act_vlan: add helper inlines to access tcf_vlan info

Needed e.g for offloading drivers to pick the relevant attributes.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_vlan.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index 6b835889ea30..48cca321ee6c 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -11,6 +11,7 @@
 #define __NET_TC_VLAN_H
 
 #include <net/act_api.h>
+#include <linux/tc_act/tc_vlan.h>
 
 #define VLAN_F_POP		0x1
 #define VLAN_F_PUSH		0x2
@@ -24,4 +25,28 @@ struct tcf_vlan {
 };
 #define to_vlan(a) ((struct tcf_vlan *)a)
 
+static inline bool is_tcf_vlan(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (a->ops && a->ops->type == TCA_ACT_VLAN)
+		return true;
+#endif
+	return false;
+}
+
+static inline u32 tcf_vlan_action(const struct tc_action *a)
+{
+	return to_vlan(a)->tcfv_action;
+}
+
+static inline u16 tcf_vlan_push_vid(const struct tc_action *a)
+{
+	return to_vlan(a)->tcfv_push_vid;
+}
+
+static inline __be16 tcf_vlan_push_proto(const struct tc_action *a)
+{
+	return to_vlan(a)->tcfv_push_proto;
+}
+
 #endif /* __NET_TC_VLAN_H */
-- 
cgit v1.2.3


From 732f794c1baf58e1eb2be4431635829c3da655bd Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 22 Sep 2016 16:49:22 -0400
Subject: net: dsa: add port fast ageing

Today the DSA drivers are in charge of flushing the MAC addresses
associated to a port when its STP state changes from Learning or
Forwarding, to Disabled or Blocking or Listening.

This makes the drivers more complex and hides the generic switch logic.
Introduce a new optional port_fast_age operation to dsa_switch_ops, to
move this logic to the DSA layer and keep drivers simple.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  2 ++
 net/dsa/slave.c   | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 7556646db2d3..b122196d5a1f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -143,6 +143,7 @@ struct dsa_port {
 	struct net_device	*netdev;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
+	u8			stp_state;
 };
 
 struct dsa_switch {
@@ -339,6 +340,7 @@ struct dsa_switch_ops {
 	void	(*port_bridge_leave)(struct dsa_switch *ds, int port);
 	void	(*port_stp_state_set)(struct dsa_switch *ds, int port,
 				      u8 state);
+	void	(*port_fast_age)(struct dsa_switch *ds, int port);
 
 	/*
 	 * VLAN support
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index fd78d4c9b197..6b1282c006b1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -71,8 +71,26 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
 
 static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
 {
+	struct dsa_port *dp = &ds->ports[port];
+
 	if (ds->ops->port_stp_state_set)
 		ds->ops->port_stp_state_set(ds, port, state);
+
+	if (ds->ops->port_fast_age) {
+		/* Fast age FDB entries or flush appropriate forwarding database
+		 * for the given port, if we are moving it from Learning or
+		 * Forwarding state, to Disabled or Blocking or Listening state.
+		 */
+
+		if ((dp->stp_state == BR_STATE_LEARNING ||
+		     dp->stp_state == BR_STATE_FORWARDING) &&
+		    (state == BR_STATE_DISABLED ||
+		     state == BR_STATE_BLOCKING ||
+		     state == BR_STATE_LISTENING))
+			ds->ops->port_fast_age(ds, port);
+	}
+
+	dp->stp_state = state;
 }
 
 static int dsa_slave_open(struct net_device *dev)
-- 
cgit v1.2.3


From 7a4b28c6cc9ffac50f791b99cc7e46106436e5d8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 23 Sep 2016 01:28:37 +0200
Subject: bpf: add helper to invalidate hash

Add a small helper that complements 36bbef52c7eb ("bpf: direct packet
write and access for helpers for clsact progs") for invalidating the
current skb->hash after mangling on headers via direct packet write.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  7 +++++++
 net/core/filter.c        | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e07432b9f8b8..f09c70b97eca 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -419,6 +419,13 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_csum_update,
 
+	/**
+	 * bpf_set_hash_invalid(skb)
+	 * Invalidate current skb>hash.
+	 * @skb: pointer to skb
+	 */
+	BPF_FUNC_set_hash_invalid,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index acf84fbfb043..00351cdf7d0c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1777,6 +1777,22 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+	/* After all direct packet write, this can be used once for
+	 * triggering a lazy recalc on next skb_get_hash() invocation.
+	 */
+	skb_clear_hash(skb);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+	.func		= bpf_set_hash_invalid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	   u16, vlan_tci)
 {
@@ -2534,6 +2550,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_route_realm_proto;
 	case BPF_FUNC_get_hash_recalc:
 		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_set_hash_invalid:
+		return &bpf_set_hash_invalid_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
-- 
cgit v1.2.3


From fc7ab6d29a3af0b7f6df7c095509378c8caf85b5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 15:22:36 +0100
Subject: rxrpc: Add a tracepoint for the call timer

Add a tracepoint to log call timer initiation, setting and expiry.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 36 ++++++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 13 ++++++++++++-
 net/rxrpc/call_event.c       |  7 ++++---
 net/rxrpc/call_object.c      |  6 ++++--
 net/rxrpc/misc.c             |  8 ++++++++
 net/rxrpc/sendmsg.c          |  2 +-
 6 files changed, 65 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index e8f2afbbe0bf..57322897d745 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -414,6 +414,42 @@ TRACE_EVENT(rxrpc_rtt_rx,
 		      __entry->avg)
 	    );
 
+TRACE_EVENT(rxrpc_timer,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why,
+		     unsigned long now),
+
+	    TP_ARGS(call, why, now),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,		call		)
+		    __field(enum rxrpc_timer_trace,		why		)
+		    __field(unsigned long,			now		)
+		    __field(unsigned long,			expire_at	)
+		    __field(unsigned long,			ack_at		)
+		    __field(unsigned long,			resend_at	)
+		    __field(unsigned long,			timer		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call	= call;
+		    __entry->why	= why;
+		    __entry->now	= now;
+		    __entry->expire_at	= call->expire_at;
+		    __entry->ack_at	= call->ack_at;
+		    __entry->resend_at	= call->resend_at;
+		    __entry->timer	= call->timer.expires;
+			   ),
+
+	    TP_printk("c=%p %s now=%lx x=%ld a=%ld r=%ld t=%ld",
+		      __entry->call,
+		      rxrpc_timer_traces[__entry->why],
+		      __entry->now,
+		      __entry->expire_at - __entry->now,
+		      __entry->ack_at - __entry->now,
+		      __entry->resend_at - __entry->now,
+		      __entry->timer - __entry->now)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index a494d56eb236..e564eca75985 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -678,6 +678,17 @@ enum rxrpc_rtt_rx_trace {
 
 extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
 
+enum rxrpc_timer_trace {
+	rxrpc_timer_begin,
+	rxrpc_timer_expired,
+	rxrpc_timer_set_for_ack,
+	rxrpc_timer_set_for_resend,
+	rxrpc_timer_set_for_send,
+	rxrpc_timer__nr_trace
+};
+
+extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
@@ -707,7 +718,7 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
-void rxrpc_set_timer(struct rxrpc_call *);
+void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace);
 void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
 void rxrpc_process_call(struct work_struct *);
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 8bc5c8e37ab4..90e970ba048a 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -24,7 +24,7 @@
 /*
  * Set the timer
  */
-void rxrpc_set_timer(struct rxrpc_call *call)
+void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why)
 {
 	unsigned long t, now = jiffies;
 
@@ -45,6 +45,7 @@ void rxrpc_set_timer(struct rxrpc_call *call)
 
 		if (call->timer.expires != t || !timer_pending(&call->timer)) {
 			mod_timer(&call->timer, t);
+			trace_rxrpc_timer(call, why, now);
 		}
 	}
 
@@ -120,7 +121,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		_debug("deferred ACK %ld < %ld", expiry, call->ack_at - now);
 		if (time_before(ack_at, call->ack_at)) {
 			call->ack_at = ack_at;
-			rxrpc_set_timer(call);
+			rxrpc_set_timer(call, rxrpc_timer_set_for_ack);
 		}
 	}
 }
@@ -293,7 +294,7 @@ recheck_state:
 		goto recheck_state;
 	}
 
-	rxrpc_set_timer(call);
+	rxrpc_set_timer(call, rxrpc_timer_set_for_resend);
 
 	/* other events may have been raised since we started checking */
 	if (call->events && call->state < RXRPC_CALL_COMPLETE) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f2fadf667e19..a53f4c2c0025 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -76,8 +76,10 @@ static void rxrpc_call_timer_expired(unsigned long _call)
 
 	_enter("%d", call->debug_id);
 
-	if (call->state < RXRPC_CALL_COMPLETE)
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies);
 		rxrpc_queue_call(call);
+	}
 }
 
 /*
@@ -200,7 +202,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
 	call->ack_at = expire_at;
 	call->resend_at = expire_at;
 	call->timer.expires = expire_at + 1;
-	rxrpc_set_timer(call);
+	rxrpc_set_timer(call, rxrpc_timer_begin);
 }
 
 /*
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index fe648711c2f7..fa9942fabdf2 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -194,3 +194,11 @@ const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
 	[rxrpc_rtt_rx_ping_response]	= "PONG",
 	[rxrpc_rtt_rx_requested_ack]	= "RACK",
 };
+
+const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
+	[rxrpc_timer_begin]			= "Begin ",
+	[rxrpc_timer_expired]			= "*EXPR*",
+	[rxrpc_timer_set_for_ack]		= "SetAck",
+	[rxrpc_timer_set_for_send]		= "SetTx ",
+	[rxrpc_timer_set_for_resend]		= "SetRTx",
+};
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 93e6584cd751..99939372b5a4 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -153,7 +153,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 
 		if (time_before(resend_at, call->resend_at)) {
 			call->resend_at = resend_at;
-			rxrpc_set_timer(call);
+			rxrpc_set_timer(call, rxrpc_timer_set_for_send);
 		}
 	}
 
-- 
cgit v1.2.3


From be832aecc5ba811728e15a10f675f4a2187f25dd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:22 +0100
Subject: rxrpc: Add data Tx tracepoint and adjust Tx ACK tracepoint

Add a tracepoint to log transmission of DATA packets (including loss
injection).

Adjust the ACK transmission tracepoint to include the packet serial number
and to line this up with the DATA transmission display.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 50 +++++++++++++++++++++++++++++++++++++-------
 net/rxrpc/conn_event.c       |  5 ++---
 net/rxrpc/output.c           |  5 ++++-
 3 files changed, 48 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 57322897d745..6001bf93dc79 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -256,33 +256,67 @@ TRACE_EVENT(rxrpc_rx_ack,
 		      __entry->n_acks)
 	    );
 
+TRACE_EVENT(rxrpc_tx_data,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
+		     rxrpc_serial_t serial, u8 flags, bool lose),
+
+	    TP_ARGS(call, seq, serial, flags, lose),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(rxrpc_serial_t,		serial		)
+		    __field(u8,				flags		)
+		    __field(bool,			lose		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->seq = seq;
+		    __entry->serial = serial;
+		    __entry->flags = flags;
+		    __entry->lose = lose;
+			   ),
+
+	    TP_printk("c=%p DATA %08x q=%08x fl=%02x%s",
+		      __entry->call,
+		      __entry->serial,
+		      __entry->seq,
+		      __entry->flags,
+		      __entry->lose ? " *LOSE*" : "")
+	    );
+
 TRACE_EVENT(rxrpc_tx_ack,
-	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first,
-		     rxrpc_serial_t serial, u8 reason, u8 n_acks),
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial,
+		     rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial,
+		     u8 reason, u8 n_acks),
 
-	    TP_ARGS(call, first, serial, reason, n_acks),
+	    TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks),
 
 	    TP_STRUCT__entry(
 		    __field(struct rxrpc_call *,	call		)
-		    __field(rxrpc_seq_t,		first		)
 		    __field(rxrpc_serial_t,		serial		)
+		    __field(rxrpc_seq_t,		ack_first	)
+		    __field(rxrpc_serial_t,		ack_serial	)
 		    __field(u8,				reason		)
 		    __field(u8,				n_acks		)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->call = call;
-		    __entry->first = first;
 		    __entry->serial = serial;
+		    __entry->ack_first = ack_first;
+		    __entry->ack_serial = ack_serial;
 		    __entry->reason = reason;
 		    __entry->n_acks = n_acks;
 			   ),
 
-	    TP_printk("c=%p %s f=%08x r=%08x n=%u",
+	    TP_printk(" c=%p ACK  %08x %s f=%08x r=%08x n=%u",
 		      __entry->call,
-		      rxrpc_acks(__entry->reason),
-		      __entry->first,
 		      __entry->serial,
+		      rxrpc_acks(__entry->reason),
+		      __entry->ack_first,
+		      __entry->ack_serial,
 		      __entry->n_acks)
 	    );
 
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 75a15a4c74c3..a1cf1ec5f29e 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -98,9 +98,6 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.info.rwind		= htonl(rxrpc_rx_window_size);
 		pkt.info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
 		len += sizeof(pkt.ack) + sizeof(pkt.info);
-
-		trace_rxrpc_tx_ack(NULL, chan->last_seq, 0,
-				   RXRPC_ACK_DUPLICATE, 0);
 		break;
 	}
 
@@ -122,6 +119,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		_proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
 		break;
 	case RXRPC_PACKET_TYPE_ACK:
+		trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
+				   RXRPC_ACK_DUPLICATE, 0);
 		_proto("Tx ACK %%%u [re]", serial);
 		break;
 	}
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 5c1e008a5323..e47fbd1c836d 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -177,7 +177,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	pkt->whdr.serial = htonl(serial);
 	switch (type) {
 	case RXRPC_PACKET_TYPE_ACK:
-		trace_rxrpc_tx_ack(call,
+		trace_rxrpc_tx_ack(call, serial,
 				   ntohl(pkt->ack.firstPacket),
 				   ntohl(pkt->ack.serial),
 				   pkt->ack.reason, pkt->ack.nAcks);
@@ -275,6 +275,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
 		static int lose;
 		if ((lose++ & 7) == 7) {
+			trace_rxrpc_tx_data(call, sp->hdr.seq, serial,
+					    whdr.flags, true);
 			rxrpc_lose_skb(skb, rxrpc_skb_tx_lost);
 			_leave(" = 0 [lose]");
 			return 0;
@@ -302,6 +304,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		goto send_fragmentable;
 
 done:
+	trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, false);
 	if (ret >= 0) {
 		ktime_t now = ktime_get_real();
 		skb->tstamp = now;
-- 
cgit v1.2.3


From 89b475abdb107a74f57497b65becaf837a0e5b6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:22 +0100
Subject: rxrpc: Add a tracepoint to log injected Rx packet loss

Add a tracepoint to log received packets that get discarded due to Rx
packet loss.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 21 +++++++++++++++++++++
 net/rxrpc/input.c            | 11 +++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 6001bf93dc79..9413b17ba04b 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -484,6 +484,27 @@ TRACE_EVENT(rxrpc_timer,
 		      __entry->timer - __entry->now)
 	    );
 
+TRACE_EVENT(rxrpc_rx_lose,
+	    TP_PROTO(struct rxrpc_skb_priv *sp),
+
+	    TP_ARGS(sp),
+
+	    TP_STRUCT__entry(
+		    __field_struct(struct rxrpc_host_header,	hdr		)
+			     ),
+
+	    TP_fast_assign(
+		    memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr));
+			   ),
+
+	    TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s *LOSE*",
+		      __entry->hdr.epoch, __entry->hdr.cid,
+		      __entry->hdr.callNumber, __entry->hdr.serviceId,
+		      __entry->hdr.serial, __entry->hdr.seq,
+		      __entry->hdr.type, __entry->hdr.flags,
+		      __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK")
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index fb3e2f6afa3b..19b1e189f5dc 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -837,20 +837,19 @@ void rxrpc_data_ready(struct sock *udp_sk)
 	skb_orphan(skb);
 	sp = rxrpc_skb(skb);
 
+	/* dig out the RxRPC connection details */
+	if (rxrpc_extract_header(sp, skb) < 0)
+		goto bad_message;
+
 	if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
 		static int lose;
 		if ((lose++ & 7) == 7) {
+			trace_rxrpc_rx_lose(sp);
 			rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
 			return;
 		}
 	}
 
-	_net("Rx UDP packet from %08x:%04hu",
-	     ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
-
-	/* dig out the RxRPC connection details */
-	if (rxrpc_extract_header(sp, skb) < 0)
-		goto bad_message;
 	trace_rxrpc_rx_packet(sp);
 
 	_net("Rx RxRPC %s ep=%x call=%x:%x",
-- 
cgit v1.2.3


From 9c7ad434441da6b5d4ac878cac368fbdaec99b56 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 13:50:40 +0100
Subject: rxrpc: Add tracepoint for ACK proposal

Add a tracepoint to log proposed ACKs, including whether the proposal is
used to update a pending ACK or is discarded in favour of an easlier,
higher priority ACK.

Whilst we're at it, get rid of the rxrpc_acks() function and access the
name array directly.  We do, however, need to validate the ACK reason
number given to trace_rxrpc_rx_ack() to make sure we don't overrun the
array.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/rxrpc/packet.h       |  1 +
 include/trace/events/rxrpc.h | 42 ++++++++++++++++++++++++++++++++++++++++--
 net/rxrpc/ar-internal.h      | 25 +++++++++++++++++++++++--
 net/rxrpc/call_event.c       | 21 ++++++++++++++-------
 net/rxrpc/input.c            | 19 +++++++++++++------
 net/rxrpc/misc.c             | 30 +++++++++++++++++++-----------
 net/rxrpc/output.c           |  3 ++-
 net/rxrpc/recvmsg.c          |  3 ++-
 8 files changed, 114 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index fd6eb3a60a8c..703a64b4681a 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -123,6 +123,7 @@ struct rxrpc_ackpacket {
 #define RXRPC_ACK_PING_RESPONSE		7	/* response to RXRPC_ACK_PING */
 #define RXRPC_ACK_DELAY			8	/* nothing happened since received packet */
 #define RXRPC_ACK_IDLE			9	/* ACK due to fully received ACK window */
+#define RXRPC_ACK__INVALID		10	/* Representation of invalid ACK reason */
 
 	uint8_t		nAcks;		/* number of ACKs */
 #define RXRPC_MAXACKS	255
diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 9413b17ba04b..d67a8c6b085a 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -251,7 +251,7 @@ TRACE_EVENT(rxrpc_rx_ack,
 
 	    TP_printk("c=%p %s f=%08x n=%u",
 		      __entry->call,
-		      rxrpc_acks(__entry->reason),
+		      rxrpc_ack_names[__entry->reason],
 		      __entry->first,
 		      __entry->n_acks)
 	    );
@@ -314,7 +314,7 @@ TRACE_EVENT(rxrpc_tx_ack,
 	    TP_printk(" c=%p ACK  %08x %s f=%08x r=%08x n=%u",
 		      __entry->call,
 		      __entry->serial,
-		      rxrpc_acks(__entry->reason),
+		      rxrpc_ack_names[__entry->reason],
 		      __entry->ack_first,
 		      __entry->ack_serial,
 		      __entry->n_acks)
@@ -505,6 +505,44 @@ TRACE_EVENT(rxrpc_rx_lose,
 		      __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK")
 	    );
 
+TRACE_EVENT(rxrpc_propose_ack,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why,
+		     u8 ack_reason, rxrpc_serial_t serial, bool immediate,
+		     bool background, enum rxrpc_propose_ack_outcome outcome),
+
+	    TP_ARGS(call, why, ack_reason, serial, immediate, background,
+		    outcome),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,		call		)
+		    __field(enum rxrpc_propose_ack_trace,	why		)
+		    __field(rxrpc_serial_t,			serial		)
+		    __field(u8,					ack_reason	)
+		    __field(bool,				immediate	)
+		    __field(bool,				background	)
+		    __field(enum rxrpc_propose_ack_outcome,	outcome		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call	= call;
+		    __entry->why	= why;
+		    __entry->serial	= serial;
+		    __entry->ack_reason	= ack_reason;
+		    __entry->immediate	= immediate;
+		    __entry->background	= background;
+		    __entry->outcome	= outcome;
+			   ),
+
+	    TP_printk("c=%p %s %s r=%08x i=%u b=%u%s",
+		      __entry->call,
+		      rxrpc_propose_ack_traces[__entry->why],
+		      rxrpc_ack_names[__entry->ack_reason],
+		      __entry->serial,
+		      __entry->immediate,
+		      __entry->background,
+		      rxrpc_propose_ack_outcomes[__entry->outcome])
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e564eca75985..042dbcc52654 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -689,8 +689,28 @@ enum rxrpc_timer_trace {
 
 extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
 
+enum rxrpc_propose_ack_trace {
+	rxrpc_propose_ack_input_data,
+	rxrpc_propose_ack_ping_for_params,
+	rxrpc_propose_ack_respond_to_ack,
+	rxrpc_propose_ack_respond_to_ping,
+	rxrpc_propose_ack_retry_tx,
+	rxrpc_propose_ack_terminal_ack,
+	rxrpc_propose_ack__nr_trace
+};
+
+enum rxrpc_propose_ack_outcome {
+	rxrpc_propose_ack_use,
+	rxrpc_propose_ack_update,
+	rxrpc_propose_ack_subsume,
+	rxrpc_propose_ack__nr_outcomes
+};
+
+extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
+extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
+
 extern const char *const rxrpc_pkts[];
-extern const char *rxrpc_acks(u8 reason);
+extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
 
 #include <trace/events/rxrpc.h>
 
@@ -719,7 +739,8 @@ int rxrpc_reject_call(struct rxrpc_sock *);
  * call_event.c
  */
 void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
+		       enum rxrpc_propose_ack_trace);
 void rxrpc_process_call(struct work_struct *);
 
 /*
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 90e970ba048a..fd5b11339ffb 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -58,14 +58,13 @@ out:
  */
 static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 				u16 skew, u32 serial, bool immediate,
-				bool background)
+				bool background,
+				enum rxrpc_propose_ack_trace why)
 {
+	enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
 	unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
-	_enter("{%d},%s,%%%x,%u",
-	       call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
-
 	/* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
 	 * numbers, but we don't alter the timeout.
 	 */
@@ -74,15 +73,18 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 	       call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
 	if (ack_reason == call->ackr_reason) {
 		if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
+			outcome = rxrpc_propose_ack_update;
 			call->ackr_serial = serial;
 			call->ackr_skew = skew;
 		}
 		if (!immediate)
-			return;
+			goto trace;
 	} else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
 		call->ackr_reason = ack_reason;
 		call->ackr_serial = serial;
 		call->ackr_skew = skew;
+	} else {
+		outcome = rxrpc_propose_ack_subsume;
 	}
 
 	switch (ack_reason) {
@@ -124,17 +126,22 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 			rxrpc_set_timer(call, rxrpc_timer_set_for_ack);
 		}
 	}
+
+trace:
+	trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate,
+				background, outcome);
 }
 
 /*
  * propose an ACK be sent, locking the call structure
  */
 void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-		       u16 skew, u32 serial, bool immediate, bool background)
+		       u16 skew, u32 serial, bool immediate, bool background,
+		       enum rxrpc_propose_ack_trace why)
 {
 	spin_lock_bh(&call->lock);
 	__rxrpc_propose_ACK(call, ack_reason, skew, serial,
-			    immediate, background);
+			    immediate, background, why);
 	spin_unlock_bh(&call->lock);
 }
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 19b1e189f5dc..349698d87ad1 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -49,7 +49,8 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
 	if (call->peer->rtt_usage < 3 ||
 	    ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
-				  true, true);
+				  true, true,
+				  rxrpc_propose_ack_ping_for_params);
 }
 
 /*
@@ -382,7 +383,8 @@ skip:
 ack:
 	if (ack)
 		rxrpc_propose_ACK(call, ack, skew, ack_serial,
-				  immediate_ack, true);
+				  immediate_ack, true,
+				  rxrpc_propose_ack_input_data);
 
 	if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
 		rxrpc_notify_socket(call);
@@ -539,6 +541,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 			    u16 skew)
 {
+	u8 ack_reason;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	union {
 		struct rxrpc_ackpacket ack;
@@ -561,8 +564,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	first_soft_ack = ntohl(buf.ack.firstPacket);
 	hard_ack = first_soft_ack - 1;
 	nr_acks = buf.ack.nAcks;
+	ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
+		      buf.ack.reason : RXRPC_ACK__INVALID);
 
-	trace_rxrpc_rx_ack(call, first_soft_ack, buf.ack.reason, nr_acks);
+	trace_rxrpc_rx_ack(call, first_soft_ack, ack_reason, nr_acks);
 
 	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
 	       sp->hdr.serial,
@@ -570,7 +575,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	       first_soft_ack,
 	       ntohl(buf.ack.previousPacket),
 	       acked_serial,
-	       rxrpc_acks(buf.ack.reason),
+	       rxrpc_ack_names[ack_reason],
 	       buf.ack.nAcks);
 
 	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
@@ -583,10 +588,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	if (buf.ack.reason == RXRPC_ACK_PING) {
 		_proto("Rx ACK %%%u PING Request", sp->hdr.serial);
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
-				  skew, sp->hdr.serial, true, true);
+				  skew, sp->hdr.serial, true, true,
+				  rxrpc_propose_ack_respond_to_ping);
 	} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
-				  skew, sp->hdr.serial, true, true);
+				  skew, sp->hdr.serial, true, true,
+				  rxrpc_propose_ack_respond_to_ack);
 	}
 
 	offset = sp->offset + nr_acks + 3;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index fa9942fabdf2..1ca14835d87f 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -91,17 +91,10 @@ const s8 rxrpc_ack_priority[] = {
 	[RXRPC_ACK_PING]		= 9,
 };
 
-const char *rxrpc_acks(u8 reason)
-{
-	static const char *const str[] = {
-		"---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
-		"IDL", "-?-"
-	};
-
-	if (reason >= ARRAY_SIZE(str))
-		reason = ARRAY_SIZE(str) - 1;
-	return str[reason];
-}
+const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
+	"---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
+	"IDL", "-?-"
+};
 
 const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
 	[rxrpc_skb_rx_cleaned]		= "Rx CLN",
@@ -202,3 +195,18 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
 	[rxrpc_timer_set_for_send]		= "SetTx ",
 	[rxrpc_timer_set_for_resend]		= "SetRTx",
 };
+
+const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
+	[rxrpc_propose_ack_input_data]		= "DataIn ",
+	[rxrpc_propose_ack_ping_for_params]	= "Params ",
+	[rxrpc_propose_ack_respond_to_ack]	= "Rsp2Ack",
+	[rxrpc_propose_ack_respond_to_ping]	= "Rsp2Png",
+	[rxrpc_propose_ack_retry_tx]		= "RetryTx",
+	[rxrpc_propose_ack_terminal_ack]	= "ClTerm ",
+};
+
+const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
+	[rxrpc_propose_ack_use]			= "",
+	[rxrpc_propose_ack_update]		= " Update",
+	[rxrpc_propose_ack_subsume]		= " Subsume",
+};
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index e47fbd1c836d..0c563e325c9d 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -210,7 +210,8 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 			rxrpc_propose_ACK(call, pkt->ack.reason,
 					  ntohs(pkt->ack.maxSkew),
 					  ntohl(pkt->ack.serial),
-					  true, true);
+					  true, true,
+					  rxrpc_propose_ack_retry_tx);
 			break;
 		case RXRPC_PACKET_TYPE_ABORT:
 			break;
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 99e4c0ae30f1..8c7f3de45bac 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -141,7 +141,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 	ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
 
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
-		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false);
+		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false,
+				  rxrpc_propose_ack_terminal_ack);
 		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
 	}
 
-- 
cgit v1.2.3


From c6672e3fe4a641bf302d6309ab4d5ee55648e758 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 13:58:55 +0100
Subject: rxrpc: Add a tracepoint to log which packets will be retransmitted

Add a tracepoint to log in rxrpc_resend() which packets will be
retransmitted.  Note that if a positive ACK comes in whilst we have dropped
the lock to retransmit another packet, the actual retransmission may not
happen, though some of the effects will (such as altering the congestion
management).

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 27 +++++++++++++++++++++++++++
 net/rxrpc/call_event.c       |  2 ++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index d67a8c6b085a..56475497043d 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -543,6 +543,33 @@ TRACE_EVENT(rxrpc_propose_ack,
 		      rxrpc_propose_ack_outcomes[__entry->outcome])
 	    );
 
+TRACE_EVENT(rxrpc_retransmit,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation,
+		     s64 expiry),
+
+	    TP_ARGS(call, seq, annotation, expiry),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(u8,				annotation	)
+		    __field(s64,			expiry		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->seq = seq;
+		    __entry->annotation = annotation;
+		    __entry->expiry = expiry;
+			   ),
+
+	    TP_printk("c=%p q=%x a=%02x xp=%lld",
+		      __entry->call,
+		      __entry->seq,
+		      __entry->annotation,
+		      __entry->expiry)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index fd5b11339ffb..a78a92fe5d77 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -196,6 +196,8 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 		/* Okay, we need to retransmit a packet. */
 		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
+		trace_rxrpc_retransmit(call, seq, annotation | anno_type,
+				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
 
 	resend_at = ktime_sub(ktime_add_ms(oldest, rxrpc_resend_timeout), now);
-- 
cgit v1.2.3


From 7c3d21c8153c6bfb5690e35e086b0522c42442d9 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 22 Sep 2016 12:11:13 +0300
Subject: net/mlx4_core: Preparation for VF vlan protocol 802.1ad

Check device capability to support VF vlan protocol 802.1ad mode.
Add vport attribute vlan protocol.
Init vport vlan protocol by default to 802.1Q.
Add update QP support for VF vlan protocol 802.1ad.
Add func capability vlan_offload_disable to disable all
vlan HW acceleration on VF while the VF is set to VF vlan protocol
802.1ad mode.
No change in VF vlan protocol 802.1Q (VST) mode.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |  7 ++++
 drivers/net/ethernet/mellanox/mlx4/fw.c            | 37 ++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  2 ++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 40 ++++++++++++++++++----
 include/linux/mlx4/device.h                        |  3 ++
 include/linux/mlx4/qp.h                            |  2 ++
 6 files changed, 78 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index a58d96cf1ed1..09c969420eac 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1851,6 +1851,7 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
 
 	if (vp_oper->state.default_vlan == vp_admin->default_vlan &&
 	    vp_oper->state.default_qos == vp_admin->default_qos &&
+	    vp_oper->state.vlan_proto == vp_admin->vlan_proto &&
 	    vp_oper->state.link_state == vp_admin->link_state &&
 	    vp_oper->state.qos_vport == vp_admin->qos_vport)
 		return 0;
@@ -1909,6 +1910,7 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
 
 	vp_oper->state.default_vlan = vp_admin->default_vlan;
 	vp_oper->state.default_qos = vp_admin->default_qos;
+	vp_oper->state.vlan_proto = vp_admin->vlan_proto;
 	vp_oper->state.link_state = vp_admin->link_state;
 	vp_oper->state.qos_vport = vp_admin->qos_vport;
 
@@ -1922,6 +1924,7 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
 	work->qos_vport = vp_oper->state.qos_vport;
 	work->vlan_id = vp_oper->state.default_vlan;
 	work->vlan_ix = vp_oper->vlan_idx;
+	work->vlan_proto = vp_oper->state.vlan_proto;
 	work->priv = priv;
 	INIT_WORK(&work->work, mlx4_vf_immed_vlan_work_handler);
 	queue_work(priv->mfunc.master.comm_wq, &work->work);
@@ -2012,6 +2015,8 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 						   vp_admin->default_vlan, &(vp_oper->vlan_idx));
 			if (err) {
 				vp_oper->vlan_idx = NO_INDX;
+				vp_oper->state.default_vlan = MLX4_VGT;
+				vp_oper->state.vlan_proto = htons(ETH_P_8021Q);
 				mlx4_warn(&priv->dev,
 					  "No vlan resources slave %d, port %d\n",
 					  slave, port);
@@ -2388,6 +2393,8 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 				admin_vport->qos_vport =
 						MLX4_VPP_DEFAULT_VPORT;
 				oper_vport->qos_vport = MLX4_VPP_DEFAULT_VPORT;
+				admin_vport->vlan_proto = htons(ETH_P_8021Q);
+				oper_vport->vlan_proto = htons(ETH_P_8021Q);
 				vf_oper->vport[port].vlan_idx = NO_INDX;
 				vf_oper->vport[port].mac_idx = NO_INDX;
 				mlx4_set_random_admin_guid(dev, i, port);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index c7523307b41a..7dc9d38a51f8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -158,7 +158,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[31] = "Modifying loopback source checks using UPDATE_QP support",
 		[32] = "Loopback source checks support",
 		[33] = "RoCEv2 support",
-		[34] = "DMFS Sniffer support (UC & MC)"
+		[34] = "DMFS Sniffer support (UC & MC)",
+		[35] = "QinQ VST mode support",
 	};
 	int i;
 
@@ -313,12 +314,15 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
 #define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31)
 #define QUERY_FUNC_CAP_PHV_BIT			0x40
+#define QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE	0x20
 
 	if (vhcr->op_modifier == 1) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, slave);
 		int converted_port = mlx4_slave_convert_port(
 				dev, slave, vhcr->in_modifier);
+		struct mlx4_vport_oper_state *vp_oper =
+			&priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier];
 
 		if (converted_port < 0)
 			return -EINVAL;
@@ -357,11 +361,12 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
 			 QUERY_FUNC_CAP_PHYS_PORT_ID);
 
-		if (dev->caps.phv_bit[port]) {
-			field = QUERY_FUNC_CAP_PHV_BIT;
-			MLX4_PUT(outbox->buf, field,
-				 QUERY_FUNC_CAP_FLAGS0_OFFSET);
-		}
+		field = 0;
+		if (dev->caps.phv_bit[port])
+			field |= QUERY_FUNC_CAP_PHV_BIT;
+		if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD))
+			field |= QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE;
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS0_OFFSET);
 
 	} else if (vhcr->op_modifier == 0) {
 		struct mlx4_active_ports actv_ports =
@@ -689,6 +694,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET	0x52
 #define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET		0x55
 #define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET	0x56
+#define QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET	0x5D
 #define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET		0x61
 #define QUERY_DEV_CAP_RSVD_MCG_OFFSET		0x62
 #define QUERY_DEV_CAP_MAX_MCG_OFFSET		0x63
@@ -856,6 +862,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
 	dev_cap->max_sq_desc_sz = size;
 
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET);
+	if (field & 0x1)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
 	dev_cap->max_qp_per_mcg = 1 << field;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
@@ -2937,6 +2946,22 @@ int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val)
 }
 EXPORT_SYMBOL(set_phv_bit);
 
+int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port,
+				      bool *vlan_offload_disabled)
+{
+	struct mlx4_func_cap func_cap;
+	int err;
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, port, &func_cap);
+	if (!err)
+		*vlan_offload_disabled =
+			!!(func_cap.flags0 &
+			   QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_get_is_vlan_offload_disabled);
+
 void mlx4_replace_zero_macs(struct mlx4_dev *dev)
 {
 	int i;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index c128ba3ef014..fdfe1ace07d7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -508,6 +508,7 @@ struct mlx4_vport_state {
 	u64 mac;
 	u16 default_vlan;
 	u8  default_qos;
+	__be16 vlan_proto;
 	u32 tx_rate;
 	bool spoofchk;
 	u32 link_state;
@@ -657,6 +658,7 @@ struct mlx4_vf_immed_vlan_work {
 	u8                      qos_vport;
 	u16			vlan_id;
 	u16			orig_vlan_id;
+	__be16			vlan_proto;
 };
 
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 8b81114bdc72..84d7857ccc27 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -790,10 +790,22 @@ static int update_vport_qp_param(struct mlx4_dev *dev,
 				MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
 				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
 		} else if (0 != vp_oper->state.default_vlan) {
-			qpc->pri_path.vlan_control |=
-				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
-				MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
-				MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD)) {
+				/* vst QinQ should block untagged on TX,
+				 * but cvlan is in payload and phv is set so
+				 * hw see it as untagged. Block tagged instead.
+				 */
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			} else { /* vst 802.1Q */
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			}
 		} else { /* priority tagged */
 			qpc->pri_path.vlan_control |=
 				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
@@ -802,7 +814,11 @@ static int update_vport_qp_param(struct mlx4_dev *dev,
 
 		qpc->pri_path.fvl_rx |= MLX4_FVL_RX_FORCE_ETH_VLAN;
 		qpc->pri_path.vlan_index = vp_oper->vlan_idx;
-		qpc->pri_path.fl |= MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN;
+		qpc->pri_path.fl |= MLX4_FL_ETH_HIDE_CQE_VLAN;
+		if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD))
+			qpc->pri_path.fl |= MLX4_FL_SV;
+		else
+			qpc->pri_path.fl |= MLX4_FL_CV;
 		qpc->pri_path.feup |= MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
 		qpc->pri_path.sched_queue &= 0xC7;
 		qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3;
@@ -5238,6 +5254,7 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
 	u64 qp_path_mask = ((1ULL << MLX4_UPD_QP_PATH_MASK_VLAN_INDEX) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_CV) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_SV) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_FEUP) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL_RX) |
@@ -5266,7 +5283,12 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
 	else if (!work->vlan_id)
 		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
 			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
-	else
+	else if (work->vlan_proto == htons(ETH_P_8021AD))
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+	else  /* vst 802.1Q */
 		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
 			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
 			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
@@ -5311,7 +5333,11 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
 				upd_context->qp_context.pri_path.fvl_rx =
 					qp->fvl_rx | MLX4_FVL_RX_FORCE_ETH_VLAN;
 				upd_context->qp_context.pri_path.fl =
-					qp->pri_path_fl | MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN;
+					qp->pri_path_fl | MLX4_FL_ETH_HIDE_CQE_VLAN;
+				if (work->vlan_proto == htons(ETH_P_8021AD))
+					upd_context->qp_context.pri_path.fl |= MLX4_FL_SV;
+				else
+					upd_context->qp_context.pri_path.fl |= MLX4_FL_CV;
 				upd_context->qp_context.pri_path.feup =
 					qp->feup | MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
 				upd_context->qp_context.pri_path.sched_queue =
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 42da3552f7cb..59b50d3eedb4 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -221,6 +221,7 @@ enum {
 	MLX4_DEV_CAP_FLAG2_ROCE_V1_V2		= 1ULL <<  33,
 	MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER   = 1ULL <<  34,
 	MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT	= 1ULL <<  35,
+	MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP          = 1ULL <<  36,
 };
 
 enum {
@@ -1371,6 +1372,8 @@ int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port,
 int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable);
 int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val);
 int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv);
+int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port,
+				      bool *vlan_offload_disabled);
 int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx);
 int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
 int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index deaa2217214d..b4ee8f62ce8d 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -160,6 +160,7 @@ struct mlx4_qp_path {
 
 enum { /* fl */
 	MLX4_FL_CV	= 1 << 6,
+	MLX4_FL_SV	= 1 << 5,
 	MLX4_FL_ETH_HIDE_CQE_VLAN	= 1 << 2,
 	MLX4_FL_ETH_SRC_CHECK_MC_LB	= 1 << 1,
 	MLX4_FL_ETH_SRC_CHECK_UC_LB	= 1 << 0,
@@ -267,6 +268,7 @@ enum {
 	MLX4_UPD_QP_PATH_MASK_FVL_RX			= 16 + 32,
 	MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_UC_LB	= 18 + 32,
 	MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB	= 19 + 32,
+	MLX4_UPD_QP_PATH_MASK_SV			= 22 + 32,
 };
 
 enum { /* param3 */
-- 
cgit v1.2.3


From 79aab093a0b5370d7fc4e99df75996f4744dc03f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 22 Sep 2016 12:11:15 +0300
Subject: net: Update API for VF vlan protocol 802.1ad support

Introduce new rtnl UAPI that exposes a list of vlans per VF, giving
the ability for user-space application to specify it for the VF, as an
option to support 802.1ad.
We adjusted IP Link tool to support this option.

For future use cases, the new UAPI supports multiple vlans. For now we
limit the list size to a single vlan in kernel.
Add IFLA_VF_VLAN_LIST in addition to IFLA_VF_VLAN to keep backward
compatibility with older versions of IP Link tool.

Add a vlan protocol parameter to the ndo_set_vf_vlan callback.
We kept 802.1Q as the drivers' default vlan protocol.
Suitable ip link tool command examples:
  Set vf vlan protocol 802.1ad:
    ip link set eth0 vf 1 vlan 100 proto 802.1ad
  Set vf to VST (802.1Q) mode:
    ip link set eth0 vf 1 vlan 100 proto 802.1Q
  Or by omitting the new parameter
    ip link set eth0 vf 1 vlan 100

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h    |  3 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c  |  9 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c    |  6 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h    |  2 +-
 drivers/net/ethernet/emulex/benet/be_main.c        |  6 +-
 drivers/net/ethernet/intel/fm10k/fm10k.h           |  2 +-
 drivers/net/ethernet/intel/fm10k/fm10k_iov.c       |  6 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  4 +-
 drivers/net/ethernet/intel/igb/igb_main.c          |  9 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c     |  5 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h     |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  6 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  6 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h  |  2 +-
 .../net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c   |  5 +-
 drivers/net/ethernet/sfc/sriov.c                   |  5 +-
 drivers/net/ethernet/sfc/sriov.h                   |  2 +-
 include/linux/if_link.h                            |  1 +
 include/linux/netdevice.h                          |  6 +-
 include/uapi/linux/if_link.h                       | 19 ++++-
 net/core/rtnetlink.c                               | 80 ++++++++++++++++++----
 23 files changed, 161 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 0e68fadecfdb..243cb9748d35 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -492,7 +492,8 @@ int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
 int bnx2x_set_vf_mac(struct net_device *dev, int queue, u8 *mac);
-int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos);
+int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
+		      __be16 vlan_proto);
 
 /* select_queue callback */
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 6c586b045d1d..3f77d0863543 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -2521,7 +2521,8 @@ void bnx2x_pf_set_vfs_vlan(struct bnx2x *bp)
 	for_each_vf(bp, vfidx) {
 		bulletin = BP_VF_BULLETIN(bp, vfidx);
 		if (bulletin->valid_bitmap & (1 << VLAN_VALID))
-			bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0);
+			bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0,
+					  htons(ETH_P_8021Q));
 	}
 }
 
@@ -2781,7 +2782,8 @@ static int bnx2x_set_vf_vlan_filter(struct bnx2x *bp, struct bnx2x_virtf *vf,
 	return 0;
 }
 
-int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos)
+int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos,
+		      __be16 vlan_proto)
 {
 	struct pf_vf_bulletin_content *bulletin = NULL;
 	struct bnx2x *bp = netdev_priv(dev);
@@ -2796,6 +2798,9 @@ int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos)
 		return -EINVAL;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	DP(BNX2X_MSG_IOV, "configuring VF %d with VLAN %d qos %d\n",
 	   vfidx, vlan, 0);
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 8be718508600..ec6cd18842c3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -174,7 +174,8 @@ int bnxt_set_vf_mac(struct net_device *dev, int vf_id, u8 *mac)
 	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 }
 
-int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos)
+int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos,
+		     __be16 vlan_proto)
 {
 	struct hwrm_func_cfg_input req = {0};
 	struct bnxt *bp = netdev_priv(dev);
@@ -185,6 +186,9 @@ int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos)
 	if (bp->hwrm_spec_code < 0x10201)
 		return -ENOTSUPP;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	rc = bnxt_vf_ndo_prep(bp, vf_id);
 	if (rc)
 		return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
index 0392670ab49c..1ab72e4820af 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
@@ -12,7 +12,7 @@
 
 int bnxt_get_vf_config(struct net_device *, int, struct ifla_vf_info *);
 int bnxt_set_vf_mac(struct net_device *, int, u8 *);
-int bnxt_set_vf_vlan(struct net_device *, int, u16, u8);
+int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
 int bnxt_set_vf_bw(struct net_device *, int, int, int);
 int bnxt_set_vf_link_state(struct net_device *, int, int);
 int bnxt_set_vf_spoofchk(struct net_device *, int, bool);
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 9a94840c5757..ac513e6627d1 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1895,7 +1895,8 @@ static int be_clear_vf_tvt(struct be_adapter *adapter, int vf)
 	return 0;
 }
 
-static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
+			  __be16 vlan_proto)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
@@ -1907,6 +1908,9 @@ static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
 	if (vf >= adapter->num_vfs || vlan > 4095 || qos > 7)
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	if (vlan || qos) {
 		vlan |= qos << VLAN_PRIO_SHIFT;
 		status = be_set_vf_tvt(adapter, vf, vlan);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index 67ff01aeb11a..4d19e46f7c55 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -507,7 +507,7 @@ int fm10k_iov_configure(struct pci_dev *pdev, int num_vfs);
 s32 fm10k_iov_update_pvid(struct fm10k_intfc *interface, u16 glort, u16 pvid);
 int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac);
 int fm10k_ndo_set_vf_vlan(struct net_device *netdev,
-			  int vf_idx, u16 vid, u8 qos);
+			  int vf_idx, u16 vid, u8 qos, __be16 vlan_proto);
 int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, int rate,
 			int unused);
 int fm10k_ndo_get_vf_config(struct net_device *netdev,
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
index d9dec81f6b6d..5f4dac0d36ef 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
@@ -445,7 +445,7 @@ int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac)
 }
 
 int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid,
-			  u8 qos)
+			  u8 qos, __be16 vlan_proto)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	struct fm10k_iov_data *iov_data = interface->iov_data;
@@ -460,6 +460,10 @@ int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid,
 	if (qos || (vid > (VLAN_VID_MASK - 1)))
 		return -EINVAL;
 
+	/* VF VLAN Protocol part to default is unsupported */
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	vf_info = &iov_data->vf_info[vf_idx];
 
 	/* exit if there is nothing to do */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index da3423561b3a..724d8740d4cc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2747,11 +2747,12 @@ error_param:
  * @vf_id: VF identifier
  * @vlan_id: mac address
  * @qos: priority setting
+ * @vlan_proto: vlan protocol
  *
  * program VF vlan id and/or qos
  **/
-int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
-			      int vf_id, u16 vlan_id, u8 qos)
+int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
+			      u16 vlan_id, u8 qos, __be16 vlan_proto)
 {
 	u16 vlanprio = vlan_id | (qos << I40E_VLAN_PRIORITY_SHIFT);
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
@@ -2774,6 +2775,12 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
 		goto error_pvid;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q)) {
+		dev_err(&pf->pdev->dev, "VF VLAN protocol is not supported\n");
+		ret = -EPROTONOSUPPORT;
+		goto error_pvid;
+	}
+
 	vf = &(pf->vf[vf_id]);
 	vsi = pf->vsi[vf->lan_vsi_idx];
 	if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states)) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 875174141451..4012d069939a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -129,8 +129,8 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf);
 
 /* VF configuration related iplink handlers */
 int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac);
-int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
-			      int vf_id, u16 vlan_id, u8 qos);
+int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
+			      u16 vlan_id, u8 qos, __be16 vlan_proto);
 int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
 		       int max_tx_rate);
 int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index af75eac5fa16..a83aa13a5bf4 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -169,7 +169,7 @@ static int igb_set_vf_mac(struct igb_adapter *, int, unsigned char *);
 static void igb_restore_vf_multicasts(struct igb_adapter *adapter);
 static int igb_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac);
 static int igb_ndo_set_vf_vlan(struct net_device *netdev,
-			       int vf, u16 vlan, u8 qos);
+			       int vf, u16 vlan, u8 qos, __be16 vlan_proto);
 static int igb_ndo_set_vf_bw(struct net_device *, int, int, int);
 static int igb_ndo_set_vf_spoofchk(struct net_device *netdev, int vf,
 				   bool setting);
@@ -6222,14 +6222,17 @@ static int igb_disable_port_vlan(struct igb_adapter *adapter, int vf)
 	return 0;
 }
 
-static int igb_ndo_set_vf_vlan(struct net_device *netdev,
-			       int vf, u16 vlan, u8 qos)
+static int igb_ndo_set_vf_vlan(struct net_device *netdev, int vf,
+			       u16 vlan, u8 qos, __be16 vlan_proto)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 
 	if ((vf >= adapter->vfs_allocated_count) || (vlan > 4095) || (qos > 7))
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return (vlan || qos) ? igb_enable_port_vlan(adapter, vf, vlan, qos) :
 			       igb_disable_port_vlan(adapter, vf);
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 8618599dfd6f..b18590a995db 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1354,13 +1354,16 @@ static int ixgbe_disable_port_vlan(struct ixgbe_adapter *adapter, int vf)
 	return err;
 }
 
-int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
+			  u8 qos, __be16 vlan_proto)
 {
 	int err = 0;
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
 	if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7))
 		return -EINVAL;
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
 	if (vlan || qos) {
 		/* Check if there is already a port VLAN set, if so
 		 * we have to delete the old one first before we
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index 47e65e2f886a..0c7977d27b71 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -43,7 +43,7 @@ void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter);
 void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter);
 int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac);
 int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan,
-			   u8 qos);
+			   u8 qos, __be16 vlan_proto);
 int ixgbe_link_mbps(struct ixgbe_adapter *adapter);
 int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate,
 			int max_tx_rate);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index a94f8a3f026c..132eeeafcdc4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2400,11 +2400,15 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
 	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
 }
 
-static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
+static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			       __be16 vlan_proto)
 {
 	struct mlx4_en_priv *en_priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = en_priv->mdev;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c12792314be7..b58cfe37dead 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2917,11 +2917,15 @@ static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
 	return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac);
 }
 
-static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
+static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			     __be16 vlan_proto)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1,
 					   vlan, qos);
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index cd23a2946db7..0e198fe89d1a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -100,7 +100,8 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev,
 static void qede_link_update(void *dev, struct qed_link_output *link);
 
 #ifdef CONFIG_QED_SRIOV
-static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
+static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos,
+			    __be16 vlan_proto)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
 
@@ -109,6 +110,9 @@ static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
 		return -EINVAL;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	DP_VERBOSE(edev, QED_MSG_IOV, "Setting Vlan 0x%04x to VF [%d]\n",
 		   vlan, vf);
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
index 24061b9b92e8..5f327659efa7 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
@@ -238,7 +238,7 @@ int qlcnic_sriov_set_vf_mac(struct net_device *, int, u8 *);
 int qlcnic_sriov_set_vf_tx_rate(struct net_device *, int, int, int);
 int qlcnic_sriov_get_vf_config(struct net_device *, int ,
 			       struct ifla_vf_info *);
-int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8);
+int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
 int qlcnic_sriov_set_vf_spoofchk(struct net_device *, int, bool);
 #else
 static inline void qlcnic_sriov_pf_disable(struct qlcnic_adapter *adapter) {}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
index afd687e5e779..50eaafa3eaba 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
@@ -1915,7 +1915,7 @@ int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf,
 }
 
 int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf,
-			     u16 vlan, u8 qos)
+			     u16 vlan, u8 qos, __be16 vlan_proto)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
@@ -1928,6 +1928,9 @@ int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf,
 	if (vf >= sriov->num_vfs || qos > 7)
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	if (vlan > MAX_VLAN_ID) {
 		netdev_err(netdev,
 			   "Invalid VLAN ID, allowed range is [0 - %d]\n",
diff --git a/drivers/net/ethernet/sfc/sriov.c b/drivers/net/ethernet/sfc/sriov.c
index 816c44689e67..9abcf4aded30 100644
--- a/drivers/net/ethernet/sfc/sriov.c
+++ b/drivers/net/ethernet/sfc/sriov.c
@@ -22,7 +22,7 @@ int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac)
 }
 
 int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
-			  u8 qos)
+			  u8 qos, __be16 vlan_proto)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
@@ -31,6 +31,9 @@ int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
 		    (qos & ~(VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT)))
 			return -EINVAL;
 
+		if (vlan_proto != htons(ETH_P_8021Q))
+			return -EPROTONOSUPPORT;
+
 		return efx->type->sriov_set_vf_vlan(efx, vf_i, vlan, qos);
 	} else {
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/sfc/sriov.h b/drivers/net/ethernet/sfc/sriov.h
index 400df526586d..ba1762e7f216 100644
--- a/drivers/net/ethernet/sfc/sriov.h
+++ b/drivers/net/ethernet/sfc/sriov.h
@@ -16,7 +16,7 @@
 
 int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac);
 int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
-			  u8 qos);
+			  u8 qos, __be16 vlan_proto);
 int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf_i,
 			      bool spoofchk);
 int efx_sriov_get_vf_config(struct net_device *net_dev, int vf_i,
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index f923d15b432c..0b17c585b5cd 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -25,5 +25,6 @@ struct ifla_vf_info {
 	__u32 max_tx_rate;
 	__u32 rss_query_en;
 	__u32 trusted;
+	__be16 vlan_proto;
 };
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 69f242c71865..1e8a5c734d72 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -946,7 +946,8 @@ struct netdev_xdp {
  *
  *	SR-IOV management functions.
  * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
- * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
+ * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
+ *			  u8 qos, __be16 proto);
  * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
  *			  int max_tx_rate);
  * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
@@ -1187,7 +1188,8 @@ struct net_device_ops {
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
 	int			(*ndo_set_vf_vlan)(struct net_device *dev,
-						   int queue, u16 vlan, u8 qos);
+						   int queue, u16 vlan,
+						   u8 qos, __be16 proto);
 	int			(*ndo_set_vf_rate)(struct net_device *dev,
 						   int vf, int min_tx_rate,
 						   int max_tx_rate);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 7ec9e99d5491..b4fba662cd32 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -619,7 +619,7 @@ enum {
 enum {
 	IFLA_VF_UNSPEC,
 	IFLA_VF_MAC,		/* Hardware queue specific attributes */
-	IFLA_VF_VLAN,
+	IFLA_VF_VLAN,		/* VLAN ID and QoS */
 	IFLA_VF_TX_RATE,	/* Max TX Bandwidth Allocation */
 	IFLA_VF_SPOOFCHK,	/* Spoof Checking on/off switch */
 	IFLA_VF_LINK_STATE,	/* link state enable/disable/auto switch */
@@ -631,6 +631,7 @@ enum {
 	IFLA_VF_TRUST,		/* Trust VF */
 	IFLA_VF_IB_NODE_GUID,	/* VF Infiniband node GUID */
 	IFLA_VF_IB_PORT_GUID,	/* VF Infiniband port GUID */
+	IFLA_VF_VLAN_LIST,	/* nested list of vlans, option for QinQ */
 	__IFLA_VF_MAX,
 };
 
@@ -647,6 +648,22 @@ struct ifla_vf_vlan {
 	__u32 qos;
 };
 
+enum {
+	IFLA_VF_VLAN_INFO_UNSPEC,
+	IFLA_VF_VLAN_INFO,	/* VLAN ID, QoS and VLAN protocol */
+	__IFLA_VF_VLAN_INFO_MAX,
+};
+
+#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1)
+#define MAX_VLAN_LIST_LEN 1
+
+struct ifla_vf_vlan_info {
+	__u32 vf;
+	__u32 vlan; /* 0 - 4095, 0 disables VLAN filter */
+	__u32 qos;
+	__be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */
+};
+
 struct ifla_vf_tx_rate {
 	__u32 vf;
 	__u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 0dbae4244a89..3ac8946bf244 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -843,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 		size += nla_total_size(num_vfs * sizeof(struct nlattr));
 		size += num_vfs *
 			(nla_total_size(sizeof(struct ifla_vf_mac)) +
-			 nla_total_size(sizeof(struct ifla_vf_vlan)) +
+			 nla_total_size(MAX_VLAN_LIST_LEN *
+					sizeof(struct nlattr)) +
+			 nla_total_size(MAX_VLAN_LIST_LEN *
+					sizeof(struct ifla_vf_vlan_info)) +
 			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
 			 nla_total_size(sizeof(struct ifla_vf_rate)) +
 			 nla_total_size(sizeof(struct ifla_vf_link_state)) +
@@ -1111,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 					       struct nlattr *vfinfo)
 {
 	struct ifla_vf_rss_query_en vf_rss_query_en;
+	struct nlattr *vf, *vfstats, *vfvlanlist;
 	struct ifla_vf_link_state vf_linkstate;
+	struct ifla_vf_vlan_info vf_vlan_info;
 	struct ifla_vf_spoofchk vf_spoofchk;
 	struct ifla_vf_tx_rate vf_tx_rate;
 	struct ifla_vf_stats vf_stats;
 	struct ifla_vf_trust vf_trust;
 	struct ifla_vf_vlan vf_vlan;
 	struct ifla_vf_rate vf_rate;
-	struct nlattr *vf, *vfstats;
 	struct ifla_vf_mac vf_mac;
 	struct ifla_vf_info ivi;
 
@@ -1135,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	 * IFLA_VF_LINK_STATE_AUTO which equals zero
 	 */
 	ivi.linkstate = 0;
+	/* VLAN Protocol by default is 802.1Q */
+	ivi.vlan_proto = htons(ETH_P_8021Q);
 	if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
 		return 0;
 
 	vf_mac.vf =
 		vf_vlan.vf =
+		vf_vlan_info.vf =
 		vf_rate.vf =
 		vf_tx_rate.vf =
 		vf_spoofchk.vf =
@@ -1150,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
 	vf_vlan.vlan = ivi.vlan;
 	vf_vlan.qos = ivi.qos;
+	vf_vlan_info.vlan = ivi.vlan;
+	vf_vlan_info.qos = ivi.qos;
+	vf_vlan_info.vlan_proto = ivi.vlan_proto;
 	vf_tx_rate.rate = ivi.max_tx_rate;
 	vf_rate.min_tx_rate = ivi.min_tx_rate;
 	vf_rate.max_tx_rate = ivi.max_tx_rate;
@@ -1158,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	vf_rss_query_en.setting = ivi.rss_query_en;
 	vf_trust.setting = ivi.trusted;
 	vf = nla_nest_start(skb, IFLA_VF_INFO);
-	if (!vf) {
-		nla_nest_cancel(skb, vfinfo);
-		return -EMSGSIZE;
-	}
+	if (!vf)
+		goto nla_put_vfinfo_failure;
 	if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
 	    nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
 	    nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
@@ -1177,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 		    &vf_rss_query_en) ||
 	    nla_put(skb, IFLA_VF_TRUST,
 		    sizeof(vf_trust), &vf_trust))
-		return -EMSGSIZE;
+		goto nla_put_vf_failure;
+	vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+	if (!vfvlanlist)
+		goto nla_put_vf_failure;
+	if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+		    &vf_vlan_info)) {
+		nla_nest_cancel(skb, vfvlanlist);
+		goto nla_put_vf_failure;
+	}
+	nla_nest_end(skb, vfvlanlist);
 	memset(&vf_stats, 0, sizeof(vf_stats));
 	if (dev->netdev_ops->ndo_get_vf_stats)
 		dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
 						&vf_stats);
 	vfstats = nla_nest_start(skb, IFLA_VF_STATS);
-	if (!vfstats) {
-		nla_nest_cancel(skb, vf);
-		nla_nest_cancel(skb, vfinfo);
-		return -EMSGSIZE;
-	}
+	if (!vfstats)
+		goto nla_put_vf_failure;
 	if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
 			      vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
@@ -1199,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
 			      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
-			      vf_stats.multicast, IFLA_VF_STATS_PAD))
-		return -EMSGSIZE;
+			      vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+		nla_nest_cancel(skb, vfstats);
+		goto nla_put_vf_failure;
+	}
 	nla_nest_end(skb, vfstats);
 	nla_nest_end(skb, vf);
 	return 0;
+
+nla_put_vf_failure:
+	nla_nest_cancel(skb, vf);
+nla_put_vfinfo_failure:
+	nla_nest_cancel(skb, vfinfo);
+	return -EMSGSIZE;
 }
 
 static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
@@ -1448,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
 static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_MAC]		= { .len = sizeof(struct ifla_vf_mac) },
 	[IFLA_VF_VLAN]		= { .len = sizeof(struct ifla_vf_vlan) },
+	[IFLA_VF_VLAN_LIST]     = { .type = NLA_NESTED },
 	[IFLA_VF_TX_RATE]	= { .len = sizeof(struct ifla_vf_tx_rate) },
 	[IFLA_VF_SPOOFCHK]	= { .len = sizeof(struct ifla_vf_spoofchk) },
 	[IFLA_VF_RATE]		= { .len = sizeof(struct ifla_vf_rate) },
@@ -1704,7 +1727,34 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 		err = -EOPNOTSUPP;
 		if (ops->ndo_set_vf_vlan)
 			err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
-						   ivv->qos);
+						   ivv->qos,
+						   htons(ETH_P_8021Q));
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[IFLA_VF_VLAN_LIST]) {
+		struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+		struct nlattr *attr;
+		int rem, len = 0;
+
+		err = -EOPNOTSUPP;
+		if (!ops->ndo_set_vf_vlan)
+			return err;
+
+		nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+			if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+			    nla_len(attr) < NLA_HDRLEN) {
+				return -EINVAL;
+			}
+			if (len >= MAX_VLAN_LIST_LEN)
+				return -EOPNOTSUPP;
+			ivvl[len] = nla_data(attr);
+
+			len++;
+		}
+		err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+					   ivvl[0]->qos, ivvl[0]->vlan_proto);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v1.2.3


From b42959dc35a533a531dd698b581193a65a5da831 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 22 Sep 2016 12:11:16 +0300
Subject: net/mlx4: Add VF vlan protocol 802.1ad support

Move the vf to VST 802.1ad mode (mlx4 VST QinQ mode) by setting vf vlan
protocol to 802.1ad.
VST 802.1ad mode in mlx4, is used for STAG strip/insertion by PF, while
the CTAG is set by the VF.
Read current vlan protocol as part of the vf configuration state.

Upon setting vf vlan protocol to 802.1ad, we use a mechanism of handshake
to verify that both the vf and the pf driver version support it.
The handshake uses the command QUERY_FUNC_CAP:
- The vf sets a pre-defined support bit in input modifier.
- A pf that supports the feature sends the request to the vf through a
  pre-defined field in the output mailbox.
- In case vf does not support the feature, the pf will fail the control
  command (in this case, IP link tool command to set the vf vlan
  protocol to 802.1ad).

No change in VST 802.1Q mode.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c       | 51 ++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  6 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c        | 89 ++++++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |  1 +
 include/linux/mlx4/cmd.h                       |  3 +-
 5 files changed, 138 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 09c969420eac..b1cef7a0f7ca 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1995,6 +1995,8 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 	int port, err;
 	struct mlx4_vport_state *vp_admin;
 	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_slave_state *slave_state =
+		&priv->mfunc.master.slave_state[slave];
 	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
 			&priv->dev, slave);
 	int min_port = find_first_bit(actv_ports.ports,
@@ -2009,7 +2011,19 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 			priv->mfunc.master.vf_admin[slave].enable_smi[port];
 		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 		vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
-		vp_oper->state = *vp_admin;
+		if (vp_admin->vlan_proto != htons(ETH_P_8021AD) ||
+		    slave_state->vst_qinq_supported) {
+			vp_oper->state.vlan_proto   = vp_admin->vlan_proto;
+			vp_oper->state.default_vlan = vp_admin->default_vlan;
+			vp_oper->state.default_qos  = vp_admin->default_qos;
+		}
+		vp_oper->state.link_state = vp_admin->link_state;
+		vp_oper->state.mac        = vp_admin->mac;
+		vp_oper->state.spoofchk   = vp_admin->spoofchk;
+		vp_oper->state.tx_rate    = vp_admin->tx_rate;
+		vp_oper->state.qos_vport  = vp_admin->qos_vport;
+		vp_oper->state.guid       = vp_admin->guid;
+
 		if (MLX4_VGT != vp_admin->default_vlan) {
 			err = __mlx4_register_vlan(&priv->dev, port,
 						   vp_admin->default_vlan, &(vp_oper->vlan_idx));
@@ -2097,6 +2111,7 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 		mlx4_warn(dev, "Received reset from slave:%d\n", slave);
 		slave_state[slave].active = false;
 		slave_state[slave].old_vlan_api = false;
+		slave_state[slave].vst_qinq_supported = false;
 		mlx4_master_deactivate_admin_state(priv, slave);
 		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) {
 				slave_state[slave].event_eq[i].eqn = -1;
@@ -2364,6 +2379,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 			vf_oper = &priv->mfunc.master.vf_oper[i];
 			s_state = &priv->mfunc.master.slave_state[i];
 			s_state->last_cmd = MLX4_COMM_CMD_RESET;
+			s_state->vst_qinq_supported = false;
 			mutex_init(&priv->mfunc.master.gen_eqe_mutex[i]);
 			for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j)
 				s_state->event_eq[j].eqn = -1;
@@ -2955,10 +2971,13 @@ int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac)
 EXPORT_SYMBOL_GPL(mlx4_set_vf_mac);
 
 
-int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos,
+		     __be16 proto)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_vport_state *vf_admin;
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vport_oper_state *vf_oper;
 	int slave;
 
 	if ((!mlx4_is_master(dev)) ||
@@ -2968,12 +2987,31 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
 	if ((vlan > 4095) || (qos > 7))
 		return -EINVAL;
 
+	if (proto == htons(ETH_P_8021AD) &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP))
+		return -EPROTONOSUPPORT;
+
+	if (proto != htons(ETH_P_8021Q) &&
+	    proto != htons(ETH_P_8021AD))
+		return -EINVAL;
+
+	if ((proto == htons(ETH_P_8021AD)) &&
+	    ((vlan == 0) || (vlan == MLX4_VGT)))
+		return -EINVAL;
+
 	slave = mlx4_get_slave_indx(dev, vf);
 	if (slave < 0)
 		return -EINVAL;
 
+	slave_state = &priv->mfunc.master.slave_state[slave];
+	if ((proto == htons(ETH_P_8021AD)) && (slave_state->active) &&
+	    (!slave_state->vst_qinq_supported)) {
+		mlx4_err(dev, "vf %d does not support VST QinQ mode\n", vf);
+		return -EPROTONOSUPPORT;
+	}
 	port = mlx4_slaves_closest_port(dev, slave, port);
 	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+	vf_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 
 	if (!mlx4_valid_vf_state_change(dev, port, vf_admin, vlan, qos))
 		return -EPERM;
@@ -2983,6 +3021,7 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
 	else
 		vf_admin->default_vlan = vlan;
 	vf_admin->default_qos = qos;
+	vf_admin->vlan_proto = proto;
 
 	/* If rate was configured prior to VST, we saved the configured rate
 	 * in vf_admin->rate and now, if priority supported we enforce the QoS
@@ -2991,7 +3030,12 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
 	    vf_admin->tx_rate)
 		vf_admin->qos_vport = slave;
 
-	if (mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
+	/* Try to activate new vf state without restart,
+	 * this option is not supported while moving to VST QinQ mode.
+	 */
+	if ((proto == htons(ETH_P_8021AD) &&
+	     vf_oper->state.vlan_proto != proto) ||
+	    mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
 		mlx4_info(dev,
 			  "updating vf %d port %d config will take effect on next VF restart\n",
 			  vf, port);
@@ -3135,6 +3179,7 @@ int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_in
 
 	ivf->vlan		= s_info->default_vlan;
 	ivf->qos		= s_info->default_qos;
+	ivf->vlan_proto		= s_info->vlan_proto;
 
 	if (mlx4_is_vf_vst_and_prio_qos(dev, port, s_info))
 		ivf->max_tx_rate = s_info->tx_rate;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 132eeeafcdc4..7e703bed7b82 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2406,10 +2406,8 @@ static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
 	struct mlx4_en_priv *en_priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = en_priv->mdev;
 
-	if (vlan_proto != htons(ETH_P_8021Q))
-		return -EPROTONOSUPPORT;
-
-	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos);
+	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos,
+				vlan_proto);
 }
 
 static int mlx4_en_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 7dc9d38a51f8..090bf81076e8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -249,6 +249,72 @@ out:
 	return err;
 }
 
+static int mlx4_activate_vst_qinq(struct mlx4_priv *priv, int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_vport_state *vp_admin;
+	int err;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (vp_admin->default_vlan != vp_oper->state.default_vlan) {
+		err = __mlx4_register_vlan(&priv->dev, port,
+					   vp_admin->default_vlan,
+					   &vp_oper->vlan_idx);
+		if (err) {
+			vp_oper->vlan_idx = NO_INDX;
+			mlx4_warn(&priv->dev,
+				  "No vlan resources slave %d, port %d\n",
+				  slave, port);
+			return err;
+		}
+		mlx4_dbg(&priv->dev, "alloc vlan %d idx  %d slave %d port %d\n",
+			 (int)(vp_oper->state.default_vlan),
+			 vp_oper->vlan_idx, slave, port);
+	}
+	vp_oper->state.vlan_proto   = vp_admin->vlan_proto;
+	vp_oper->state.default_vlan = vp_admin->default_vlan;
+	vp_oper->state.default_qos  = vp_admin->default_qos;
+
+	return 0;
+}
+
+static int mlx4_handle_vst_qinq(struct mlx4_priv *priv, int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vport_state *vp_admin;
+	int err;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+	slave_state = &priv->mfunc.master.slave_state[slave];
+
+	if ((vp_admin->vlan_proto != htons(ETH_P_8021AD)) ||
+	    (!slave_state->active))
+		return 0;
+
+	if (vp_oper->state.vlan_proto == vp_admin->vlan_proto &&
+	    vp_oper->state.default_vlan == vp_admin->default_vlan &&
+	    vp_oper->state.default_qos == vp_admin->default_qos)
+		return 0;
+
+	if (!slave_state->vst_qinq_supported) {
+		/* Warn and revert the request to set vst QinQ mode */
+		vp_admin->vlan_proto   = vp_oper->state.vlan_proto;
+		vp_admin->default_vlan = vp_oper->state.default_vlan;
+		vp_admin->default_qos  = vp_oper->state.default_qos;
+
+		mlx4_warn(&priv->dev,
+			  "Slave %d does not support VST QinQ mode\n", slave);
+		return 0;
+	}
+
+	err = mlx4_activate_vst_qinq(priv, slave, port);
+	return err;
+}
+
 int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 				struct mlx4_vhcr *vhcr,
 				struct mlx4_cmd_mailbox *inbox,
@@ -312,17 +378,18 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_VF_ENABLE_QP0		0x08
 
 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
-#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31)
 #define QUERY_FUNC_CAP_PHV_BIT			0x40
 #define QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE	0x20
 
+#define QUERY_FUNC_CAP_SUPPORTS_VST_QINQ	BIT(30)
+#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS BIT(31)
+
 	if (vhcr->op_modifier == 1) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, slave);
 		int converted_port = mlx4_slave_convert_port(
 				dev, slave, vhcr->in_modifier);
-		struct mlx4_vport_oper_state *vp_oper =
-			&priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier];
+		struct mlx4_vport_oper_state *vp_oper;
 
 		if (converted_port < 0)
 			return -EINVAL;
@@ -361,6 +428,11 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
 			 QUERY_FUNC_CAP_PHYS_PORT_ID);
 
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		err = mlx4_handle_vst_qinq(priv, slave, port);
+		if (err)
+			return err;
+
 		field = 0;
 		if (dev->caps.phv_bit[port])
 			field |= QUERY_FUNC_CAP_PHV_BIT;
@@ -371,6 +443,9 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	} else if (vhcr->op_modifier == 0) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, slave);
+		struct mlx4_slave_state *slave_state =
+			&priv->mfunc.master.slave_state[slave];
+
 		/* enable rdma and ethernet interfaces, new quota locations,
 		 * and reserved lkey
 		 */
@@ -444,6 +519,10 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 
 		size = dev->caps.reserved_lkey + ((slave << 8) & 0xFF00);
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET);
+
+		if (vhcr->in_modifier & QUERY_FUNC_CAP_SUPPORTS_VST_QINQ)
+			slave_state->vst_qinq_supported = true;
+
 	} else
 		err = -EINVAL;
 
@@ -459,10 +538,12 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 	u32			size, qkey;
 	int			err = 0, quotas = 0;
 	u32                     in_modifier;
+	u32			slave_caps;
 
 	op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
-	in_modifier = op_modifier ? gen_or_port :
+	slave_caps = QUERY_FUNC_CAP_SUPPORTS_VST_QINQ |
 		QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS;
+	in_modifier = op_modifier ? gen_or_port : slave_caps;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index fdfe1ace07d7..e4878f31e45d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -483,6 +483,7 @@ struct mlx4_slave_state {
 	u8 init_port_mask;
 	bool active;
 	bool old_vlan_api;
+	bool vst_qinq_supported;
 	u8 function;
 	dma_addr_t vhcr_dma;
 	u16 mtu[MLX4_MAX_PORTS + 1];
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 116b284bc4ce..1f3568694a57 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -309,7 +309,8 @@ int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx,
 		      struct ifla_vf_stats *vf_stats);
 u32 mlx4_comm_get_version(void);
 int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
-int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan,
+		     u8 qos, __be16 proto);
 int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate,
 		     int max_tx_rate);
 int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
-- 
cgit v1.2.3


From 57494343cb5d66962bb197878fb1cc576177db31 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 18:05:27 +0100
Subject: rxrpc: Implement slow-start

Implement RxRPC slow-start, which is similar to RFC 5681 for TCP.  A
tracepoint is added to log the state of the congestion management algorithm
and the decisions it makes.

Notes:

 (1) Since we send fixed-size DATA packets (apart from the final packet in
     each phase), counters and calculations are in terms of packets rather
     than bytes.

 (2) The ACK packet carries the equivalent of TCP SACK.

 (3) The FLIGHT_SIZE calculation in RFC 5681 doesn't seem particularly
     suited to SACK of a small number of packets.  It seems that, almost
     inevitably, by the time three 'duplicate' ACKs have been seen, we have
     narrowed the loss down to one or two missing packets, and the
     FLIGHT_SIZE calculation ends up as 2.

 (4) In rxrpc_resend(), if there was no data that apparently needed
     retransmission, we transmit a PING ACK to ask the peer to tell us what
     its Rx window state is.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  45 ++++++++++++
 net/rxrpc/ar-internal.h      |  53 +++++++++++++-
 net/rxrpc/call_event.c       |  36 ++++++++-
 net/rxrpc/call_object.c      |  13 ++++
 net/rxrpc/conn_event.c       |   1 +
 net/rxrpc/input.c            | 169 +++++++++++++++++++++++++++++++++++++++++--
 net/rxrpc/misc.c             |  19 +++++
 net/rxrpc/output.c           |   9 ++-
 net/rxrpc/sendmsg.c          |   7 +-
 9 files changed, 339 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 56475497043d..ada12d00118c 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -570,6 +570,51 @@ TRACE_EVENT(rxrpc_retransmit,
 		      __entry->expiry)
 	    );
 
+TRACE_EVENT(rxrpc_congest,
+	    TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary,
+		     rxrpc_serial_t ack_serial, enum rxrpc_congest_change change),
+
+	    TP_ARGS(call, summary, ack_serial, change),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,		call		)
+		    __field(enum rxrpc_congest_change,		change		)
+		    __field(rxrpc_seq_t,			hard_ack	)
+		    __field(rxrpc_seq_t,			top		)
+		    __field(rxrpc_seq_t,			lowest_nak	)
+		    __field(rxrpc_serial_t,			ack_serial	)
+		    __field_struct(struct rxrpc_ack_summary,	sum		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call	= call;
+		    __entry->change	= change;
+		    __entry->hard_ack	= call->tx_hard_ack;
+		    __entry->top	= call->tx_top;
+		    __entry->lowest_nak	= call->acks_lowest_nak;
+		    __entry->ack_serial	= ack_serial;
+		    memcpy(&__entry->sum, summary, sizeof(__entry->sum));
+			   ),
+
+	    TP_printk("c=%p %08x %s %08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s",
+		      __entry->call,
+		      __entry->ack_serial,
+		      rxrpc_ack_names[__entry->sum.ack_reason],
+		      __entry->hard_ack,
+		      rxrpc_congest_modes[__entry->sum.mode],
+		      __entry->sum.cwnd,
+		      __entry->sum.ssthresh,
+		      __entry->sum.nr_acks, __entry->sum.nr_nacks,
+		      __entry->sum.nr_new_acks, __entry->sum.nr_new_nacks,
+		      __entry->sum.nr_rot_new_acks,
+		      __entry->top - __entry->hard_ack,
+		      __entry->sum.cumulative_acks,
+		      __entry->sum.dup_acks,
+		      __entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "",
+		      rxrpc_congest_changes[__entry->change],
+		      __entry->sum.retrans_timeo ? " rTxTo" : "")
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index b1e697fc9ffb..ca96e547cb9a 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -402,6 +402,7 @@ enum rxrpc_call_flag {
 	RXRPC_CALL_RX_LAST,		/* Received the last packet (at rxtx_top) */
 	RXRPC_CALL_TX_LAST,		/* Last packet in Tx buffer (at rxtx_top) */
 	RXRPC_CALL_PINGING,		/* Ping in process */
+	RXRPC_CALL_RETRANS_TIMEOUT,	/* Retransmission due to timeout occurred */
 };
 
 /*
@@ -446,6 +447,17 @@ enum rxrpc_call_completion {
 	NR__RXRPC_CALL_COMPLETIONS
 };
 
+/*
+ * Call Tx congestion management modes.
+ */
+enum rxrpc_congest_mode {
+	RXRPC_CALL_SLOW_START,
+	RXRPC_CALL_CONGEST_AVOIDANCE,
+	RXRPC_CALL_PACKET_LOSS,
+	RXRPC_CALL_FAST_RETRANSMIT,
+	NR__RXRPC_CONGEST_MODES
+};
+
 /*
  * RxRPC call definition
  * - matched by { connection, call_id }
@@ -518,6 +530,20 @@ struct rxrpc_call {
 						 * not hard-ACK'd packet follows this.
 						 */
 	rxrpc_seq_t		tx_top;		/* Highest Tx slot allocated. */
+
+	/* TCP-style slow-start congestion control [RFC5681].  Since the SMSS
+	 * is fixed, we keep these numbers in terms of segments (ie. DATA
+	 * packets) rather than bytes.
+	 */
+#define RXRPC_TX_SMSS		RXRPC_JUMBO_DATALEN
+	u8			cong_cwnd;	/* Congestion window size */
+	u8			cong_extra;	/* Extra to send for congestion management */
+	u8			cong_ssthresh;	/* Slow-start threshold */
+	enum rxrpc_congest_mode	cong_mode:8;	/* Congestion management mode */
+	u8			cong_dup_acks;	/* Count of ACKs showing missing packets */
+	u8			cong_cumul_acks; /* Cumulative ACK count */
+	ktime_t			cong_tstamp;	/* Last time cwnd was changed */
+
 	rxrpc_seq_t		rx_hard_ack;	/* Dead slot in buffer; the first received but not
 						 * consumed packet follows this.
 						 */
@@ -539,12 +565,13 @@ struct rxrpc_call {
 	ktime_t			ackr_ping_time;	/* Time last ping sent */
 
 	/* transmission-phase ACK management */
+	ktime_t			acks_latest_ts;	/* Timestamp of latest ACK received */
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 	rxrpc_seq_t		acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
 };
 
 /*
- * Summary of a new ACK and the changes it made.
+ * Summary of a new ACK and the changes it made to the Tx buffer packet states.
  */
 struct rxrpc_ack_summary {
 	u8			ack_reason;
@@ -554,6 +581,14 @@ struct rxrpc_ack_summary {
 	u8			nr_new_nacks;		/* Number of new NACKs in packet */
 	u8			nr_rot_new_acks;	/* Number of rotated new ACKs */
 	bool			new_low_nack;		/* T if new low NACK found */
+	bool			retrans_timeo;		/* T if reTx due to timeout happened */
+	u8			flight_size;		/* Number of unreceived transmissions */
+	/* Place to stash values for tracing */
+	enum rxrpc_congest_mode	mode:8;
+	u8			cwnd;
+	u8			ssthresh;
+	u8			dup_acks;
+	u8			cumulative_acks;
 };
 
 enum rxrpc_skb_trace {
@@ -709,6 +744,7 @@ extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
 enum rxrpc_propose_ack_trace {
 	rxrpc_propose_ack_client_tx_end,
 	rxrpc_propose_ack_input_data,
+	rxrpc_propose_ack_ping_for_lost_ack,
 	rxrpc_propose_ack_ping_for_lost_reply,
 	rxrpc_propose_ack_ping_for_params,
 	rxrpc_propose_ack_respond_to_ack,
@@ -729,6 +765,21 @@ enum rxrpc_propose_ack_outcome {
 extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
 extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
 
+enum rxrpc_congest_change {
+	rxrpc_cong_begin_retransmission,
+	rxrpc_cong_cleared_nacks,
+	rxrpc_cong_new_low_nack,
+	rxrpc_cong_no_change,
+	rxrpc_cong_progress,
+	rxrpc_cong_retransmit_again,
+	rxrpc_cong_rtt_window_end,
+	rxrpc_cong_saw_nack,
+	rxrpc_congest__nr_change
+};
+
+extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
+extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
+
 extern const char *const rxrpc_pkts[];
 extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 05b94d1acf52..0e8478012212 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -146,6 +146,14 @@ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 	spin_unlock_bh(&call->lock);
 }
 
+/*
+ * Handle congestion being detected by the retransmit timeout.
+ */
+static void rxrpc_congestion_timeout(struct rxrpc_call *call)
+{
+	set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
+}
+
 /*
  * Perform retransmission of NAK'd and unack'd packets.
  */
@@ -154,9 +162,9 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
 	rxrpc_seq_t cursor, seq, top;
-	ktime_t now = ktime_get_real(), max_age, oldest,  resend_at;
+	ktime_t now = ktime_get_real(), max_age, oldest, resend_at, ack_ts;
 	int ix;
-	u8 annotation, anno_type;
+	u8 annotation, anno_type, retrans = 0, unacked = 0;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
@@ -193,10 +201,13 @@ static void rxrpc_resend(struct rxrpc_call *call)
 					oldest = skb->tstamp;
 				continue;
 			}
+			if (!(annotation & RXRPC_TX_ANNO_RESENT))
+				unacked++;
 		}
 
 		/* Okay, we need to retransmit a packet. */
 		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
+		retrans++;
 		trace_rxrpc_retransmit(call, seq, annotation | anno_type,
 				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
@@ -210,6 +221,25 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		    * reached the nsec timeout yet.
 		    */
 
+	if (unacked)
+		rxrpc_congestion_timeout(call);
+
+	/* If there was nothing that needed retransmission then it's likely
+	 * that an ACK got lost somewhere.  Send a ping to find out instead of
+	 * retransmitting data.
+	 */
+	if (!retrans) {
+		rxrpc_set_timer(call, rxrpc_timer_set_for_resend);
+		spin_unlock_bh(&call->lock);
+		ack_ts = ktime_sub(now, call->acks_latest_ts);
+		if (ktime_to_ns(ack_ts) < call->peer->rtt)
+			goto out;
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
+				  rxrpc_propose_ack_ping_for_lost_ack);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+		goto out;
+	}
+
 	/* Now go through the Tx window and perform the retransmissions.  We
 	 * have to drop the lock for each send.  If an ACK comes in whilst the
 	 * lock is dropped, it may clear some of the retransmission markers for
@@ -260,6 +290,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 out_unlock:
 	spin_unlock_bh(&call->lock);
+out:
 	_leave("");
 }
 
@@ -293,6 +324,7 @@ recheck_state:
 	if (time_after_eq(now, call->expire_at)) {
 		rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
 		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+		goto recheck_state;
 	}
 
 	if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index a53f4c2c0025..d4b3293b78fa 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -160,6 +160,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 	call->rx_winsize = rxrpc_rx_window_size;
 	call->tx_winsize = 16;
 	call->rx_expect_next = 1;
+
+	if (RXRPC_TX_SMSS > 2190)
+		call->cong_cwnd = 2;
+	else if (RXRPC_TX_SMSS > 1095)
+		call->cong_cwnd = 3;
+	else
+		call->cong_cwnd = 4;
+	call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
 	return call;
 
 nomem_2:
@@ -176,6 +184,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 						  gfp_t gfp)
 {
 	struct rxrpc_call *call;
+	ktime_t now;
 
 	_enter("");
 
@@ -185,6 +194,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
 	call->service_id = srx->srx_service;
 	call->tx_phase = true;
+	now = ktime_get_real();
+	call->acks_latest_ts = now;
+	call->cong_tstamp = now;
 
 	_leave(" = %p", call);
 	return call;
@@ -325,6 +337,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
 	call->state		= RXRPC_CALL_SERVER_ACCEPTING;
 	if (sp->hdr.securityIndex > 0)
 		call->state	= RXRPC_CALL_SERVER_SECURING;
+	call->cong_tstamp	= skb->tstamp;
 
 	/* Set the channel for this call.  We don't get channel_lock as we're
 	 * only defending against the data_ready handler (which we're called
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index a1cf1ec5f29e..37609ce89f52 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -97,6 +97,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.info.maxMTU		= htonl(mtu);
 		pkt.info.rwind		= htonl(rxrpc_rx_window_size);
 		pkt.info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
+		pkt.whdr.flags		|= RXRPC_SLOW_START_OK;
 		len += sizeof(pkt.ack) + sizeof(pkt.info);
 		break;
 	}
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 0344f4494eb7..094720dd1eaf 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -36,6 +36,166 @@ static void rxrpc_proto_abort(const char *why,
 	}
 }
 
+/*
+ * Do TCP-style congestion management [RFC 5681].
+ */
+static void rxrpc_congestion_management(struct rxrpc_call *call,
+					struct sk_buff *skb,
+					struct rxrpc_ack_summary *summary)
+{
+	enum rxrpc_congest_change change = rxrpc_cong_no_change;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned int cumulative_acks = call->cong_cumul_acks;
+	unsigned int cwnd = call->cong_cwnd;
+	bool resend = false;
+
+	summary->flight_size =
+		(call->tx_top - call->tx_hard_ack) - summary->nr_acks;
+
+	if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
+		summary->retrans_timeo = true;
+		call->cong_ssthresh = max_t(unsigned int,
+					    summary->flight_size / 2, 2);
+		cwnd = 1;
+		if (cwnd > call->cong_ssthresh &&
+		    call->cong_mode == RXRPC_CALL_SLOW_START) {
+			call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+			call->cong_tstamp = skb->tstamp;
+			cumulative_acks = 0;
+		}
+	}
+
+	cumulative_acks += summary->nr_new_acks;
+	cumulative_acks += summary->nr_rot_new_acks;
+	if (cumulative_acks > 255)
+		cumulative_acks = 255;
+
+	summary->mode = call->cong_mode;
+	summary->cwnd = call->cong_cwnd;
+	summary->ssthresh = call->cong_ssthresh;
+	summary->cumulative_acks = cumulative_acks;
+	summary->dup_acks = call->cong_dup_acks;
+
+	switch (call->cong_mode) {
+	case RXRPC_CALL_SLOW_START:
+		if (summary->nr_nacks > 0)
+			goto packet_loss_detected;
+		if (summary->cumulative_acks > 0)
+			cwnd += 1;
+		if (cwnd > call->cong_ssthresh) {
+			call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+			call->cong_tstamp = skb->tstamp;
+		}
+		goto out;
+
+	case RXRPC_CALL_CONGEST_AVOIDANCE:
+		if (summary->nr_nacks > 0)
+			goto packet_loss_detected;
+
+		/* We analyse the number of packets that get ACK'd per RTT
+		 * period and increase the window if we managed to fill it.
+		 */
+		if (call->peer->rtt_usage == 0)
+			goto out;
+		if (ktime_before(skb->tstamp,
+				 ktime_add_ns(call->cong_tstamp,
+					      call->peer->rtt)))
+			goto out_no_clear_ca;
+		change = rxrpc_cong_rtt_window_end;
+		call->cong_tstamp = skb->tstamp;
+		if (cumulative_acks >= cwnd)
+			cwnd++;
+		goto out;
+
+	case RXRPC_CALL_PACKET_LOSS:
+		if (summary->nr_nacks == 0)
+			goto resume_normality;
+
+		if (summary->new_low_nack) {
+			change = rxrpc_cong_new_low_nack;
+			call->cong_dup_acks = 1;
+			if (call->cong_extra > 1)
+				call->cong_extra = 1;
+			goto send_extra_data;
+		}
+
+		call->cong_dup_acks++;
+		if (call->cong_dup_acks < 3)
+			goto send_extra_data;
+
+		change = rxrpc_cong_begin_retransmission;
+		call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
+		call->cong_ssthresh = max_t(unsigned int,
+					    summary->flight_size / 2, 2);
+		cwnd = call->cong_ssthresh + 3;
+		call->cong_extra = 0;
+		call->cong_dup_acks = 0;
+		resend = true;
+		goto out;
+
+	case RXRPC_CALL_FAST_RETRANSMIT:
+		if (!summary->new_low_nack) {
+			if (summary->nr_new_acks == 0)
+				cwnd += 1;
+			call->cong_dup_acks++;
+			if (call->cong_dup_acks == 2) {
+				change = rxrpc_cong_retransmit_again;
+				call->cong_dup_acks = 0;
+				resend = true;
+			}
+		} else {
+			change = rxrpc_cong_progress;
+			cwnd = call->cong_ssthresh;
+			if (summary->nr_nacks == 0)
+				goto resume_normality;
+		}
+		goto out;
+
+	default:
+		BUG();
+		goto out;
+	}
+
+resume_normality:
+	change = rxrpc_cong_cleared_nacks;
+	call->cong_dup_acks = 0;
+	call->cong_extra = 0;
+	call->cong_tstamp = skb->tstamp;
+	if (cwnd <= call->cong_ssthresh)
+		call->cong_mode = RXRPC_CALL_SLOW_START;
+	else
+		call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+out:
+	cumulative_acks = 0;
+out_no_clear_ca:
+	if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1)
+		cwnd = RXRPC_RXTX_BUFF_SIZE - 1;
+	call->cong_cwnd = cwnd;
+	call->cong_cumul_acks = cumulative_acks;
+	trace_rxrpc_congest(call, summary, sp->hdr.serial, change);
+	if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+		rxrpc_queue_call(call);
+	return;
+
+packet_loss_detected:
+	change = rxrpc_cong_saw_nack;
+	call->cong_mode = RXRPC_CALL_PACKET_LOSS;
+	call->cong_dup_acks = 0;
+	goto send_extra_data;
+
+send_extra_data:
+	/* Send some previously unsent DATA if we have some to advance the ACK
+	 * state.
+	 */
+	if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+	    RXRPC_TX_ANNO_LAST ||
+	    summary->nr_acks != call->tx_top - call->tx_hard_ack) {
+		call->cong_extra++;
+		wake_up(&call->waitq);
+	}
+	goto out_no_clear_ca;
+}
+
 /*
  * Ping the other end to fill our RTT cache and to retrieve the rwind
  * and MTU parameters.
@@ -524,7 +684,6 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 				  rxrpc_seq_t seq, int nr_acks,
 				  struct rxrpc_ack_summary *summary)
 {
-	bool resend = false;
 	int ix;
 	u8 annotation, anno_type;
 
@@ -556,16 +715,11 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 				continue;
 			call->rxtx_annotations[ix] =
 				RXRPC_TX_ANNO_NAK | annotation;
-			resend = true;
 			break;
 		default:
 			return rxrpc_proto_abort("SFT", call, 0);
 		}
 	}
-
-	if (resend &&
-	    !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
-		rxrpc_queue_call(call);
 }
 
 /*
@@ -663,6 +817,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 		       sp->hdr.serial, call->acks_latest);
 		return;
 	}
+	call->acks_latest_ts = skb->tstamp;
 	call->acks_latest = sp->hdr.serial;
 
 	if (before(hard_ack, call->tx_hard_ack) ||
@@ -692,6 +847,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
 				  false, true,
 				  rxrpc_propose_ack_ping_for_lost_reply);
+
+	return rxrpc_congestion_management(call, skb, &summary);
 }
 
 /*
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index a608769343e6..aedb8978226d 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -200,6 +200,7 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
 const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
 	[rxrpc_propose_ack_client_tx_end]	= "ClTxEnd",
 	[rxrpc_propose_ack_input_data]		= "DataIn ",
+	[rxrpc_propose_ack_ping_for_lost_ack]	= "LostAck",
 	[rxrpc_propose_ack_ping_for_lost_reply]	= "LostRpl",
 	[rxrpc_propose_ack_ping_for_params]	= "Params ",
 	[rxrpc_propose_ack_respond_to_ack]	= "Rsp2Ack",
@@ -214,3 +215,21 @@ const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
 	[rxrpc_propose_ack_update]		= " Update",
 	[rxrpc_propose_ack_subsume]		= " Subsume",
 };
+
+const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
+	[RXRPC_CALL_SLOW_START]		= "SlowStart",
+	[RXRPC_CALL_CONGEST_AVOIDANCE]	= "CongAvoid",
+	[RXRPC_CALL_PACKET_LOSS]	= "PktLoss  ",
+	[RXRPC_CALL_FAST_RETRANSMIT]	= "FastReTx ",
+};
+
+const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
+	[rxrpc_cong_begin_retransmission]	= " Retrans",
+	[rxrpc_cong_cleared_nacks]		= " Cleared",
+	[rxrpc_cong_new_low_nack]		= " NewLowN",
+	[rxrpc_cong_no_change]			= "",
+	[rxrpc_cong_progress]			= " Progres",
+	[rxrpc_cong_retransmit_again]		= " ReTxAgn",
+	[rxrpc_cong_rtt_window_end]		= " RttWinE",
+	[rxrpc_cong_saw_nack]			= " SawNack",
+};
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 3eb01445e814..cf43a715685e 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -157,6 +157,8 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 		spin_unlock_bh(&call->lock);
 
 
+		pkt->whdr.flags |= RXRPC_SLOW_START_OK;
+
 		iov[0].iov_len += sizeof(pkt->ack) + n;
 		iov[1].iov_base = &pkt->ackinfo;
 		iov[1].iov_len	= sizeof(pkt->ackinfo);
@@ -276,8 +278,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	msg.msg_controllen = 0;
 	msg.msg_flags = 0;
 
-	/* If our RTT cache needs working on, request an ACK. */
-	if ((call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
+	/* If our RTT cache needs working on, request an ACK.  Also request
+	 * ACKs if a DATA packet appears to have been lost.
+	 */
+	if (call->cong_mode == RXRPC_CALL_FAST_RETRANSMIT ||
+	    (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
 	    ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
 			 ktime_get_real()))
 		whdr.flags |= RXRPC_REQUEST_ACK;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 99939372b5a4..1f8040d82395 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -45,7 +45,9 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		ret = 0;
-		if (call->tx_top - call->tx_hard_ack < call->tx_winsize)
+		if (call->tx_top - call->tx_hard_ack <
+		    min_t(unsigned int, call->tx_winsize,
+			  call->cong_cwnd + call->cong_extra))
 			break;
 		if (call->state >= RXRPC_CALL_COMPLETE) {
 			ret = -call->error;
@@ -203,7 +205,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			_debug("alloc");
 
 			if (call->tx_top - call->tx_hard_ack >=
-			    call->tx_winsize) {
+			    min_t(unsigned int, call->tx_winsize,
+				  call->cong_cwnd + call->cong_extra)) {
 				ret = -EAGAIN;
 				if (msg->msg_flags & MSG_DONTWAIT)
 					goto maybe_error;
-- 
cgit v1.2.3