From cae9910e73446cac68a54e3a7b02aaa12b689026 Mon Sep 17 00:00:00 2001
From: Felipe Gasper <felipe@felipegasper.com>
Date: Mon, 20 May 2019 19:43:51 -0500
Subject: net: Add UNIX_DIAG_UID to Netlink UNIX socket diagnostics.

This adds the ability for Netlink to report a socket's UID along with the
other UNIX diagnostic information that is already available. This will
allow diagnostic tools greater insight into which users control which
socket.

To test this, do the following as a non-root user:

    unshare -U -r bash
    nc -l -U user.socket.$$ &

.. and verify from within that same session that Netlink UNIX socket
diagnostics report the socket's UID as 0. Also verify that Netlink UNIX
socket diagnostics report the socket's UID as the user's UID from an
unprivileged process in a different session. Verify the same from
a root process.

Signed-off-by: Felipe Gasper <felipe@felipegasper.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/unix_diag.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h
index 5c502fdf7a42..a1988576fa8a 100644
--- a/include/uapi/linux/unix_diag.h
+++ b/include/uapi/linux/unix_diag.h
@@ -20,6 +20,7 @@ struct unix_diag_req {
 #define UDIAG_SHOW_ICONS	0x00000008	/* show pending connections */
 #define UDIAG_SHOW_RQLEN	0x00000010	/* show skb receive queue len */
 #define UDIAG_SHOW_MEMINFO	0x00000020	/* show memory info of a socket */
+#define UDIAG_SHOW_UID		0x00000040	/* show socket's UID */
 
 struct unix_diag_msg {
 	__u8	udiag_family;
@@ -40,6 +41,7 @@ enum {
 	UNIX_DIAG_RQLEN,
 	UNIX_DIAG_MEMINFO,
 	UNIX_DIAG_SHUTDOWN,
+	UNIX_DIAG_UID,
 
 	__UNIX_DIAG_MAX,
 };
-- 
cgit v1.2.3


From 980066e6d9642fa5854bed8e592b1a30ea885b76 Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@impinj.com>
Date: Wed, 22 May 2019 18:43:21 +0000
Subject: dt-bindings: phy: dp83867: Add documentation for disabling clock
 output

The clock output is generally only used for testing and development and
not used to daisy-chain PHYs.  It's just a source of RF noise afterward.

Add a mux value for "off".  I've added it as another enumeration to the
output property.  In the actual PHY, the mux and the output enable are
independently controllable.  However, it doesn't seem useful to be able
to describe the mux setting when the output is disabled.

Document that PHY's default setting will be left as is if the property
is omitted.

Cc: Rob Herring <robh+dt@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Trent Piepho <tpiepho@impinj.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/ti,dp83867.txt | 6 ++++--
 include/dt-bindings/net/ti-dp83867.h                 | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt
index 99b8681bde49..db6aa3f2215b 100644
--- a/Documentation/devicetree/bindings/net/ti,dp83867.txt
+++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt
@@ -33,8 +33,10 @@ Optional property:
 				    software needs to take when this pin is
 				    strapped in these modes. See data manual
 				    for details.
-	- ti,clk-output-sel - Muxing option for CLK_OUT pin - see dt-bindings/net/ti-dp83867.h
-				    for applicable values.
+	- ti,clk-output-sel - Muxing option for CLK_OUT pin.  See dt-bindings/net/ti-dp83867.h
+			      for applicable values.  The CLK_OUT pin can also
+			      be disabled by this property.  When omitted, the
+			      PHY's default will be left as is.
 
 Note: ti,min-output-impedance and ti,max-output-impedance are mutually
       exclusive. When both properties are present ti,max-output-impedance
diff --git a/include/dt-bindings/net/ti-dp83867.h b/include/dt-bindings/net/ti-dp83867.h
index 7b1656427cbe..192b79439eb7 100644
--- a/include/dt-bindings/net/ti-dp83867.h
+++ b/include/dt-bindings/net/ti-dp83867.h
@@ -56,4 +56,6 @@
 #define DP83867_CLK_O_SEL_CHN_C_TCLK		0xA
 #define DP83867_CLK_O_SEL_CHN_D_TCLK		0xB
 #define DP83867_CLK_O_SEL_REF_CLK		0xC
+/* Special flag to indicate clock should be off */
+#define DP83867_CLK_O_SEL_OFF			0xFFFFFFFF
 #endif
-- 
cgit v1.2.3


From b2557764d0ebf387da7a11967fd955f3b226b172 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 22 May 2019 20:47:03 +0200
Subject: net: phy: Add support for 100BaseT1 and 1000BaseT1

Add link modes for 100Mbps and 1Gbps over a single pair.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c   | 4 +++-
 include/uapi/linux/ethtool.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 3daf0214a242..16667fbac8bf 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -8,7 +8,7 @@
 
 const char *phy_speed_to_str(int speed)
 {
-	BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 67,
+	BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 69,
 		"Enum ethtool_link_mode_bit_indices and phylib are out of sync. "
 		"If a speed or mode has been added please update phy_speed_to_str "
 		"and the PHY settings array.\n");
@@ -131,9 +131,11 @@ static const struct phy_setting settings[] = {
 	PHY_SETTING(   1000, FULL,   1000baseKX_Full		),
 	PHY_SETTING(   1000, FULL,   1000baseT_Full		),
 	PHY_SETTING(   1000, HALF,   1000baseT_Half		),
+	PHY_SETTING(   1000, FULL,   1000baseT1_Full		),
 	PHY_SETTING(   1000, FULL,   1000baseX_Full		),
 	/* 100M */
 	PHY_SETTING(    100, FULL,    100baseT_Full		),
+	PHY_SETTING(    100, FULL,    100baseT1_Full		),
 	PHY_SETTING(    100, HALF,    100baseT_Half		),
 	/* 10M */
 	PHY_SETTING(     10, FULL,     10baseT_Full		),
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 3534ce157ae9..dd06302aa93e 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1483,6 +1483,8 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64,
 	ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT	 = 65,
 	ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT	 = 66,
+	ETHTOOL_LINK_MODE_100baseT1_Full_BIT		 = 67,
+	ETHTOOL_LINK_MODE_1000baseT1_Full_BIT		 = 68,
 
 	/* must be last entry */
 	__ETHTOOL_LINK_MODE_MASK_NBITS
-- 
cgit v1.2.3


From 68a9b13d9219a52cd272bd8e93f7fdfd1c22eba1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:04:39 -0700
Subject: ipv6: Add delete route hook to stubs

Add ip6_del_rt to the IPv6 stub. The hook is needed by the nexthop
code to remove entries linked to a nexthop that is getting deleted.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6_stubs.h | 1 +
 net/ipv6/addrconf_core.c | 6 ++++++
 net/ipv6/af_inet6.c      | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 6c0c4fde16f8..307114a46eee 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -45,6 +45,7 @@ struct ipv6_stub {
 			    struct fib6_config *cfg, gfp_t gfp_flags,
 			    struct netlink_ext_ack *extack);
 	void (*fib6_nh_release)(struct fib6_nh *fib6_nh);
+	int (*ip6_del_rt)(struct net *net, struct fib6_info *rt);
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 5b1246635e02..783f3c1466da 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 	return -EAFNOSUPPORT;
 }
 
+static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt)
+{
+	return -EAFNOSUPPORT;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
 	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
 	.ipv6_route_input  = eafnosupport_ipv6_route_input,
@@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
 	.fib6_select_path  = eafnosupport_fib6_select_path,
 	.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
 	.fib6_nh_init	   = eafnosupport_fib6_nh_init,
+	.ip6_del_rt	   = eafnosupport_ip6_del_rt,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c04ae282f4e4..bc2ca61a020a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -926,6 +926,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
 	.fib6_nh_init	   = fib6_nh_init,
 	.fib6_nh_release   = fib6_nh_release,
+	.ip6_del_rt	   = ip6_del_rt,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
-- 
cgit v1.2.3


From cdaa16a4f70cfa6c55641588c3a3eb9b53abd56b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:04:40 -0700
Subject: ipv6: Add hook to bump sernum for a route to stubs

Add hook to ipv6 stub to bump the sernum up to the root node for a
route. This is needed by the nexthop code when a nexthop config changes.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    | 1 +
 include/net/ipv6_stubs.h | 1 +
 net/ipv6/af_inet6.c      | 1 +
 net/ipv6/ip6_fib.c       | 8 ++++++++
 4 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 525f701653ca..d038d02cbc3c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -485,6 +485,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
 void fib6_update_sernum(struct net *net, struct fib6_info *rt);
 void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
+void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i);
 
 void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val);
 static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 307114a46eee..97f42e16b3b3 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -45,6 +45,7 @@ struct ipv6_stub {
 			    struct fib6_config *cfg, gfp_t gfp_flags,
 			    struct netlink_ext_ack *extack);
 	void (*fib6_nh_release)(struct fib6_nh *fib6_nh);
+	void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt);
 	int (*ip6_del_rt)(struct net *net, struct fib6_info *rt);
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index bc2ca61a020a..55138f0d2b9d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -926,6 +926,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
 	.fib6_nh_init	   = fib6_nh_init,
 	.fib6_nh_release   = fib6_nh_release,
+	.fib6_update_sernum = fib6_update_sernum_stub,
 	.ip6_del_rt	   = ip6_del_rt,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 008421b550c6..df726fb8f70f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1222,6 +1222,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
 	__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
 }
 
+/* allow ipv4 to update sernum via ipv6_stub */
+void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
+{
+	spin_lock_bh(&f6i->fib6_table->tb6_lock);
+	fib6_update_sernum_upto_root(net, f6i);
+	spin_unlock_bh(&f6i->fib6_table->tb6_lock);
+}
+
 /*
  *	Add routing information to the routing tree.
  *	<destination addr>/<source addr>
-- 
cgit v1.2.3


From 19a3b7eea42402accf52bcb9ddb51bfdb4d7a13b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:04:41 -0700
Subject: ipv6: export function to send route updates

Add fib6_rt_update to send RTM_NEWROUTE with NLM_F_REPLACE set. This
helper will be used by the nexthop code to notify userspace of routes
that are impacted when a nexthop config is updated via replace.

This notification is needed for legacy apps that do not understand
the new nexthop object. Apps that are nexthop aware can use the
RTA_NH_ID attribute in the route notification to just ignore it.

In the future this should be wrapped in a sysctl to allow OS'es that
are fully updated to avoid the notificaton storm.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    |  6 ++++++
 include/net/ipv6_stubs.h |  3 +++
 net/ipv6/af_inet6.c      |  1 +
 net/ipv6/ip6_fib.c       |  8 ++++----
 net/ipv6/route.c         | 32 ++++++++++++++++++++++++++++++++
 5 files changed, 46 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index d038d02cbc3c..0d0d06b1cd26 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -452,6 +452,12 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 		 struct netlink_ext_ack *extack);
 void fib6_nh_release(struct fib6_nh *fib6_nh);
 
+int call_fib6_entry_notifiers(struct net *net,
+			      enum fib_event_type event_type,
+			      struct fib6_info *rt,
+			      struct netlink_ext_ack *extack);
+void fib6_rt_update(struct net *net, struct fib6_info *rt,
+		    struct nl_info *info);
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 97f42e16b3b3..5c93e942c50b 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -47,6 +47,9 @@ struct ipv6_stub {
 	void (*fib6_nh_release)(struct fib6_nh *fib6_nh);
 	void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt);
 	int (*ip6_del_rt)(struct net *net, struct fib6_info *rt);
+	void (*fib6_rt_update)(struct net *net, struct fib6_info *rt,
+			       struct nl_info *info);
+
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 55138f0d2b9d..cc6f8d0c625a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -927,6 +927,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.fib6_nh_init	   = fib6_nh_init,
 	.fib6_nh_release   = fib6_nh_release,
 	.fib6_update_sernum = fib6_update_sernum_stub,
+	.fib6_rt_update	   = fib6_rt_update,
 	.ip6_del_rt	   = ip6_del_rt,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index df726fb8f70f..7958cf91895a 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -393,10 +393,10 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 	return call_fib6_notifier(nb, net, event_type, &info.info);
 }
 
-static int call_fib6_entry_notifiers(struct net *net,
-				     enum fib_event_type event_type,
-				     struct fib6_info *rt,
-				     struct netlink_ext_ack *extack)
+int call_fib6_entry_notifiers(struct net *net,
+			      enum fib_event_type event_type,
+			      struct fib6_info *rt,
+			      struct netlink_ext_ack *extack)
 {
 	struct fib6_entry_notifier_info info = {
 		.info.extack = extack,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7a014ca877ed..c52a7f49d096 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5123,6 +5123,38 @@ errout:
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
 }
 
+void fib6_rt_update(struct net *net, struct fib6_info *rt,
+		    struct nl_info *info)
+{
+	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	/* call_fib6_entry_notifiers will be removed when in-kernel notifier
+	 * is implemented and supported for nexthop objects
+	 */
+	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
+
+	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+	if (!skb)
+		goto errout;
+
+	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+			    RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+		    info->nlh, gfp_any());
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+
 static int ip6_route_dev_notify(struct notifier_block *this,
 				unsigned long event, void *ptr)
 {
-- 
cgit v1.2.3


From 1bff1a0c9bbda06f1646030082123baf23ea8e7f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:04:42 -0700
Subject: ipv4: Add function to send route updates

Add fib_info_notify_update to walk the fib and send RTM_NEWROUTE
notifications with NLM_F_REPLACE set for entries linked to a fib_info
that have nh_updated flag set. This helper will be used by the nexthop
code to notify userspace of routes that are impacted when a nexthop
config is updated via replace. The new function and its helper are
similar to how fib_flush and fib_table_flush work for address delete
and link down events.

This notification is needed for legacy apps that do not understand
the new nexthop object. Apps that are nexthop aware can use the
RTA_NH_ID attribute in the route notification to just ignore it.

In the future this should be wrapped in a sysctl to allow OS'es that
are fully updated to avoid the notificaton storm.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h |  2 ++
 net/ipv4/fib_trie.c  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index d0e28f4ab099..ec6496c08f48 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -150,6 +150,7 @@ struct fib_info {
 #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
 	int			fib_nhs;
 	bool			fib_nh_is_v6;
+	bool			nh_updated;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].fib_nh_dev
@@ -231,6 +232,7 @@ int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
 int __net_init fib4_notifier_init(struct net *net);
 void __net_exit fib4_notifier_exit(struct net *net);
 
+void fib_info_notify_update(struct net *net, struct nl_info *info);
 void fib_notify(struct net *net, struct notifier_block *nb);
 
 struct fib_table {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 334f723bdf80..ea7df7ebf597 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1943,6 +1943,78 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
 	return found;
 }
 
+/* derived from fib_trie_free */
+static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
+				     struct nl_info *info)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *pn = t->kv;
+	unsigned long cindex = 1;
+	struct fib_alias *fa;
+
+	for (;;) {
+		struct key_vector *n;
+
+		if (!(cindex--)) {
+			t_key pkey = pn->key;
+
+			if (IS_TRIE(pn))
+				break;
+
+			n = pn;
+			pn = node_parent(pn);
+			cindex = get_index(pkey, pn);
+			continue;
+		}
+
+		/* grab the next available node */
+		n = get_child(pn, cindex);
+		if (!n)
+			continue;
+
+		if (IS_TNODE(n)) {
+			/* record pn and cindex for leaf walking */
+			pn = n;
+			cindex = 1ul << n->bits;
+
+			continue;
+		}
+
+		hlist_for_each_entry(fa, &n->leaf, fa_list) {
+			struct fib_info *fi = fa->fa_info;
+
+			if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
+				continue;
+
+			rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
+				  KEYLENGTH - fa->fa_slen, tb->tb_id,
+				  info, NLM_F_REPLACE);
+
+			/* call_fib_entry_notifiers will be removed when
+			 * in-kernel notifier is implemented and supported
+			 * for nexthop objects
+			 */
+			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+						 n->key,
+						 KEYLENGTH - fa->fa_slen, fa,
+						 NULL);
+		}
+	}
+}
+
+void fib_info_notify_update(struct net *net, struct nl_info *info)
+{
+	unsigned int h;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb_hlist)
+			__fib_info_notify_update(net, tb, info);
+	}
+}
+
 static void fib_leaf_notify(struct net *net, struct key_vector *l,
 			    struct fib_table *tb, struct notifier_block *nb)
 {
-- 
cgit v1.2.3


From ac1fab2d139447d84b10d99f80bec5d7b08c365a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:04:43 -0700
Subject: ipv4: export fib_check_nh

Change fib_check_nh to take net, table and scope as input arguments
over struct fib_config and export for use by nexthop code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  2 ++
 net/ipv4/fib_semantics.c | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index ec6496c08f48..27d7c89ca9c4 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -436,6 +436,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 		       const struct sk_buff *skb, struct flow_keys *flkeys);
 #endif
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+		 struct netlink_ext_ack *extack);
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
 		     struct flowi4 *fl4, const struct sk_buff *skb);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d3da6a10f86f..4541121426fb 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1092,15 +1092,13 @@ out:
 	return err;
 }
 
-static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
-			struct netlink_ext_ack *extack)
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+		 struct netlink_ext_ack *extack)
 {
-	struct net *net = cfg->fc_nlinfo.nl_net;
-	u32 table = cfg->fc_table;
 	int err;
 
 	if (nh->fib_nh_gw_family == AF_INET)
-		err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack);
+		err = fib_check_nh_v4_gw(net, nh, table, scope, extack);
 	else if (nh->fib_nh_gw_family == AF_INET6)
 		err = fib_check_nh_v6_gw(net, nh, table, extack);
 	else
@@ -1377,7 +1375,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		int linkdown = 0;
 
 		change_nexthops(fi) {
-			err = fib_check_nh(cfg, nexthop_nh, extack);
+			err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
+					   cfg->fc_table, cfg->fc_scope,
+					   extack);
 			if (err != 0)
 				goto failure;
 			if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
-- 
cgit v1.2.3


From 9bd836679210534396a93a02f2fcf3ece64f45f7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:04:44 -0700
Subject: ipv4: export fib_flush

As nexthops are deleted, fib entries referencing it are marked dead.
Export fib_flush so those entries can be removed in a timely manner.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    | 1 +
 net/ipv4/fib_frontend.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 27d7c89ca9c4..79c18bd6a059 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -473,6 +473,7 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 #endif
 }
 
+void fib_flush(struct net *net);
 void free_fib_info(struct fib_info *fi);
 
 static inline void fib_info_hold(struct fib_info *fi)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b298255f6fdb..dfa57a84ac14 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -192,7 +192,7 @@ int fib_unmerge(struct net *net)
 	return 0;
 }
 
-static void fib_flush(struct net *net)
+void fib_flush(struct net *net)
 {
 	int flushed = 0;
 	unsigned int h;
-- 
cgit v1.2.3


From c3669486b5127165fd348daf4a785996820ac8f2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:04:45 -0700
Subject: ipv4: export fib_info_update_nh_saddr

Add scope as input argument versus relying on fib_info reference in
fib_nh, and export fib_info_update_nh_saddr.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  3 ++-
 net/ipv4/fib_semantics.c | 11 +++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 79c18bd6a059..8511ebb6f7be 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -201,7 +201,8 @@ static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
 #define FIB_TABLE_HASHSZ 2
 #endif
 
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh,
+				unsigned char scope);
 __be32 fib_result_prefsrc(struct net *net, struct fib_result *res);
 
 #define FIB_RES_NHC(res)		((res).nhc)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4541121426fb..bd8c51d2c59b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1189,11 +1189,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
 	fib_info_hash_free(old_laddrhash, bytes);
 }
 
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh,
+				unsigned char scope)
 {
-	nh->nh_saddr = inet_select_addr(nh->fib_nh_dev,
-					nh->fib_nh_gw4,
-					nh->nh_parent->fib_scope);
+	nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);
 	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
 
 	return nh->nh_saddr;
@@ -1211,7 +1210,7 @@ __be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
 	if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
 		return nh->nh_saddr;
 
-	return fib_info_update_nh_saddr(net, nh);
+	return fib_info_update_nh_saddr(net, nh, res->fi->fib_scope);
 }
 
 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
@@ -1393,7 +1392,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	}
 
 	change_nexthops(fi) {
-		fib_info_update_nh_saddr(net, nexthop_nh);
+		fib_info_update_nh_saddr(net, nexthop_nh, fi->fib_scope);
 		if (nexthop_nh->fib_nh_gw_family == AF_INET6)
 			fi->fib_nh_is_v6 = true;
 	} endfor_nexthops(fi)
-- 
cgit v1.2.3


From 06c77c3e67b0352473345a162ab17729a132e7db Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:04:46 -0700
Subject: ipv4: Rename and export nh_update_mtu

Rename nh_update_mtu to fib_nhc_update_mtu and export for use by the
nexthop code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 1 +
 net/ipv4/fib_semantics.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 8511ebb6f7be..70ba0302c8c9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -432,6 +432,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned char nh_flags);
 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
+void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig);
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index bd8c51d2c59b..78648072783e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1712,7 +1712,7 @@ static int call_fib_nh_notifiers(struct fib_nh *nh,
  * - if the new MTU is greater than the PMTU, don't make any change
  * - otherwise, unlock and set PMTU
  */
-static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
+void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
 {
 	struct fnhe_hash_bucket *bucket;
 	int i;
@@ -1748,7 +1748,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
 
 	hlist_for_each_entry(nh, head, nh_hash) {
 		if (nh->fib_nh_dev == dev)
-			nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
+			fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
 	}
 }
 
-- 
cgit v1.2.3


From 75425657fe3ad853b300976966d8fafa3f209b89 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:07:43 -0700
Subject: net: Set strict_start_type for routes and rules

New userspace on an older kernel can send unknown and unsupported
attributes resulting in an incompelete config which is almost
always wrong for routing (few exceptions are passthrough settings
like the protocol that installed the route).

Set strict_start_type in the policies for IPv4 and IPv6 routes and
rules to detect new, unsupported attributes and fail the route add.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h | 1 +
 net/ipv4/fib_frontend.c | 1 +
 net/ipv6/route.c        | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index b473df5b9512..eba8465e1d86 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -103,6 +103,7 @@ struct fib_rule_notifier_info {
 };
 
 #define FRA_GENERIC_POLICY \
+	[FRA_UNSPEC]	= { .strict_start_type = FRA_DPORT_RANGE + 1 }, \
 	[FRA_IIFNAME]	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
 	[FRA_OIFNAME]	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
 	[FRA_PRIORITY]	= { .type = NLA_U32 }, \
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index dfa57a84ac14..76055c66326a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -645,6 +645,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
 }
 
 const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
 	[RTA_DST]		= { .type = NLA_U32 },
 	[RTA_SRC]		= { .type = NLA_U32 },
 	[RTA_IIF]		= { .type = NLA_U32 },
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c52a7f49d096..5f0661c18624 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4221,6 +4221,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
 }
 
 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
+	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
 	[RTA_OIF]               = { .type = NLA_U32 },
-- 
cgit v1.2.3


From fc651001d2c5ca4f8b87efae2edb69fca94a6365 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 12:22:21 -0700
Subject: neighbor: Add tracepoint to __neigh_create

Add tracepoint to __neigh_create to enable debugging of new entries.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/neigh.h | 49 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/neighbour.c         |  2 ++
 2 files changed, 51 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/neigh.h b/include/trace/events/neigh.h
index 0bdb08557763..62bb17516713 100644
--- a/include/trace/events/neigh.h
+++ b/include/trace/events/neigh.h
@@ -20,6 +20,55 @@
 		{ NUD_NOARP, "noarp" },			\
 		{ NUD_PERMANENT, "permanent"})
 
+TRACE_EVENT(neigh_create,
+
+	TP_PROTO(struct neigh_table *tbl, struct net_device *dev,
+		 const void *pkey, const struct neighbour *n,
+		 bool exempt_from_gc),
+
+	TP_ARGS(tbl, dev, pkey, n, exempt_from_gc),
+
+	TP_STRUCT__entry(
+		__field(u32, family)
+		__dynamic_array(char,  dev,   IFNAMSIZ )
+		__field(int, entries)
+		__field(u8, created)
+		__field(u8, gc_exempt)
+		__array(u8, primary_key4, 4)
+		__array(u8, primary_key6, 16)
+	),
+
+	TP_fast_assign(
+		struct in6_addr *pin6;
+		__be32 *p32;
+
+		__entry->family = tbl->family;
+		__assign_str(dev, (dev ? dev->name : "NULL"));
+		__entry->entries = atomic_read(&tbl->gc_entries);
+		__entry->created = n != NULL;
+		__entry->gc_exempt = exempt_from_gc;
+		pin6 = (struct in6_addr *)__entry->primary_key6;
+		p32 = (__be32 *)__entry->primary_key4;
+
+		if (tbl->family == AF_INET)
+			*p32 = *(__be32 *)pkey;
+		else
+			*p32 = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+		if (tbl->family == AF_INET6) {
+			pin6 = (struct in6_addr *)__entry->primary_key6;
+			*pin6 = *(struct in6_addr *)pkey;
+		}
+#endif
+	),
+
+	TP_printk("family %d dev %s entries %d primary_key4 %pI4 primary_key6 %pI6c created %d gc_exempt %d",
+		  __entry->family, __get_str(dev), __entry->entries,
+		  __entry->primary_key4, __entry->primary_key6,
+		  __entry->created, __entry->gc_exempt)
+);
+
 TRACE_EVENT(neigh_update,
 
 	TP_PROTO(struct neighbour *n, const u8 *lladdr, u8 new,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index dfa871061f14..a5556e4d3f96 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -587,6 +587,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl,
 	int error;
 	struct neigh_hash_table *nht;
 
+	trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
+
 	if (!n) {
 		rc = ERR_PTR(-ENOBUFS);
 		goto out;
-- 
cgit v1.2.3


From 0db355d499f10a79b6a5161e77c7eba8f062bde4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 May 2019 15:00:25 -0700
Subject: ipv4/igmp: shrink struct ip_sf_list

Removing two 4 bytes holes allows to use kmalloc-32
kmem cache instead of kmalloc-64 on 64bit kernels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 9c94b2ea789c..6649cb78de4a 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -65,8 +65,8 @@ struct ip_mc_socklist {
 
 struct ip_sf_list {
 	struct ip_sf_list	*sf_next;
-	__be32			sf_inaddr;
 	unsigned long		sf_count[2];	/* include/exclude counts */
+	__be32			sf_inaddr;
 	unsigned char		sf_gsresp;	/* include in g & s response? */
 	unsigned char		sf_oldin;	/* change state */
 	unsigned char		sf_crcount;	/* retrans. left to send */
-- 
cgit v1.2.3


From 136bf27fc0e9376525b9b6d9a1aa08508a0d1ac2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 23 May 2019 10:43:35 +0200
Subject: devlink: add warning in case driver does not set port type

Prevent misbehavior of drivers who would not set port type for longer
period of time. Drivers should always set port type. Do WARN if that
happens.

Note that it is perfectly fine to temporarily not have the type set,
during initialization and port type change.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  2 ++
 net/core/devlink.c    | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 1c4adfb4195a..151eb930d329 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -17,6 +17,7 @@
 #include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
+#include <linux/workqueue.h>
 #include <net/net_namespace.h>
 #include <uapi/linux/devlink.h>
 
@@ -64,6 +65,7 @@ struct devlink_port {
 	enum devlink_port_type desired_type;
 	void *type_dev;
 	struct devlink_port_attrs attrs;
+	struct delayed_work type_warn_dw;
 };
 
 struct devlink_sb_pool_info {
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d43bc52b8840..9716a7f382cb 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -21,6 +21,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/refcount.h>
+#include <linux/workqueue.h>
 #include <rdma/ib_verbs.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
@@ -5390,6 +5391,38 @@ void devlink_free(struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devlink_free);
 
+static void devlink_port_type_warn(struct work_struct *work)
+{
+	WARN(true, "Type was not set for devlink port.");
+}
+
+static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
+{
+	/* Ignore CPU and DSA flavours. */
+	return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+	       devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA;
+}
+
+#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30)
+
+static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
+{
+	if (!devlink_port_type_should_warn(devlink_port))
+		return;
+	/* Schedule a work to WARN in case driver does not set port
+	 * type within timeout.
+	 */
+	schedule_delayed_work(&devlink_port->type_warn_dw,
+			      DEVLINK_PORT_TYPE_WARN_TIMEOUT);
+}
+
+static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
+{
+	if (!devlink_port_type_should_warn(devlink_port))
+		return;
+	cancel_delayed_work_sync(&devlink_port->type_warn_dw);
+}
+
 /**
  *	devlink_port_register - Register devlink port
  *
@@ -5419,6 +5452,8 @@ int devlink_port_register(struct devlink *devlink,
 	list_add_tail(&devlink_port->list, &devlink->port_list);
 	INIT_LIST_HEAD(&devlink_port->param_list);
 	mutex_unlock(&devlink->lock);
+	INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
+	devlink_port_type_warn_schedule(devlink_port);
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 	return 0;
 }
@@ -5433,6 +5468,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port)
 {
 	struct devlink *devlink = devlink_port->devlink;
 
+	devlink_port_type_warn_cancel(devlink_port);
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
 	mutex_lock(&devlink->lock);
 	list_del(&devlink_port->list);
@@ -5446,6 +5482,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
 {
 	if (WARN_ON(!devlink_port->registered))
 		return;
+	devlink_port_type_warn_cancel(devlink_port);
 	spin_lock(&devlink_port->type_lock);
 	devlink_port->type = type;
 	devlink_port->type_dev = type_dev;
@@ -5519,6 +5556,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
 void devlink_port_type_clear(struct devlink_port *devlink_port)
 {
 	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
+	devlink_port_type_warn_schedule(devlink_port);
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
-- 
cgit v1.2.3


From 1bd33bf0fe6d3012410db0302187199871b510a0 Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Thu, 23 May 2019 14:02:20 +0200
Subject: net: ll_temac: Prepare indirect register access for multicast support

With .ndo_set_rx_mode/temac_set_multicast_list() being called in atomic
context (holding addr_list_lock), and temac_set_multicast_list() needing
to access temac indirect registers, the mutex used to synchronize indirect
register is a no-no.

Replace it with a spinlock, and avoid sleeping in
temac_indirect_busywait().

To avoid excessive holding of the lock, which is now a spinlock, the
temac_device_reset() function is changed to only hold the lock for short
periods.  With timeouts, it could be holding the spinlock for more than
2 seconds.

Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/ll_temac.h        |   5 +-
 drivers/net/ethernet/xilinx/ll_temac_main.c   | 240 ++++++++++++++++++--------
 drivers/net/ethernet/xilinx/ll_temac_mdio.c   |  20 +--
 include/linux/platform_data/xilinx-ll-temac.h |   3 +-
 4 files changed, 179 insertions(+), 89 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/xilinx/ll_temac.h b/drivers/net/ethernet/xilinx/ll_temac.h
index 1aeda084b8f1..276292bca334 100644
--- a/drivers/net/ethernet/xilinx/ll_temac.h
+++ b/drivers/net/ethernet/xilinx/ll_temac.h
@@ -361,7 +361,7 @@ struct temac_local {
 	/* For synchronization of indirect register access.  Must be
 	 * shared mutex between interfaces in same TEMAC block.
 	 */
-	struct mutex *indirect_mutex;
+	spinlock_t *indirect_lock;
 	u32 options;			/* Current options word */
 	int last_link;
 	unsigned int temac_features;
@@ -388,8 +388,9 @@ struct temac_local {
 /* xilinx_temac.c */
 int temac_indirect_busywait(struct temac_local *lp);
 u32 temac_indirect_in32(struct temac_local *lp, int reg);
+u32 temac_indirect_in32_locked(struct temac_local *lp, int reg);
 void temac_indirect_out32(struct temac_local *lp, int reg, u32 value);
-
+void temac_indirect_out32_locked(struct temac_local *lp, int reg, u32 value);
 
 /* xilinx_temac_mdio.c */
 int temac_mdio_setup(struct temac_local *lp, struct platform_device *pdev);
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 65fb549241b2..cc58bd8c12f6 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -53,6 +53,7 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/dma-mapping.h>
+#include <linux/processor.h>
 #include <linux/platform_data/xilinx-ll-temac.h>
 
 #include "ll_temac.h"
@@ -84,51 +85,118 @@ static void _temac_iow_le(struct temac_local *lp, int offset, u32 value)
 	return iowrite32(value, lp->regs + offset);
 }
 
+static bool hard_acs_rdy(struct temac_local *lp)
+{
+	return temac_ior(lp, XTE_RDY0_OFFSET) & XTE_RDY0_HARD_ACS_RDY_MASK;
+}
+
+static bool hard_acs_rdy_or_timeout(struct temac_local *lp, ktime_t timeout)
+{
+	ktime_t cur = ktime_get();
+
+	return hard_acs_rdy(lp) || ktime_after(cur, timeout);
+}
+
+/* Poll for maximum 20 ms.  This is similar to the 2 jiffies @ 100 Hz
+ * that was used before, and should cover MDIO bus speed down to 3200
+ * Hz.
+ */
+#define HARD_ACS_RDY_POLL_NS (20 * NSEC_PER_MSEC)
+
+/**
+ * temac_indirect_busywait - Wait for current indirect register access
+ * to complete.
+ */
 int temac_indirect_busywait(struct temac_local *lp)
 {
-	unsigned long end = jiffies + 2;
+	ktime_t timeout = ktime_add_ns(ktime_get(), HARD_ACS_RDY_POLL_NS);
 
-	while (!(temac_ior(lp, XTE_RDY0_OFFSET) & XTE_RDY0_HARD_ACS_RDY_MASK)) {
-		if (time_before_eq(end, jiffies)) {
-			WARN_ON(1);
-			return -ETIMEDOUT;
-		}
-		usleep_range(500, 1000);
-	}
-	return 0;
+	spin_until_cond(hard_acs_rdy_or_timeout(lp, timeout));
+	if (WARN_ON(!hard_acs_rdy(lp)))
+		return -ETIMEDOUT;
+	else
+		return 0;
 }
 
 /**
- * temac_indirect_in32
- *
- * lp->indirect_mutex must be held when calling this function
+ * temac_indirect_in32 - Indirect register read access.  This function
+ * must be called without lp->indirect_lock being held.
  */
 u32 temac_indirect_in32(struct temac_local *lp, int reg)
 {
-	u32 val;
+	unsigned long flags;
+	int val;
+
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	val = temac_indirect_in32_locked(lp, reg);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
+	return val;
+}
 
-	if (temac_indirect_busywait(lp))
+/**
+ * temac_indirect_in32_locked - Indirect register read access.  This
+ * function must be called with lp->indirect_lock being held.  Use
+ * this together with spin_lock_irqsave/spin_lock_irqrestore to avoid
+ * repeated lock/unlock and to ensure uninterrupted access to indirect
+ * registers.
+ */
+u32 temac_indirect_in32_locked(struct temac_local *lp, int reg)
+{
+	/* This initial wait should normally not spin, as we always
+	 * try to wait for indirect access to complete before
+	 * releasing the indirect_lock.
+	 */
+	if (WARN_ON(temac_indirect_busywait(lp)))
 		return -ETIMEDOUT;
+	/* Initiate read from indirect register */
 	temac_iow(lp, XTE_CTL0_OFFSET, reg);
-	if (temac_indirect_busywait(lp))
+	/* Wait for indirect register access to complete.  We really
+	 * should not see timeouts, and could even end up causing
+	 * problem for following indirect access, so let's make a bit
+	 * of WARN noise.
+	 */
+	if (WARN_ON(temac_indirect_busywait(lp)))
 		return -ETIMEDOUT;
-	val = temac_ior(lp, XTE_LSW0_OFFSET);
-
-	return val;
+	/* Value is ready now */
+	return temac_ior(lp, XTE_LSW0_OFFSET);
 }
 
 /**
- * temac_indirect_out32
- *
- * lp->indirect_mutex must be held when calling this function
+ * temac_indirect_out32 - Indirect register write access.  This function
+ * must be called without lp->indirect_lock being held.
  */
 void temac_indirect_out32(struct temac_local *lp, int reg, u32 value)
 {
-	if (temac_indirect_busywait(lp))
+	unsigned long flags;
+
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	temac_indirect_out32_locked(lp, reg, value);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
+}
+
+/**
+ * temac_indirect_out32_locked - Indirect register write access.  This
+ * function must be called with lp->indirect_lock being held.  Use
+ * this together with spin_lock_irqsave/spin_lock_irqrestore to avoid
+ * repeated lock/unlock and to ensure uninterrupted access to indirect
+ * registers.
+ */
+void temac_indirect_out32_locked(struct temac_local *lp, int reg, u32 value)
+{
+	/* As in temac_indirect_in32_locked(), we should normally not
+	 * spin here.  And if it happens, we actually end up silently
+	 * ignoring the write request.  Ouch.
+	 */
+	if (WARN_ON(temac_indirect_busywait(lp)))
 		return;
+	/* Initiate write to indirect register */
 	temac_iow(lp, XTE_LSW0_OFFSET, value);
 	temac_iow(lp, XTE_CTL0_OFFSET, CNTLREG_WRITE_ENABLE_MASK | reg);
-	temac_indirect_busywait(lp);
+	/* As in temac_indirect_in32_locked(), we should not see timeouts
+	 * here.  And if it happens, we continue before the write has
+	 * completed.  Not good.
+	 */
+	WARN_ON(temac_indirect_busywait(lp));
 }
 
 /**
@@ -344,20 +412,21 @@ out:
 static void temac_do_set_mac_address(struct net_device *ndev)
 {
 	struct temac_local *lp = netdev_priv(ndev);
+	unsigned long flags;
 
 	/* set up unicast MAC address filter set its mac address */
-	mutex_lock(lp->indirect_mutex);
-	temac_indirect_out32(lp, XTE_UAW0_OFFSET,
-			     (ndev->dev_addr[0]) |
-			     (ndev->dev_addr[1] << 8) |
-			     (ndev->dev_addr[2] << 16) |
-			     (ndev->dev_addr[3] << 24));
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	temac_indirect_out32_locked(lp, XTE_UAW0_OFFSET,
+				    (ndev->dev_addr[0]) |
+				    (ndev->dev_addr[1] << 8) |
+				    (ndev->dev_addr[2] << 16) |
+				    (ndev->dev_addr[3] << 24));
 	/* There are reserved bits in EUAW1
 	 * so don't affect them Set MAC bits [47:32] in EUAW1 */
-	temac_indirect_out32(lp, XTE_UAW1_OFFSET,
-			     (ndev->dev_addr[4] & 0x000000ff) |
-			     (ndev->dev_addr[5] << 8));
-	mutex_unlock(lp->indirect_mutex);
+	temac_indirect_out32_locked(lp, XTE_UAW1_OFFSET,
+				    (ndev->dev_addr[4] & 0x000000ff) |
+				    (ndev->dev_addr[5] << 8));
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 }
 
 static int temac_init_mac_address(struct net_device *ndev, const void *address)
@@ -383,42 +452,56 @@ static int temac_set_mac_address(struct net_device *ndev, void *p)
 static void temac_set_multicast_list(struct net_device *ndev)
 {
 	struct temac_local *lp = netdev_priv(ndev);
-	u32 multi_addr_msw, multi_addr_lsw, val;
+	u32 multi_addr_msw, multi_addr_lsw;
 	int i;
+	unsigned long flags;
+	bool promisc_mode_disabled = false;
 
-	mutex_lock(lp->indirect_mutex);
-	if (ndev->flags & (IFF_ALLMULTI | IFF_PROMISC) ||
-	    netdev_mc_count(ndev) > MULTICAST_CAM_TABLE_NUM) {
+	if (ndev->flags & (IFF_PROMISC | IFF_ALLMULTI) ||
+	    (netdev_mc_count(ndev) > MULTICAST_CAM_TABLE_NUM)) {
 		temac_indirect_out32(lp, XTE_AFM_OFFSET, XTE_AFM_EPPRM_MASK);
 		dev_info(&ndev->dev, "Promiscuous mode enabled.\n");
-	} else if (!netdev_mc_empty(ndev)) {
+		return;
+	}
+
+	spin_lock_irqsave(lp->indirect_lock, flags);
+
+	if (!netdev_mc_empty(ndev)) {
 		struct netdev_hw_addr *ha;
 
 		i = 0;
 		netdev_for_each_mc_addr(ha, ndev) {
-			if (i >= MULTICAST_CAM_TABLE_NUM)
+			if (WARN_ON(i >= MULTICAST_CAM_TABLE_NUM))
 				break;
 			multi_addr_msw = ((ha->addr[3] << 24) |
 					  (ha->addr[2] << 16) |
 					  (ha->addr[1] << 8) |
 					  (ha->addr[0]));
-			temac_indirect_out32(lp, XTE_MAW0_OFFSET,
-					     multi_addr_msw);
+			temac_indirect_out32_locked(lp, XTE_MAW0_OFFSET,
+						    multi_addr_msw);
 			multi_addr_lsw = ((ha->addr[5] << 8) |
 					  (ha->addr[4]) | (i << 16));
-			temac_indirect_out32(lp, XTE_MAW1_OFFSET,
-					     multi_addr_lsw);
+			temac_indirect_out32_locked(lp, XTE_MAW1_OFFSET,
+						    multi_addr_lsw);
 			i++;
 		}
 	} else {
-		val = temac_indirect_in32(lp, XTE_AFM_OFFSET);
-		temac_indirect_out32(lp, XTE_AFM_OFFSET,
-				     val & ~XTE_AFM_EPPRM_MASK);
-		temac_indirect_out32(lp, XTE_MAW0_OFFSET, 0);
-		temac_indirect_out32(lp, XTE_MAW1_OFFSET, 0);
-		dev_info(&ndev->dev, "Promiscuous mode disabled.\n");
+		temac_indirect_out32_locked(lp, XTE_MAW0_OFFSET, 0);
+		temac_indirect_out32_locked(lp, XTE_MAW1_OFFSET, i << 16);
+		}
 	}
-	mutex_unlock(lp->indirect_mutex);
+
+	/* Enable address filter block if currently disabled */
+	if (temac_indirect_in32_locked(lp, XTE_AFM_OFFSET)
+	    & XTE_AFM_EPPRM_MASK) {
+		temac_indirect_out32_locked(lp, XTE_AFM_OFFSET, 0);
+		promisc_mode_disabled = true;
+	}
+
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
+
+	if (promisc_mode_disabled)
+		dev_info(&ndev->dev, "Promiscuous mode disabled.\n");
 }
 
 static struct temac_option {
@@ -509,17 +592,19 @@ static u32 temac_setoptions(struct net_device *ndev, u32 options)
 	struct temac_local *lp = netdev_priv(ndev);
 	struct temac_option *tp = &temac_options[0];
 	int reg;
+	unsigned long flags;
 
-	mutex_lock(lp->indirect_mutex);
+	spin_lock_irqsave(lp->indirect_lock, flags);
 	while (tp->opt) {
-		reg = temac_indirect_in32(lp, tp->reg) & ~tp->m_or;
-		if (options & tp->opt)
+		reg = temac_indirect_in32_locked(lp, tp->reg) & ~tp->m_or;
+		if (options & tp->opt) {
 			reg |= tp->m_or;
-		temac_indirect_out32(lp, tp->reg, reg);
+			temac_indirect_out32_locked(lp, tp->reg, reg);
+		}
 		tp++;
 	}
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 	lp->options |= options;
-	mutex_unlock(lp->indirect_mutex);
 
 	return 0;
 }
@@ -530,6 +615,7 @@ static void temac_device_reset(struct net_device *ndev)
 	struct temac_local *lp = netdev_priv(ndev);
 	u32 timeout;
 	u32 val;
+	unsigned long flags;
 
 	/* Perform a software reset */
 
@@ -538,7 +624,6 @@ static void temac_device_reset(struct net_device *ndev)
 
 	dev_dbg(&ndev->dev, "%s()\n", __func__);
 
-	mutex_lock(lp->indirect_mutex);
 	/* Reset the receiver and wait for it to finish reset */
 	temac_indirect_out32(lp, XTE_RXC1_OFFSET, XTE_RXC1_RXRST_MASK);
 	timeout = 1000;
@@ -564,8 +649,11 @@ static void temac_device_reset(struct net_device *ndev)
 	}
 
 	/* Disable the receiver */
-	val = temac_indirect_in32(lp, XTE_RXC1_OFFSET);
-	temac_indirect_out32(lp, XTE_RXC1_OFFSET, val & ~XTE_RXC1_RXEN_MASK);
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	val = temac_indirect_in32_locked(lp, XTE_RXC1_OFFSET);
+	temac_indirect_out32_locked(lp, XTE_RXC1_OFFSET,
+				    val & ~XTE_RXC1_RXEN_MASK);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 
 	/* Reset Local Link (DMA) */
 	lp->dma_out(lp, DMA_CONTROL_REG, DMA_CONTROL_RST);
@@ -585,12 +673,12 @@ static void temac_device_reset(struct net_device *ndev)
 				"temac_device_reset descriptor allocation failed\n");
 	}
 
-	temac_indirect_out32(lp, XTE_RXC0_OFFSET, 0);
-	temac_indirect_out32(lp, XTE_RXC1_OFFSET, 0);
-	temac_indirect_out32(lp, XTE_TXC_OFFSET, 0);
-	temac_indirect_out32(lp, XTE_FCC_OFFSET, XTE_FCC_RXFLO_MASK);
-
-	mutex_unlock(lp->indirect_mutex);
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	temac_indirect_out32_locked(lp, XTE_RXC0_OFFSET, 0);
+	temac_indirect_out32_locked(lp, XTE_RXC1_OFFSET, 0);
+	temac_indirect_out32_locked(lp, XTE_TXC_OFFSET, 0);
+	temac_indirect_out32_locked(lp, XTE_FCC_OFFSET, XTE_FCC_RXFLO_MASK);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 
 	/* Sync default options with HW
 	 * but leave receiver and transmitter disabled.  */
@@ -614,13 +702,14 @@ static void temac_adjust_link(struct net_device *ndev)
 	struct phy_device *phy = ndev->phydev;
 	u32 mii_speed;
 	int link_state;
+	unsigned long flags;
 
 	/* hash together the state values to decide if something has changed */
 	link_state = phy->speed | (phy->duplex << 1) | phy->link;
 
-	mutex_lock(lp->indirect_mutex);
 	if (lp->last_link != link_state) {
-		mii_speed = temac_indirect_in32(lp, XTE_EMCFG_OFFSET);
+		spin_lock_irqsave(lp->indirect_lock, flags);
+		mii_speed = temac_indirect_in32_locked(lp, XTE_EMCFG_OFFSET);
 		mii_speed &= ~XTE_EMCFG_LINKSPD_MASK;
 
 		switch (phy->speed) {
@@ -630,11 +719,12 @@ static void temac_adjust_link(struct net_device *ndev)
 		}
 
 		/* Write new speed setting out to TEMAC */
-		temac_indirect_out32(lp, XTE_EMCFG_OFFSET, mii_speed);
+		temac_indirect_out32_locked(lp, XTE_EMCFG_OFFSET, mii_speed);
+		spin_unlock_irqrestore(lp->indirect_lock, flags);
+
 		lp->last_link = link_state;
 		phy_print_status(phy);
 	}
-	mutex_unlock(lp->indirect_mutex);
 }
 
 #ifdef CONFIG_64BIT
@@ -1096,17 +1186,17 @@ static int temac_probe(struct platform_device *pdev)
 
 	/* Setup mutex for synchronization of indirect register access */
 	if (pdata) {
-		if (!pdata->indirect_mutex) {
+		if (!pdata->indirect_lock) {
 			dev_err(&pdev->dev,
-				"indirect_mutex missing in platform_data\n");
+				"indirect_lock missing in platform_data\n");
 			return -EINVAL;
 		}
-		lp->indirect_mutex = pdata->indirect_mutex;
+		lp->indirect_lock = pdata->indirect_lock;
 	} else {
-		lp->indirect_mutex = devm_kmalloc(&pdev->dev,
-						  sizeof(*lp->indirect_mutex),
-						  GFP_KERNEL);
-		mutex_init(lp->indirect_mutex);
+		lp->indirect_lock = devm_kmalloc(&pdev->dev,
+						 sizeof(*lp->indirect_lock),
+						 GFP_KERNEL);
+		spin_lock_init(lp->indirect_lock);
 	}
 
 	/* map device registers */
diff --git a/drivers/net/ethernet/xilinx/ll_temac_mdio.c b/drivers/net/ethernet/xilinx/ll_temac_mdio.c
index a4667326f745..6fd2dea4e60f 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_mdio.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_mdio.c
@@ -25,14 +25,15 @@ static int temac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 {
 	struct temac_local *lp = bus->priv;
 	u32 rc;
+	unsigned long flags;
 
 	/* Write the PHY address to the MIIM Access Initiator register.
 	 * When the transfer completes, the PHY register value will appear
 	 * in the LSW0 register */
-	mutex_lock(lp->indirect_mutex);
+	spin_lock_irqsave(lp->indirect_lock, flags);
 	temac_iow(lp, XTE_LSW0_OFFSET, (phy_id << 5) | reg);
-	rc = temac_indirect_in32(lp, XTE_MIIMAI_OFFSET);
-	mutex_unlock(lp->indirect_mutex);
+	rc = temac_indirect_in32_locked(lp, XTE_MIIMAI_OFFSET);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 
 	dev_dbg(lp->dev, "temac_mdio_read(phy_id=%i, reg=%x) == %x\n",
 		phy_id, reg, rc);
@@ -43,6 +44,7 @@ static int temac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 static int temac_mdio_write(struct mii_bus *bus, int phy_id, int reg, u16 val)
 {
 	struct temac_local *lp = bus->priv;
+	unsigned long flags;
 
 	dev_dbg(lp->dev, "temac_mdio_write(phy_id=%i, reg=%x, val=%x)\n",
 		phy_id, reg, val);
@@ -50,10 +52,10 @@ static int temac_mdio_write(struct mii_bus *bus, int phy_id, int reg, u16 val)
 	/* First write the desired value into the write data register
 	 * and then write the address into the access initiator register
 	 */
-	mutex_lock(lp->indirect_mutex);
-	temac_indirect_out32(lp, XTE_MGTDR_OFFSET, val);
-	temac_indirect_out32(lp, XTE_MIIMAI_OFFSET, (phy_id << 5) | reg);
-	mutex_unlock(lp->indirect_mutex);
+	spin_lock_irqsave(lp->indirect_lock, flags);
+	temac_indirect_out32_locked(lp, XTE_MGTDR_OFFSET, val);
+	temac_indirect_out32_locked(lp, XTE_MIIMAI_OFFSET, (phy_id << 5) | reg);
+	spin_unlock_irqrestore(lp->indirect_lock, flags);
 
 	return 0;
 }
@@ -87,9 +89,7 @@ int temac_mdio_setup(struct temac_local *lp, struct platform_device *pdev)
 
 	/* Enable the MDIO bus by asserting the enable bit and writing
 	 * in the clock config */
-	mutex_lock(lp->indirect_mutex);
 	temac_indirect_out32(lp, XTE_MC_OFFSET, 1 << 6 | clk_div);
-	mutex_unlock(lp->indirect_mutex);
 
 	bus = devm_mdiobus_alloc(&pdev->dev);
 	if (!bus)
@@ -116,10 +116,8 @@ int temac_mdio_setup(struct temac_local *lp, struct platform_device *pdev)
 	if (rc)
 		return rc;
 
-	mutex_lock(lp->indirect_mutex);
 	dev_dbg(lp->dev, "MDIO bus registered;  MC:%x\n",
 		temac_indirect_in32(lp, XTE_MC_OFFSET));
-	mutex_unlock(lp->indirect_mutex);
 	return 0;
 }
 
diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h
index 368530f98176..f4a68136afa6 100644
--- a/include/linux/platform_data/xilinx-ll-temac.h
+++ b/include/linux/platform_data/xilinx-ll-temac.h
@@ -4,6 +4,7 @@
 
 #include <linux/if_ether.h>
 #include <linux/phy.h>
+#include <linux/spinlock.h>
 
 struct ll_temac_platform_data {
 	bool txcsum;		/* Enable/disable TX checksum */
@@ -21,7 +22,7 @@ struct ll_temac_platform_data {
 	 * TEMAC IP block, the same mutex should be passed here, as
 	 * they share the same DCR bus bridge.
 	 */
-	struct mutex *indirect_mutex;
+	spinlock_t *indirect_lock;
 	/* DMA channel control setup */
 	u8 tx_irq_timeout;	/* TX Interrupt Delay Time-out */
 	u8 tx_irq_count;	/* TX Interrupt Coalescing Threshold Count */
-- 
cgit v1.2.3


From 9395da4efbd46661f0049d24d54d1cea63241fc9 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Wed, 22 May 2019 14:21:07 -0600
Subject: net: qualcomm: rmnet: Move common struct definitions to include

Create if_rmnet.h and move the rmnet MAP packet structs to this
common include file. To account for portablity, add little and
big endian bitfield definitions similar to the ip & tcp headers.

The definitions in the headers can now be re-used by the
upcoming ipa driver series as well as qmi_wwan.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h | 25 +----------
 include/linux/if_rmnet.h                        | 55 +++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 24 deletions(-)
 create mode 100644 include/linux/if_rmnet.h

(limited to 'include')

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 884f1f52dcc2..991d7e285736 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -12,6 +12,7 @@
 
 #ifndef _RMNET_MAP_H_
 #define _RMNET_MAP_H_
+#include <linux/if_rmnet.h>
 
 struct rmnet_map_control_command {
 	u8  command_name;
@@ -39,30 +40,6 @@ enum rmnet_map_commands {
 	RMNET_MAP_COMMAND_ENUM_LENGTH
 };
 
-struct rmnet_map_header {
-	u8  pad_len:6;
-	u8  reserved_bit:1;
-	u8  cd_bit:1;
-	u8  mux_id;
-	__be16 pkt_len;
-}  __aligned(1);
-
-struct rmnet_map_dl_csum_trailer {
-	u8  reserved1;
-	u8  valid:1;
-	u8  reserved2:7;
-	u16 csum_start_offset;
-	u16 csum_length;
-	__be16 csum_value;
-} __aligned(1);
-
-struct rmnet_map_ul_csum_header {
-	__be16 csum_start_offset;
-	u16 csum_insert_offset:14;
-	u16 udp_ip4_ind:1;
-	u16 csum_enabled:1;
-} __aligned(1);
-
 #define RMNET_MAP_GET_MUX_ID(Y) (((struct rmnet_map_header *) \
 				 (Y)->data)->mux_id)
 #define RMNET_MAP_GET_CD_BIT(Y) (((struct rmnet_map_header *) \
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
new file mode 100644
index 000000000000..b4f5403383fc
--- /dev/null
+++ b/include/linux/if_rmnet.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ * Copyright (c) 2013-2019, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _LINUX_IF_RMNET_H_
+#define _LINUX_IF_RMNET_H_
+
+struct rmnet_map_header {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8  pad_len:6;
+	u8  reserved_bit:1;
+	u8  cd_bit:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	u8  cd_bit:1;
+	u8  reserved_bit:1;
+	u8  pad_len:6;
+#else
+#error	"Please fix <asm/byteorder.h>"
+#endif
+	u8  mux_id;
+	__be16 pkt_len;
+}  __aligned(1);
+
+struct rmnet_map_dl_csum_trailer {
+	u8  reserved1;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8  valid:1;
+	u8  reserved2:7;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	u8  reserved2:7;
+	u8  valid:1;
+#else
+#error	"Please fix <asm/byteorder.h>"
+#endif
+	u16 csum_start_offset;
+	u16 csum_length;
+	__be16 csum_value;
+} __aligned(1);
+
+struct rmnet_map_ul_csum_header {
+	__be16 csum_start_offset;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u16 csum_insert_offset:14;
+	u16 udp_ip4_ind:1;
+	u16 csum_enabled:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	u16 csum_enabled:1;
+	u16 udp_ip4_ind:1;
+	u16 csum_insert_offset:14;
+#else
+#error	"Please fix <asm/byteorder.h>"
+#endif
+} __aligned(1);
+
+#endif /* !(_LINUX_IF_RMNET_H_) */
-- 
cgit v1.2.3


From a8f500af0ccffc3d2aaf9018537981cb173865a1 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 21 May 2019 20:17:06 -0700
Subject: bpf: split explored_states

split explored_states into prune_point boolean mark
and link list of explored states.
This removes STATE_LIST_MARK hack and allows marks to be separate from states.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 31 +++++++++++++------------------
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1305ccbd8fe6..02bba09a0ea1 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -233,6 +233,7 @@ struct bpf_insn_aux_data {
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
 	u8 alu_state; /* used in combination with alu_limit */
+	bool prune_point;
 	unsigned int orig_idx; /* original instruction index */
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 736b5a0d4848..6a3e69ba891e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5436,7 +5436,6 @@ enum {
 	BRANCH = 2,
 };
 
-#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
 static struct bpf_verifier_state_list **explored_state(
 					struct bpf_verifier_env *env,
 					int idx)
@@ -5446,7 +5445,7 @@ static struct bpf_verifier_state_list **explored_state(
 
 static void init_explored_state(struct bpf_verifier_env *env, int idx)
 {
-	env->explored_states[idx] = STATE_LIST_MARK;
+	env->insn_aux_data[idx].prune_point = true;
 }
 
 /* t, w, e - match pseudo-code above:
@@ -6018,10 +6017,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 	int i;
 
 	sl = *explored_state(env, insn);
-	if (!sl)
-		return;
-
-	while (sl != STATE_LIST_MARK) {
+	while (sl) {
 		if (sl->state.curframe != cur->curframe)
 			goto next;
 		for (i = 0; i <= cur->curframe; i++)
@@ -6376,18 +6372,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
 
-	pprev = explored_state(env, insn_idx);
-	sl = *pprev;
-
-	if (!sl)
+	if (!env->insn_aux_data[insn_idx].prune_point)
 		/* this 'insn_idx' instruction wasn't marked, so we will not
 		 * be doing state search here
 		 */
 		return 0;
 
+	pprev = explored_state(env, insn_idx);
+	sl = *pprev;
+
 	clean_live_states(env, insn_idx, cur);
 
-	while (sl != STATE_LIST_MARK) {
+	while (sl) {
 		if (states_equal(env, &sl->state, cur)) {
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -8145,13 +8141,12 @@ static void free_states(struct bpf_verifier_env *env)
 	for (i = 0; i < env->prog->len; i++) {
 		sl = env->explored_states[i];
 
-		if (sl)
-			while (sl != STATE_LIST_MARK) {
-				sln = sl->next;
-				free_verifier_state(&sl->state, false);
-				kfree(sl);
-				sl = sln;
-			}
+		while (sl) {
+			sln = sl->next;
+			free_verifier_state(&sl->state, false);
+			kfree(sl);
+			sl = sln;
+		}
 	}
 
 	kvfree(env->explored_states);
-- 
cgit v1.2.3


From dc2a4ebc0b44a212fcf72242210e56aa17e7317b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 21 May 2019 20:17:07 -0700
Subject: bpf: convert explored_states to hash table

All prune points inside a callee bpf function most likely will have
different callsites. For example, if function foo() is called from
two callsites the half of explored states in all prune points in foo()
will be useless for subsequent walking of one of those callsites.
Fortunately explored_states pruning heuristics keeps the number of states
per prune point small, but walking these states is still a waste of cpu
time when the callsite of the current state is different from the callsite
of the explored state.

To improve pruning logic convert explored_states into hash table and
use simple insn_idx ^ callsite hash to select hash bucket.
This optimization has no effect on programs without bpf2bpf calls
and drastically improves programs with calls.
In the later case it reduces total memory consumption in 1M scale tests
by almost 3 times (peak_states drops from 5752 to 2016).

Care should be taken when comparing the states for equivalency.
Since the same hash bucket can now contain states with different indices
the insn_idx has to be part of verifier_state and compared.

Different hash table sizes and different hash functions were explored,
but the results were not significantly better vs this patch.
They can be improved in the future.

Hit/miss heuristic is not counting index miscompare as a miss.
Otherwise verifier stats become unstable when experimenting
with different hash functions.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 23 ++++++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 02bba09a0ea1..405b502283c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -187,6 +187,7 @@ struct bpf_func_state {
 struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
+	u32 insn_idx;
 	u32 curframe;
 	u32 active_spin_lock;
 	bool speculative;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a3e69ba891e..550091c7a46a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5436,11 +5436,19 @@ enum {
 	BRANCH = 2,
 };
 
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+	return env->prog->len;
+}
+
 static struct bpf_verifier_state_list **explored_state(
 					struct bpf_verifier_env *env,
 					int idx)
 {
-	return &env->explored_states[idx];
+	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_func_state *state = cur->frame[cur->curframe];
+
+	return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
 }
 
 static void init_explored_state(struct bpf_verifier_env *env, int idx)
@@ -6018,7 +6026,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 
 	sl = *explored_state(env, insn);
 	while (sl) {
-		if (sl->state.curframe != cur->curframe)
+		if (sl->state.insn_idx != insn ||
+		    sl->state.curframe != cur->curframe)
 			goto next;
 		for (i = 0; i <= cur->curframe; i++)
 			if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
@@ -6384,6 +6393,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	clean_live_states(env, insn_idx, cur);
 
 	while (sl) {
+		states_cnt++;
+		if (sl->state.insn_idx != insn_idx)
+			goto next;
 		if (states_equal(env, &sl->state, cur)) {
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -6401,7 +6413,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				return err;
 			return 1;
 		}
-		states_cnt++;
 		sl->miss_cnt++;
 		/* heuristic to determine whether this state is beneficial
 		 * to keep checking from state equivalence point of view.
@@ -6428,6 +6439,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			sl = *pprev;
 			continue;
 		}
+next:
 		pprev = &sl->next;
 		sl = *pprev;
 	}
@@ -6459,6 +6471,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		kfree(new_sl);
 		return err;
 	}
+	new->insn_idx = insn_idx;
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
 	/* connect new state to parentage chain. Current frame needs all
@@ -8138,7 +8151,7 @@ static void free_states(struct bpf_verifier_env *env)
 	if (!env->explored_states)
 		return;
 
-	for (i = 0; i < env->prog->len; i++) {
+	for (i = 0; i < state_htab_size(env); i++) {
 		sl = env->explored_states[i];
 
 		while (sl) {
@@ -8246,7 +8259,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 			goto skip_full_check;
 	}
 
-	env->explored_states = kvcalloc(env->prog->len,
+	env->explored_states = kvcalloc(state_htab_size(env),
 				       sizeof(struct bpf_verifier_state_list *),
 				       GFP_USER);
 	ret = -ENOMEM;
-- 
cgit v1.2.3


From f40b6ae2b612446dc970d7b51eeec47bd1619f82 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 20:27:55 -0700
Subject: ipv6: Move pcpu cached routes to fib6_nh

rt6_info are specific instances of a fib entry and are tied to a
device and gateway - ie., a nexthop. Before nexthop objects, IPv6 fib
entries have separate fib6_info for each nexthop in a multipath route,
so the location of the pcpu cache in the fib6_info struct worked.
However, with nexthop objects a fib6_info can point to a set of nexthops
(yet another alignment of ipv6 with ipv4). Accordingly, the pcpu
cache needs to be moved to the fib6_nh struct so the cached entries
are local to the nexthop specification used to create the rt6_info.

Initialization and free of the pcpu entries moved to fib6_nh_init and
fib6_nh_release.

Change in location only, from fib6_info down to fib6_nh; no other
functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  3 ++-
 net/ipv6/addrconf.c   |  6 +++---
 net/ipv6/ip6_fib.c    | 34 ++++++----------------------------
 net/ipv6/route.c      | 29 +++++++++++++++++++++++++++--
 4 files changed, 38 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 0d0d06b1cd26..38e87ef81b7e 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -131,6 +131,8 @@ struct fib6_nh {
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	unsigned long		last_probe;
 #endif
+
+	struct rt6_info * __percpu *rt6i_pcpu;
 };
 
 struct fib6_info {
@@ -156,7 +158,6 @@ struct fib6_info {
 	struct rt6key			fib6_src;
 	struct rt6key			fib6_prefsrc;
 
-	struct rt6_info * __percpu	*rt6i_pcpu;
 	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
 	u32				fib6_metric;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f96d1de79509..4bc35dd02b56 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6341,16 +6341,16 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 	list_for_each_entry(ifa, &idev->addr_list, if_list) {
 		spin_lock(&ifa->lock);
 		if (ifa->rt) {
-			struct fib6_info *rt = ifa->rt;
+			struct fib6_nh *nh = &ifa->rt->fib6_nh;
 			int cpu;
 
 			rcu_read_lock();
 			ifa->rt->dst_nopolicy = val ? true : false;
-			if (rt->rt6i_pcpu) {
+			if (nh->rt6i_pcpu) {
 				for_each_possible_cpu(cpu) {
 					struct rt6_info **rtp;
 
-					rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+					rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
 					addrconf_set_nopolicy(*rtp, val);
 				}
 			}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 7958cf91895a..274f1243866f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -155,12 +155,6 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 	if (!f6i)
 		return NULL;
 
-	f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
-	if (!f6i->rt6i_pcpu) {
-		kfree(f6i);
-		return NULL;
-	}
-
 	INIT_LIST_HEAD(&f6i->fib6_siblings);
 	refcount_set(&f6i->fib6_ref, 1);
 
@@ -177,25 +171,6 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
 	kfree(bucket);
 
-	if (f6i->rt6i_pcpu) {
-		int cpu;
-
-		for_each_possible_cpu(cpu) {
-			struct rt6_info **ppcpu_rt;
-			struct rt6_info *pcpu_rt;
-
-			ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
-			pcpu_rt = *ppcpu_rt;
-			if (pcpu_rt) {
-				dst_dev_put(&pcpu_rt->dst);
-				dst_release(&pcpu_rt->dst);
-				*ppcpu_rt = NULL;
-			}
-		}
-
-		free_percpu(f6i->rt6i_pcpu);
-	}
-
 	fib6_nh_release(&f6i->fib6_nh);
 
 	ip_fib_metrics_put(f6i->fib6_metrics);
@@ -902,8 +877,12 @@ insert_above:
 static void fib6_drop_pcpu_from(struct fib6_info *f6i,
 				const struct fib6_table *table)
 {
+	struct fib6_nh *fib6_nh = &f6i->fib6_nh;
 	int cpu;
 
+	if (!fib6_nh->rt6i_pcpu)
+		return;
+
 	/* Make sure rt6_make_pcpu_route() wont add other percpu routes
 	 * while we are cleaning them here.
 	 */
@@ -917,7 +896,7 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
 		struct rt6_info **ppcpu_rt;
 		struct rt6_info *pcpu_rt;
 
-		ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+		ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
 		pcpu_rt = *ppcpu_rt;
 		if (pcpu_rt) {
 			struct fib6_info *from;
@@ -933,8 +912,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 {
 	struct fib6_table *table = rt->fib6_table;
 
-	if (rt->rt6i_pcpu)
-		fib6_drop_pcpu_from(rt, table);
+	fib6_drop_pcpu_from(rt, table);
 
 	if (refcount_read(&rt->fib6_ref) != 1) {
 		/* This route is used as dummy address holder in some split
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5f0661c18624..e404813c9844 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1270,7 +1270,7 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
 {
 	struct rt6_info *pcpu_rt, **p;
 
-	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
+	p = this_cpu_ptr(res->nh->rt6i_pcpu);
 	pcpu_rt = *p;
 
 	if (pcpu_rt)
@@ -1291,7 +1291,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
 	}
 
 	dst_hold(&pcpu_rt->dst);
-	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
+	p = this_cpu_ptr(res->nh->rt6i_pcpu);
 	prev = cmpxchg(p, NULL, pcpu_rt);
 	BUG_ON(prev);
 
@@ -3068,6 +3068,12 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 	    !netif_carrier_ok(dev))
 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
 
+	fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+	if (!fib6_nh->rt6i_pcpu) {
+		err = -ENOMEM;
+		goto out;
+	}
+
 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
 	if (err)
@@ -3092,6 +3098,25 @@ out:
 
 void fib6_nh_release(struct fib6_nh *fib6_nh)
 {
+	if (fib6_nh->rt6i_pcpu) {
+		int cpu;
+
+		for_each_possible_cpu(cpu) {
+			struct rt6_info **ppcpu_rt;
+			struct rt6_info *pcpu_rt;
+
+			ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+			pcpu_rt = *ppcpu_rt;
+			if (pcpu_rt) {
+				dst_dev_put(&pcpu_rt->dst);
+				dst_release(&pcpu_rt->dst);
+				*ppcpu_rt = NULL;
+			}
+		}
+
+		free_percpu(fib6_nh->rt6i_pcpu);
+	}
+
 	fib_nh_common_release(&fib6_nh->nh_common);
 }
 
-- 
cgit v1.2.3


From cc5c073a693fa6ed7a207b0436114f68cce72434 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 20:27:58 -0700
Subject: ipv6: Move exception bucket to fib6_nh

Similar to the pcpu routes exceptions are really per nexthop, so move
rt6i_exception_bucket from fib6_info to fib6_nh.

To avoid additional increases to the size of fib6_nh for a 1-bit flag,
use the lowest bit in the allocated memory pointer for the flushed flag.
Add helpers for retrieving the bucket pointer to mask off the flag.

The cleanup of the exception bucket is moved to fib6_nh_release.

fib6_nh_flush_exceptions can now be called from 2 contexts:
1. deleting a fib entry
2. deleting a fib6_nh

For 1., fib6_nh_flush_exceptions is called for a specific fib6_info that
is getting deleted. All exceptions in the cache using the entry are
deleted. For 2, the fib6_nh itself is getting destroyed so
fib6_nh_flush_exceptions is called for a NULL fib6_info which means
flush all entries.

The pmtu.sh selftest exercises the affected code paths - from creating
exceptions to cleaning them up on device delete. All tests pass without
any rcu locking or memleak warnings.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |   8 +--
 net/ipv6/ip6_fib.c    |   6 --
 net/ipv6/route.c      | 185 +++++++++++++++++++++++++++++++++-----------------
 3 files changed, 126 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 38e87ef81b7e..6b4852cf2fc2 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -133,6 +133,7 @@ struct fib6_nh {
 #endif
 
 	struct rt6_info * __percpu *rt6i_pcpu;
+	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 };
 
 struct fib6_info {
@@ -158,18 +159,15 @@ struct fib6_info {
 	struct rt6key			fib6_src;
 	struct rt6key			fib6_prefsrc;
 
-	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
-
 	u32				fib6_metric;
 	u8				fib6_protocol;
 	u8				fib6_type;
-	u8				exception_bucket_flushed:1,
-					should_flush:1,
+	u8				should_flush:1,
 					dst_nocount:1,
 					dst_nopolicy:1,
 					dst_host:1,
 					fib6_destroying:1,
-					unused:2;
+					unused:3;
 
 	struct fib6_nh			fib6_nh;
 	struct rcu_head			rcu;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 178a9c2d2d34..87ac82f850d2 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -164,17 +164,11 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 void fib6_info_destroy_rcu(struct rcu_head *head)
 {
 	struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
-	struct rt6_exception_bucket *bucket;
 
 	WARN_ON(f6i->fib6_node);
 
-	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
-	kfree(bucket);
-
 	fib6_nh_release(&f6i->fib6_nh);
-
 	ip_fib_metrics_put(f6i->fib6_metrics);
-
 	kfree(f6i);
 }
 EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8bfaa7349e10..b01118a3c42e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1461,25 +1461,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res)
 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
 }
 
+#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL
+
+/* used when the flushed bit is not relevant, only access to the bucket
+ * (ie., all bucket users except rt6_insert_exception);
+ *
+ * called under rcu lock; sometimes called with rt6_exception_lock held
+ */
+static
+struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
+						       spinlock_t *lock)
+{
+	struct rt6_exception_bucket *bucket;
+
+	if (lock)
+		bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+						   lockdep_is_held(lock));
+	else
+		bucket = rcu_dereference(nh->rt6i_exception_bucket);
+
+	/* remove bucket flushed bit if set */
+	if (bucket) {
+		unsigned long p = (unsigned long)bucket;
+
+		p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
+		bucket = (struct rt6_exception_bucket *)p;
+	}
+
+	return bucket;
+}
+
+static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
+{
+	unsigned long p = (unsigned long)bucket;
+
+	return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
+}
+
+/* called with rt6_exception_lock held */
+static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
+					      spinlock_t *lock)
+{
+	struct rt6_exception_bucket *bucket;
+	unsigned long p;
+
+	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+					   lockdep_is_held(lock));
+
+	p = (unsigned long)bucket;
+	p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
+	bucket = (struct rt6_exception_bucket *)p;
+	rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+}
+
 static int rt6_insert_exception(struct rt6_info *nrt,
 				const struct fib6_result *res)
 {
 	struct net *net = dev_net(nrt->dst.dev);
 	struct rt6_exception_bucket *bucket;
+	struct fib6_info *f6i = res->f6i;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
-	struct fib6_info *f6i = res->f6i;
+	struct fib6_nh *nh = res->nh;
 	int err = 0;
 
 	spin_lock_bh(&rt6_exception_lock);
 
-	if (f6i->exception_bucket_flushed) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
-					lockdep_is_held(&rt6_exception_lock));
+	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+					  lockdep_is_held(&rt6_exception_lock));
 	if (!bucket) {
 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
 				 GFP_ATOMIC);
@@ -1487,7 +1536,10 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 			err = -ENOMEM;
 			goto out;
 		}
-		rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
+		rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+	} else if (fib6_nh_excptn_bucket_flushed(bucket)) {
+		err = -EINVAL;
+		goto out;
 	}
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1550,21 +1602,24 @@ static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
 	int i;
 
 	spin_lock_bh(&rt6_exception_lock);
-	/* Prevent rt6_insert_exception() to recreate the bucket list */
-	from->exception_bucket_flushed = 1;
 
-	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
-				    lockdep_is_held(&rt6_exception_lock));
+	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
 	if (!bucket)
 		goto out;
 
+	/* Prevent rt6_insert_exception() to recreate the bucket list */
+	if (!from)
+		fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
+
 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
-		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
-			rt6_remove_exception(bucket, rt6_ex);
-		WARN_ON_ONCE(bucket->depth);
+		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
+			if (!from ||
+			    rcu_access_pointer(rt6_ex->rt6i->from) == from)
+				rt6_remove_exception(bucket, rt6_ex);
+		}
+		WARN_ON_ONCE(!from && bucket->depth);
 		bucket++;
 	}
-
 out:
 	spin_unlock_bh(&rt6_exception_lock);
 }
@@ -1602,7 +1657,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
 		src_key = saddr;
 find_ex:
 #endif
-	bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
+	bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
 
 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
@@ -1620,7 +1675,7 @@ find_ex:
 }
 
 /* Remove the passed in cached rt from the hash table that contains it */
-static int fib6_nh_remove_exception(const struct fib6_info *from, int plen,
+static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
 				    const struct rt6_info *rt)
 {
 	const struct in6_addr *src_key = NULL;
@@ -1628,15 +1683,16 @@ static int fib6_nh_remove_exception(const struct fib6_info *from, int plen,
 	struct rt6_exception *rt6_ex;
 	int err;
 
-	if (!rcu_access_pointer(from->rt6i_exception_bucket))
+	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
 		return -ENOENT;
 
 	spin_lock_bh(&rt6_exception_lock);
-	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
-				    lockdep_is_held(&rt6_exception_lock));
+	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+
 #ifdef CONFIG_IPV6_SUBTREES
-	/* plen != 0 indicates 'from' is in subtree and exception
-	 * table is indexed by a hash of both rt6i_dst and rt6i_src.
+	/* rt6i_src.plen != 0 indicates 'from' is in subtree
+	 * and exception table is indexed by a hash of
+	 * both rt6i_dst and rt6i_src.
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
@@ -1662,37 +1718,35 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
 	struct fib6_info *from;
 
 	from = rcu_dereference(rt->from);
-	if (!from ||
-	    !(rt->rt6i_flags & RTF_CACHE))
+	if (!from || !(rt->rt6i_flags & RTF_CACHE))
 		return -EINVAL;
 
-	return fib6_nh_remove_exception(from, from->fib6_src.plen, rt);
+	return fib6_nh_remove_exception(&from->fib6_nh,
+					from->fib6_src.plen, rt);
 }
 
 /* Find rt6_ex which contains the passed in rt cache and
  * refresh its stamp
  */
-static void fib6_nh_update_exception(const struct fib6_info *from, int plen,
+static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
 				     const struct rt6_info *rt)
 {
 	const struct in6_addr *src_key = NULL;
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
 
-	bucket = rcu_dereference(from->rt6i_exception_bucket);
-
+	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
 #ifdef CONFIG_IPV6_SUBTREES
-	/* plen != 0 indicates 'from' is in subtree and exception
-	 * table is indexed by a hash of both rt6i_dst and rt6i_src.
+	/* rt6i_src.plen != 0 indicates 'from' is in subtree
+	 * and exception table is indexed by a hash of
+	 * both rt6i_dst and rt6i_src.
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
 	if (plen)
 		src_key = &rt->rt6i_src.addr;
 #endif
-	rt6_ex = __rt6_find_exception_rcu(&bucket,
-					  &rt->rt6i_dst.addr,
-					  src_key);
+	rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
 	if (rt6_ex)
 		rt6_ex->stamp = jiffies;
 }
@@ -1707,7 +1761,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
 		goto unlock;
 
-	fib6_nh_update_exception(from, from->fib6_src.plen, rt);
+	fib6_nh_update_exception(&from->fib6_nh, from->fib6_src.plen, rt);
 unlock:
 	rcu_read_unlock();
 }
@@ -1735,15 +1789,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
 }
 
 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
-				       struct fib6_info *rt, int mtu)
+				       const struct fib6_nh *nh, int mtu)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
 	int i;
 
-	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
-					lockdep_is_held(&rt6_exception_lock));
-
+	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
 	if (!bucket)
 		return;
 
@@ -1765,21 +1817,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
 
 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
 
-static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
-					struct in6_addr *gateway)
+static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
+					    const struct in6_addr *gateway)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
 	struct hlist_node *tmp;
 	int i;
 
-	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
 		return;
 
 	spin_lock_bh(&rt6_exception_lock);
-	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
-				     lockdep_is_held(&rt6_exception_lock));
-
+	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
 	if (bucket) {
 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
 			hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1844,7 +1894,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 	gc_args->more++;
 }
 
-static void fib6_nh_age_exceptions(struct fib6_info *rt,
+static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
 				   struct fib6_gc_args *gc_args,
 				   unsigned long now)
 {
@@ -1853,14 +1903,12 @@ static void fib6_nh_age_exceptions(struct fib6_info *rt,
 	struct hlist_node *tmp;
 	int i;
 
-	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
 		return;
 
 	rcu_read_lock_bh();
 	spin_lock(&rt6_exception_lock);
-	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
-				    lockdep_is_held(&rt6_exception_lock));
-
+	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
 	if (bucket) {
 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
 			hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1875,11 +1923,11 @@ static void fib6_nh_age_exceptions(struct fib6_info *rt,
 	rcu_read_unlock_bh();
 }
 
-void rt6_age_exceptions(struct fib6_info *rt,
+void rt6_age_exceptions(struct fib6_info *f6i,
 			struct fib6_gc_args *gc_args,
 			unsigned long now)
 {
-	fib6_nh_age_exceptions(rt, gc_args, now);
+	fib6_nh_age_exceptions(&f6i->fib6_nh, gc_args, now);
 }
 
 /* must be called with rcu lock held */
@@ -3122,6 +3170,19 @@ out:
 
 void fib6_nh_release(struct fib6_nh *fib6_nh)
 {
+	struct rt6_exception_bucket *bucket;
+
+	rcu_read_lock();
+
+	fib6_nh_flush_exceptions(fib6_nh, NULL);
+	bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
+	if (bucket) {
+		rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
+		kfree(bucket);
+	}
+
+	rcu_read_unlock();
+
 	if (fib6_nh->rt6i_pcpu) {
 		int cpu;
 
@@ -3411,9 +3472,11 @@ static int ip6_route_del(struct fib6_config *cfg,
 		for_each_fib6_node_rt_rcu(fn) {
 			struct fib6_nh *nh;
 
+			nh = &rt->fib6_nh;
 			if (cfg->fc_flags & RTF_CACHE) {
 				struct fib6_result res = {
 					.f6i = rt,
+					.nh = nh,
 				};
 				int rc;
 
@@ -3430,7 +3493,6 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			}
 
-			nh = &rt->fib6_nh;
 			if (cfg->fc_ifindex &&
 			    (!nh->fib_nh_dev ||
 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
@@ -3947,18 +4009,17 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
+	struct fib6_nh *nh = &rt->fib6_nh;
 
 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
-	    rt->fib6_nh.fib_nh_gw_family &&
-	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
+	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
 		return -1;
-	}
 
 	/* Further clean up cached routes in exception table.
 	 * This is needed because cached route may have a different
 	 * gateway than its 'parent' in the case of an ip redirect.
 	 */
-	rt6_exceptions_clean_tohost(rt, gateway);
+	fib6_nh_exceptions_clean_tohost(nh, gateway);
 
 	return 0;
 }
@@ -4225,10 +4286,10 @@ struct rt6_mtu_change_arg {
 	struct fib6_info *f6i;
 };
 
-static int fib6_nh_mtu_change(struct fib6_info *f6i, void *_arg)
+static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
 {
 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
-	struct fib6_nh *nh = &f6i->fib6_nh;
+	struct fib6_info *f6i = arg->f6i;
 
 	/* For administrative MTU increase, there is no way to discover
 	 * IPv6 PMTU increase, so PMTU increase should be updated here.
@@ -4244,7 +4305,7 @@ static int fib6_nh_mtu_change(struct fib6_info *f6i, void *_arg)
 			fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
 
 		spin_lock_bh(&rt6_exception_lock);
-		rt6_exceptions_update_pmtu(idev, f6i, arg->mtu);
+		rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
 		spin_unlock_bh(&rt6_exception_lock);
 	}
 
@@ -4270,7 +4331,7 @@ static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
 		return 0;
 
 	arg->f6i = f6i;
-	return fib6_nh_mtu_change(f6i, arg);
+	return fib6_nh_mtu_change(&f6i->fib6_nh, arg);
 }
 
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
-- 
cgit v1.2.3


From 1cf844c747d5424abe76f7b599c00b1ac17d3fce Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 20:27:59 -0700
Subject: ipv6: Make fib6_nh optional at the end of fib6_info

Move fib6_nh to the end of fib6_info and make it an array of
size 0. Pass a flag to fib6_info_alloc indicating if the
allocation needs to add space for a fib6_nh.

The current code path always has a fib6_nh allocated with a
fib6_info; with nexthop objects they will be separate.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  31 ++---
 include/net/ip6_fib.h                              |   6 +-
 include/net/ip6_route.h                            |   4 +-
 net/ipv6/addrconf.c                                |   6 +-
 net/ipv6/ip6_fib.c                                 |  18 +--
 net/ipv6/ndisc.c                                   |   8 +-
 net/ipv6/route.c                                   | 134 ++++++++++-----------
 7 files changed, 106 insertions(+), 101 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 1cda8a248b12..0ec52be7cc33 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2886,7 +2886,7 @@ mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp,
 		return false;
 
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
-		struct fib6_nh *fib6_nh = &mlxsw_sp_rt6->rt->fib6_nh;
+		struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh;
 		struct in6_addr *gw;
 		int ifindex, weight;
 
@@ -2958,7 +2958,7 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed)
 	struct net_device *dev;
 
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
-		dev = mlxsw_sp_rt6->rt->fib6_nh.fib_nh_dev;
+		dev = mlxsw_sp_rt6->rt->fib6_nh->fib_nh_dev;
 		val ^= dev->ifindex;
 	}
 
@@ -3960,9 +3960,9 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
 		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
 		struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
-		if (nh->rif && nh->rif->dev == rt->fib6_nh.fib_nh_dev &&
+		if (nh->rif && nh->rif->dev == rt->fib6_nh->fib_nh_dev &&
 		    ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
-				    &rt->fib6_nh.fib_nh_gw6))
+				    &rt->fib6_nh->fib_nh_gw6))
 			return nh;
 		continue;
 	}
@@ -4022,13 +4022,13 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL ||
 	    fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_BLACKHOLE) {
 		list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
-				 list)->rt->fib6_nh.fib_nh_flags |= RTNH_F_OFFLOAD;
+				 list)->rt->fib6_nh->fib_nh_flags |= RTNH_F_OFFLOAD;
 		return;
 	}
 
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
 		struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
-		struct fib6_nh *fib6_nh = &mlxsw_sp_rt6->rt->fib6_nh;
+		struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh;
 		struct mlxsw_sp_nexthop *nh;
 
 		nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
@@ -4050,7 +4050,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
 		struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
-		rt->fib6_nh.fib_nh_flags &= ~RTNH_F_OFFLOAD;
+		rt->fib6_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
@@ -4928,7 +4928,8 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
 static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
 {
 	/* RTF_CACHE routes are ignored */
-	return !(rt->fib6_flags & RTF_ADDRCONF) && rt->fib6_nh.fib_nh_gw_family;
+	return !(rt->fib6_flags & RTF_ADDRCONF) &&
+		rt->fib6_nh->fib_nh_gw_family;
 }
 
 static struct fib6_info *
@@ -4987,8 +4988,8 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
 					const struct fib6_info *rt,
 					enum mlxsw_sp_ipip_type *ret)
 {
-	return rt->fib6_nh.fib_nh_dev &&
-	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.fib_nh_dev, ret);
+	return rt->fib6_nh->fib_nh_dev &&
+	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh->fib_nh_dev, ret);
 }
 
 static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
@@ -4998,7 +4999,7 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
 {
 	const struct mlxsw_sp_ipip_ops *ipip_ops;
 	struct mlxsw_sp_ipip_entry *ipip_entry;
-	struct net_device *dev = rt->fib6_nh.fib_nh_dev;
+	struct net_device *dev = rt->fib6_nh->fib_nh_dev;
 	struct mlxsw_sp_rif *rif;
 	int err;
 
@@ -5041,11 +5042,11 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
 				  struct mlxsw_sp_nexthop *nh,
 				  const struct fib6_info *rt)
 {
-	struct net_device *dev = rt->fib6_nh.fib_nh_dev;
+	struct net_device *dev = rt->fib6_nh->fib_nh_dev;
 
 	nh->nh_grp = nh_grp;
-	nh->nh_weight = rt->fib6_nh.fib_nh_weight;
-	memcpy(&nh->gw_addr, &rt->fib6_nh.fib_nh_gw6, sizeof(nh->gw_addr));
+	nh->nh_weight = rt->fib6_nh->fib_nh_weight;
+	memcpy(&nh->gw_addr, &rt->fib6_nh->fib_nh_gw6, sizeof(nh->gw_addr));
 	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
 
 	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
@@ -5068,7 +5069,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
 static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
 				    const struct fib6_info *rt)
 {
-	return rt->fib6_nh.fib_nh_gw_family ||
+	return rt->fib6_nh->fib_nh_gw_family ||
 	       mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
 }
 
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6b4852cf2fc2..ebe5d65f97e0 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -169,8 +169,8 @@ struct fib6_info {
 					fib6_destroying:1,
 					unused:3;
 
-	struct fib6_nh			fib6_nh;
 	struct rcu_head			rcu;
+	struct fib6_nh			fib6_nh[0];
 };
 
 struct rt6_info {
@@ -280,7 +280,7 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 	dst_release(&rt->dst);
 }
 
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh);
 void fib6_info_destroy_rcu(struct rcu_head *head);
 
 static inline void fib6_info_hold(struct fib6_info *f6i)
@@ -443,7 +443,7 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
 
 static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.fib_nh_dev;
+	return f6i->fib6_nh->fib_nh_dev;
 }
 
 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 4790beaa86e0..a6ce6ea856b9 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -70,7 +70,7 @@ static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
 	/* the RTF_ADDRCONF flag filters out RA's */
 	return !(f6i->fib6_flags & RTF_ADDRCONF) &&
-		f6i->fib6_nh.fib_nh_gw_family;
+		f6i->fib6_nh->fib_nh_gw_family;
 }
 
 void ip6_route_input(struct sk_buff *skb);
@@ -275,7 +275,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
-	struct fib6_nh *nha = &a->fib6_nh, *nhb = &b->fib6_nh;
+	struct fib6_nh *nha = a->fib6_nh, *nhb = b->fib6_nh;
 
 	return nha->fib_nh_dev == nhb->fib_nh_dev &&
 	       ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4bc35dd02b56..683613e7355b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2421,9 +2421,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex)
+		if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
 			continue;
-		if (no_gw && rt->fib6_nh.fib_nh_gw_family)
+		if (no_gw && rt->fib6_nh->fib_nh_gw_family)
 			continue;
 		if ((rt->fib6_flags & flags) != flags)
 			continue;
@@ -6341,7 +6341,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 	list_for_each_entry(ifa, &idev->addr_list, if_list) {
 		spin_lock(&ifa->lock);
 		if (ifa->rt) {
-			struct fib6_nh *nh = &ifa->rt->fib6_nh;
+			struct fib6_nh *nh = ifa->rt->fib6_nh;
 			int cpu;
 
 			rcu_read_lock();
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 87ac82f850d2..cdfb8500ccae 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -147,11 +147,15 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
 	       addr[fn_bit >> 5];
 }
 
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
 {
 	struct fib6_info *f6i;
+	size_t sz = sizeof(*f6i);
 
-	f6i = kzalloc(sizeof(*f6i), gfp_flags);
+	if (with_fib6_nh)
+		sz += sizeof(struct fib6_nh);
+
+	f6i = kzalloc(sz, gfp_flags);
 	if (!f6i)
 		return NULL;
 
@@ -167,7 +171,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 
 	WARN_ON(f6i->fib6_node);
 
-	fib6_nh_release(&f6i->fib6_nh);
+	fib6_nh_release(f6i->fib6_nh);
 	ip_fib_metrics_put(f6i->fib6_metrics);
 	kfree(f6i);
 }
@@ -912,7 +916,7 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
 	f6i->fib6_destroying = 1;
 	mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
 
-	fib6_nh = &f6i->fib6_nh;
+	fib6_nh = f6i->fib6_nh;
 	__fib6_drop_pcpu_from(fib6_nh, f6i, table);
 }
 
@@ -2301,14 +2305,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 #else
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
-	if (rt->fib6_nh.fib_nh_gw_family) {
+	if (rt->fib6_nh->fib_nh_gw_family) {
 		flags |= RTF_GATEWAY;
-		seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6);
+		seq_printf(seq, "%pi6", &rt->fib6_nh->fib_nh_gw6);
 	} else {
 		seq_puts(seq, "00000000000000000000000000000000");
 	}
 
-	dev = rt->fib6_nh.fib_nh_dev;
+	dev = rt->fib6_nh->fib_nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
 		   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
 		   flags, dev ? dev->name : "");
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 4c8e2ea8bf19..f874dde1ee85 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1293,8 +1293,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
 
 	if (rt) {
-		neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
-					 rt->fib6_nh.fib_nh_dev, NULL,
+		neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+					 rt->fib6_nh->fib_nh_dev, NULL,
 					  &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
 			ND_PRINTK(0, err,
@@ -1323,8 +1323,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			return;
 		}
 
-		neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
-					 rt->fib6_nh.fib_nh_dev, NULL,
+		neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+					 rt->fib6_nh->fib_nh_dev, NULL,
 					  &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
 			ND_PRINTK(0, err,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b01118a3c42e..f248ce807116 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -441,12 +441,12 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
 	if (!fl6->mp_hash)
 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
-	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
+	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
 		goto out;
 
 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
 				 fib6_siblings) {
-		const struct fib6_nh *nh = &sibling->fib6_nh;
+		const struct fib6_nh *nh = sibling->fib6_nh;
 		int nh_upper_bound;
 
 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
@@ -460,7 +460,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
 
 out:
 	res->f6i = match;
-	res->nh = &match->fib6_nh;
+	res->nh = match->fib6_nh;
 }
 
 /*
@@ -496,13 +496,13 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
 	struct fib6_nh *nh;
 
 	if (!oif && ipv6_addr_any(saddr)) {
-		nh = &f6i->fib6_nh;
+		nh = f6i->fib6_nh;
 		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
 			goto out;
 	}
 
 	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
-		nh = &spf6i->fib6_nh;
+		nh = spf6i->fib6_nh;
 		if (__rt6_device_match(net, nh, saddr, oif, flags)) {
 			res->f6i = spf6i;
 			goto out;
@@ -511,14 +511,14 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
 
 	if (oif && flags & RT6_LOOKUP_F_IFACE) {
 		res->f6i = net->ipv6.fib6_null_entry;
-		nh = &res->f6i->fib6_nh;
+		nh = res->f6i->fib6_nh;
 		goto out;
 	}
 
-	nh = &f6i->fib6_nh;
+	nh = f6i->fib6_nh;
 	if (nh->fib_nh_flags & RTNH_F_DEAD) {
 		res->f6i = net->ipv6.fib6_null_entry;
-		nh = &res->f6i->fib6_nh;
+		nh = res->f6i->fib6_nh;
 	}
 out:
 	res->nh = nh;
@@ -714,7 +714,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start,
 		if (fib6_check_expired(f6i))
 			continue;
 
-		nh = &f6i->fib6_nh;
+		nh = f6i->fib6_nh;
 		if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
 			res->f6i = f6i;
 			res->nh = nh;
@@ -796,7 +796,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
 out:
 	if (!res->f6i) {
 		res->f6i = net->ipv6.fib6_null_entry;
-		res->nh = &res->f6i->fib6_nh;
+		res->nh = res->f6i->fib6_nh;
 		res->fib6_flags = res->f6i->fib6_flags;
 		res->fib6_type = res->f6i->fib6_type;
 	}
@@ -1626,7 +1626,7 @@ out:
 
 void rt6_flush_exceptions(struct fib6_info *f6i)
 {
-	fib6_nh_flush_exceptions(&f6i->fib6_nh, f6i);
+	fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
 }
 
 /* Find cached rt in the hash table inside passed in rt
@@ -1721,7 +1721,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
 		return -EINVAL;
 
-	return fib6_nh_remove_exception(&from->fib6_nh,
+	return fib6_nh_remove_exception(from->fib6_nh,
 					from->fib6_src.plen, rt);
 }
 
@@ -1761,7 +1761,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
 		goto unlock;
 
-	fib6_nh_update_exception(&from->fib6_nh, from->fib6_src.plen, rt);
+	fib6_nh_update_exception(from->fib6_nh, from->fib6_src.plen, rt);
 unlock:
 	rcu_read_unlock();
 }
@@ -1927,7 +1927,7 @@ void rt6_age_exceptions(struct fib6_info *f6i,
 			struct fib6_gc_args *gc_args,
 			unsigned long now)
 {
-	fib6_nh_age_exceptions(&f6i->fib6_nh, gc_args, now);
+	fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
 }
 
 /* must be called with rcu lock held */
@@ -2456,7 +2456,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 			rcu_read_unlock();
 			return;
 		}
-		res.nh = &res.f6i->fib6_nh;
+		res.nh = res.f6i->fib6_nh;
 		res.fib6_flags = res.f6i->fib6_flags;
 		res.fib6_type = res.f6i->fib6_type;
 
@@ -2599,7 +2599,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		res.f6i = rt;
-		res.nh = &rt->fib6_nh;
+		res.nh = rt->fib6_nh;
 
 		if (fib6_check_expired(rt))
 			continue;
@@ -2623,7 +2623,7 @@ restart:
 	}
 
 	res.f6i = rt;
-	res.nh = &rt->fib6_nh;
+	res.nh = rt->fib6_nh;
 out:
 	if (ret) {
 		ip6_hold_safe(net, &ret);
@@ -3264,7 +3264,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 
 	err = -ENOMEM;
-	rt = fib6_info_alloc(gfp_flags);
+	rt = fib6_info_alloc(gfp_flags, true);
 	if (!rt)
 		goto out;
 
@@ -3304,7 +3304,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
 	rt->fib6_src.plen = cfg->fc_src_len;
 #endif
-	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
+	err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
 	if (err)
 		goto out;
 
@@ -3312,7 +3312,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	 * they would result in kernel looping; promote them to reject routes
 	 */
 	addr_type = ipv6_addr_type(&cfg->fc_dst);
-	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
+	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type))
 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
 
 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
@@ -3472,7 +3472,7 @@ static int ip6_route_del(struct fib6_config *cfg,
 		for_each_fib6_node_rt_rcu(fn) {
 			struct fib6_nh *nh;
 
-			nh = &rt->fib6_nh;
+			nh = rt->fib6_nh;
 			if (cfg->fc_flags & RTF_CACHE) {
 				struct fib6_result res = {
 					.f6i = rt,
@@ -3614,7 +3614,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	if (!res.f6i)
 		goto out;
 
-	res.nh = &res.f6i->fib6_nh;
+	res.nh = res.f6i->fib6_nh;
 	res.fib6_flags = res.f6i->fib6_flags;
 	res.fib6_type = res.f6i->fib6_type;
 	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
@@ -3666,12 +3666,12 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
+		if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
 			continue;
 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
-		    !rt->fib6_nh.fib_nh_gw_family)
+		    !rt->fib6_nh->fib_nh_gw_family)
 			continue;
-		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
+		if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
 			continue;
 		if (!fib6_info_hold_safe(rt))
 			continue;
@@ -3729,7 +3729,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
-		struct fib6_nh *nh = &rt->fib6_nh;
+		struct fib6_nh *nh = rt->fib6_nh;
 
 		if (dev == nh->fib_nh_dev &&
 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
@@ -3981,7 +3981,7 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
 
-	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
+	if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
 	    rt != net->ipv6.fib6_null_entry &&
 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
@@ -4009,7 +4009,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
-	struct fib6_nh *nh = &rt->fib6_nh;
+	struct fib6_nh *nh = rt->fib6_nh;
 
 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
 	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
@@ -4059,9 +4059,9 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 
 static bool rt6_is_dead(const struct fib6_info *rt)
 {
-	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
-	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
-	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
+	if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
+	    (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
+	     ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
 		return true;
 
 	return false;
@@ -4073,11 +4073,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
-		total += rt->fib6_nh.fib_nh_weight;
+		total += rt->fib6_nh->fib_nh_weight;
 
 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
 		if (!rt6_is_dead(iter))
-			total += iter->fib6_nh.fib_nh_weight;
+			total += iter->fib6_nh->fib_nh_weight;
 	}
 
 	return total;
@@ -4088,11 +4088,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
 	int upper_bound = -1;
 
 	if (!rt6_is_dead(rt)) {
-		*weight += rt->fib6_nh.fib_nh_weight;
+		*weight += rt->fib6_nh->fib_nh_weight;
 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
 						    total) - 1;
 	}
-	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
+	atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
 }
 
 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
@@ -4136,8 +4136,8 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
 	struct net *net = dev_net(arg->dev);
 
 	if (rt != net->ipv6.fib6_null_entry &&
-	    rt->fib6_nh.fib_nh_dev == arg->dev) {
-		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
+	    rt->fib6_nh->fib_nh_dev == arg->dev) {
+		rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
 		rt6_multipath_rebalance(rt);
 	}
@@ -4165,10 +4165,10 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
 {
 	struct fib6_info *iter;
 
-	if (rt->fib6_nh.fib_nh_dev == dev)
+	if (rt->fib6_nh->fib_nh_dev == dev)
 		return true;
 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
-		if (iter->fib6_nh.fib_nh_dev == dev)
+		if (iter->fib6_nh->fib_nh_dev == dev)
 			return true;
 
 	return false;
@@ -4189,12 +4189,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
 	struct fib6_info *iter;
 	unsigned int dead = 0;
 
-	if (rt->fib6_nh.fib_nh_dev == down_dev ||
-	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh->fib_nh_dev == down_dev ||
+	    rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
 		dead++;
 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
-		if (iter->fib6_nh.fib_nh_dev == down_dev ||
-		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+		if (iter->fib6_nh->fib_nh_dev == down_dev ||
+		    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
 			dead++;
 
 	return dead;
@@ -4206,11 +4206,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
 {
 	struct fib6_info *iter;
 
-	if (rt->fib6_nh.fib_nh_dev == dev)
-		rt->fib6_nh.fib_nh_flags |= nh_flags;
+	if (rt->fib6_nh->fib_nh_dev == dev)
+		rt->fib6_nh->fib_nh_flags |= nh_flags;
 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
-		if (iter->fib6_nh.fib_nh_dev == dev)
-			iter->fib6_nh.fib_nh_flags |= nh_flags;
+		if (iter->fib6_nh->fib_nh_dev == dev)
+			iter->fib6_nh->fib_nh_flags |= nh_flags;
 }
 
 /* called with write lock held for table with rt */
@@ -4225,12 +4225,12 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 
 	switch (arg->event) {
 	case NETDEV_UNREGISTER:
-		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
+		return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
 	case NETDEV_DOWN:
 		if (rt->should_flush)
 			return -1;
 		if (!rt->fib6_nsiblings)
-			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
+			return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
 		if (rt6_multipath_uses_dev(rt, dev)) {
 			unsigned int count;
 
@@ -4246,10 +4246,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 		}
 		return -2;
 	case NETDEV_CHANGE:
-		if (rt->fib6_nh.fib_nh_dev != dev ||
+		if (rt->fib6_nh->fib_nh_dev != dev ||
 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
-		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
+		rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
 		rt6_multipath_rebalance(rt);
 		break;
 	}
@@ -4331,7 +4331,7 @@ static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
 		return 0;
 
 	arg->f6i = f6i;
-	return fib6_nh_mtu_change(&f6i->fib6_nh, arg);
+	return fib6_nh_mtu_change(f6i->fib6_nh, arg);
 }
 
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
@@ -4611,7 +4611,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			goto cleanup;
 		}
 
-		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
+		rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
 
 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
 					    rt, &r_cfg);
@@ -4778,7 +4778,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
 			    + NLA_ALIGN(sizeof(struct rtnexthop))
 			    + nla_total_size(16) /* RTA_GATEWAY */
-			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
+			    + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws);
 
 		nexthop_len *= rt->fib6_nsiblings;
 	}
@@ -4796,7 +4796,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
 	       + nla_total_size(1) /* RTA_PREF */
-	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
+	       + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws)
 	       + nexthop_len;
 }
 
@@ -4916,14 +4916,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 		if (!mp)
 			goto nla_put_failure;
 
-		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
-				    rt->fib6_nh.fib_nh_weight) < 0)
+		if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
+				    rt->fib6_nh->fib_nh_weight) < 0)
 			goto nla_put_failure;
 
 		list_for_each_entry_safe(sibling, next_sibling,
 					 &rt->fib6_siblings, fib6_siblings) {
-			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
-					    sibling->fib6_nh.fib_nh_weight) < 0)
+			if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
+					    sibling->fib6_nh->fib_nh_weight) < 0)
 				goto nla_put_failure;
 		}
 
@@ -4931,7 +4931,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	} else {
 		unsigned char nh_flags = 0;
 
-		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
+		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
 				     &nh_flags, false) < 0)
 			goto nla_put_failure;
 
@@ -4961,7 +4961,7 @@ nla_put_failure:
 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
 			       const struct net_device *dev)
 {
-	if (f6i->fib6_nh.fib_nh_dev == dev)
+	if (f6i->fib6_nh->fib_nh_dev == dev)
 		return true;
 
 	if (f6i->fib6_nsiblings) {
@@ -4969,7 +4969,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
 
 		list_for_each_entry_safe(sibling, next_sibling,
 					 &f6i->fib6_siblings, fib6_siblings) {
-			if (sibling->fib6_nh.fib_nh_dev == dev)
+			if (sibling->fib6_nh->fib_nh_dev == dev)
 				return true;
 		}
 	}
@@ -5290,7 +5290,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		return NOTIFY_OK;
 
 	if (event == NETDEV_REGISTER) {
-		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
+		net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
 		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5484,11 +5484,11 @@ static int __net_init ip6_route_net_init(struct net *net)
 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
 		goto out_ip6_dst_ops;
 
-	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
-					    sizeof(*net->ipv6.fib6_null_entry),
-					    GFP_KERNEL);
+	net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
 	if (!net->ipv6.fib6_null_entry)
 		goto out_ip6_dst_entries;
+	memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
+	       sizeof(*net->ipv6.fib6_null_entry));
 
 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
 					   sizeof(*net->ipv6.ip6_null_entry),
@@ -5625,7 +5625,7 @@ void __init ip6_route_init_special_entries(void)
 	/* Registering of the loopback is done before this portion of code,
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
-	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
+	init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-- 
cgit v1.2.3


From 4618d6719743b60f1da4b8112c4518ee46110b94 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 23 May 2019 20:06:49 +0200
Subject: net: phy: add interface mode PHY_INTERFACE_MODE_USXGMII

Add support for interface mode PHY_INTERFACE_MODE_USXGMII.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 073fb151b5a9..7180b1d1e5e3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -103,6 +103,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_XAUI,
 	/* 10GBASE-KR, XFI, SFI - single lane 10G Serdes */
 	PHY_INTERFACE_MODE_10GKR,
+	PHY_INTERFACE_MODE_USXGMII,
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
@@ -178,6 +179,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "xaui";
 	case PHY_INTERFACE_MODE_10GKR:
 		return "10gbase-kr";
+	case PHY_INTERFACE_MODE_USXGMII:
+		return "usxgmii";
 	default:
 		return "unknown";
 	}
-- 
cgit v1.2.3


From 8b401f9ed2441ad9e219953927a842d24ed051fc Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 May 2019 14:47:45 -0700
Subject: bpf: implement bpf_send_signal() helper

This patch tries to solve the following specific use case.

Currently, bpf program can already collect stack traces
through kernel function get_perf_callchain()
when certain events happens (e.g., cache miss counter or
cpu clock counter overflows). But such stack traces are
not enough for jitted programs, e.g., hhvm (jited php).
To get real stack trace, jit engine internal data structures
need to be traversed in order to get the real user functions.

bpf program itself may not be the best place to traverse
the jit engine as the traversing logic could be complex and
it is not a stable interface either.

Instead, hhvm implements a signal handler,
e.g. for SIGALARM, and a set of program locations which
it can dump stack traces. When it receives a signal, it will
dump the stack in next such program location.

Such a mechanism can be implemented in the following way:
  . a perf ring buffer is created between bpf program
    and tracing app.
  . once a particular event happens, bpf program writes
    to the ring buffer and the tracing app gets notified.
  . the tracing app sends a signal SIGALARM to the hhvm.

But this method could have large delays and causing profiling
results skewed.

This patch implements bpf_send_signal() helper to send
a signal to hhvm in real time, resulting in intended stack traces.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 17 +++++++++++-
 kernel/trace/bpf_trace.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63e0cf66f01a..68d4470523a0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2672,6 +2672,20 @@ union bpf_attr {
  *		0 on success.
  *
  *		**-ENOENT** if the bpf-local-storage cannot be found.
+ *
+ * int bpf_send_signal(u32 sig)
+ *	Description
+ *		Send signal *sig* to the current task.
+ *	Return
+ *		0 on success or successfully queued.
+ *
+ *		**-EBUSY** if work queue under nmi is full.
+ *
+ *		**-EINVAL** if *sig* is invalid.
+ *
+ *		**-EPERM** if no permission to send the *sig*.
+ *
+ *		**-EAGAIN** if bpf program can try again.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2782,7 +2796,8 @@ union bpf_attr {
 	FN(strtol),			\
 	FN(strtoul),			\
 	FN(sk_storage_get),		\
-	FN(sk_storage_delete),
+	FN(sk_storage_delete),		\
+	FN(send_signal),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f92d6ad5e080..70029eafc71f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -567,6 +567,63 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+struct send_signal_irq_work {
+	struct irq_work irq_work;
+	struct task_struct *task;
+	u32 sig;
+};
+
+static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
+
+static void do_bpf_send_signal(struct irq_work *entry)
+{
+	struct send_signal_irq_work *work;
+
+	work = container_of(entry, struct send_signal_irq_work, irq_work);
+	group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
+}
+
+BPF_CALL_1(bpf_send_signal, u32, sig)
+{
+	struct send_signal_irq_work *work = NULL;
+
+	/* Similar to bpf_probe_write_user, task needs to be
+	 * in a sound condition and kernel memory access be
+	 * permitted in order to send signal to the current
+	 * task.
+	 */
+	if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
+		return -EPERM;
+	if (unlikely(uaccess_kernel()))
+		return -EPERM;
+	if (unlikely(!nmi_uaccess_okay()))
+		return -EPERM;
+
+	if (in_nmi()) {
+		work = this_cpu_ptr(&send_signal_work);
+		if (work->irq_work.flags & IRQ_WORK_BUSY)
+			return -EBUSY;
+
+		/* Add the current task, which is the target of sending signal,
+		 * to the irq_work. The current task may change when queued
+		 * irq works get executed.
+		 */
+		work->task = current;
+		work->sig = sig;
+		irq_work_queue(&work->irq_work);
+		return 0;
+	}
+
+	return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
+}
+
+static const struct bpf_func_proto bpf_send_signal_proto = {
+	.func		= bpf_send_signal,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -617,6 +674,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
 #endif
+	case BPF_FUNC_send_signal:
+		return &bpf_send_signal_proto;
 	default:
 		return NULL;
 	}
@@ -1343,5 +1402,18 @@ static int __init bpf_event_init(void)
 	return 0;
 }
 
+static int __init send_signal_irq_work_init(void)
+{
+	int cpu;
+	struct send_signal_irq_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&send_signal_work, cpu);
+		init_irq_work(&work->irq_work, do_bpf_send_signal);
+	}
+	return 0;
+}
+
 fs_initcall(bpf_event_init);
+subsys_initcall(send_signal_irq_work_init);
 #endif /* CONFIG_MODULES */
-- 
cgit v1.2.3


From 5327ed3d44b754f5cc51d5b3f18e442eaebacff5 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 24 May 2019 23:25:12 +0100
Subject: bpf: verifier: mark verified-insn with sub-register zext flag

eBPF ISA specification requires high 32-bit cleared when low 32-bit
sub-register is written. This applies to destination register of ALU32 etc.
JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and
AArch64 ISA has the same semantics, so the corresponding JIT back-end
doesn't need to do extra work.

However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches
(PowerPC, SPARC etc) need to do explicit zero extension to meet this
requirement, otherwise code like the following will fail.

  u64_value = (u64) u32_value
  ... other uses of u64_value

This is because compiler could exploit the semantic described above and
save those zero extensions for extending u32_value to u64_value, these JIT
back-ends are expected to guarantee this through inserting extra zero
extensions which however could be a significant increase on the code size.
Some benchmarks show there could be ~40% sub-register writes out of total
insns, meaning at least ~40% extra code-gen.

One observation is these extra zero extensions are not always necessary.
Take above code snippet for example, it is possible u32_value will never be
casted into a u64, the value of high 32-bit of u32_value then could be
ignored and extra zero extension could be eliminated.

This patch implements this idea, insns defining sub-registers will be
marked when the high 32-bit of the defined sub-register matters. For
those unmarked insns, it is safe to eliminate high 32-bit clearnace for
them.

Algo:
 - Split read flags into READ32 and READ64.

 - Record index of insn that does sub-register write. Keep the index inside
   reg state and update it during verifier insn walking.

 - A full register read on a sub-register marks its definition insn as
   needing zero extension on dst register.

   A new sub-register write overrides the old one.

 - When propagating read64 during path pruning, also mark any insn defining
   a sub-register that is read in the pruned path as full-register.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  14 +++-
 kernel/bpf/verifier.c        | 173 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 171 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 405b502283c5..704ed7971472 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -36,9 +36,11 @@
  */
 enum bpf_reg_liveness {
 	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
-	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
-	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
-	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
+	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
+	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
+	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
+	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
+	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
 };
 
 struct bpf_reg_state {
@@ -131,6 +133,11 @@ struct bpf_reg_state {
 	 * pointing to bpf_func_state.
 	 */
 	u32 frameno;
+	/* Tracks subreg definition. The stored value is the insn_idx of the
+	 * writing insn. This is safe because subreg_def is used before any insn
+	 * patching which only happens after main verification finished.
+	 */
+	s32 subreg_def;
 	enum bpf_reg_liveness live;
 };
 
@@ -233,6 +240,7 @@ struct bpf_insn_aux_data {
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
+	bool zext_dst; /* this insn zero extends dst reg */
 	u8 alu_state; /* used in combination with alu_limit */
 	bool prune_point;
 	unsigned int orig_idx; /* original instruction index */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 550091c7a46a..f6b4c7148c3e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -982,6 +982,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 	__mark_reg_not_init(regs + regno);
 }
 
+#define DEF_NOT_SUBREG	(0)
 static void init_reg_state(struct bpf_verifier_env *env,
 			   struct bpf_func_state *state)
 {
@@ -992,6 +993,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
 		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
 		regs[i].parent = NULL;
+		regs[i].subreg_def = DEF_NOT_SUBREG;
 	}
 
 	/* frame pointer */
@@ -1137,7 +1139,7 @@ next:
  */
 static int mark_reg_read(struct bpf_verifier_env *env,
 			 const struct bpf_reg_state *state,
-			 struct bpf_reg_state *parent)
+			 struct bpf_reg_state *parent, u8 flag)
 {
 	bool writes = parent == state->parent; /* Observe write marks */
 	int cnt = 0;
@@ -1152,17 +1154,26 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 				parent->var_off.value, parent->off);
 			return -EFAULT;
 		}
-		if (parent->live & REG_LIVE_READ)
+		/* The first condition is more likely to be true than the
+		 * second, checked it first.
+		 */
+		if ((parent->live & REG_LIVE_READ) == flag ||
+		    parent->live & REG_LIVE_READ64)
 			/* The parentage chain never changes and
 			 * this parent was already marked as LIVE_READ.
 			 * There is no need to keep walking the chain again and
 			 * keep re-marking all parents as LIVE_READ.
 			 * This case happens when the same register is read
 			 * multiple times without writes into it in-between.
+			 * Also, if parent has the stronger REG_LIVE_READ64 set,
+			 * then no need to set the weak REG_LIVE_READ32.
 			 */
 			break;
 		/* ... then we depend on parent's value */
-		parent->live |= REG_LIVE_READ;
+		parent->live |= flag;
+		/* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
+		if (flag == REG_LIVE_READ64)
+			parent->live &= ~REG_LIVE_READ32;
 		state = parent;
 		parent = state->parent;
 		writes = true;
@@ -1174,12 +1185,111 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* This function is supposed to be used by the following 32-bit optimization
+ * code only. It returns TRUE if the source or destination register operates
+ * on 64-bit, otherwise return FALSE.
+ */
+static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+		     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+{
+	u8 code, class, op;
+
+	code = insn->code;
+	class = BPF_CLASS(code);
+	op = BPF_OP(code);
+	if (class == BPF_JMP) {
+		/* BPF_EXIT for "main" will reach here. Return TRUE
+		 * conservatively.
+		 */
+		if (op == BPF_EXIT)
+			return true;
+		if (op == BPF_CALL) {
+			/* BPF to BPF call will reach here because of marking
+			 * caller saved clobber with DST_OP_NO_MARK for which we
+			 * don't care the register def because they are anyway
+			 * marked as NOT_INIT already.
+			 */
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				return false;
+			/* Helper call will reach here because of arg type
+			 * check, conservatively return TRUE.
+			 */
+			if (t == SRC_OP)
+				return true;
+
+			return false;
+		}
+	}
+
+	if (class == BPF_ALU64 || class == BPF_JMP ||
+	    /* BPF_END always use BPF_ALU class. */
+	    (class == BPF_ALU && op == BPF_END && insn->imm == 64))
+		return true;
+
+	if (class == BPF_ALU || class == BPF_JMP32)
+		return false;
+
+	if (class == BPF_LDX) {
+		if (t != SRC_OP)
+			return BPF_SIZE(code) == BPF_DW;
+		/* LDX source must be ptr. */
+		return true;
+	}
+
+	if (class == BPF_STX) {
+		if (reg->type != SCALAR_VALUE)
+			return true;
+		return BPF_SIZE(code) == BPF_DW;
+	}
+
+	if (class == BPF_LD) {
+		u8 mode = BPF_MODE(code);
+
+		/* LD_IMM64 */
+		if (mode == BPF_IMM)
+			return true;
+
+		/* Both LD_IND and LD_ABS return 32-bit data. */
+		if (t != SRC_OP)
+			return  false;
+
+		/* Implicit ctx ptr. */
+		if (regno == BPF_REG_6)
+			return true;
+
+		/* Explicit source could be any width. */
+		return true;
+	}
+
+	if (class == BPF_ST)
+		/* The only source register for BPF_ST is a ptr. */
+		return true;
+
+	/* Conservatively return true at default. */
+	return true;
+}
+
+static void mark_insn_zext(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *reg)
+{
+	s32 def_idx = reg->subreg_def;
+
+	if (def_idx == DEF_NOT_SUBREG)
+		return;
+
+	env->insn_aux_data[def_idx - 1].zext_dst = true;
+	/* The dst will be zero extended, so won't be sub-register anymore. */
+	reg->subreg_def = DEF_NOT_SUBREG;
+}
+
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
+	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
 	struct bpf_reg_state *reg, *regs = state->regs;
+	bool rw64;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
@@ -1187,6 +1297,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	}
 
 	reg = &regs[regno];
+	rw64 = is_reg64(env, insn, regno, reg, t);
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (reg->type == NOT_INIT) {
@@ -1197,7 +1308,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 		if (regno == BPF_REG_FP)
 			return 0;
 
-		return mark_reg_read(env, reg, reg->parent);
+		if (rw64)
+			mark_insn_zext(env, reg);
+
+		return mark_reg_read(env, reg, reg->parent,
+				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -1205,6 +1320,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		reg->live |= REG_LIVE_WRITTEN;
+		reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
 		if (t == DST_OP)
 			mark_reg_unknown(env, regs, regno);
 	}
@@ -1384,7 +1500,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 		}
 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
-			      reg_state->stack[spi].spilled_ptr.parent);
+			      reg_state->stack[spi].spilled_ptr.parent,
+			      REG_LIVE_READ64);
 		return 0;
 	} else {
 		int zeros = 0;
@@ -1401,7 +1518,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			return -EACCES;
 		}
 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
-			      reg_state->stack[spi].spilled_ptr.parent);
+			      reg_state->stack[spi].spilled_ptr.parent,
+			      REG_LIVE_READ64);
 		if (value_regno >= 0) {
 			if (zeros == size) {
 				/* any size read into register is zero extended,
@@ -2110,6 +2228,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 						    value_regno);
 				if (reg_type_may_be_null(reg_type))
 					regs[value_regno].id = ++env->id_gen;
+				/* A load of ctx field could have different
+				 * actual load size with the one encoded in the
+				 * insn. When the dst is PTR, it is for sure not
+				 * a sub-register.
+				 */
+				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
 			}
 			regs[value_regno].type = reg_type;
 		}
@@ -2369,7 +2493,8 @@ mark:
 		 * the whole slot to be marked as 'read'
 		 */
 		mark_reg_read(env, &state->stack[spi].spilled_ptr,
-			      state->stack[spi].spilled_ptr.parent);
+			      state->stack[spi].spilled_ptr.parent,
+			      REG_LIVE_READ64);
 	}
 	return update_stack_depth(env, state, min_off);
 }
@@ -3333,6 +3458,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
+	/* helper call returns 64-bit value. */
+	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
 	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
@@ -4264,6 +4392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				 */
 				*dst_reg = *src_reg;
 				dst_reg->live |= REG_LIVE_WRITTEN;
+				dst_reg->subreg_def = DEF_NOT_SUBREG;
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
@@ -4274,6 +4403,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				} else if (src_reg->type == SCALAR_VALUE) {
 					*dst_reg = *src_reg;
 					dst_reg->live |= REG_LIVE_WRITTEN;
+					dst_reg->subreg_def = env->insn_idx + 1;
 				} else {
 					mark_reg_unknown(env, regs,
 							 insn->dst_reg);
@@ -5353,6 +5483,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * Already marked as written above.
 	 */
 	mark_reg_unknown(env, regs, BPF_REG_0);
+	/* ld_abs load up to 32-bit skb data. */
+	regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
 	return 0;
 }
 
@@ -6309,20 +6441,33 @@ static bool states_equal(struct bpf_verifier_env *env,
 	return true;
 }
 
+/* Return 0 if no propagation happened. Return negative error code if error
+ * happened. Otherwise, return the propagated bit.
+ */
 static int propagate_liveness_reg(struct bpf_verifier_env *env,
 				  struct bpf_reg_state *reg,
 				  struct bpf_reg_state *parent_reg)
 {
+	u8 parent_flag = parent_reg->live & REG_LIVE_READ;
+	u8 flag = reg->live & REG_LIVE_READ;
 	int err;
 
-	if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ))
+	/* When comes here, read flags of PARENT_REG or REG could be any of
+	 * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
+	 * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
+	 */
+	if (parent_flag == REG_LIVE_READ64 ||
+	    /* Or if there is no read flag from REG. */
+	    !flag ||
+	    /* Or if the read flag from REG is the same as PARENT_REG. */
+	    parent_flag == flag)
 		return 0;
 
-	err = mark_reg_read(env, reg, parent_reg);
+	err = mark_reg_read(env, reg, parent_reg, flag);
 	if (err)
 		return err;
 
-	return 0;
+	return flag;
 }
 
 /* A write screens off any subsequent reads; but write marks come from the
@@ -6356,8 +6501,10 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
 			err = propagate_liveness_reg(env, &state_reg[i],
 						     &parent_reg[i]);
-			if (err)
+			if (err < 0)
 				return err;
+			if (err == REG_LIVE_READ64)
+				mark_insn_zext(env, &parent_reg[i]);
 		}
 
 		/* Propagate stack slots. */
@@ -6367,11 +6514,11 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 			state_reg = &state->stack[i].spilled_ptr;
 			err = propagate_liveness_reg(env, state_reg,
 						     parent_reg);
-			if (err)
+			if (err < 0)
 				return err;
 		}
 	}
-	return err;
+	return 0;
 }
 
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
-- 
cgit v1.2.3


From 7d134041a89610ae552501fc88652805addcdee4 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 24 May 2019 23:25:14 +0100
Subject: bpf: introduce new mov32 variant for doing explicit zero extension

The encoding for this new variant is based on BPF_X format. "imm" field was
0 only, now it could be 1 which means doing zero extension unconditionally

  .code = BPF_ALU | BPF_MOV | BPF_X
  .dst_reg = DST
  .src_reg = SRC
  .imm  = 1

We use this new form for doing zero extension for which verifier will
guarantee SRC == DST.

Implications on JIT back-ends when doing code-gen for
BPF_ALU | BPF_MOV | BPF_X:
  1. No change if hardware already does zero extension unconditionally for
     sub-register write.
  2. Otherwise, when seeing imm == 1, just generate insns to clear high
     32-bit. No need to generate insns for the move because when imm == 1,
     dst_reg is the same as src_reg at the moment.

Interpreter doesn't need change as well. It is doing unconditionally zero
extension for mov32 already.

One helper macro BPF_ZEXT_REG is added to help creating zero extension
insn using this new mov32 variant.

One helper function insn_is_zext is added for checking one insn is an
zero extension on dst. This will be widely used by a few JIT back-ends in
later patches in this set.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7148bab96943..bb10ffb88452 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -160,6 +160,20 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = IMM })
 
+/* Special form of mov32, used for doing explicit zero extension on dst. */
+#define BPF_ZEXT_REG(DST)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = DST,					\
+		.off   = 0,					\
+		.imm   = 1 })
+
+static inline bool insn_is_zext(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
+}
+
 /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
 #define BPF_LD_IMM64(DST, IMM)					\
 	BPF_LD_IMM64_RAW(DST, 0, IMM)
-- 
cgit v1.2.3


From a4b1d3c1ddf6cb441187b6c130a473c16a05a356 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 24 May 2019 23:25:15 +0100
Subject: bpf: verifier: insert zero extension according to analysis result

After previous patches, verifier will mark a insn if it really needs zero
extension on dst_reg.

It is then for back-ends to decide how to use such information to eliminate
unnecessary zero extension code-gen during JIT compilation.

One approach is verifier insert explicit zero extension for those insns
that need zero extension in a generic way, JIT back-ends then do not
generate zero extension for sub-register write at default.

However, only those back-ends which do not have hardware zero extension
want this optimization. Back-ends like x86_64 and AArch64 have hardware
zero extension support that the insertion should be disabled.

This patch introduces new target hook "bpf_jit_needs_zext" which returns
false at default, meaning verifier zero extension insertion is disabled at
default. A back-end could override this hook to return true if it doesn't
have hardware support and want verifier insert zero extension explicitly.

Offload targets do not use this native target hook, instead, they could
get the optimization results using bpf_prog_offload_ops.finalize.

NOTE: arches could have diversified features, it is possible for one arch
to have hardware zero extension support for some sub-register write insns
but not for all. For example, PowerPC, SPARC have zero extended loads, but
not for alu32. So when verifier zero extension insertion enabled, these JIT
back-ends need to peephole insns to remove those zero extension inserted
for insn that actually has hardware zero extension support. The peephole
could be as simple as looking the next insn, if it is a special zero
extension insn then it is safe to eliminate it if the current insn has
hardware zero extension support.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    |  1 +
 include/linux/filter.h |  1 +
 kernel/bpf/core.c      |  9 +++++++++
 kernel/bpf/verifier.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fb3aa2dc975..d98141edb74b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -370,6 +370,7 @@ struct bpf_prog_aux {
 	u32 id;
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
+	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
 	bool offload_requested;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index bb10ffb88452..ba8b65270e0d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -825,6 +825,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
+bool bpf_jit_needs_zext(void);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(void)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 242a643af82f..3675b19ecb90 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2090,6 +2090,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
 	return false;
 }
 
+/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
+ * analysis code and wants explicit zero extension inserted by verifier.
+ * Otherwise, return FALSE.
+ */
+bool __weak bpf_jit_needs_zext(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a6af3166acae..d4394a84b9eb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7640,6 +7640,38 @@ static int opt_remove_nops(struct bpf_verifier_env *env)
 	return 0;
 }
 
+static int opt_subreg_zext_lo32(struct bpf_verifier_env *env)
+{
+	struct bpf_insn_aux_data *aux = env->insn_aux_data;
+	struct bpf_insn *insns = env->prog->insnsi;
+	int i, delta = 0, len = env->prog->len;
+	struct bpf_insn zext_patch[2];
+	struct bpf_prog *new_prog;
+
+	zext_patch[1] = BPF_ZEXT_REG(0);
+	for (i = 0; i < len; i++) {
+		int adj_idx = i + delta;
+		struct bpf_insn insn;
+
+		if (!aux[adj_idx].zext_dst)
+			continue;
+
+		insn = insns[adj_idx];
+		zext_patch[0] = insn;
+		zext_patch[1].dst_reg = insn.dst_reg;
+		zext_patch[1].src_reg = insn.dst_reg;
+		new_prog = bpf_patch_insn_data(env, adj_idx, zext_patch, 2);
+		if (!new_prog)
+			return -ENOMEM;
+		env->prog = new_prog;
+		insns = new_prog->insnsi;
+		aux = env->insn_aux_data;
+		delta += 2;
+	}
+
+	return 0;
+}
+
 /* convert load instructions that access fields of a context type into a
  * sequence of instructions that access fields of the underlying structure:
  *     struct __sk_buff    -> struct sk_buff
@@ -8490,6 +8522,15 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
+	/* do 32-bit optimization after insn patching has done so those patched
+	 * insns could be handled correctly.
+	 */
+	if (ret == 0 && bpf_jit_needs_zext() &&
+	    !bpf_prog_is_dev_bound(env->prog->aux)) {
+		ret = opt_subreg_zext_lo32(env);
+		env->prog->aux->verifier_zext = !ret;
+	}
+
 	if (ret == 0)
 		ret = fixup_call_args(env);
 
-- 
cgit v1.2.3


From c240eff63a1cf1c4edc768e0cfc374811c02f069 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 24 May 2019 23:25:16 +0100
Subject: bpf: introduce new bpf prog load flags "BPF_F_TEST_RND_HI32"

x86_64 and AArch64 perhaps are two arches that running bpf testsuite
frequently, however the zero extension insertion pass is not enabled for
them because of their hardware support.

It is critical to guarantee the pass correction as it is supposed to be
enabled at default for a couple of other arches, for example PowerPC,
SPARC, arm, NFP etc. Therefore, it would be very useful if there is a way
to test this pass on for example x86_64.

The test methodology employed by this set is "poisoning" useless bits. High
32-bit of a definition is randomized if it is identified as not used by any
later insn. Such randomization is only enabled under testing mode which is
gated by the new bpf prog load flags "BPF_F_TEST_RND_HI32".

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 18 ++++++++++++++++++
 kernel/bpf/syscall.c     |  4 +++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 68d4470523a0..7c6aef253173 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -260,6 +260,24 @@ enum bpf_attach_type {
  */
 #define BPF_F_ANY_ALIGNMENT	(1U << 1)
 
+/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose.
+ * Verifier does sub-register def/use analysis and identifies instructions whose
+ * def only matters for low 32-bit, high 32-bit is never referenced later
+ * through implicit zero extension. Therefore verifier notifies JIT back-ends
+ * that it is safe to ignore clearing high 32-bit for these instructions. This
+ * saves some back-ends a lot of code-gen. However such optimization is not
+ * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends
+ * hence hasn't used verifier's analysis result. But, we really want to have a
+ * way to be able to verify the correctness of the described optimization on
+ * x86_64 on which testsuites are frequently exercised.
+ *
+ * So, this flag is introduced. Once it is set, verifier will randomize high
+ * 32-bit for those instructions who has been identified as safe to ignore them.
+ * Then, if verifier is not doing correct analysis, such randomization will
+ * regress tests to expose bugs.
+ */
+#define BPF_F_TEST_RND_HI32	(1U << 2)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * two extensions:
  *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb5440b02e82..3d546b6f4646 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1604,7 +1604,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	if (CHECK_ATTR(BPF_PROG_LOAD))
 		return -EINVAL;
 
-	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
+	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
+				 BPF_F_ANY_ALIGNMENT |
+				 BPF_F_TEST_RND_HI32))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
-- 
cgit v1.2.3


From 08eb1fb0f77b0036568d2228f3425f2595d671bb Mon Sep 17 00:00:00 2001
From: Michal Kalderon <michal.kalderon@marvell.com>
Date: Sun, 26 May 2019 15:22:22 +0300
Subject: qed*: Change hwfn used for sb initialization

When initializing status blocks use the affined hwfn
instead of the leading one for RDMA / Storage

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/qedr/main.c            |  3 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c   | 47 ++++++++++++++++------------
 drivers/net/ethernet/qlogic/qede/qede_main.c |  3 +-
 include/linux/qed/qed_if.h                   | 10 +++++-
 4 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 083c2c00a8e9..806b3d0e57d8 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -312,7 +312,8 @@ static void qedr_free_mem_sb(struct qedr_dev *dev,
 			     struct qed_sb_info *sb_info, int sb_id)
 {
 	if (sb_info->sb_virt) {
-		dev->ops->common->sb_release(dev->cdev, sb_info, sb_id);
+		dev->ops->common->sb_release(dev->cdev, sb_info, sb_id,
+					     QED_SB_TYPE_CNQ);
 		dma_free_coherent(&dev->pdev->dev, sizeof(*sb_info->sb_virt),
 				  (void *)sb_info->sb_virt, sb_info->sb_phys);
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 6de23b56b294..7f19fefe0d79 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1301,26 +1301,21 @@ static u32 qed_sb_init(struct qed_dev *cdev,
 {
 	struct qed_hwfn *p_hwfn;
 	struct qed_ptt *p_ptt;
-	int hwfn_index;
 	u16 rel_sb_id;
-	u8 n_hwfns;
 	u32 rc;
 
-	/* RoCE uses single engine and CMT uses two engines. When using both
-	 * we force only a single engine. Storage uses only engine 0 too.
-	 */
-	if (type == QED_SB_TYPE_L2_QUEUE)
-		n_hwfns = cdev->num_hwfns;
-	else
-		n_hwfns = 1;
-
-	hwfn_index = sb_id % n_hwfns;
-	p_hwfn = &cdev->hwfns[hwfn_index];
-	rel_sb_id = sb_id / n_hwfns;
+	/* RoCE/Storage use a single engine in CMT mode while L2 uses both */
+	if (type == QED_SB_TYPE_L2_QUEUE) {
+		p_hwfn = &cdev->hwfns[sb_id % cdev->num_hwfns];
+		rel_sb_id = sb_id / cdev->num_hwfns;
+	} else {
+		p_hwfn = QED_AFFIN_HWFN(cdev);
+		rel_sb_id = sb_id;
+	}
 
 	DP_VERBOSE(cdev, NETIF_MSG_INTR,
 		   "hwfn [%d] <--[init]-- SB %04x [0x%04x upper]\n",
-		   hwfn_index, rel_sb_id, sb_id);
+		   IS_LEAD_HWFN(p_hwfn) ? 0 : 1, rel_sb_id, sb_id);
 
 	if (IS_PF(p_hwfn->cdev)) {
 		p_ptt = qed_ptt_acquire(p_hwfn);
@@ -1339,20 +1334,26 @@ static u32 qed_sb_init(struct qed_dev *cdev,
 }
 
 static u32 qed_sb_release(struct qed_dev *cdev,
-			  struct qed_sb_info *sb_info, u16 sb_id)
+			  struct qed_sb_info *sb_info,
+			  u16 sb_id,
+			  enum qed_sb_type type)
 {
 	struct qed_hwfn *p_hwfn;
-	int hwfn_index;
 	u16 rel_sb_id;
 	u32 rc;
 
-	hwfn_index = sb_id % cdev->num_hwfns;
-	p_hwfn = &cdev->hwfns[hwfn_index];
-	rel_sb_id = sb_id / cdev->num_hwfns;
+	/* RoCE/Storage use a single engine in CMT mode while L2 uses both */
+	if (type == QED_SB_TYPE_L2_QUEUE) {
+		p_hwfn = &cdev->hwfns[sb_id % cdev->num_hwfns];
+		rel_sb_id = sb_id / cdev->num_hwfns;
+	} else {
+		p_hwfn = QED_AFFIN_HWFN(cdev);
+		rel_sb_id = sb_id;
+	}
 
 	DP_VERBOSE(cdev, NETIF_MSG_INTR,
 		   "hwfn [%d] <--[init]-- SB %04x [0x%04x upper]\n",
-		   hwfn_index, rel_sb_id, sb_id);
+		   IS_LEAD_HWFN(p_hwfn) ? 0 : 1, rel_sb_id, sb_id);
 
 	rc = qed_int_sb_release(p_hwfn, sb_info, rel_sb_id);
 
@@ -2372,6 +2373,11 @@ static int qed_read_module_eeprom(struct qed_dev *cdev, char *buf,
 	return rc;
 }
 
+static u8 qed_get_affin_hwfn_idx(struct qed_dev *cdev)
+{
+	return QED_AFFIN_HWFN_IDX(cdev);
+}
+
 static struct qed_selftest_ops qed_selftest_ops_pass = {
 	.selftest_memory = &qed_selftest_memory,
 	.selftest_interrupt = &qed_selftest_interrupt,
@@ -2419,6 +2425,7 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.db_recovery_add = &qed_db_recovery_add,
 	.db_recovery_del = &qed_db_recovery_del,
 	.read_module_eeprom = &qed_read_module_eeprom,
+	.get_affin_hwfn_idx = &qed_get_affin_hwfn_idx,
 };
 
 void qed_get_protocol_stats(struct qed_dev *cdev,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 02a97c659e29..a9684a881f2a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1306,7 +1306,8 @@ static void qede_free_mem_sb(struct qede_dev *edev, struct qed_sb_info *sb_info,
 			     u16 sb_id)
 {
 	if (sb_info->sb_virt) {
-		edev->ops->common->sb_release(edev->cdev, sb_info, sb_id);
+		edev->ops->common->sb_release(edev->cdev, sb_info, sb_id,
+					      QED_SB_TYPE_L2_QUEUE);
 		dma_free_coherent(&edev->pdev->dev, sizeof(*sb_info->sb_virt),
 				  (void *)sb_info->sb_virt, sb_info->sb_phys);
 		memset(sb_info, 0, sizeof(*sb_info));
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 48841e5dab90..eef02e64b422 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -907,7 +907,8 @@ struct qed_common_ops {
 
 	u32		(*sb_release)(struct qed_dev *cdev,
 				      struct qed_sb_info *sb_info,
-				      u16 sb_id);
+				      u16 sb_id,
+				      enum qed_sb_type type);
 
 	void		(*simd_handler_config)(struct qed_dev *cdev,
 					       void *token,
@@ -1123,6 +1124,13 @@ struct qed_common_ops {
  */
 	int (*read_module_eeprom)(struct qed_dev *cdev,
 				  char *buf, u8 dev_addr, u32 offset, u32 len);
+
+/**
+ * @brief get_affin_hwfn_idx
+ *
+ * @param cdev
+ */
+	u8 (*get_affin_hwfn_idx)(struct qed_dev *cdev);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From 3576e99e08217f291290ac62431c7e330ac111c4 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <michal.kalderon@marvell.com>
Date: Sun, 26 May 2019 15:22:27 +0300
Subject: qed*: Add iWARP 100g support

Add iWARP engine affinity setting for supporting iWARP over 100g.
iWARP cannot be distinguished by the LLH from L2, hence the
engine division will affect L2 as well. For this reason we add
a parameter to devlink to determine the engine division.

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/qedr/main.c          | 13 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_rdma.c | 31 ++++++++++++++++++++++++++++++
 include/linux/qed/qed_rdma_if.h            |  2 ++
 3 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 055a63144480..5ebf3c53b3fb 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -871,7 +871,16 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev,
 	dev->user_dpm_enabled = dev_info.user_dpm_enabled;
 	dev->rdma_type = dev_info.rdma_type;
 	dev->num_hwfns = dev_info.common.num_hwfns;
+
+	if (IS_IWARP(dev) && QEDR_IS_CMT(dev)) {
+		rc = dev->ops->iwarp_set_engine_affin(cdev, false);
+		if (rc) {
+			DP_ERR(dev, "iWARP is disabled over a 100g device Enabling it may impact L2 performance. To enable it run devlink dev param set <dev> name iwarp_cmt value true cmode runtime\n");
+			goto init_err;
+		}
+	}
 	dev->affin_hwfn_idx = dev->ops->common->get_affin_hwfn_idx(cdev);
+
 	dev->rdma_ctx = dev->ops->rdma_get_rdma_ctx(cdev);
 
 	dev->num_cnq = dev->ops->rdma_get_min_cnq_msix(cdev);
@@ -932,6 +941,10 @@ static void qedr_remove(struct qedr_dev *dev)
 	qedr_stop_hw(dev);
 	qedr_sync_free_irqs(dev);
 	qedr_free_resources(dev);
+
+	if (IS_IWARP(dev) && QEDR_IS_CMT(dev))
+		dev->ops->iwarp_set_engine_affin(dev->cdev, true);
+
 	ib_dealloc_device(&dev->ibdev);
 }
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index e4d63359864e..f900fde448db 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -1916,6 +1916,36 @@ static int qed_roce_ll2_set_mac_filter(struct qed_dev *cdev,
 	return rc;
 }
 
+static int qed_iwarp_set_engine_affin(struct qed_dev *cdev, bool b_reset)
+{
+	enum qed_eng eng;
+	u8 ppfid = 0;
+	int rc;
+
+	/* Make sure iwarp cmt mode is enabled before setting affinity */
+	if (!cdev->iwarp_cmt)
+		return -EINVAL;
+
+	if (b_reset)
+		eng = QED_BOTH_ENG;
+	else
+		eng = cdev->l2_affin_hint ? QED_ENG1 : QED_ENG0;
+
+	rc = qed_llh_set_ppfid_affinity(cdev, ppfid, eng);
+	if (rc) {
+		DP_NOTICE(cdev,
+			  "Failed to set the engine affinity of ppfid %d\n",
+			  ppfid);
+		return rc;
+	}
+
+	DP_VERBOSE(cdev, (QED_MSG_RDMA | QED_MSG_SP),
+		   "LLH: Set the engine affinity of non-RoCE packets as %d\n",
+		   eng);
+
+	return 0;
+}
+
 static const struct qed_rdma_ops qed_rdma_ops_pass = {
 	.common = &qed_common_ops_pass,
 	.fill_dev_info = &qed_fill_rdma_dev_info,
@@ -1955,6 +1985,7 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = {
 	.ll2_set_fragment_of_tx_packet = &qed_ll2_set_fragment_of_tx_packet,
 	.ll2_set_mac_filter = &qed_roce_ll2_set_mac_filter,
 	.ll2_get_stats = &qed_ll2_get_stats,
+	.iwarp_set_engine_affin = &qed_iwarp_set_engine_affin,
 	.iwarp_connect = &qed_iwarp_connect,
 	.iwarp_create_listen = &qed_iwarp_create_listen,
 	.iwarp_destroy_listen = &qed_iwarp_destroy_listen,
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
index d15f8e4815e3..898f595ea3d6 100644
--- a/include/linux/qed/qed_rdma_if.h
+++ b/include/linux/qed/qed_rdma_if.h
@@ -670,6 +670,8 @@ struct qed_rdma_ops {
 	int (*ll2_set_mac_filter)(struct qed_dev *cdev,
 				  u8 *old_mac_address, u8 *new_mac_address);
 
+	int (*iwarp_set_engine_affin)(struct qed_dev *cdev, bool b_reset);
+
 	int (*iwarp_connect)(void *rdma_cxt,
 			     struct qed_iwarp_connect_in *iparams,
 			     struct qed_iwarp_connect_out *oparams);
-- 
cgit v1.2.3


From 6ce3b4dcee4f96a5000d3f790403eb6997e3d553 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 May 2019 09:03:30 -0700
Subject: inet: rename netns_frags to fqdir

1) struct netns_frags is renamed to struct fqdir
  This structure is really holding many frag queues in a hash table.

2) (struct inet_frag_queue)->net field is renamed to fqdir
  since net is generally associated to a 'struct net' pointer
  in networking stack.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 | 29 +++++++++---------
 include/net/netns/ieee802154_6lowpan.h  |  2 +-
 include/net/netns/ipv4.h                |  2 +-
 include/net/netns/ipv6.h                |  4 +--
 net/ieee802154/6lowpan/reassembly.c     |  2 +-
 net/ipv4/inet_fragment.c                | 52 ++++++++++++++++-----------------
 net/ipv4/ip_fragment.c                  | 20 ++++++-------
 net/ipv6/netfilter/nf_conntrack_reasm.c |  4 +--
 net/ipv6/reassembly.c                   |  6 ++--
 9 files changed, 61 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 378904ee9129..b19b1ba44ac5 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -4,7 +4,8 @@
 
 #include <linux/rhashtable-types.h>
 
-struct netns_frags {
+/* Per netns frag queues directory */
+struct fqdir {
 	/* sysctls */
 	long			high_thresh;
 	long			low_thresh;
@@ -64,7 +65,7 @@ struct frag_v6_compare_key {
  * @meat: length of received fragments so far
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
- * @net: namespace that this frag belongs to
+ * @fqdir: pointer to struct fqdir
  * @rcu: rcu head for freeing deferall
  */
 struct inet_frag_queue {
@@ -84,7 +85,7 @@ struct inet_frag_queue {
 	int			meat;
 	__u8			flags;
 	u16			max_size;
-	struct netns_frags      *net;
+	struct fqdir		*fqdir;
 	struct rcu_head		rcu;
 };
 
@@ -103,16 +104,16 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int inet_frags_init_net(struct netns_frags *nf)
+static inline int inet_frags_init_net(struct fqdir *fqdir)
 {
-	atomic_long_set(&nf->mem, 0);
-	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
+	atomic_long_set(&fqdir->mem, 0);
+	return rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
 }
-void inet_frags_exit_net(struct netns_frags *nf);
+void inet_frags_exit_net(struct fqdir *fqdir);
 
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
+struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
 
 /* Free all skbs in the queue; return the sum of their truesizes. */
 unsigned int inet_frag_rbtree_purge(struct rb_root *root);
@@ -125,19 +126,19 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
 
 /* Memory Tracking Functions. */
 
-static inline long frag_mem_limit(const struct netns_frags *nf)
+static inline long frag_mem_limit(const struct fqdir *fqdir)
 {
-	return atomic_long_read(&nf->mem);
+	return atomic_long_read(&fqdir->mem);
 }
 
-static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
+static inline void sub_frag_mem_limit(struct fqdir *fqdir, long val)
 {
-	atomic_long_sub(val, &nf->mem);
+	atomic_long_sub(val, &fqdir->mem);
 }
 
-static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
+static inline void add_frag_mem_limit(struct fqdir *fqdir, long val)
 {
-	atomic_long_add(val, &nf->mem);
+	atomic_long_add(val, &fqdir->mem);
 }
 
 /* RFC 3168 support :
diff --git a/include/net/netns/ieee802154_6lowpan.h b/include/net/netns/ieee802154_6lowpan.h
index 736aeac52f56..48897cbcb538 100644
--- a/include/net/netns/ieee802154_6lowpan.h
+++ b/include/net/netns/ieee802154_6lowpan.h
@@ -16,7 +16,7 @@ struct netns_sysctl_lowpan {
 
 struct netns_ieee802154_lowpan {
 	struct netns_sysctl_lowpan sysctl;
-	struct netns_frags	frags;
+	struct fqdir	frags;
 };
 
 #endif
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7698460a3dd1..22f712141962 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -72,7 +72,7 @@ struct netns_ipv4 {
 
 	struct inet_peer_base	*peers;
 	struct sock  * __percpu	*tcp_sk;
-	struct netns_frags	frags;
+	struct fqdir	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
 	struct xt_table		*iptable_mangle;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 5e61b5a8635d..a22e8702d828 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -58,7 +58,7 @@ struct netns_ipv6 {
 	struct ipv6_devconf	*devconf_all;
 	struct ipv6_devconf	*devconf_dflt;
 	struct inet_peer_base	*peers;
-	struct netns_frags	frags;
+	struct fqdir	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*ip6table_filter;
 	struct xt_table		*ip6table_mangle;
@@ -116,7 +116,7 @@ struct netns_ipv6 {
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 struct netns_nf_frag {
-	struct netns_frags	frags;
+	struct fqdir	frags;
 };
 #endif
 
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 4196bcd4105a..8551d307f214 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -139,7 +139,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
 		fq->q.flags |= INET_FRAG_FIRST_IN;
 
 	fq->q.meat += skb->len;
-	add_frag_mem_limit(fq->q.net, skb->truesize);
+	add_frag_mem_limit(fq->q.fqdir, skb->truesize);
 
 	if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    fq->q.meat == fq->q.len) {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 737808e27f8b..f8de2860e3a3 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -145,11 +145,11 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 	inet_frag_put(fq);
 }
 
-void inet_frags_exit_net(struct netns_frags *nf)
+void inet_frags_exit_net(struct fqdir *fqdir)
 {
-	nf->high_thresh = 0; /* prevent creation of new frags */
+	fqdir->high_thresh = 0; /* prevent creation of new frags */
 
-	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
+	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
 
@@ -159,10 +159,10 @@ void inet_frag_kill(struct inet_frag_queue *fq)
 		refcount_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		struct netns_frags *nf = fq->net;
+		struct fqdir *fqdir = fq->fqdir;
 
 		fq->flags |= INET_FRAG_COMPLETE;
-		rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
+		rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params);
 		refcount_dec(&fq->refcnt);
 	}
 }
@@ -172,7 +172,7 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
 {
 	struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
 						 rcu);
-	struct inet_frags *f = q->net->f;
+	struct inet_frags *f = q->fqdir->f;
 
 	if (f->destructor)
 		f->destructor(q);
@@ -203,7 +203,7 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge);
 
 void inet_frag_destroy(struct inet_frag_queue *q)
 {
-	struct netns_frags *nf;
+	struct fqdir *fqdir;
 	unsigned int sum, sum_truesize = 0;
 	struct inet_frags *f;
 
@@ -211,18 +211,18 @@ void inet_frag_destroy(struct inet_frag_queue *q)
 	WARN_ON(del_timer(&q->timer) != 0);
 
 	/* Release all fragment data. */
-	nf = q->net;
-	f = nf->f;
+	fqdir = q->fqdir;
+	f = fqdir->f;
 	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
 	sum = sum_truesize + f->qsize;
 
 	call_rcu(&q->rcu, inet_frag_destroy_rcu);
 
-	sub_frag_mem_limit(nf, sum);
+	sub_frag_mem_limit(fqdir, sum);
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
 					       struct inet_frags *f,
 					       void *arg)
 {
@@ -232,9 +232,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 	if (!q)
 		return NULL;
 
-	q->net = nf;
+	q->fqdir = fqdir;
 	f->constructor(q, arg);
-	add_frag_mem_limit(nf, f->qsize);
+	add_frag_mem_limit(fqdir, f->qsize);
 
 	timer_setup(&q->timer, f->frag_expire, 0);
 	spin_lock_init(&q->lock);
@@ -243,21 +243,21 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 	return q;
 }
 
-static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
 						void *arg,
 						struct inet_frag_queue **prev)
 {
-	struct inet_frags *f = nf->f;
+	struct inet_frags *f = fqdir->f;
 	struct inet_frag_queue *q;
 
-	q = inet_frag_alloc(nf, f, arg);
+	q = inet_frag_alloc(fqdir, f, arg);
 	if (!q) {
 		*prev = ERR_PTR(-ENOMEM);
 		return NULL;
 	}
-	mod_timer(&q->timer, jiffies + nf->timeout);
+	mod_timer(&q->timer, jiffies + fqdir->timeout);
 
-	*prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
+	*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
 						 &q->node, f->rhash_params);
 	if (*prev) {
 		q->flags |= INET_FRAG_COMPLETE;
@@ -269,18 +269,18 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
 }
 
 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
+struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
 {
 	struct inet_frag_queue *fq = NULL, *prev;
 
-	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+	if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh)
 		return NULL;
 
 	rcu_read_lock();
 
-	prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+	prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
 	if (!prev)
-		fq = inet_frag_create(nf, key, &prev);
+		fq = inet_frag_create(fqdir, key, &prev);
 	if (prev && !IS_ERR(prev)) {
 		fq = prev;
 		if (!refcount_inc_not_zero(&fq->refcnt))
@@ -391,7 +391,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
 
 	delta += head->truesize;
 	if (delta)
-		add_frag_mem_limit(q->net, delta);
+		add_frag_mem_limit(q->fqdir, delta);
 
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
@@ -413,7 +413,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
 		head->truesize += clone->truesize;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		add_frag_mem_limit(q->net, clone->truesize);
+		add_frag_mem_limit(q->fqdir, clone->truesize);
 		skb_shinfo(head)->frag_list = clone;
 		nextp = &clone->next;
 	} else {
@@ -466,7 +466,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
 			rbn = rbnext;
 		}
 	}
-	sub_frag_mem_limit(q->net, head->truesize);
+	sub_frag_mem_limit(q->fqdir, head->truesize);
 
 	*nextp = NULL;
 	skb_mark_not_on_list(head);
@@ -494,7 +494,7 @@ struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
 	if (head == q->fragments_tail)
 		q->fragments_tail = NULL;
 
-	sub_frag_mem_limit(q->net, head->truesize);
+	sub_frag_mem_limit(q->fqdir, head->truesize);
 
 	return head;
 }
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cf2b0a6a3337..c93e27cb0a8d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -82,7 +82,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 {
 	struct ipq *qp = container_of(q, struct ipq, q);
-	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
+	struct netns_ipv4 *ipv4 = container_of(q->fqdir, struct netns_ipv4,
 					       frags);
 	struct net *net = container_of(ipv4, struct net, ipv4);
 
@@ -90,7 +90,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 
 	q->key.v4 = *key;
 	qp->ecn = 0;
-	qp->peer = q->net->max_dist ?
+	qp->peer = q->fqdir->max_dist ?
 		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
 		NULL;
 }
@@ -142,7 +142,7 @@ static void ip_expire(struct timer_list *t)
 	int err;
 
 	qp = container_of(frag, struct ipq, q);
-	net = container_of(qp->q.net, struct net, ipv4.frags);
+	net = container_of(qp->q.fqdir, struct net, ipv4.frags);
 
 	rcu_read_lock();
 	spin_lock(&qp->q.lock);
@@ -222,7 +222,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 static int ip_frag_too_far(struct ipq *qp)
 {
 	struct inet_peer *peer = qp->peer;
-	unsigned int max = qp->q.net->max_dist;
+	unsigned int max = qp->q.fqdir->max_dist;
 	unsigned int start, end;
 
 	int rc;
@@ -239,7 +239,7 @@ static int ip_frag_too_far(struct ipq *qp)
 	if (rc) {
 		struct net *net;
 
-		net = container_of(qp->q.net, struct net, ipv4.frags);
+		net = container_of(qp->q.fqdir, struct net, ipv4.frags);
 		__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	}
 
@@ -250,13 +250,13 @@ static int ip_frag_reinit(struct ipq *qp)
 {
 	unsigned int sum_truesize = 0;
 
-	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+	if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
 		refcount_inc(&qp->q.refcnt);
 		return -ETIMEDOUT;
 	}
 
 	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
-	sub_frag_mem_limit(qp->q.net, sum_truesize);
+	sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
 
 	qp->q.flags = 0;
 	qp->q.len = 0;
@@ -273,7 +273,7 @@ static int ip_frag_reinit(struct ipq *qp)
 /* Add new segment to existing queue. */
 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
-	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+	struct net *net = container_of(qp->q.fqdir, struct net, ipv4.frags);
 	int ihl, end, flags, offset;
 	struct sk_buff *prev_tail;
 	struct net_device *dev;
@@ -352,7 +352,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
-	add_frag_mem_limit(qp->q.net, skb->truesize);
+	add_frag_mem_limit(qp->q.fqdir, skb->truesize);
 	if (offset == 0)
 		qp->q.flags |= INET_FRAG_FIRST_IN;
 
@@ -399,7 +399,7 @@ err:
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 			 struct sk_buff *prev_tail, struct net_device *dev)
 {
-	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+	struct net *net = container_of(qp->q.fqdir, struct net, ipv4.frags);
 	struct iphdr *iph;
 	void *reasm_data;
 	int len, err;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3de0e9b0a482..5b877d732b2f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -151,7 +151,7 @@ static void nf_ct_frag6_expire(struct timer_list *t)
 	struct net *net;
 
 	fq = container_of(frag, struct frag_queue, q);
-	net = container_of(fq->q.net, struct net, nf_frag.frags);
+	net = container_of(fq->q.fqdir, struct net, nf_frag.frags);
 
 	ip6frag_expire_frag_queue(net, fq);
 }
@@ -276,7 +276,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	fq->ecn |= ecn;
 	if (payload_len > fq->q.max_size)
 		fq->q.max_size = payload_len;
-	add_frag_mem_limit(fq->q.net, skb->truesize);
+	add_frag_mem_limit(fq->q.fqdir, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 1a832f5e190b..acd5a9a04415 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -79,7 +79,7 @@ static void ip6_frag_expire(struct timer_list *t)
 	struct net *net;
 
 	fq = container_of(frag, struct frag_queue, q);
-	net = container_of(fq->q.net, struct net, ipv6.frags);
+	net = container_of(fq->q.fqdir, struct net, ipv6.frags);
 
 	ip6frag_expire_frag_queue(net, fq);
 }
@@ -200,7 +200,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
 	fq->ecn |= ecn;
-	add_frag_mem_limit(fq->q.net, skb->truesize);
+	add_frag_mem_limit(fq->q.fqdir, skb->truesize);
 
 	fragsize = -skb_network_offset(skb) + skb->len;
 	if (fragsize > fq->q.max_size)
@@ -254,7 +254,7 @@ err:
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
 			  struct sk_buff *prev_tail, struct net_device *dev)
 {
-	struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
+	struct net *net = container_of(fq->q.fqdir, struct net, ipv6.frags);
 	unsigned int nhoff;
 	void *reasm_data;
 	int payload_len;
-- 
cgit v1.2.3


From 89fb900514d1623cf6019848f39d0557a3d31890 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 May 2019 09:03:31 -0700
Subject: net: rename inet_frags_exit_net() to fqdir_exit()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 | 2 +-
 net/ieee802154/6lowpan/reassembly.c     | 4 ++--
 net/ipv4/inet_fragment.c                | 4 ++--
 net/ipv4/ip_fragment.c                  | 4 ++--
 net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++--
 net/ipv6/reassembly.c                   | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index b19b1ba44ac5..d1bfd5dbe2d4 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -109,7 +109,7 @@ static inline int inet_frags_init_net(struct fqdir *fqdir)
 	atomic_long_set(&fqdir->mem, 0);
 	return rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
 }
-void inet_frags_exit_net(struct fqdir *fqdir);
+void fqdir_exit(struct fqdir *fqdir);
 
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 8551d307f214..dc73452d3224 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -464,7 +464,7 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&ieee802154_lowpan->frags);
+		fqdir_exit(&ieee802154_lowpan->frags);
 	return res;
 }
 
@@ -474,7 +474,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 		net_ieee802154_lowpan(net);
 
 	lowpan_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&ieee802154_lowpan->frags);
+	fqdir_exit(&ieee802154_lowpan->frags);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index f8de2860e3a3..a5ec5d956793 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -145,13 +145,13 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 	inet_frag_put(fq);
 }
 
-void inet_frags_exit_net(struct fqdir *fqdir)
+void fqdir_exit(struct fqdir *fqdir)
 {
 	fqdir->high_thresh = 0; /* prevent creation of new frags */
 
 	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
 }
-EXPORT_SYMBOL(inet_frags_exit_net);
+EXPORT_SYMBOL(fqdir_exit);
 
 void inet_frag_kill(struct inet_frag_queue *fq)
 {
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index c93e27cb0a8d..9de13b5d23e3 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -685,14 +685,14 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv4.frags);
+		fqdir_exit(&net->ipv4.frags);
 	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
-	inet_frags_exit_net(&net->ipv4.frags);
+	fqdir_exit(&net->ipv4.frags);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 5b877d732b2f..f08e1422c56d 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -506,14 +506,14 @@ static int nf_ct_net_init(struct net *net)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->nf_frag.frags);
+		fqdir_exit(&net->nf_frag.frags);
 	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
-	inet_frags_exit_net(&net->nf_frag.frags);
+	fqdir_exit(&net->nf_frag.frags);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index acd5a9a04415..f1142f5d5075 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -528,14 +528,14 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 
 	res = ip6_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv6.frags);
+		fqdir_exit(&net->ipv6.frags);
 	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&net->ipv6.frags);
+	fqdir_exit(&net->ipv6.frags);
 }
 
 static struct pernet_operations ip6_frags_ops = {
-- 
cgit v1.2.3


From 803fdd99684714b3cdcbed4364473d41abbd6afe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 May 2019 09:03:32 -0700
Subject: net: rename struct fqdir fields

Rename the @frags fields from structs netns_ipv4, netns_ipv6,
netns_nf_frag and netns_ieee802154_lowpan to @fqdir

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ieee802154_6lowpan.h  |  2 +-
 include/net/netns/ipv4.h                |  2 +-
 include/net/netns/ipv6.h                |  4 +--
 net/ieee802154/6lowpan/reassembly.c     | 36 +++++++++++------------
 net/ipv4/ip_fragment.c                  | 52 ++++++++++++++++-----------------
 net/ipv4/proc.c                         |  4 +--
 net/ipv6/netfilter/nf_conntrack_reasm.c | 40 ++++++++++++-------------
 net/ipv6/proc.c                         |  4 +--
 net/ipv6/reassembly.c                   | 40 ++++++++++++-------------
 9 files changed, 92 insertions(+), 92 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ieee802154_6lowpan.h b/include/net/netns/ieee802154_6lowpan.h
index 48897cbcb538..d27ac64f8dfe 100644
--- a/include/net/netns/ieee802154_6lowpan.h
+++ b/include/net/netns/ieee802154_6lowpan.h
@@ -16,7 +16,7 @@ struct netns_sysctl_lowpan {
 
 struct netns_ieee802154_lowpan {
 	struct netns_sysctl_lowpan sysctl;
-	struct fqdir	frags;
+	struct fqdir		fqdir;
 };
 
 #endif
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 22f712141962..3c270baa32e0 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -72,7 +72,7 @@ struct netns_ipv4 {
 
 	struct inet_peer_base	*peers;
 	struct sock  * __percpu	*tcp_sk;
-	struct fqdir	frags;
+	struct fqdir		fqdir;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
 	struct xt_table		*iptable_mangle;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index a22e8702d828..3dd2ae2a38e2 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -58,7 +58,7 @@ struct netns_ipv6 {
 	struct ipv6_devconf	*devconf_all;
 	struct ipv6_devconf	*devconf_dflt;
 	struct inet_peer_base	*peers;
-	struct fqdir	frags;
+	struct fqdir		fqdir;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*ip6table_filter;
 	struct xt_table		*ip6table_mangle;
@@ -116,7 +116,7 @@ struct netns_ipv6 {
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 struct netns_nf_frag {
-	struct fqdir	frags;
+	struct fqdir	fqdir;
 };
 #endif
 
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index dc73452d3224..955047fe797a 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -79,7 +79,7 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	key.src = *src;
 	key.dst = *dst;
 
-	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
+	q = inet_frag_find(&ieee802154_lowpan->fqdir, &key);
 	if (!q)
 		return NULL;
 
@@ -326,23 +326,23 @@ err:
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
 	{
 		.procname	= "6lowpanfrag_high_thresh",
-		.data		= &init_net.ieee802154_lowpan.frags.high_thresh,
+		.data		= &init_net.ieee802154_lowpan.fqdir.high_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &init_net.ieee802154_lowpan.frags.low_thresh
+		.extra1		= &init_net.ieee802154_lowpan.fqdir.low_thresh
 	},
 	{
 		.procname	= "6lowpanfrag_low_thresh",
-		.data		= &init_net.ieee802154_lowpan.frags.low_thresh,
+		.data		= &init_net.ieee802154_lowpan.fqdir.low_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra2		= &init_net.ieee802154_lowpan.frags.high_thresh
+		.extra2		= &init_net.ieee802154_lowpan.fqdir.high_thresh
 	},
 	{
 		.procname	= "6lowpanfrag_time",
-		.data		= &init_net.ieee802154_lowpan.frags.timeout,
+		.data		= &init_net.ieee802154_lowpan.fqdir.timeout,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -377,11 +377,11 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
 		if (table == NULL)
 			goto err_alloc;
 
-		table[0].data = &ieee802154_lowpan->frags.high_thresh;
-		table[0].extra1 = &ieee802154_lowpan->frags.low_thresh;
-		table[1].data = &ieee802154_lowpan->frags.low_thresh;
-		table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
-		table[2].data = &ieee802154_lowpan->frags.timeout;
+		table[0].data = &ieee802154_lowpan->fqdir.high_thresh;
+		table[0].extra1 = &ieee802154_lowpan->fqdir.low_thresh;
+		table[1].data = &ieee802154_lowpan->fqdir.low_thresh;
+		table[1].extra2 = &ieee802154_lowpan->fqdir.high_thresh;
+		table[2].data = &ieee802154_lowpan->fqdir.timeout;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -454,17 +454,17 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 		net_ieee802154_lowpan(net);
 	int res;
 
-	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
-	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
-	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
-	ieee802154_lowpan->frags.f = &lowpan_frags;
+	ieee802154_lowpan->fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
+	ieee802154_lowpan->fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
+	ieee802154_lowpan->fqdir.timeout = IPV6_FRAG_TIMEOUT;
+	ieee802154_lowpan->fqdir.f = &lowpan_frags;
 
-	res = inet_frags_init_net(&ieee802154_lowpan->frags);
+	res = inet_frags_init_net(&ieee802154_lowpan->fqdir);
 	if (res < 0)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
 	if (res < 0)
-		fqdir_exit(&ieee802154_lowpan->frags);
+		fqdir_exit(&ieee802154_lowpan->fqdir);
 	return res;
 }
 
@@ -474,7 +474,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 		net_ieee802154_lowpan(net);
 
 	lowpan_frags_ns_sysctl_unregister(net);
-	fqdir_exit(&ieee802154_lowpan->frags);
+	fqdir_exit(&ieee802154_lowpan->fqdir);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9de13b5d23e3..f1831367cc2b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -83,7 +83,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 {
 	struct ipq *qp = container_of(q, struct ipq, q);
 	struct netns_ipv4 *ipv4 = container_of(q->fqdir, struct netns_ipv4,
-					       frags);
+					       fqdir);
 	struct net *net = container_of(ipv4, struct net, ipv4);
 
 	const struct frag_v4_compare_key *key = a;
@@ -142,7 +142,7 @@ static void ip_expire(struct timer_list *t)
 	int err;
 
 	qp = container_of(frag, struct ipq, q);
-	net = container_of(qp->q.fqdir, struct net, ipv4.frags);
+	net = container_of(qp->q.fqdir, struct net, ipv4.fqdir);
 
 	rcu_read_lock();
 	spin_lock(&qp->q.lock);
@@ -211,7 +211,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 	};
 	struct inet_frag_queue *q;
 
-	q = inet_frag_find(&net->ipv4.frags, &key);
+	q = inet_frag_find(&net->ipv4.fqdir, &key);
 	if (!q)
 		return NULL;
 
@@ -239,7 +239,7 @@ static int ip_frag_too_far(struct ipq *qp)
 	if (rc) {
 		struct net *net;
 
-		net = container_of(qp->q.fqdir, struct net, ipv4.frags);
+		net = container_of(qp->q.fqdir, struct net, ipv4.fqdir);
 		__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	}
 
@@ -273,7 +273,7 @@ static int ip_frag_reinit(struct ipq *qp)
 /* Add new segment to existing queue. */
 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
-	struct net *net = container_of(qp->q.fqdir, struct net, ipv4.frags);
+	struct net *net = container_of(qp->q.fqdir, struct net, ipv4.fqdir);
 	int ihl, end, flags, offset;
 	struct sk_buff *prev_tail;
 	struct net_device *dev;
@@ -399,7 +399,7 @@ err:
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 			 struct sk_buff *prev_tail, struct net_device *dev)
 {
-	struct net *net = container_of(qp->q.fqdir, struct net, ipv4.frags);
+	struct net *net = container_of(qp->q.fqdir, struct net, ipv4.fqdir);
 	struct iphdr *iph;
 	void *reasm_data;
 	int len, err;
@@ -544,30 +544,30 @@ static int dist_min;
 static struct ctl_table ip4_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ipfrag_high_thresh",
-		.data		= &init_net.ipv4.frags.high_thresh,
+		.data		= &init_net.ipv4.fqdir.high_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &init_net.ipv4.frags.low_thresh
+		.extra1		= &init_net.ipv4.fqdir.low_thresh
 	},
 	{
 		.procname	= "ipfrag_low_thresh",
-		.data		= &init_net.ipv4.frags.low_thresh,
+		.data		= &init_net.ipv4.fqdir.low_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra2		= &init_net.ipv4.frags.high_thresh
+		.extra2		= &init_net.ipv4.fqdir.high_thresh
 	},
 	{
 		.procname	= "ipfrag_time",
-		.data		= &init_net.ipv4.frags.timeout,
+		.data		= &init_net.ipv4.fqdir.timeout,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{
 		.procname	= "ipfrag_max_dist",
-		.data		= &init_net.ipv4.frags.max_dist,
+		.data		= &init_net.ipv4.fqdir.max_dist,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -600,12 +600,12 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
 		if (!table)
 			goto err_alloc;
 
-		table[0].data = &net->ipv4.frags.high_thresh;
-		table[0].extra1 = &net->ipv4.frags.low_thresh;
-		table[1].data = &net->ipv4.frags.low_thresh;
-		table[1].extra2 = &net->ipv4.frags.high_thresh;
-		table[2].data = &net->ipv4.frags.timeout;
-		table[3].data = &net->ipv4.frags.max_dist;
+		table[0].data = &net->ipv4.fqdir.high_thresh;
+		table[0].extra1 = &net->ipv4.fqdir.low_thresh;
+		table[1].data = &net->ipv4.fqdir.low_thresh;
+		table[1].extra2 = &net->ipv4.fqdir.high_thresh;
+		table[2].data = &net->ipv4.fqdir.timeout;
+		table[3].data = &net->ipv4.fqdir.max_dist;
 	}
 
 	hdr = register_net_sysctl(net, "net/ipv4", table);
@@ -668,31 +668,31 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	 * we will prune down to 3MB, making room for approx 8 big 64K
 	 * fragments 8x128k.
 	 */
-	net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
-	net->ipv4.frags.low_thresh  = 3 * 1024 * 1024;
+	net->ipv4.fqdir.high_thresh = 4 * 1024 * 1024;
+	net->ipv4.fqdir.low_thresh  = 3 * 1024 * 1024;
 	/*
 	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
 	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
 	 * by TTL.
 	 */
-	net->ipv4.frags.timeout = IP_FRAG_TIME;
+	net->ipv4.fqdir.timeout = IP_FRAG_TIME;
 
-	net->ipv4.frags.max_dist = 64;
-	net->ipv4.frags.f = &ip4_frags;
+	net->ipv4.fqdir.max_dist = 64;
+	net->ipv4.fqdir.f = &ip4_frags;
 
-	res = inet_frags_init_net(&net->ipv4.frags);
+	res = inet_frags_init_net(&net->ipv4.fqdir);
 	if (res < 0)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
 	if (res < 0)
-		fqdir_exit(&net->ipv4.frags);
+		fqdir_exit(&net->ipv4.fqdir);
 	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
-	fqdir_exit(&net->ipv4.frags);
+	fqdir_exit(&net->ipv4.fqdir);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index c3610b37bb4c..3927e00084e8 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -72,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
 	seq_printf(seq,  "FRAG: inuse %u memory %lu\n",
-		   atomic_read(&net->ipv4.frags.rhashtable.nelems),
-		   frag_mem_limit(&net->ipv4.frags));
+		   atomic_read(&net->ipv4.fqdir.rhashtable.nelems),
+		   frag_mem_limit(&net->ipv4.fqdir));
 	return 0;
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index f08e1422c56d..46073e9a6c56 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -58,26 +58,26 @@ static struct inet_frags nf_frags;
 static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_frag6_timeout",
-		.data		= &init_net.nf_frag.frags.timeout,
+		.data		= &init_net.nf_frag.fqdir.timeout,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{
 		.procname	= "nf_conntrack_frag6_low_thresh",
-		.data		= &init_net.nf_frag.frags.low_thresh,
+		.data		= &init_net.nf_frag.fqdir.low_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra2		= &init_net.nf_frag.frags.high_thresh
+		.extra2		= &init_net.nf_frag.fqdir.high_thresh
 	},
 	{
 		.procname	= "nf_conntrack_frag6_high_thresh",
-		.data		= &init_net.nf_frag.frags.high_thresh,
+		.data		= &init_net.nf_frag.fqdir.high_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &init_net.nf_frag.frags.low_thresh
+		.extra1		= &init_net.nf_frag.fqdir.low_thresh
 	},
 	{ }
 };
@@ -94,12 +94,12 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
 		if (table == NULL)
 			goto err_alloc;
 
-		table[0].data = &net->nf_frag.frags.timeout;
-		table[1].data = &net->nf_frag.frags.low_thresh;
-		table[1].extra2 = &net->nf_frag.frags.high_thresh;
-		table[2].data = &net->nf_frag.frags.high_thresh;
-		table[2].extra1 = &net->nf_frag.frags.low_thresh;
-		table[2].extra2 = &init_net.nf_frag.frags.high_thresh;
+		table[0].data = &net->nf_frag.fqdir.timeout;
+		table[1].data = &net->nf_frag.fqdir.low_thresh;
+		table[1].extra2 = &net->nf_frag.fqdir.high_thresh;
+		table[2].data = &net->nf_frag.fqdir.high_thresh;
+		table[2].extra1 = &net->nf_frag.fqdir.low_thresh;
+		table[2].extra2 = &init_net.nf_frag.fqdir.high_thresh;
 	}
 
 	hdr = register_net_sysctl(net, "net/netfilter", table);
@@ -151,7 +151,7 @@ static void nf_ct_frag6_expire(struct timer_list *t)
 	struct net *net;
 
 	fq = container_of(frag, struct frag_queue, q);
-	net = container_of(fq->q.fqdir, struct net, nf_frag.frags);
+	net = container_of(fq->q.fqdir, struct net, nf_frag.fqdir);
 
 	ip6frag_expire_frag_queue(net, fq);
 }
@@ -169,7 +169,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 	};
 	struct inet_frag_queue *q;
 
-	q = inet_frag_find(&net->nf_frag.frags, &key);
+	q = inet_frag_find(&net->nf_frag.fqdir, &key);
 	if (!q)
 		return NULL;
 
@@ -496,24 +496,24 @@ static int nf_ct_net_init(struct net *net)
 {
 	int res;
 
-	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
-	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
-	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-	net->nf_frag.frags.f = &nf_frags;
+	net->nf_frag.fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
+	net->nf_frag.fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
+	net->nf_frag.fqdir.timeout = IPV6_FRAG_TIMEOUT;
+	net->nf_frag.fqdir.f = &nf_frags;
 
-	res = inet_frags_init_net(&net->nf_frag.frags);
+	res = inet_frags_init_net(&net->nf_frag.fqdir);
 	if (res < 0)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
 	if (res < 0)
-		fqdir_exit(&net->nf_frag.frags);
+		fqdir_exit(&net->nf_frag.fqdir);
 	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
-	fqdir_exit(&net->nf_frag.frags);
+	fqdir_exit(&net->nf_frag.fqdir);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 2356b4af7309..f3e3118393c4 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -48,8 +48,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
 	seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
-		   atomic_read(&net->ipv6.frags.rhashtable.nelems),
-		   frag_mem_limit(&net->ipv6.frags));
+		   atomic_read(&net->ipv6.fqdir.rhashtable.nelems),
+		   frag_mem_limit(&net->ipv6.fqdir));
 	return 0;
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index f1142f5d5075..5160fd9ed223 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -79,7 +79,7 @@ static void ip6_frag_expire(struct timer_list *t)
 	struct net *net;
 
 	fq = container_of(frag, struct frag_queue, q);
-	net = container_of(fq->q.fqdir, struct net, ipv6.frags);
+	net = container_of(fq->q.fqdir, struct net, ipv6.fqdir);
 
 	ip6frag_expire_frag_queue(net, fq);
 }
@@ -100,7 +100,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 					    IPV6_ADDR_LINKLOCAL)))
 		key.iif = 0;
 
-	q = inet_frag_find(&net->ipv6.frags, &key);
+	q = inet_frag_find(&net->ipv6.fqdir, &key);
 	if (!q)
 		return NULL;
 
@@ -254,7 +254,7 @@ err:
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
 			  struct sk_buff *prev_tail, struct net_device *dev)
 {
-	struct net *net = container_of(fq->q.fqdir, struct net, ipv6.frags);
+	struct net *net = container_of(fq->q.fqdir, struct net, ipv6.fqdir);
 	unsigned int nhoff;
 	void *reasm_data;
 	int payload_len;
@@ -401,23 +401,23 @@ static const struct inet6_protocol frag_protocol = {
 static struct ctl_table ip6_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ip6frag_high_thresh",
-		.data		= &init_net.ipv6.frags.high_thresh,
+		.data		= &init_net.ipv6.fqdir.high_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &init_net.ipv6.frags.low_thresh
+		.extra1		= &init_net.ipv6.fqdir.low_thresh
 	},
 	{
 		.procname	= "ip6frag_low_thresh",
-		.data		= &init_net.ipv6.frags.low_thresh,
+		.data		= &init_net.ipv6.fqdir.low_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra2		= &init_net.ipv6.frags.high_thresh
+		.extra2		= &init_net.ipv6.fqdir.high_thresh
 	},
 	{
 		.procname	= "ip6frag_time",
-		.data		= &init_net.ipv6.frags.timeout,
+		.data		= &init_net.ipv6.fqdir.timeout,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -449,11 +449,11 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
 		if (!table)
 			goto err_alloc;
 
-		table[0].data = &net->ipv6.frags.high_thresh;
-		table[0].extra1 = &net->ipv6.frags.low_thresh;
-		table[1].data = &net->ipv6.frags.low_thresh;
-		table[1].extra2 = &net->ipv6.frags.high_thresh;
-		table[2].data = &net->ipv6.frags.timeout;
+		table[0].data = &net->ipv6.fqdir.high_thresh;
+		table[0].extra1 = &net->ipv6.fqdir.low_thresh;
+		table[1].data = &net->ipv6.fqdir.low_thresh;
+		table[1].extra2 = &net->ipv6.fqdir.high_thresh;
+		table[2].data = &net->ipv6.fqdir.timeout;
 	}
 
 	hdr = register_net_sysctl(net, "net/ipv6", table);
@@ -517,25 +517,25 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 {
 	int res;
 
-	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
-	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
-	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
-	net->ipv6.frags.f = &ip6_frags;
+	net->ipv6.fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
+	net->ipv6.fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
+	net->ipv6.fqdir.timeout = IPV6_FRAG_TIMEOUT;
+	net->ipv6.fqdir.f = &ip6_frags;
 
-	res = inet_frags_init_net(&net->ipv6.frags);
+	res = inet_frags_init_net(&net->ipv6.fqdir);
 	if (res < 0)
 		return res;
 
 	res = ip6_frags_ns_sysctl_register(net);
 	if (res < 0)
-		fqdir_exit(&net->ipv6.frags);
+		fqdir_exit(&net->ipv6.fqdir);
 	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
-	fqdir_exit(&net->ipv6.frags);
+	fqdir_exit(&net->ipv6.fqdir);
 }
 
 static struct pernet_operations ip6_frags_ops = {
-- 
cgit v1.2.3


From 9cce45f22ceedf639cbb5fb5dfe612a278d36b58 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 May 2019 09:03:37 -0700
Subject: net: rename inet_frags_init_net() to fdir_init()

And pass an extra parameter, since we will soon
dynamically allocate fqdir structures.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 | 3 ++-
 net/ieee802154/6lowpan/reassembly.c     | 3 +--
 net/ipv4/ip_fragment.c                  | 3 +--
 net/ipv6/netfilter/nf_conntrack_reasm.c | 3 +--
 net/ipv6/reassembly.c                   | 3 +--
 5 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index d1bfd5dbe2d4..fca246b0abd8 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -104,8 +104,9 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int inet_frags_init_net(struct fqdir *fqdir)
+static inline int fqdir_init(struct fqdir *fqdir, struct inet_frags *f)
 {
+	fqdir->f = f;
 	atomic_long_set(&fqdir->mem, 0);
 	return rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
 }
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 4bbd6999c58f..82db76ce0e61 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -452,9 +452,8 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	ieee802154_lowpan->fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->fqdir.timeout = IPV6_FRAG_TIMEOUT;
-	ieee802154_lowpan->fqdir.f = &lowpan_frags;
 
-	res = inet_frags_init_net(&ieee802154_lowpan->fqdir);
+	res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags);
 	if (res < 0)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fb035f4f36ca..d95592d52981 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -672,9 +672,8 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	net->ipv4.fqdir.timeout = IP_FRAG_TIME;
 
 	net->ipv4.fqdir.max_dist = 64;
-	net->ipv4.fqdir.f = &ip4_frags;
 
-	res = inet_frags_init_net(&net->ipv4.fqdir);
+	res = fqdir_init(&net->ipv4.fqdir, &ip4_frags);
 	if (res < 0)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3387ce530409..e72a1cc42163 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -494,9 +494,8 @@ static int nf_ct_net_init(struct net *net)
 	net->nf_frag.fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.fqdir.timeout = IPV6_FRAG_TIMEOUT;
-	net->nf_frag.fqdir.f = &nf_frags;
 
-	res = inet_frags_init_net(&net->nf_frag.fqdir);
+	res = fqdir_init(&net->nf_frag.fqdir, &nf_frags);
 	if (res < 0)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index aabc9b2e83e4..8235c5a8e8fe 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -515,9 +515,8 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	net->ipv6.fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.fqdir.timeout = IPV6_FRAG_TIMEOUT;
-	net->ipv6.fqdir.f = &ip6_frags;
 
-	res = inet_frags_init_net(&net->ipv6.fqdir);
+	res = fqdir_init(&net->ipv6.fqdir, &ip6_frags);
 	if (res < 0)
 		return res;
 
-- 
cgit v1.2.3


From a39aca678a0626941aa99c18c1c452ca758e7865 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 May 2019 09:03:38 -0700
Subject: net: add a net pointer to struct fqdir

fqdir will soon be dynamically allocated.

We need to reach the struct net pointer from fqdir,
so add it, and replace the various container_of() constructs
by direct access to the new field.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  5 ++++-
 net/ieee802154/6lowpan/reassembly.c     |  2 +-
 net/ipv4/ip_fragment.c                  | 20 +++++++-------------
 net/ipv6/netfilter/nf_conntrack_reasm.c |  6 ++----
 net/ipv6/reassembly.c                   |  8 +++-----
 5 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index fca246b0abd8..37cde5c1498c 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -12,6 +12,7 @@ struct fqdir {
 	int			timeout;
 	int			max_dist;
 	struct inet_frags	*f;
+	struct net		*net;
 
 	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 
@@ -104,9 +105,11 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int fqdir_init(struct fqdir *fqdir, struct inet_frags *f)
+static inline int fqdir_init(struct fqdir *fqdir, struct inet_frags *f,
+			     struct net *net)
 {
 	fqdir->f = f;
+	fqdir->net = net;
 	atomic_long_set(&fqdir->mem, 0);
 	return rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
 }
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 82db76ce0e61..03a444c9e191 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -453,7 +453,7 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	ieee802154_lowpan->fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->fqdir.timeout = IPV6_FRAG_TIMEOUT;
 
-	res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags);
+	res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net);
 	if (res < 0)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d95592d52981..d59269bbe1b6 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -82,9 +82,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 {
 	struct ipq *qp = container_of(q, struct ipq, q);
-	struct netns_ipv4 *ipv4 = container_of(q->fqdir, struct netns_ipv4,
-					       fqdir);
-	struct net *net = container_of(ipv4, struct net, ipv4);
+	struct net *net = q->fqdir->net;
 
 	const struct frag_v4_compare_key *key = a;
 
@@ -142,7 +140,7 @@ static void ip_expire(struct timer_list *t)
 	int err;
 
 	qp = container_of(frag, struct ipq, q);
-	net = container_of(qp->q.fqdir, struct net, ipv4.fqdir);
+	net = qp->q.fqdir->net;
 
 	rcu_read_lock();
 	spin_lock(&qp->q.lock);
@@ -236,12 +234,8 @@ static int ip_frag_too_far(struct ipq *qp)
 
 	rc = qp->q.fragments_tail && (end - start) > max;
 
-	if (rc) {
-		struct net *net;
-
-		net = container_of(qp->q.fqdir, struct net, ipv4.fqdir);
-		__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
-	}
+	if (rc)
+		__IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
 
 	return rc;
 }
@@ -273,7 +267,7 @@ static int ip_frag_reinit(struct ipq *qp)
 /* Add new segment to existing queue. */
 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
-	struct net *net = container_of(qp->q.fqdir, struct net, ipv4.fqdir);
+	struct net *net = qp->q.fqdir->net;
 	int ihl, end, flags, offset;
 	struct sk_buff *prev_tail;
 	struct net_device *dev;
@@ -399,7 +393,7 @@ err:
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 			 struct sk_buff *prev_tail, struct net_device *dev)
 {
-	struct net *net = container_of(qp->q.fqdir, struct net, ipv4.fqdir);
+	struct net *net = qp->q.fqdir->net;
 	struct iphdr *iph;
 	void *reasm_data;
 	int len, err;
@@ -673,7 +667,7 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
 	net->ipv4.fqdir.max_dist = 64;
 
-	res = fqdir_init(&net->ipv4.fqdir, &ip4_frags);
+	res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
 	if (res < 0)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index e72a1cc42163..b6f7385ed93c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -143,12 +143,10 @@ static void nf_ct_frag6_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
 	struct frag_queue *fq;
-	struct net *net;
 
 	fq = container_of(frag, struct frag_queue, q);
-	net = container_of(fq->q.fqdir, struct net, nf_frag.fqdir);
 
-	ip6frag_expire_frag_queue(net, fq);
+	ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
 }
 
 /* Creation primitives. */
@@ -495,7 +493,7 @@ static int nf_ct_net_init(struct net *net)
 	net->nf_frag.fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.fqdir.timeout = IPV6_FRAG_TIMEOUT;
 
-	res = fqdir_init(&net->nf_frag.fqdir, &nf_frags);
+	res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net);
 	if (res < 0)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 8235c5a8e8fe..a6f26aa648fb 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -76,12 +76,10 @@ static void ip6_frag_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
 	struct frag_queue *fq;
-	struct net *net;
 
 	fq = container_of(frag, struct frag_queue, q);
-	net = container_of(fq->q.fqdir, struct net, ipv6.fqdir);
 
-	ip6frag_expire_frag_queue(net, fq);
+	ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
 }
 
 static struct frag_queue *
@@ -254,7 +252,7 @@ err:
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
 			  struct sk_buff *prev_tail, struct net_device *dev)
 {
-	struct net *net = container_of(fq->q.fqdir, struct net, ipv6.fqdir);
+	struct net *net = fq->q.fqdir->net;
 	unsigned int nhoff;
 	void *reasm_data;
 	int payload_len;
@@ -516,7 +514,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	net->ipv6.fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.fqdir.timeout = IPV6_FRAG_TIMEOUT;
 
-	res = fqdir_init(&net->ipv6.fqdir, &ip6_frags);
+	res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net);
 	if (res < 0)
 		return res;
 
-- 
cgit v1.2.3


From 4907abc605e328d61bee56e4e89db4f56ade2090 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 May 2019 09:03:39 -0700
Subject: net: dynamically allocate fqdir structures

Following patch will add rcu grace period before fqdir
rhashtable destruction, so we need to dynamically allocate
fqdir structures to not force expensive synchronize_rcu() calls
in netns dismantle path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 | 17 ++++++++++++++---
 include/net/netns/ieee802154_6lowpan.h  |  2 +-
 include/net/netns/ipv4.h                |  2 +-
 include/net/netns/ipv6.h                |  4 ++--
 net/ieee802154/6lowpan/reassembly.c     | 24 +++++++++++++-----------
 net/ipv4/inet_fragment.c                |  1 +
 net/ipv4/ip_fragment.c                  | 32 ++++++++++++++++----------------
 net/ipv4/proc.c                         |  4 ++--
 net/ipv6/netfilter/nf_conntrack_reasm.c | 27 ++++++++++++++-------------
 net/ipv6/proc.c                         |  4 ++--
 net/ipv6/reassembly.c                   | 24 ++++++++++++------------
 11 files changed, 78 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 37cde5c1498c..5f754c660cfa 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -105,14 +105,25 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int fqdir_init(struct fqdir *fqdir, struct inet_frags *f,
+static inline int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f,
 			     struct net *net)
 {
+	struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
+	int res;
+
+	if (!fqdir)
+		return -ENOMEM;
 	fqdir->f = f;
 	fqdir->net = net;
-	atomic_long_set(&fqdir->mem, 0);
-	return rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
+	res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
+	if (res < 0) {
+		kfree(fqdir);
+		return res;
+	}
+	*fqdirp = fqdir;
+	return 0;
 }
+
 void fqdir_exit(struct fqdir *fqdir);
 
 void inet_frag_kill(struct inet_frag_queue *q);
diff --git a/include/net/netns/ieee802154_6lowpan.h b/include/net/netns/ieee802154_6lowpan.h
index d27ac64f8dfe..95406e1342cb 100644
--- a/include/net/netns/ieee802154_6lowpan.h
+++ b/include/net/netns/ieee802154_6lowpan.h
@@ -16,7 +16,7 @@ struct netns_sysctl_lowpan {
 
 struct netns_ieee802154_lowpan {
 	struct netns_sysctl_lowpan sysctl;
-	struct fqdir		fqdir;
+	struct fqdir		*fqdir;
 };
 
 #endif
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 3c270baa32e0..c07cee1e0c9e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -72,7 +72,7 @@ struct netns_ipv4 {
 
 	struct inet_peer_base	*peers;
 	struct sock  * __percpu	*tcp_sk;
-	struct fqdir		fqdir;
+	struct fqdir		*fqdir;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
 	struct xt_table		*iptable_mangle;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 3dd2ae2a38e2..022a0fd1a5a4 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -58,7 +58,7 @@ struct netns_ipv6 {
 	struct ipv6_devconf	*devconf_all;
 	struct ipv6_devconf	*devconf_dflt;
 	struct inet_peer_base	*peers;
-	struct fqdir		fqdir;
+	struct fqdir		*fqdir;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*ip6table_filter;
 	struct xt_table		*ip6table_mangle;
@@ -116,7 +116,7 @@ struct netns_ipv6 {
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 struct netns_nf_frag {
-	struct fqdir	fqdir;
+	struct fqdir	*fqdir;
 };
 #endif
 
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 03a444c9e191..e59c3b708969 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -79,7 +79,7 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	key.src = *src;
 	key.dst = *dst;
 
-	q = inet_frag_find(&ieee802154_lowpan->fqdir, &key);
+	q = inet_frag_find(ieee802154_lowpan->fqdir, &key);
 	if (!q)
 		return NULL;
 
@@ -377,11 +377,11 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
 			table[0].procname = NULL;
 	}
 
-	table[0].data	= &ieee802154_lowpan->fqdir.high_thresh;
-	table[0].extra1	= &ieee802154_lowpan->fqdir.low_thresh;
-	table[1].data	= &ieee802154_lowpan->fqdir.low_thresh;
-	table[1].extra2	= &ieee802154_lowpan->fqdir.high_thresh;
-	table[2].data	= &ieee802154_lowpan->fqdir.timeout;
+	table[0].data	= &ieee802154_lowpan->fqdir->high_thresh;
+	table[0].extra1	= &ieee802154_lowpan->fqdir->low_thresh;
+	table[1].data	= &ieee802154_lowpan->fqdir->low_thresh;
+	table[1].extra2	= &ieee802154_lowpan->fqdir->high_thresh;
+	table[2].data	= &ieee802154_lowpan->fqdir->timeout;
 
 	hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table);
 	if (hdr == NULL)
@@ -449,16 +449,18 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 		net_ieee802154_lowpan(net);
 	int res;
 
-	ieee802154_lowpan->fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
-	ieee802154_lowpan->fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
-	ieee802154_lowpan->fqdir.timeout = IPV6_FRAG_TIMEOUT;
 
 	res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net);
 	if (res < 0)
 		return res;
+
+	ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+	ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+	ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
 	res = lowpan_frags_ns_sysctl_register(net);
 	if (res < 0)
-		fqdir_exit(&ieee802154_lowpan->fqdir);
+		fqdir_exit(ieee802154_lowpan->fqdir);
 	return res;
 }
 
@@ -468,7 +470,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 		net_ieee802154_lowpan(net);
 
 	lowpan_frags_ns_sysctl_unregister(net);
-	fqdir_exit(&ieee802154_lowpan->fqdir);
+	fqdir_exit(ieee802154_lowpan->fqdir);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index a5ec5d956793..b4432f209c71 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -150,6 +150,7 @@ void fqdir_exit(struct fqdir *fqdir)
 	fqdir->high_thresh = 0; /* prevent creation of new frags */
 
 	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+	kfree(fqdir);
 }
 EXPORT_SYMBOL(fqdir_exit);
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d59269bbe1b6..1ffaec056821 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -209,7 +209,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 	};
 	struct inet_frag_queue *q;
 
-	q = inet_frag_find(&net->ipv4.fqdir, &key);
+	q = inet_frag_find(net->ipv4.fqdir, &key);
 	if (!q)
 		return NULL;
 
@@ -589,12 +589,12 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
 			goto err_alloc;
 
 	}
-	table[0].data	= &net->ipv4.fqdir.high_thresh;
-	table[0].extra1	= &net->ipv4.fqdir.low_thresh;
-	table[1].data	= &net->ipv4.fqdir.low_thresh;
-	table[1].extra2	= &net->ipv4.fqdir.high_thresh;
-	table[2].data	= &net->ipv4.fqdir.timeout;
-	table[3].data	= &net->ipv4.fqdir.max_dist;
+	table[0].data	= &net->ipv4.fqdir->high_thresh;
+	table[0].extra1	= &net->ipv4.fqdir->low_thresh;
+	table[1].data	= &net->ipv4.fqdir->low_thresh;
+	table[1].extra2	= &net->ipv4.fqdir->high_thresh;
+	table[2].data	= &net->ipv4.fqdir->timeout;
+	table[3].data	= &net->ipv4.fqdir->max_dist;
 
 	hdr = register_net_sysctl(net, "net/ipv4", table);
 	if (!hdr)
@@ -642,6 +642,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 {
 	int res;
 
+	res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
+	if (res < 0)
+		return res;
 	/* Fragment cache limits.
 	 *
 	 * The fragment memory accounting code, (tries to) account for
@@ -656,30 +659,27 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	 * we will prune down to 3MB, making room for approx 8 big 64K
 	 * fragments 8x128k.
 	 */
-	net->ipv4.fqdir.high_thresh = 4 * 1024 * 1024;
-	net->ipv4.fqdir.low_thresh  = 3 * 1024 * 1024;
+	net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
+	net->ipv4.fqdir->low_thresh  = 3 * 1024 * 1024;
 	/*
 	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
 	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
 	 * by TTL.
 	 */
-	net->ipv4.fqdir.timeout = IP_FRAG_TIME;
+	net->ipv4.fqdir->timeout = IP_FRAG_TIME;
 
-	net->ipv4.fqdir.max_dist = 64;
+	net->ipv4.fqdir->max_dist = 64;
 
-	res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
-	if (res < 0)
-		return res;
 	res = ip4_frags_ns_ctl_register(net);
 	if (res < 0)
-		fqdir_exit(&net->ipv4.fqdir);
+		fqdir_exit(net->ipv4.fqdir);
 	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
-	fqdir_exit(&net->ipv4.fqdir);
+	fqdir_exit(net->ipv4.fqdir);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3927e00084e8..b613572c6616 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -72,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
 	seq_printf(seq,  "FRAG: inuse %u memory %lu\n",
-		   atomic_read(&net->ipv4.fqdir.rhashtable.nelems),
-		   frag_mem_limit(&net->ipv4.fqdir));
+		   atomic_read(&net->ipv4.fqdir->rhashtable.nelems),
+		   frag_mem_limit(net->ipv4.fqdir));
 	return 0;
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b6f7385ed93c..c5d59fa568d6 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -90,12 +90,12 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
 			goto err_alloc;
 	}
 
-	table[0].data	= &net->nf_frag.fqdir.timeout;
-	table[1].data	= &net->nf_frag.fqdir.low_thresh;
-	table[1].extra2	= &net->nf_frag.fqdir.high_thresh;
-	table[2].data	= &net->nf_frag.fqdir.high_thresh;
-	table[2].extra1	= &net->nf_frag.fqdir.low_thresh;
-	table[2].extra2	= &init_net.nf_frag.fqdir.high_thresh;
+	table[0].data	= &net->nf_frag.fqdir->timeout;
+	table[1].data	= &net->nf_frag.fqdir->low_thresh;
+	table[1].extra2	= &net->nf_frag.fqdir->high_thresh;
+	table[2].data	= &net->nf_frag.fqdir->high_thresh;
+	table[2].extra1	= &net->nf_frag.fqdir->low_thresh;
+	table[2].extra2	= &init_net.nf_frag.fqdir->high_thresh;
 
 	hdr = register_net_sysctl(net, "net/netfilter", table);
 	if (hdr == NULL)
@@ -162,7 +162,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 	};
 	struct inet_frag_queue *q;
 
-	q = inet_frag_find(&net->nf_frag.fqdir, &key);
+	q = inet_frag_find(net->nf_frag.fqdir, &key);
 	if (!q)
 		return NULL;
 
@@ -489,23 +489,24 @@ static int nf_ct_net_init(struct net *net)
 {
 	int res;
 
-	net->nf_frag.fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
-	net->nf_frag.fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
-	net->nf_frag.fqdir.timeout = IPV6_FRAG_TIMEOUT;
-
 	res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net);
 	if (res < 0)
 		return res;
+
+	net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+	net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+	net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
 	res = nf_ct_frag6_sysctl_register(net);
 	if (res < 0)
-		fqdir_exit(&net->nf_frag.fqdir);
+		fqdir_exit(net->nf_frag.fqdir);
 	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
-	fqdir_exit(&net->nf_frag.fqdir);
+	fqdir_exit(net->nf_frag.fqdir);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index f3e3118393c4..0bbefc440bcd 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -48,8 +48,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
 	seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
-		   atomic_read(&net->ipv6.fqdir.rhashtable.nelems),
-		   frag_mem_limit(&net->ipv6.fqdir));
+		   atomic_read(&net->ipv6.fqdir->rhashtable.nelems),
+		   frag_mem_limit(net->ipv6.fqdir));
 	return 0;
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index a6f26aa648fb..836ea964cf14 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -98,7 +98,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 					    IPV6_ADDR_LINKLOCAL)))
 		key.iif = 0;
 
-	q = inet_frag_find(&net->ipv6.fqdir, &key);
+	q = inet_frag_find(net->ipv6.fqdir, &key);
 	if (!q)
 		return NULL;
 
@@ -443,11 +443,11 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
 			goto err_alloc;
 
 	}
-	table[0].data	= &net->ipv6.fqdir.high_thresh;
-	table[0].extra1	= &net->ipv6.fqdir.low_thresh;
-	table[1].data	= &net->ipv6.fqdir.low_thresh;
-	table[1].extra2	= &net->ipv6.fqdir.high_thresh;
-	table[2].data	= &net->ipv6.fqdir.timeout;
+	table[0].data	= &net->ipv6.fqdir->high_thresh;
+	table[0].extra1	= &net->ipv6.fqdir->low_thresh;
+	table[1].data	= &net->ipv6.fqdir->low_thresh;
+	table[1].extra2	= &net->ipv6.fqdir->high_thresh;
+	table[2].data	= &net->ipv6.fqdir->timeout;
 
 	hdr = register_net_sysctl(net, "net/ipv6", table);
 	if (!hdr)
@@ -510,24 +510,24 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 {
 	int res;
 
-	net->ipv6.fqdir.high_thresh = IPV6_FRAG_HIGH_THRESH;
-	net->ipv6.fqdir.low_thresh = IPV6_FRAG_LOW_THRESH;
-	net->ipv6.fqdir.timeout = IPV6_FRAG_TIMEOUT;
-
 	res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net);
 	if (res < 0)
 		return res;
 
+	net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+	net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+	net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
 	res = ip6_frags_ns_sysctl_register(net);
 	if (res < 0)
-		fqdir_exit(&net->ipv6.fqdir);
+		fqdir_exit(net->ipv6.fqdir);
 	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
-	fqdir_exit(&net->ipv6.fqdir);
+	fqdir_exit(net->ipv6.fqdir);
 }
 
 static struct pernet_operations ip6_frags_ops = {
-- 
cgit v1.2.3


From 3c8fc87820446ce5b948dc17648509340102b818 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 24 May 2019 09:03:40 -0700
Subject: inet: frags: rework rhashtable dismantle

syszbot found an interesting use-after-free [1] happening
while IPv4 fragment rhashtable was destroyed at netns dismantle.

While no insertions can possibly happen at the time a dismantling
netns is destroying this rhashtable, timers can still fire and
attempt to remove elements from this rhashtable.

This is forbidden, since rhashtable_free_and_destroy() has
no synchronization against concurrent inserts and deletes.

Add a new fqdir->dead flag so that timers do not attempt
a rhashtable_remove_fast() operation.

We also have to respect an RCU grace period before starting
the rhashtable_free_and_destroy() from process context,
thus we use rcu_work infrastructure.

This is a refinement of a prior rough attempt to fix this bug :
https://marc.info/?l=linux-netdev&m=153845936820900&w=2

Since the rhashtable cleanup is now deferred to a work queue,
netns dismantles should be slightly faster.

[1]
BUG: KASAN: use-after-free in __read_once_size include/linux/compiler.h:194 [inline]
BUG: KASAN: use-after-free in rhashtable_last_table+0x162/0x180 lib/rhashtable.c:212
Read of size 8 at addr ffff8880a6497b70 by task kworker/0:0/5

CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.2.0-rc1+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events rht_deferred_worker
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188
 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 kasan_report+0x12/0x20 mm/kasan/common.c:614
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
 __read_once_size include/linux/compiler.h:194 [inline]
 rhashtable_last_table+0x162/0x180 lib/rhashtable.c:212
 rht_deferred_worker+0x111/0x2030 lib/rhashtable.c:411
 process_one_work+0x989/0x1790 kernel/workqueue.c:2269
 worker_thread+0x98/0xe40 kernel/workqueue.c:2415
 kthread+0x354/0x420 kernel/kthread.c:255
 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352

Allocated by task 32687:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_kmalloc mm/kasan/common.c:489 [inline]
 __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462
 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:503
 __do_kmalloc_node mm/slab.c:3620 [inline]
 __kmalloc_node+0x4e/0x70 mm/slab.c:3627
 kmalloc_node include/linux/slab.h:590 [inline]
 kvmalloc_node+0x68/0x100 mm/util.c:431
 kvmalloc include/linux/mm.h:637 [inline]
 kvzalloc include/linux/mm.h:645 [inline]
 bucket_table_alloc+0x90/0x480 lib/rhashtable.c:178
 rhashtable_init+0x3f4/0x7b0 lib/rhashtable.c:1057
 inet_frags_init_net include/net/inet_frag.h:109 [inline]
 ipv4_frags_init_net+0x182/0x410 net/ipv4/ip_fragment.c:683
 ops_init+0xb3/0x410 net/core/net_namespace.c:130
 setup_net+0x2d3/0x740 net/core/net_namespace.c:316
 copy_net_ns+0x1df/0x340 net/core/net_namespace.c:439
 create_new_namespaces+0x400/0x7b0 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xc2/0x200 kernel/nsproxy.c:206
 ksys_unshare+0x440/0x980 kernel/fork.c:2692
 __do_sys_unshare kernel/fork.c:2760 [inline]
 __se_sys_unshare kernel/fork.c:2758 [inline]
 __x64_sys_unshare+0x31/0x40 kernel/fork.c:2758
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 7:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451
 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459
 __cache_free mm/slab.c:3432 [inline]
 kfree+0xcf/0x220 mm/slab.c:3755
 kvfree+0x61/0x70 mm/util.c:460
 bucket_table_free+0x69/0x150 lib/rhashtable.c:108
 rhashtable_free_and_destroy+0x165/0x8b0 lib/rhashtable.c:1155
 inet_frags_exit_net+0x3d/0x50 net/ipv4/inet_fragment.c:152
 ipv4_frags_exit_net+0x73/0x90 net/ipv4/ip_fragment.c:695
 ops_exit_list.isra.0+0xaa/0x150 net/core/net_namespace.c:154
 cleanup_net+0x3fb/0x960 net/core/net_namespace.c:553
 process_one_work+0x989/0x1790 kernel/workqueue.c:2269
 worker_thread+0x98/0xe40 kernel/workqueue.c:2415
 kthread+0x354/0x420 kernel/kthread.c:255
 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352

The buggy address belongs to the object at ffff8880a6497b40
 which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 48 bytes inside of
 1024-byte region [ffff8880a6497b40, ffff8880a6497f40)
The buggy address belongs to the page:
page:ffffea0002992580 refcount:1 mapcount:0 mapping:ffff8880aa400ac0 index:0xffff8880a64964c0 compound_mapcount: 0
flags: 0x1fffc0000010200(slab|head)
raw: 01fffc0000010200 ffffea0002916e88 ffffea000218fe08 ffff8880aa400ac0
raw: ffff8880a64964c0 ffff8880a6496040 0000000100000005 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8880a6497a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8880a6497a80: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
>ffff8880a6497b00: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
                                                             ^
 ffff8880a6497b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8880a6497c00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  |  4 ++++
 net/ipv4/inet_fragment.c | 49 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 41 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 5f754c660cfa..002f23c1a1a7 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -13,11 +13,13 @@ struct fqdir {
 	int			max_dist;
 	struct inet_frags	*f;
 	struct net		*net;
+	bool			dead;
 
 	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 
 	/* Keep atomic mem on separate cachelines in structs that include it */
 	atomic_long_t		mem ____cacheline_aligned_in_smp;
+	struct rcu_work		destroy_rwork;
 };
 
 /**
@@ -26,11 +28,13 @@ struct fqdir {
  * @INET_FRAG_FIRST_IN: first fragment has arrived
  * @INET_FRAG_LAST_IN: final fragment has arrived
  * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
+ * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable
  */
 enum {
 	INET_FRAG_FIRST_IN	= BIT(0),
 	INET_FRAG_LAST_IN	= BIT(1),
 	INET_FRAG_COMPLETE	= BIT(2),
+	INET_FRAG_HASH_DEAD	= BIT(3),
 };
 
 struct frag_v4_compare_key {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index b4432f209c71..6ca9523374da 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -124,34 +124,49 @@ void inet_frags_fini(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
+/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
 static void inet_frags_free_cb(void *ptr, void *arg)
 {
 	struct inet_frag_queue *fq = ptr;
+	int count;
 
-	/* If we can not cancel the timer, it means this frag_queue
-	 * is already disappearing, we have nothing to do.
-	 * Otherwise, we own a refcount until the end of this function.
-	 */
-	if (!del_timer(&fq->timer))
-		return;
+	count = del_timer_sync(&fq->timer) ? 1 : 0;
 
 	spin_lock_bh(&fq->lock);
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
 		fq->flags |= INET_FRAG_COMPLETE;
-		refcount_dec(&fq->refcnt);
+		count++;
+	} else if (fq->flags & INET_FRAG_HASH_DEAD) {
+		count++;
 	}
 	spin_unlock_bh(&fq->lock);
 
-	inet_frag_put(fq);
+	if (refcount_sub_and_test(count, &fq->refcnt))
+		inet_frag_destroy(fq);
 }
 
-void fqdir_exit(struct fqdir *fqdir)
+static void fqdir_rwork_fn(struct work_struct *work)
 {
-	fqdir->high_thresh = 0; /* prevent creation of new frags */
+	struct fqdir *fqdir = container_of(to_rcu_work(work),
+					   struct fqdir, destroy_rwork);
 
 	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
 	kfree(fqdir);
 }
+
+void fqdir_exit(struct fqdir *fqdir)
+{
+	fqdir->high_thresh = 0; /* prevent creation of new frags */
+
+	/* paired with READ_ONCE() in inet_frag_kill() :
+	 * We want to prevent rhashtable_remove_fast() calls
+	 */
+	smp_store_release(&fqdir->dead, true);
+
+	INIT_RCU_WORK(&fqdir->destroy_rwork, fqdir_rwork_fn);
+	queue_rcu_work(system_wq, &fqdir->destroy_rwork);
+
+}
 EXPORT_SYMBOL(fqdir_exit);
 
 void inet_frag_kill(struct inet_frag_queue *fq)
@@ -163,8 +178,18 @@ void inet_frag_kill(struct inet_frag_queue *fq)
 		struct fqdir *fqdir = fq->fqdir;
 
 		fq->flags |= INET_FRAG_COMPLETE;
-		rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params);
-		refcount_dec(&fq->refcnt);
+		rcu_read_lock();
+		/* This READ_ONCE() is paired with smp_store_release()
+		 * in inet_frags_exit_net().
+		 */
+		if (!READ_ONCE(fqdir->dead)) {
+			rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
+					       fqdir->f->rhash_params);
+			refcount_dec(&fq->refcnt);
+		} else {
+			fq->flags |= INET_FRAG_HASH_DEAD;
+		}
+		rcu_read_unlock();
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
-- 
cgit v1.2.3


From a3ce17d1495b65d62fceb2c7a704bb97133c5de9 Mon Sep 17 00:00:00 2001
From: Chaitanya Tata <chaitanya.tata@bluwireless.co.uk>
Date: Wed, 1 May 2019 18:25:24 +0530
Subject: cfg80211: Handle bss expiry during connection

If the BSS is expired during connection, the connect result will
trigger a kernel warning. Ideally cfg80211 should hold the BSS
before the connection is attempted, but as the BSSID is not known
in case of auth/assoc MLME offload (connect op) it doesn't.

For those drivers without the connect op cfg80211 holds down the
reference so it wil not be removed from list.

Fix this by removing the warning and silently adding the BSS back to
the bss list which is return by the driver (with proper BSSID set) or
in case the BSS is already added use that.

The requirements for drivers are documented in the API's.

Signed-off-by: Chaitanya Tata <chaitanya.tata@bluwireless.co.uk>
[formatting fixes, keep old timestamp]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 15 +++++++++++----
 net/wireless/core.h    |  4 ++++
 net/wireless/scan.c    | 12 +++++++-----
 net/wireless/sme.c     | 32 ++++++++++++++++++++++++++++----
 4 files changed, 50 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 87dae868707e..c19687833493 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6231,8 +6231,11 @@ struct cfg80211_fils_resp_params {
  *	case.
  * @bssid: The BSSID of the AP (may be %NULL)
  * @bss: Entry of bss to which STA got connected to, can be obtained through
- *	cfg80211_get_bss() (may be %NULL). Only one parameter among @bssid and
- *	@bss needs to be specified.
+ *	cfg80211_get_bss() (may be %NULL). But it is recommended to store the
+ *	bss from the connect_request and hold a reference to it and return
+ *	through this param to avoid a warning if the bss is expired during the
+ *	connection, esp. for those drivers implementing connect op.
+ *	Only one parameter among @bssid and @bss needs to be specified.
  * @req_ie: Association request IEs (may be %NULL)
  * @req_ie_len: Association request IEs length
  * @resp_ie: Association response IEs (may be %NULL)
@@ -6280,8 +6283,12 @@ void cfg80211_connect_done(struct net_device *dev,
  *
  * @dev: network device
  * @bssid: the BSSID of the AP
- * @bss: entry of bss to which STA got connected to, can be obtained
- *	through cfg80211_get_bss (may be %NULL)
+ * @bss: Entry of bss to which STA got connected to, can be obtained through
+ *	cfg80211_get_bss() (may be %NULL). But it is recommended to store the
+ *	bss from the connect_request and hold a reference to it and return
+ *	through this param to avoid a warning if the bss is expired during the
+ *	connection, esp. for those drivers implementing connect op.
+ *	Only one parameter among @bssid and @bss needs to be specified.
  * @req_ie: association request IEs (maybe be %NULL)
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 84d36ca7a7ab..ee8388fe4a92 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -531,6 +531,10 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
 void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
 		       struct wireless_dev *wdev);
 
+struct cfg80211_internal_bss *
+cfg80211_bss_update(struct cfg80211_registered_device *rdev,
+		    struct cfg80211_internal_bss *tmp,
+		    bool signal_valid, unsigned long ts);
 #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
 #define CFG80211_DEV_WARN_ON(cond)	WARN_ON(cond)
 #else
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index c04f5451f89b..f347387f195a 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1092,17 +1092,17 @@ struct cfg80211_non_tx_bss {
 };
 
 /* Returned bss is reference counted and must be cleaned up appropriately. */
-static struct cfg80211_internal_bss *
+struct cfg80211_internal_bss *
 cfg80211_bss_update(struct cfg80211_registered_device *rdev,
 		    struct cfg80211_internal_bss *tmp,
-		    bool signal_valid)
+		    bool signal_valid, unsigned long ts)
 {
 	struct cfg80211_internal_bss *found = NULL;
 
 	if (WARN_ON(!tmp->pub.channel))
 		return NULL;
 
-	tmp->ts = jiffies;
+	tmp->ts = ts;
 
 	spin_lock_bh(&rdev->bss_lock);
 
@@ -1425,7 +1425,8 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
 
 	signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
 		wiphy->max_adj_channel_rssi_comp;
-	res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
+	res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid,
+				  jiffies);
 	if (!res)
 		return NULL;
 
@@ -1842,7 +1843,8 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 
 	signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
 		wiphy->max_adj_channel_rssi_comp;
-	res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
+	res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid,
+				  jiffies);
 	if (!res)
 		return NULL;
 
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 7d34cb884840..7a6c38ddc65a 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -796,12 +796,36 @@ void cfg80211_connect_done(struct net_device *dev,
 	u8 *next;
 
 	if (params->bss) {
-		/* Make sure the bss entry provided by the driver is valid. */
 		struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss);
 
-		if (WARN_ON(list_empty(&ibss->list))) {
-			cfg80211_put_bss(wdev->wiphy, params->bss);
-			return;
+		if (list_empty(&ibss->list)) {
+			struct cfg80211_bss *found = NULL, *tmp = params->bss;
+
+			found = cfg80211_get_bss(wdev->wiphy, NULL,
+						 params->bss->bssid,
+						 wdev->ssid, wdev->ssid_len,
+						 wdev->conn_bss_type,
+						 IEEE80211_PRIVACY_ANY);
+			if (found) {
+				/* The same BSS is already updated so use it
+				 * instead, as it has latest info.
+				 */
+				params->bss = found;
+			} else {
+				/* Update with BSS provided by driver, it will
+				 * be freshly added and ref cnted, we can free
+				 * the old one.
+				 *
+				 * signal_valid can be false, as we are not
+				 * expecting the BSS to be found.
+				 *
+				 * keep the old timestamp to avoid confusion
+				 */
+				cfg80211_bss_update(rdev, ibss, false,
+						    ibss->ts);
+			}
+
+			cfg80211_put_bss(wdev->wiphy, tmp);
 		}
 	}
 
-- 
cgit v1.2.3


From 4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sat, 25 May 2019 09:37:39 -0700
Subject: bpf: decouple the lifetime of cgroup_bpf from cgroup itself

Currently the lifetime of bpf programs attached to a cgroup is bound
to the lifetime of the cgroup itself. It means that if a user
forgets (or intentionally avoids) to detach a bpf program before
removing the cgroup, it will stay attached up to the release of the
cgroup. Since the cgroup can stay in the dying state (the state
between being rmdir()'ed and being released) for a very long time, it
leads to a waste of memory. Also, it blocks a possibility to implement
the memcg-based memory accounting for bpf objects, because a circular
reference dependency will occur. Charged memory pages are pinning the
corresponding memory cgroup, and if the memory cgroup is pinning
the attached bpf program, nothing will be ever released.

A dying cgroup can not contain any processes, so the only chance for
an attached bpf program to be executed is a live socket associated
with the cgroup. So in order to release all bpf data early, let's
count associated sockets using a new percpu refcounter. On cgroup
removal the counter is transitioned to the atomic mode, and as soon
as it reaches 0, all bpf programs are detached.

Because cgroup_bpf_release() can block, it can't be called from
the percpu ref counter callback directly, so instead an asynchronous
work is scheduled.

The reference counter is not socket specific, and can be used for any
other types of programs, which can be executed from a cgroup-bpf hook
outside of the process context, had such a need arise in the future.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: jolsa@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h | 11 +++++++++--
 include/linux/cgroup.h     | 18 ++++++++++++++++++
 kernel/bpf/cgroup.c        | 41 +++++++++++++++++++++++++++++++++++++----
 kernel/cgroup/cgroup.c     | 11 ++++++++---
 4 files changed, 72 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index cb3c6b3b89c8..9f100fc422c3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -6,6 +6,7 @@
 #include <linux/errno.h>
 #include <linux/jump_label.h>
 #include <linux/percpu.h>
+#include <linux/percpu-refcount.h>
 #include <linux/rbtree.h>
 #include <uapi/linux/bpf.h>
 
@@ -72,10 +73,16 @@ struct cgroup_bpf {
 
 	/* temp storage for effective prog array used by prog_attach/detach */
 	struct bpf_prog_array __rcu *inactive;
+
+	/* reference counter used to detach bpf programs after cgroup removal */
+	struct percpu_ref refcnt;
+
+	/* cgroup_bpf is released using a work queue */
+	struct work_struct release_work;
 };
 
-void cgroup_bpf_put(struct cgroup *cgrp);
 int cgroup_bpf_inherit(struct cgroup *cgrp);
+void cgroup_bpf_offline(struct cgroup *cgrp);
 
 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
@@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
 
 struct bpf_prog;
 struct cgroup_bpf {};
-static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
+static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
 
 static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 					 enum bpf_prog_type ptype,
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c0077adeea83..49e8facf7c4a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -924,4 +924,22 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
 
 #endif /* !CONFIG_CGROUPS */
 
+#ifdef CONFIG_CGROUP_BPF
+static inline void cgroup_bpf_get(struct cgroup *cgrp)
+{
+	percpu_ref_get(&cgrp->bpf.refcnt);
+}
+
+static inline void cgroup_bpf_put(struct cgroup *cgrp)
+{
+	percpu_ref_put(&cgrp->bpf.refcnt);
+}
+
+#else /* CONFIG_CGROUP_BPF */
+
+static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+
+#endif /* CONFIG_CGROUP_BPF */
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fcde0f7b2585..d995edbe816d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -22,12 +22,21 @@
 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
+void cgroup_bpf_offline(struct cgroup *cgrp)
+{
+	cgroup_get(cgrp);
+	percpu_ref_kill(&cgrp->bpf.refcnt);
+}
+
 /**
- * cgroup_bpf_put() - put references of all bpf programs
- * @cgrp: the cgroup to modify
+ * cgroup_bpf_release() - put references of all bpf programs and
+ *                        release all cgroup bpf data
+ * @work: work structure embedded into the cgroup to modify
  */
-void cgroup_bpf_put(struct cgroup *cgrp)
+static void cgroup_bpf_release(struct work_struct *work)
 {
+	struct cgroup *cgrp = container_of(work, struct cgroup,
+					   bpf.release_work);
 	enum bpf_cgroup_storage_type stype;
 	unsigned int type;
 
@@ -47,6 +56,22 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 		}
 		bpf_prog_array_free(cgrp->bpf.effective[type]);
 	}
+
+	percpu_ref_exit(&cgrp->bpf.refcnt);
+	cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_bpf_release_fn() - callback used to schedule releasing
+ *                           of bpf cgroup data
+ * @ref: percpu ref counter structure
+ */
+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+{
+	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+
+	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+	queue_work(system_wq, &cgrp->bpf.release_work);
 }
 
 /* count number of elements in the list.
@@ -167,7 +192,12 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
  */
 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
 	struct bpf_prog_array __rcu *arrays[NR] = {};
-	int i;
+	int ret, i;
+
+	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
+			      GFP_KERNEL);
+	if (ret)
+		return ret;
 
 	for (i = 0; i < NR; i++)
 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -183,6 +213,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
 cleanup:
 	for (i = 0; i < NR; i++)
 		bpf_prog_array_free(arrays[i]);
+
+	percpu_ref_exit(&cgrp->bpf.refcnt);
+
 	return -ENOMEM;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..ef9cfbfc82a9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4955,8 +4955,6 @@ static void css_release_work_fn(struct work_struct *work)
 		if (cgrp->kn)
 			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
 					 NULL);
-
-		cgroup_bpf_put(cgrp);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -5482,6 +5480,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	cgroup1_check_for_release(parent);
 
+	cgroup_bpf_offline(cgrp);
+
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
 
@@ -6221,6 +6221,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 		 * Don't use cgroup_get_live().
 		 */
 		cgroup_get(sock_cgroup_ptr(skcd));
+		cgroup_bpf_get(sock_cgroup_ptr(skcd));
 		return;
 	}
 
@@ -6232,6 +6233,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 		cset = task_css_set(current);
 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
 			skcd->val = (unsigned long)cset->dfl_cgrp;
+			cgroup_bpf_get(cset->dfl_cgrp);
 			break;
 		}
 		cpu_relax();
@@ -6242,7 +6244,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 
 void cgroup_sk_free(struct sock_cgroup_data *skcd)
 {
-	cgroup_put(sock_cgroup_ptr(skcd));
+	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+	cgroup_bpf_put(cgrp);
+	cgroup_put(cgrp);
 }
 
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
-- 
cgit v1.2.3


From 6b73d19711d0989cbdcd19c61faa0f79a1a5e466 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 May 2019 16:56:47 -0700
Subject: inet: frags: uninline fqdir_init()

fqdir_init() is not fast path and is getting bigger.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 20 +-------------------
 net/ipv4/inet_fragment.c | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 002f23c1a1a7..94092b1ef22e 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -109,25 +109,7 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f,
-			     struct net *net)
-{
-	struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
-	int res;
-
-	if (!fqdir)
-		return -ENOMEM;
-	fqdir->f = f;
-	fqdir->net = net;
-	res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
-	if (res < 0) {
-		kfree(fqdir);
-		return res;
-	}
-	*fqdirp = fqdir;
-	return 0;
-}
-
+int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net);
 void fqdir_exit(struct fqdir *fqdir);
 
 void inet_frag_kill(struct inet_frag_queue *q);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6ca9523374da..7c07aae969e6 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -154,6 +154,25 @@ static void fqdir_rwork_fn(struct work_struct *work)
 	kfree(fqdir);
 }
 
+int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
+{
+	struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
+	int res;
+
+	if (!fqdir)
+		return -ENOMEM;
+	fqdir->f = f;
+	fqdir->net = net;
+	res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
+	if (res < 0) {
+		kfree(fqdir);
+		return res;
+	}
+	*fqdirp = fqdir;
+	return 0;
+}
+EXPORT_SYMBOL(fqdir_init);
+
 void fqdir_exit(struct fqdir *fqdir)
 {
 	fqdir->high_thresh = 0; /* prevent creation of new frags */
-- 
cgit v1.2.3


From dc93f46bc4e00899eaf4579962cfac8cf2f9966d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 May 2019 16:56:49 -0700
Subject: inet: frags: fix use-after-free read in inet_frag_destroy_rcu

As caught by syzbot [1], the rcu grace period that is respected
before fqdir_rwork_fn() proceeds and frees fqdir is not enough
to prevent inet_frag_destroy_rcu() being run after the freeing.

We need a proper rcu_barrier() synchronization to replace
the one we had in inet_frags_fini()

We also have to fix a potential problem at module removal :
inet_frags_fini() needs to make sure that all queued work queues
(fqdir_rwork_fn) have completed, otherwise we might
call kmem_cache_destroy() too soon and get another use-after-free.

[1]
BUG: KASAN: use-after-free in inet_frag_destroy_rcu+0xd9/0xe0 net/ipv4/inet_fragment.c:201
Read of size 8 at addr ffff88806ed47a18 by task swapper/1/0

CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.2.0-rc1+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188
 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 kasan_report+0x12/0x20 mm/kasan/common.c:614
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
 inet_frag_destroy_rcu+0xd9/0xe0 net/ipv4/inet_fragment.c:201
 __rcu_reclaim kernel/rcu/rcu.h:222 [inline]
 rcu_do_batch kernel/rcu/tree.c:2092 [inline]
 invoke_rcu_callbacks kernel/rcu/tree.c:2310 [inline]
 rcu_core+0xba5/0x1500 kernel/rcu/tree.c:2291
 __do_softirq+0x25c/0x94c kernel/softirq.c:293
 invoke_softirq kernel/softirq.c:374 [inline]
 irq_exit+0x180/0x1d0 kernel/softirq.c:414
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 smp_apic_timer_interrupt+0x13b/0x550 arch/x86/kernel/apic/apic.c:1068
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:806
 </IRQ>
RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61
Code: ff ff 48 89 df e8 f2 95 8c fa eb 82 e9 07 00 00 00 0f 00 2d e4 45 4b 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d d4 45 4b 00 fb f4 <c3> 90 55 48 89 e5 41 57 41 56 41 55 41 54 53 e8 8e 18 42 fa e8 99
RSP: 0018:ffff8880a98e7d78 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
RAX: 1ffffffff1164e11 RBX: ffff8880a98d4340 RCX: 0000000000000000
RDX: dffffc0000000000 RSI: 0000000000000006 RDI: ffff8880a98d4bbc
RBP: ffff8880a98e7da8 R08: ffff8880a98d4340 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: ffffffff88b27078 R14: 0000000000000001 R15: 0000000000000000
 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571
 default_idle_call+0x36/0x90 kernel/sched/idle.c:94
 cpuidle_idle_call kernel/sched/idle.c:154 [inline]
 do_idle+0x377/0x560 kernel/sched/idle.c:263
 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:354
 start_secondary+0x34e/0x4c0 arch/x86/kernel/smpboot.c:267
 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:243

Allocated by task 8877:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_kmalloc mm/kasan/common.c:489 [inline]
 __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462
 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:503
 kmem_cache_alloc_trace+0x151/0x750 mm/slab.c:3555
 kmalloc include/linux/slab.h:547 [inline]
 kzalloc include/linux/slab.h:742 [inline]
 fqdir_init include/net/inet_frag.h:115 [inline]
 ipv6_frags_init_net+0x48/0x460 net/ipv6/reassembly.c:513
 ops_init+0xb3/0x410 net/core/net_namespace.c:130
 setup_net+0x2d3/0x740 net/core/net_namespace.c:316
 copy_net_ns+0x1df/0x340 net/core/net_namespace.c:439
 create_new_namespaces+0x400/0x7b0 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xc2/0x200 kernel/nsproxy.c:206
 ksys_unshare+0x440/0x980 kernel/fork.c:2692
 __do_sys_unshare kernel/fork.c:2760 [inline]
 __se_sys_unshare kernel/fork.c:2758 [inline]
 __x64_sys_unshare+0x31/0x40 kernel/fork.c:2758
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 17:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451
 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459
 __cache_free mm/slab.c:3432 [inline]
 kfree+0xcf/0x220 mm/slab.c:3755
 fqdir_rwork_fn+0x33/0x40 net/ipv4/inet_fragment.c:154
 process_one_work+0x989/0x1790 kernel/workqueue.c:2269
 worker_thread+0x98/0xe40 kernel/workqueue.c:2415
 kthread+0x354/0x420 kernel/kthread.c:255
 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352

The buggy address belongs to the object at ffff88806ed47a00
 which belongs to the cache kmalloc-512 of size 512
The buggy address is located 24 bytes inside of
 512-byte region [ffff88806ed47a00, ffff88806ed47c00)
The buggy address belongs to the page:
page:ffffea0001bb51c0 refcount:1 mapcount:0 mapping:ffff8880aa400940 index:0x0
flags: 0x1fffc0000000200(slab)
raw: 01fffc0000000200 ffffea000282a788 ffffea0001bb53c8 ffff8880aa400940
raw: 0000000000000000 ffff88806ed47000 0000000100000006 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88806ed47900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88806ed47980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88806ed47a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                            ^
 ffff88806ed47a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88806ed47b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 3c8fc8782044 ("inet: frags: rework rhashtable dismantle")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  |  3 +++
 net/ipv4/inet_fragment.c | 20 ++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 94092b1ef22e..e91b79ad4e4a 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -3,6 +3,7 @@
 #define __NET_FRAG_H__
 
 #include <linux/rhashtable-types.h>
+#include <linux/completion.h>
 
 /* Per netns frag queues directory */
 struct fqdir {
@@ -104,6 +105,8 @@ struct inet_frags {
 	struct kmem_cache	*frags_cachep;
 	const char		*frags_cache_name;
 	struct rhashtable_params rhash_params;
+	refcount_t		refcnt;
+	struct completion	completion;
 };
 
 int inet_frags_init(struct inet_frags *);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 7c07aae969e6..2b816f1ebbb4 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -110,14 +110,18 @@ int inet_frags_init(struct inet_frags *f)
 	if (!f->frags_cachep)
 		return -ENOMEM;
 
+	refcount_set(&f->refcnt, 1);
+	init_completion(&f->completion);
 	return 0;
 }
 EXPORT_SYMBOL(inet_frags_init);
 
 void inet_frags_fini(struct inet_frags *f)
 {
-	/* We must wait that all inet_frag_destroy_rcu() have completed. */
-	rcu_barrier();
+	if (refcount_dec_and_test(&f->refcnt))
+		complete(&f->completion);
+
+	wait_for_completion(&f->completion);
 
 	kmem_cache_destroy(f->frags_cachep);
 	f->frags_cachep = NULL;
@@ -149,8 +153,19 @@ static void fqdir_rwork_fn(struct work_struct *work)
 {
 	struct fqdir *fqdir = container_of(to_rcu_work(work),
 					   struct fqdir, destroy_rwork);
+	struct inet_frags *f = fqdir->f;
 
 	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+
+	/* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
+	 * have completed, since they need to dereference fqdir.
+	 * Would it not be nice to have kfree_rcu_barrier() ? :)
+	 */
+	rcu_barrier();
+
+	if (refcount_dec_and_test(&f->refcnt))
+		complete(&f->completion);
+
 	kfree(fqdir);
 }
 
@@ -168,6 +183,7 @@ int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
 		kfree(fqdir);
 		return res;
 	}
+	refcount_inc(&f->refcnt);
 	*fqdirp = fqdir;
 	return 0;
 }
-- 
cgit v1.2.3


From 65ee00a9409f751188a8cdc0988167858eb4a536 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 24 May 2019 14:43:03 -0700
Subject: net: nexthop uapi

New UAPI for nexthops as standalone objects:
- defines netlink ancillary header, struct nhmsg
- RTM commands for nexthop objects, RTM_*NEXTHOP,
- RTNLGRP for nexthop notifications, RTNLGRP_NEXTHOP,
- Attributes for creating nexthops, NHA_*
- Attribute for route specs to specify a nexthop by id, RTA_NH_ID.

The nexthop attributes and semantics follow the route and RTA ones for
device, gateway and lwt encap. Unique to nexthop objects are a blackhole
and a group which contains references to other nexthop objects. With the
exception of blackhole and group, nexthop objects MUST contain a device.
Gateway and encap are optional. Nexthop groups can only reference other
pre-existing nexthops by id. If the NHA_ID attribute is present that id
is used for the nexthop. If not specified, one is auto assigned.

Dump requests can include attributes:
- NHA_GROUPS to return only nexthop groups,
- NHA_MASTER to limit dumps to nexthops with devices enslaved to the
  given master (e.g., VRF)
- NHA_OIF to limit dumps to nexthops using given device

nlmsg_route_perms in selinux code is updated for the new RTM comands.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/nexthop.h   | 56 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/rtnetlink.h | 10 ++++++++
 security/selinux/nlmsgtab.c    |  5 +++-
 3 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/nexthop.h

(limited to 'include')

diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
new file mode 100644
index 000000000000..7b61867e9848
--- /dev/null
+++ b/include/uapi/linux/nexthop.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_NEXTHOP_H
+#define _UAPI_LINUX_NEXTHOP_H
+
+#include <linux/types.h>
+
+struct nhmsg {
+	unsigned char	nh_family;
+	unsigned char	nh_scope;     /* return only */
+	unsigned char	nh_protocol;  /* Routing protocol that installed nh */
+	unsigned char	resvd;
+	unsigned int	nh_flags;     /* RTNH_F flags */
+};
+
+/* entry in a nexthop group */
+struct nexthop_grp {
+	__u32	id;	  /* nexthop id - must exist */
+	__u8	weight;   /* weight of this nexthop */
+	__u8	resvd1;
+	__u16	resvd2;
+};
+
+enum {
+	NEXTHOP_GRP_TYPE_MPATH,  /* default type if not specified */
+	__NEXTHOP_GRP_TYPE_MAX,
+};
+
+#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
+
+enum {
+	NHA_UNSPEC,
+	NHA_ID,		/* u32; id for nexthop. id == 0 means auto-assign */
+
+	NHA_GROUP,	/* array of nexthop_grp */
+	NHA_GROUP_TYPE,	/* u16 one of NEXTHOP_GRP_TYPE */
+	/* if NHA_GROUP attribute is added, no other attributes can be set */
+
+	NHA_BLACKHOLE,	/* flag; nexthop used to blackhole packets */
+	/* if NHA_BLACKHOLE is added, OIF, GATEWAY, ENCAP can not be set */
+
+	NHA_OIF,	/* u32; nexthop device */
+	NHA_GATEWAY,	/* be32 (IPv4) or in6_addr (IPv6) gw address */
+	NHA_ENCAP_TYPE, /* u16; lwt encap type */
+	NHA_ENCAP,	/* lwt encap data */
+
+	/* NHA_OIF can be appended to dump request to return only
+	 * nexthops using given device
+	 */
+	NHA_GROUPS,	/* flag; only return nexthop groups in dump */
+	NHA_MASTER,	/* u32;  only return nexthops with given master dev */
+
+	__NHA_MAX,
+};
+
+#define NHA_MAX	(__NHA_MAX - 1)
+#endif
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 46399367627f..ce2a623abb75 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -157,6 +157,13 @@ enum {
 	RTM_GETCHAIN,
 #define RTM_GETCHAIN RTM_GETCHAIN
 
+	RTM_NEWNEXTHOP = 104,
+#define RTM_NEWNEXTHOP	RTM_NEWNEXTHOP
+	RTM_DELNEXTHOP,
+#define RTM_DELNEXTHOP	RTM_DELNEXTHOP
+	RTM_GETNEXTHOP,
+#define RTM_GETNEXTHOP	RTM_GETNEXTHOP
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
@@ -342,6 +349,7 @@ enum rtattr_type_t {
 	RTA_IP_PROTO,
 	RTA_SPORT,
 	RTA_DPORT,
+	RTA_NH_ID,
 	__RTA_MAX
 };
 
@@ -704,6 +712,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_IPV4_MROUTE_R	RTNLGRP_IPV4_MROUTE_R
 	RTNLGRP_IPV6_MROUTE_R,
 #define RTNLGRP_IPV6_MROUTE_R	RTNLGRP_IPV6_MROUTE_R
+	RTNLGRP_NEXTHOP,
+#define RTNLGRP_NEXTHOP		RTNLGRP_NEXTHOP
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 9cec81209617..2c75d823d8e2 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -83,6 +83,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_NEWCHAIN,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_DELCHAIN,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_GETCHAIN,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_NEWNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_DELNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_GETNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 };
 
 static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
@@ -166,7 +169,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 		 * structures at the top of this file with the new mappings
 		 * before updating the BUILD_BUG_ON() macro!
 		 */
-		BUILD_BUG_ON(RTM_MAX != (RTM_NEWCHAIN + 3));
+		BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOP + 3));
 		err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
 				 sizeof(nlmsg_route_perms));
 		break;
-- 
cgit v1.2.3


From ab84be7e54fc3d9b248285f1a14067558d858819 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 24 May 2019 14:43:04 -0700
Subject: net: Initial nexthop code

Barebones start point for nexthops. Implementation for RTM commands,
notifications, management of rbtree for holding nexthops by id, and
kernel side data structures for nexthops and nexthop config.

Nexthops are maintained in an rbtree sorted by id. Similar to routes,
nexthops are configured per namespace using netns_nexthop struct added
to struct net.

Nexthop notifications are sent when a nexthop is added or deleted,
but NOT if the delete is due to a device event or network namespace
teardown (which also involves device events). Applications are
expected to use the device down event to flush nexthops and any
routes used by the nexthops.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h |   2 +
 include/net/netns/nexthop.h |  18 ++
 include/net/nexthop.h       |  88 ++++++
 net/ipv4/Makefile           |   2 +-
 net/ipv4/nexthop.c          | 722 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 831 insertions(+), 1 deletion(-)
 create mode 100644 include/net/netns/nexthop.h
 create mode 100644 include/net/nexthop.h
 create mode 100644 net/ipv4/nexthop.c

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 12689ddfc24c..abb4f92456e1 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -19,6 +19,7 @@
 #include <net/netns/packet.h>
 #include <net/netns/ipv4.h>
 #include <net/netns/ipv6.h>
+#include <net/netns/nexthop.h>
 #include <net/netns/ieee802154_6lowpan.h>
 #include <net/netns/sctp.h>
 #include <net/netns/dccp.h>
@@ -108,6 +109,7 @@ struct net {
 	struct netns_mib	mib;
 	struct netns_packet	packet;
 	struct netns_unix	unx;
+	struct netns_nexthop	nexthop;
 	struct netns_ipv4	ipv4;
 #if IS_ENABLED(CONFIG_IPV6)
 	struct netns_ipv6	ipv6;
diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h
new file mode 100644
index 000000000000..c712ee5eebd9
--- /dev/null
+++ b/include/net/netns/nexthop.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * nexthops in net namespaces
+ */
+
+#ifndef __NETNS_NEXTHOP_H__
+#define __NETNS_NEXTHOP_H__
+
+#include <linux/rbtree.h>
+
+struct netns_nexthop {
+	struct rb_root		rb_root;	/* tree of nexthops by id */
+	struct hlist_head	*devhash;	/* nexthops by device */
+
+	unsigned int		seq;		/* protected by rtnl_mutex */
+	u32			last_id_allocated;
+};
+#endif
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
new file mode 100644
index 000000000000..18e1f512f866
--- /dev/null
+++ b/include/net/nexthop.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Generic nexthop implementation
+ *
+ * Copyright (c) 2017-19 Cumulus Networks
+ * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
+ */
+
+#ifndef __LINUX_NEXTHOP_H
+#define __LINUX_NEXTHOP_H
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <net/ip_fib.h>
+#include <net/netlink.h>
+
+#define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK
+
+struct nexthop;
+
+struct nh_config {
+	u32		nh_id;
+
+	u8		nh_family;
+	u8		nh_protocol;
+	u8		nh_blackhole;
+	u32		nh_flags;
+
+	int		nh_ifindex;
+	struct net_device *dev;
+
+	u32		nlflags;
+	struct nl_info	nlinfo;
+};
+
+struct nh_info {
+	struct hlist_node	dev_hash;    /* entry on netns devhash */
+	struct nexthop		*nh_parent;
+
+	u8			family;
+	bool			reject_nh;
+
+	union {
+		struct fib_nh_common	fib_nhc;
+	};
+};
+
+struct nexthop {
+	struct rb_node		rb_node;    /* entry on netns rbtree */
+	struct net		*net;
+
+	u32			id;
+
+	u8			protocol;   /* app managing this nh */
+	u8			nh_flags;
+
+	refcount_t		refcnt;
+	struct rcu_head		rcu;
+
+	union {
+		struct nh_info	__rcu *nh_info;
+	};
+};
+
+/* caller is holding rcu or rtnl; no reference taken to nexthop */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
+void nexthop_free_rcu(struct rcu_head *head);
+
+static inline bool nexthop_get(struct nexthop *nh)
+{
+	return refcount_inc_not_zero(&nh->refcnt);
+}
+
+static inline void nexthop_put(struct nexthop *nh)
+{
+	if (refcount_dec_and_test(&nh->refcnt))
+		call_rcu(&nh->rcu, nexthop_free_rcu);
+}
+
+/* called with rcu lock */
+static inline bool nexthop_is_blackhole(const struct nexthop *nh)
+{
+	const struct nh_info *nhi;
+
+	nhi = rcu_dereference(nh->nh_info);
+	return nhi->reject_nh;
+}
+#endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 000a61994c8f..d57ecfaf89d4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
-	     metrics.o netlink.o
+	     metrics.o netlink.o nexthop.o
 
 obj-$(CONFIG_BPFILTER) += bpfilter/
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
new file mode 100644
index 000000000000..ec0ccf2ed873
--- /dev/null
+++ b/net/ipv4/nexthop.c
@@ -0,0 +1,722 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Generic nexthop implementation
+ *
+ * Copyright (c) 2017-19 Cumulus Networks
+ * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
+ */
+
+#include <linux/nexthop.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/nexthop.h>
+#include <net/sock.h>
+
+static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
+	[NHA_UNSPEC]		= { .strict_start_type = NHA_UNSPEC + 1 },
+	[NHA_ID]		= { .type = NLA_U32 },
+	[NHA_GROUP]		= { .type = NLA_BINARY },
+	[NHA_GROUP_TYPE]	= { .type = NLA_U16 },
+	[NHA_BLACKHOLE]		= { .type = NLA_FLAG },
+	[NHA_OIF]		= { .type = NLA_U32 },
+	[NHA_GATEWAY]		= { .type = NLA_BINARY },
+	[NHA_ENCAP_TYPE]	= { .type = NLA_U16 },
+	[NHA_ENCAP]		= { .type = NLA_NESTED },
+	[NHA_GROUPS]		= { .type = NLA_FLAG },
+	[NHA_MASTER]		= { .type = NLA_U32 },
+};
+
+void nexthop_free_rcu(struct rcu_head *head)
+{
+	struct nexthop *nh = container_of(head, struct nexthop, rcu);
+	struct nh_info *nhi;
+
+	nhi = rcu_dereference_raw(nh->nh_info);
+	kfree(nhi);
+
+	kfree(nh);
+}
+EXPORT_SYMBOL_GPL(nexthop_free_rcu);
+
+static struct nexthop *nexthop_alloc(void)
+{
+	struct nexthop *nh;
+
+	nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
+	return nh;
+}
+
+static void nh_base_seq_inc(struct net *net)
+{
+	while (++net->nexthop.seq == 0)
+		;
+}
+
+/* no reference taken; rcu lock or rtnl must be held */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
+{
+	struct rb_node **pp, *parent = NULL, *next;
+
+	pp = &net->nexthop.rb_root.rb_node;
+	while (1) {
+		struct nexthop *nh;
+
+		next = rcu_dereference_raw(*pp);
+		if (!next)
+			break;
+		parent = next;
+
+		nh = rb_entry(parent, struct nexthop, rb_node);
+		if (id < nh->id)
+			pp = &next->rb_left;
+		else if (id > nh->id)
+			pp = &next->rb_right;
+		else
+			return nh;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_find_by_id);
+
+/* used for auto id allocation; called with rtnl held */
+static u32 nh_find_unused_id(struct net *net)
+{
+	u32 id_start = net->nexthop.last_id_allocated;
+
+	while (1) {
+		net->nexthop.last_id_allocated++;
+		if (net->nexthop.last_id_allocated == id_start)
+			break;
+
+		if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
+			return net->nexthop.last_id_allocated;
+	}
+	return 0;
+}
+
+static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
+			int event, u32 portid, u32 seq, unsigned int nlflags)
+{
+	struct nlmsghdr *nlh;
+	struct nh_info *nhi;
+	struct nhmsg *nhm;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	nhm = nlmsg_data(nlh);
+	nhm->nh_family = AF_UNSPEC;
+	nhm->nh_flags = nh->nh_flags;
+	nhm->nh_protocol = nh->protocol;
+	nhm->nh_scope = 0;
+	nhm->resvd = 0;
+
+	if (nla_put_u32(skb, NHA_ID, nh->id))
+		goto nla_put_failure;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	nhm->nh_family = nhi->family;
+	if (nhi->reject_nh) {
+		if (nla_put_flag(skb, NHA_BLACKHOLE))
+			goto nla_put_failure;
+		goto out;
+	}
+
+out:
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static size_t nh_nlmsg_size(struct nexthop *nh)
+{
+	size_t sz = nla_total_size(4);    /* NHA_ID */
+
+	/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+	 * are mutually exclusive
+	 */
+	sz += nla_total_size(4);  /* NHA_OIF */
+
+	return sz;
+}
+
+static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
+{
+	unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
+	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
+	if (!skb)
+		goto errout;
+
+	err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in nh_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
+		    info->nlh, gfp_any());
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
+}
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+			   bool skip_fib, struct nl_info *nlinfo)
+{
+	/* remove from the tree */
+	rb_erase(&nh->rb_node, &net->nexthop.rb_root);
+
+	if (nlinfo)
+		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+
+	nh_base_seq_inc(net);
+
+	nexthop_put(nh);
+}
+
+static int replace_nexthop(struct net *net, struct nexthop *old,
+			   struct nexthop *new, struct netlink_ext_ack *extack)
+{
+	return -EEXIST;
+}
+
+/* called with rtnl_lock held */
+static int insert_nexthop(struct net *net, struct nexthop *new_nh,
+			  struct nh_config *cfg, struct netlink_ext_ack *extack)
+{
+	struct rb_node **pp, *parent = NULL, *next;
+	struct rb_root *root = &net->nexthop.rb_root;
+	bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
+	bool create = !!(cfg->nlflags & NLM_F_CREATE);
+	u32 new_id = new_nh->id;
+	int rc = -EEXIST;
+
+	pp = &root->rb_node;
+	while (1) {
+		struct nexthop *nh;
+
+		next = rtnl_dereference(*pp);
+		if (!next)
+			break;
+
+		parent = next;
+
+		nh = rb_entry(parent, struct nexthop, rb_node);
+		if (new_id < nh->id) {
+			pp = &next->rb_left;
+		} else if (new_id > nh->id) {
+			pp = &next->rb_right;
+		} else if (replace) {
+			rc = replace_nexthop(net, nh, new_nh, extack);
+			if (!rc)
+				new_nh = nh; /* send notification with old nh */
+			goto out;
+		} else {
+			/* id already exists and not a replace */
+			goto out;
+		}
+	}
+
+	if (replace && !create) {
+		NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
+		rc = -ENOENT;
+		goto out;
+	}
+
+	rb_link_node_rcu(&new_nh->rb_node, parent, pp);
+	rb_insert_color(&new_nh->rb_node, root);
+	rc = 0;
+out:
+	if (!rc) {
+		nh_base_seq_inc(net);
+		nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
+	}
+
+	return rc;
+}
+
+/* rtnl; called when net namespace is deleted */
+static void flush_all_nexthops(struct net *net)
+{
+	struct rb_root *root = &net->nexthop.rb_root;
+	struct rb_node *node;
+	struct nexthop *nh;
+
+	while ((node = rb_first(root))) {
+		nh = rb_entry(node, struct nexthop, rb_node);
+		remove_nexthop(net, nh, false, NULL);
+		cond_resched();
+	}
+}
+
+static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
+				      struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+	struct nexthop *nh;
+	int err = 0;
+
+	nh = nexthop_alloc();
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
+	if (!nhi) {
+		kfree(nh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nh->nh_flags = cfg->nh_flags;
+	nh->net = net;
+
+	nhi->nh_parent = nh;
+	nhi->family = cfg->nh_family;
+	nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
+
+	if (cfg->nh_blackhole) {
+		nhi->reject_nh = 1;
+		cfg->nh_ifindex = net->loopback_dev->ifindex;
+	}
+
+	if (err) {
+		kfree(nhi);
+		kfree(nh);
+		return ERR_PTR(err);
+	}
+
+	rcu_assign_pointer(nh->nh_info, nhi);
+
+	return nh;
+}
+
+/* called with rtnl lock held */
+static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
+				   struct netlink_ext_ack *extack)
+{
+	struct nexthop *nh;
+	int err;
+
+	if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
+		NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!cfg->nh_id) {
+		cfg->nh_id = nh_find_unused_id(net);
+		if (!cfg->nh_id) {
+			NL_SET_ERR_MSG(extack, "No unused id");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	nh = nexthop_create(net, cfg, extack);
+	if (IS_ERR(nh))
+		return nh;
+
+	refcount_set(&nh->refcnt, 1);
+	nh->id = cfg->nh_id;
+	nh->protocol = cfg->nh_protocol;
+	nh->net = net;
+
+	err = insert_nexthop(net, nh, cfg, extack);
+	if (err) {
+		nexthop_put(nh);
+		nh = ERR_PTR(err);
+	}
+
+	return nh;
+}
+
+static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
+			    struct nlmsghdr *nlh, struct nh_config *cfg,
+			    struct netlink_ext_ack *extack)
+{
+	struct nhmsg *nhm = nlmsg_data(nlh);
+	struct nlattr *tb[NHA_MAX + 1];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+			  extack);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (nhm->resvd || nhm->nh_scope) {
+		NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
+		goto out;
+	}
+	if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
+		NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
+		goto out;
+	}
+
+	switch (nhm->nh_family) {
+	default:
+		NL_SET_ERR_MSG(extack, "Invalid address family");
+		goto out;
+	}
+
+	if (tb[NHA_GROUPS] || tb[NHA_MASTER]) {
+		NL_SET_ERR_MSG(extack, "Invalid attributes in request");
+		goto out;
+	}
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->nlflags = nlh->nlmsg_flags;
+	cfg->nlinfo.portid = NETLINK_CB(skb).portid;
+	cfg->nlinfo.nlh = nlh;
+	cfg->nlinfo.nl_net = net;
+
+	cfg->nh_family = nhm->nh_family;
+	cfg->nh_protocol = nhm->nh_protocol;
+	cfg->nh_flags = nhm->nh_flags;
+
+	if (tb[NHA_ID])
+		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+
+	if (tb[NHA_BLACKHOLE]) {
+		if (tb[NHA_GATEWAY] || tb[NHA_OIF]) {
+			NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
+			goto out;
+		}
+
+		cfg->nh_blackhole = 1;
+		err = 0;
+		goto out;
+	}
+
+	if (!tb[NHA_OIF]) {
+		NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
+		goto out;
+	}
+
+	cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+	if (cfg->nh_ifindex)
+		cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+
+	if (!cfg->dev) {
+		NL_SET_ERR_MSG(extack, "Invalid device index");
+		goto out;
+	} else if (!(cfg->dev->flags & IFF_UP)) {
+		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+		err = -ENETDOWN;
+		goto out;
+	} else if (!netif_carrier_ok(cfg->dev)) {
+		NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+		err = -ENETDOWN;
+		goto out;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+/* rtnl */
+static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nh_config cfg;
+	struct nexthop *nh;
+	int err;
+
+	err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
+	if (!err) {
+		nh = nexthop_add(net, &cfg, extack);
+		if (IS_ERR(nh))
+			err = PTR_ERR(nh);
+	}
+
+	return err;
+}
+
+static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id,
+				struct netlink_ext_ack *extack)
+{
+	struct nhmsg *nhm = nlmsg_data(nlh);
+	struct nlattr *tb[NHA_MAX + 1];
+	int err, i;
+
+	err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+			  extack);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	for (i = 0; i < __NHA_MAX; ++i) {
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case NHA_ID:
+			break;
+		default:
+			NL_SET_ERR_MSG_ATTR(extack, tb[i],
+					    "Unexpected attribute in request");
+			goto out;
+		}
+	}
+	if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+		NL_SET_ERR_MSG(extack, "Invalid values in header");
+		goto out;
+	}
+
+	if (!tb[NHA_ID]) {
+		NL_SET_ERR_MSG(extack, "Nexthop id is missing");
+		goto out;
+	}
+
+	*id = nla_get_u32(tb[NHA_ID]);
+	if (!(*id))
+		NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+	else
+		err = 0;
+out:
+	return err;
+}
+
+/* rtnl */
+static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nl_info nlinfo = {
+		.nlh = nlh,
+		.nl_net = net,
+		.portid = NETLINK_CB(skb).portid,
+	};
+	struct nexthop *nh;
+	int err;
+	u32 id;
+
+	err = nh_valid_get_del_req(nlh, &id, extack);
+	if (err)
+		return err;
+
+	nh = nexthop_find_by_id(net, id);
+	if (!nh)
+		return -ENOENT;
+
+	remove_nexthop(net, nh, false, &nlinfo);
+
+	return 0;
+}
+
+/* rtnl */
+static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct sk_buff *skb = NULL;
+	struct nexthop *nh;
+	int err;
+	u32 id;
+
+	err = nh_valid_get_del_req(nlh, &id, extack);
+	if (err)
+		return err;
+
+	err = -ENOBUFS;
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		goto out;
+
+	err = -ENOENT;
+	nh = nexthop_find_by_id(net, id);
+	if (!nh)
+		goto errout_free;
+
+	err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
+			   nlh->nlmsg_seq, 0);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		goto errout_free;
+	}
+
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+	return err;
+errout_free:
+	kfree_skb(skb);
+	goto out;
+}
+
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
+			     int master_idx, u8 family)
+{
+	const struct net_device *dev;
+	const struct nh_info *nhi;
+
+	if (!dev_idx && !master_idx && !family)
+		return false;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	if (family && nhi->family != family)
+		return true;
+
+	dev = nhi->fib_nhc.nhc_dev;
+	if (dev_idx && (!dev || dev->ifindex != dev_idx))
+		return true;
+
+	if (master_idx) {
+		struct net_device *master;
+
+		if (!dev)
+			return true;
+
+		master = netdev_master_upper_dev_get((struct net_device *)dev);
+		if (!master || master->ifindex != master_idx)
+			return true;
+	}
+
+	return false;
+}
+
+static int nh_valid_dump_req(const struct nlmsghdr *nlh,
+			     int *dev_idx, int *master_idx,
+			     struct netlink_callback *cb)
+{
+	struct netlink_ext_ack *extack = cb->extack;
+	struct nlattr *tb[NHA_MAX + 1];
+	struct nhmsg *nhm;
+	int err, i;
+	u32 idx;
+
+	err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
+			  NULL);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= NHA_MAX; ++i) {
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case NHA_OIF:
+			idx = nla_get_u32(tb[i]);
+			if (idx > INT_MAX) {
+				NL_SET_ERR_MSG(extack, "Invalid device index");
+				return -EINVAL;
+			}
+			*dev_idx = idx;
+			break;
+		case NHA_MASTER:
+			idx = nla_get_u32(tb[i]);
+			if (idx > INT_MAX) {
+				NL_SET_ERR_MSG(extack, "Invalid master device index");
+				return -EINVAL;
+			}
+			*master_idx = idx;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
+	}
+
+	nhm = nlmsg_data(nlh);
+	if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+		NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* rtnl */
+static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nhmsg *nhm = nlmsg_data(cb->nlh);
+	int dev_filter_idx = 0, master_idx = 0;
+	struct net *net = sock_net(skb->sk);
+	struct rb_root *root = &net->nexthop.rb_root;
+	struct rb_node *node;
+	int idx = 0, s_idx;
+	int err;
+
+	err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, cb);
+	if (err < 0)
+		return err;
+
+	s_idx = cb->args[0];
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		struct nexthop *nh;
+
+		if (idx < s_idx)
+			goto cont;
+
+		nh = rb_entry(node, struct nexthop, rb_node);
+		if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
+				     nhm->nh_family))
+			goto cont;
+
+		err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
+				   NETLINK_CB(cb->skb).portid,
+				   cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err < 0) {
+			if (likely(skb->len))
+				goto out;
+
+			goto out_err;
+		}
+cont:
+		idx++;
+	}
+
+out:
+	err = skb->len;
+out_err:
+	cb->args[0] = idx;
+	cb->seq = net->nexthop.seq;
+	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+	return err;
+}
+
+static void __net_exit nexthop_net_exit(struct net *net)
+{
+	rtnl_lock();
+	flush_all_nexthops(net);
+	rtnl_unlock();
+}
+
+static int __net_init nexthop_net_init(struct net *net)
+{
+	net->nexthop.rb_root = RB_ROOT;
+
+	return 0;
+}
+
+static struct pernet_operations nexthop_net_ops = {
+	.init = nexthop_net_init,
+	.exit = nexthop_net_exit,
+};
+
+static int __init nexthop_init(void)
+{
+	register_pernet_subsys(&nexthop_net_ops);
+
+	rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
+		      rtm_dump_nexthop, 0);
+
+	rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+	rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+	return 0;
+}
+subsys_initcall(nexthop_init);
-- 
cgit v1.2.3


From 597cfe4fc3390a055f42546c254e48601b37009f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 24 May 2019 14:43:05 -0700
Subject: nexthop: Add support for IPv4 nexthops

Add support for IPv4 nexthops. If nh_family is set to AF_INET, then
NHA_GATEWAY is expected to be an IPv4 address.

Register for netdev events to be notified of admin up/down changes as
well as deletes. A hash table is used to track nexthop per devices to
quickly convert device events to the affected nexthops.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h |   5 ++
 net/ipv4/nexthop.c    | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 213 insertions(+)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 18e1f512f866..c0e4b0d92c39 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -29,6 +29,10 @@ struct nh_config {
 	int		nh_ifindex;
 	struct net_device *dev;
 
+	union {
+		__be32		ipv4;
+	} gw;
+
 	u32		nlflags;
 	struct nl_info	nlinfo;
 };
@@ -42,6 +46,7 @@ struct nh_info {
 
 	union {
 		struct fib_nh_common	fib_nhc;
+		struct fib_nh		fib_nh;
 	};
 };
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index ec0ccf2ed873..79c7b3461e19 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -9,8 +9,12 @@
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
 #include <net/nexthop.h>
+#include <net/route.h>
 #include <net/sock.h>
 
+#define NH_DEV_HASHBITS  8
+#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
+
 static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
 	[NHA_UNSPEC]		= { .strict_start_type = NHA_UNSPEC + 1 },
 	[NHA_ID]		= { .type = NLA_U32 },
@@ -25,12 +29,39 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
 	[NHA_MASTER]		= { .type = NLA_U32 },
 };
 
+static unsigned int nh_dev_hashfn(unsigned int val)
+{
+	unsigned int mask = NH_DEV_HASHSIZE - 1;
+
+	return (val ^
+		(val >> NH_DEV_HASHBITS) ^
+		(val >> (NH_DEV_HASHBITS * 2))) & mask;
+}
+
+static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
+{
+	struct net_device *dev = nhi->fib_nhc.nhc_dev;
+	struct hlist_head *head;
+	unsigned int hash;
+
+	WARN_ON(!dev);
+
+	hash = nh_dev_hashfn(dev->ifindex);
+	head = &net->nexthop.devhash[hash];
+	hlist_add_head(&nhi->dev_hash, head);
+}
+
 void nexthop_free_rcu(struct rcu_head *head)
 {
 	struct nexthop *nh = container_of(head, struct nexthop, rcu);
 	struct nh_info *nhi;
 
 	nhi = rcu_dereference_raw(nh->nh_info);
+	switch (nhi->family) {
+	case AF_INET:
+		fib_nh_release(nh->net, &nhi->fib_nh);
+		break;
+	}
 	kfree(nhi);
 
 	kfree(nh);
@@ -96,6 +127,7 @@ static u32 nh_find_unused_id(struct net *net)
 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 			int event, u32 portid, u32 seq, unsigned int nlflags)
 {
+	struct fib_nh *fib_nh;
 	struct nlmsghdr *nlh;
 	struct nh_info *nhi;
 	struct nhmsg *nhm;
@@ -120,6 +152,22 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 		if (nla_put_flag(skb, NHA_BLACKHOLE))
 			goto nla_put_failure;
 		goto out;
+	} else {
+		const struct net_device *dev;
+
+		dev = nhi->fib_nhc.nhc_dev;
+		if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
+			goto nla_put_failure;
+	}
+
+	nhm->nh_scope = nhi->fib_nhc.nhc_scope;
+	switch (nhi->family) {
+	case AF_INET:
+		fib_nh = &nhi->fib_nh;
+		if (fib_nh->fib_nh_gw_family &&
+		    nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+			goto nla_put_failure;
+		break;
 	}
 
 out:
@@ -132,6 +180,7 @@ nla_put_failure:
 
 static size_t nh_nlmsg_size(struct nexthop *nh)
 {
+	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 	size_t sz = nla_total_size(4);    /* NHA_ID */
 
 	/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
@@ -139,6 +188,13 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
 	 */
 	sz += nla_total_size(4);  /* NHA_OIF */
 
+	switch (nhi->family) {
+	case AF_INET:
+		if (nhi->fib_nh.fib_nh_gw_family)
+			sz += nla_total_size(4);  /* NHA_GATEWAY */
+		break;
+	}
+
 	return sz;
 }
 
@@ -169,6 +225,15 @@ errout:
 		rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
 }
 
+static void __remove_nexthop(struct net *net, struct nexthop *nh)
+{
+	struct nh_info *nhi;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	if (nhi->fib_nhc.nhc_dev)
+		hlist_del(&nhi->dev_hash);
+}
+
 static void remove_nexthop(struct net *net, struct nexthop *nh,
 			   bool skip_fib, struct nl_info *nlinfo)
 {
@@ -178,6 +243,7 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
 	if (nlinfo)
 		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 
+	__remove_nexthop(net, nh);
 	nh_base_seq_inc(net);
 
 	nexthop_put(nh);
@@ -244,6 +310,24 @@ out:
 	return rc;
 }
 
+/* rtnl */
+/* remove all nexthops tied to a device being deleted */
+static void nexthop_flush_dev(struct net_device *dev)
+{
+	unsigned int hash = nh_dev_hashfn(dev->ifindex);
+	struct net *net = dev_net(dev);
+	struct hlist_head *head = &net->nexthop.devhash[hash];
+	struct hlist_node *n;
+	struct nh_info *nhi;
+
+	hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+		if (nhi->fib_nhc.nhc_dev != dev)
+			continue;
+
+		remove_nexthop(net, nhi->nh_parent, false, NULL);
+	}
+}
+
 /* rtnl; called when net namespace is deleted */
 static void flush_all_nexthops(struct net *net)
 {
@@ -258,6 +342,38 @@ static void flush_all_nexthops(struct net *net)
 	}
 }
 
+static int nh_create_ipv4(struct net *net, struct nexthop *nh,
+			  struct nh_info *nhi, struct nh_config *cfg,
+			  struct netlink_ext_ack *extack)
+{
+	struct fib_nh *fib_nh = &nhi->fib_nh;
+	struct fib_config fib_cfg = {
+		.fc_oif   = cfg->nh_ifindex,
+		.fc_gw4   = cfg->gw.ipv4,
+		.fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
+		.fc_flags = cfg->nh_flags,
+	};
+	u32 tb_id = l3mdev_fib_table(cfg->dev);
+	int err = -EINVAL;
+
+	err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
+	if (err) {
+		fib_nh_release(net, fib_nh);
+		goto out;
+	}
+
+	/* sets nh_dev if successful */
+	err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
+	if (!err) {
+		nh->nh_flags = fib_nh->fib_nh_flags;
+		fib_info_update_nh_saddr(net, fib_nh, fib_nh->fib_nh_scope);
+	} else {
+		fib_nh_release(net, fib_nh);
+	}
+out:
+	return err;
+}
+
 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
 				      struct netlink_ext_ack *extack)
 {
@@ -287,12 +403,21 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
 		cfg->nh_ifindex = net->loopback_dev->ifindex;
 	}
 
+	switch (cfg->nh_family) {
+	case AF_INET:
+		err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+		break;
+	}
+
 	if (err) {
 		kfree(nhi);
 		kfree(nh);
 		return ERR_PTR(err);
 	}
 
+	/* add the entry to the device based hash */
+	nexthop_devhash_add(net, nhi);
+
 	rcu_assign_pointer(nh->nh_info, nhi);
 
 	return nh;
@@ -329,6 +454,7 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 
 	err = insert_nexthop(net, nh, cfg, extack);
 	if (err) {
+		__remove_nexthop(net, nh);
 		nexthop_put(nh);
 		nh = ERR_PTR(err);
 	}
@@ -360,6 +486,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 	}
 
 	switch (nhm->nh_family) {
+	case AF_INET:
+		break;
 	default:
 		NL_SET_ERR_MSG(extack, "Invalid address family");
 		goto out;
@@ -416,6 +544,32 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 		goto out;
 	}
 
+	err = -EINVAL;
+	if (tb[NHA_GATEWAY]) {
+		struct nlattr *gwa = tb[NHA_GATEWAY];
+
+		switch (cfg->nh_family) {
+		case AF_INET:
+			if (nla_len(gwa) != sizeof(u32)) {
+				NL_SET_ERR_MSG(extack, "Invalid gateway");
+				goto out;
+			}
+			cfg->gw.ipv4 = nla_get_be32(gwa);
+			break;
+		default:
+			NL_SET_ERR_MSG(extack,
+				       "Unknown address family for gateway");
+			goto out;
+		}
+	} else {
+		/* device only nexthop (no gateway) */
+		if (cfg->nh_flags & RTNH_F_ONLINK) {
+			NL_SET_ERR_MSG(extack,
+				       "ONLINK flag can not be set for nexthop without a gateway");
+			goto out;
+		}
+	}
+
 	err = 0;
 out:
 	return err;
@@ -683,16 +837,68 @@ out_err:
 	return err;
 }
 
+static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+	unsigned int hash = nh_dev_hashfn(dev->ifindex);
+	struct net *net = dev_net(dev);
+	struct hlist_head *head = &net->nexthop.devhash[hash];
+	struct hlist_node *n;
+	struct nh_info *nhi;
+
+	hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+		if (nhi->fib_nhc.nhc_dev == dev) {
+			if (nhi->family == AF_INET)
+				fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
+						   orig_mtu);
+		}
+	}
+}
+
+/* rtnl */
+static int nh_netdev_event(struct notifier_block *this,
+			   unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_info_ext *info_ext;
+
+	switch (event) {
+	case NETDEV_DOWN:
+	case NETDEV_UNREGISTER:
+		nexthop_flush_dev(dev);
+		break;
+	case NETDEV_CHANGE:
+		if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+			nexthop_flush_dev(dev);
+		break;
+	case NETDEV_CHANGEMTU:
+		info_ext = ptr;
+		nexthop_sync_mtu(dev, info_ext->ext.mtu);
+		rt_cache_flush(dev_net(dev));
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nh_netdev_notifier = {
+	.notifier_call = nh_netdev_event,
+};
+
 static void __net_exit nexthop_net_exit(struct net *net)
 {
 	rtnl_lock();
 	flush_all_nexthops(net);
 	rtnl_unlock();
+	kfree(net->nexthop.devhash);
 }
 
 static int __net_init nexthop_net_init(struct net *net)
 {
+	size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
+
 	net->nexthop.rb_root = RB_ROOT;
+	net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
+	if (!net->nexthop.devhash)
+		return -ENOMEM;
 
 	return 0;
 }
@@ -706,6 +912,8 @@ static int __init nexthop_init(void)
 {
 	register_pernet_subsys(&nexthop_net_ops);
 
+	register_netdevice_notifier(&nh_netdev_notifier);
+
 	rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
-- 
cgit v1.2.3


From 53010f991a9f5e4ed2db705ddde6ff32709192a2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 24 May 2019 14:43:06 -0700
Subject: nexthop: Add support for IPv6 gateways

Handle IPv6 gateway in a nexthop spec. If nh_family is set to AF_INET6,
NHA_GATEWAY is expected to be an IPv6 address. Add ipv6 option to gw in
nh_config to hold the address, add fib6_nh to nh_info to leverage the
ipv6 initialization and cleanup code. Update nh_fill_node to dump the v6
address.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h |  3 +++
 net/ipv4/nexthop.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index c0e4b0d92c39..d188f16c0c4f 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -12,6 +12,7 @@
 #include <linux/netdevice.h>
 #include <linux/types.h>
 #include <net/ip_fib.h>
+#include <net/ip6_fib.h>
 #include <net/netlink.h>
 
 #define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK
@@ -31,6 +32,7 @@ struct nh_config {
 
 	union {
 		__be32		ipv4;
+		struct in6_addr	ipv6;
 	} gw;
 
 	u32		nlflags;
@@ -47,6 +49,7 @@ struct nh_info {
 	union {
 		struct fib_nh_common	fib_nhc;
 		struct fib_nh		fib_nh;
+		struct fib6_nh		fib6_nh;
 	};
 };
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 79c7b3461e19..f2b237a6735c 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -8,6 +8,7 @@
 #include <linux/nexthop.h>
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
+#include <net/ipv6_stubs.h>
 #include <net/nexthop.h>
 #include <net/route.h>
 #include <net/sock.h>
@@ -61,6 +62,9 @@ void nexthop_free_rcu(struct rcu_head *head)
 	case AF_INET:
 		fib_nh_release(nh->net, &nhi->fib_nh);
 		break;
+	case AF_INET6:
+		ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
+		break;
 	}
 	kfree(nhi);
 
@@ -127,6 +131,7 @@ static u32 nh_find_unused_id(struct net *net)
 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 			int event, u32 portid, u32 seq, unsigned int nlflags)
 {
+	struct fib6_nh *fib6_nh;
 	struct fib_nh *fib_nh;
 	struct nlmsghdr *nlh;
 	struct nh_info *nhi;
@@ -168,6 +173,13 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 		    nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
 			goto nla_put_failure;
 		break;
+
+	case AF_INET6:
+		fib6_nh = &nhi->fib6_nh;
+		if (fib6_nh->fib_nh_gw_family &&
+		    nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
+			goto nla_put_failure;
+		break;
 	}
 
 out:
@@ -193,6 +205,12 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
 		if (nhi->fib_nh.fib_nh_gw_family)
 			sz += nla_total_size(4);  /* NHA_GATEWAY */
 		break;
+
+	case AF_INET6:
+		/* NHA_GATEWAY */
+		if (nhi->fib6_nh.fib_nh_gw_family)
+			sz += nla_total_size(sizeof(const struct in6_addr));
+		break;
 	}
 
 	return sz;
@@ -374,6 +392,33 @@ out:
 	return err;
 }
 
+static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
+			  struct nh_info *nhi, struct nh_config *cfg,
+			  struct netlink_ext_ack *extack)
+{
+	struct fib6_nh *fib6_nh = &nhi->fib6_nh;
+	struct fib6_config fib6_cfg = {
+		.fc_table = l3mdev_fib_table(cfg->dev),
+		.fc_ifindex = cfg->nh_ifindex,
+		.fc_gateway = cfg->gw.ipv6,
+		.fc_flags = cfg->nh_flags,
+	};
+	int err = -EINVAL;
+
+	if (!ipv6_addr_any(&cfg->gw.ipv6))
+		fib6_cfg.fc_flags |= RTF_GATEWAY;
+
+	/* sets nh_dev if successful */
+	err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
+				      extack);
+	if (err)
+		ipv6_stub->fib6_nh_release(fib6_nh);
+	else
+		nh->nh_flags = fib6_nh->fib_nh_flags;
+
+	return err;
+}
+
 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
 				      struct netlink_ext_ack *extack)
 {
@@ -407,6 +452,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
 	case AF_INET:
 		err = nh_create_ipv4(net, nh, nhi, cfg, extack);
 		break;
+	case AF_INET6:
+		err = nh_create_ipv6(net, nh, nhi, cfg, extack);
+		break;
 	}
 
 	if (err) {
@@ -487,6 +535,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 
 	switch (nhm->nh_family) {
 	case AF_INET:
+	case AF_INET6:
 		break;
 	default:
 		NL_SET_ERR_MSG(extack, "Invalid address family");
@@ -556,6 +605,13 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 			}
 			cfg->gw.ipv4 = nla_get_be32(gwa);
 			break;
+		case AF_INET6:
+			if (nla_len(gwa) != sizeof(struct in6_addr)) {
+				NL_SET_ERR_MSG(extack, "Invalid gateway");
+				goto out;
+			}
+			cfg->gw.ipv6 = nla_get_in6_addr(gwa);
+			break;
 		default:
 			NL_SET_ERR_MSG(extack,
 				       "Unknown address family for gateway");
-- 
cgit v1.2.3


From b513bd035f4044aa2667fb01418918523a049b9c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 24 May 2019 14:43:07 -0700
Subject: nexthop: Add support for lwt encaps

Add support for NHA_ENCAP and NHA_ENCAP_TYPE. Leverages the existing code
for lwtunnel within fib_nh_common, so the only change needed is handling
the attributes in the nexthop code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h |  3 +++
 net/ipv4/nexthop.c    | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index d188f16c0c4f..7cde03337e14 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -35,6 +35,9 @@ struct nh_config {
 		struct in6_addr	ipv6;
 	} gw;
 
+	struct nlattr	*nh_encap;
+	u16		nh_encap_type;
+
 	u32		nlflags;
 	struct nl_info	nlinfo;
 };
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index f2b237a6735c..3a1cbcb96baa 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -9,6 +9,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
 #include <net/ipv6_stubs.h>
+#include <net/lwtunnel.h>
 #include <net/nexthop.h>
 #include <net/route.h>
 #include <net/sock.h>
@@ -182,6 +183,11 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 		break;
 	}
 
+	if (nhi->fib_nhc.nhc_lwtstate &&
+	    lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
+				NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
+		goto nla_put_failure;
+
 out:
 	nlmsg_end(skb, nlh);
 	return 0;
@@ -213,6 +219,11 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
 		break;
 	}
 
+	if (nhi->fib_nhc.nhc_lwtstate) {
+		sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
+		sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
+	}
+
 	return sz;
 }
 
@@ -370,6 +381,8 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
 		.fc_gw4   = cfg->gw.ipv4,
 		.fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
 		.fc_flags = cfg->nh_flags,
+		.fc_encap = cfg->nh_encap,
+		.fc_encap_type = cfg->nh_encap_type,
 	};
 	u32 tb_id = l3mdev_fib_table(cfg->dev);
 	int err = -EINVAL;
@@ -402,6 +415,8 @@ static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
 		.fc_ifindex = cfg->nh_ifindex,
 		.fc_gateway = cfg->gw.ipv6,
 		.fc_flags = cfg->nh_flags,
+		.fc_encap = cfg->nh_encap,
+		.fc_encap_type = cfg->nh_encap_type,
 	};
 	int err = -EINVAL;
 
@@ -561,7 +576,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
 
 	if (tb[NHA_BLACKHOLE]) {
-		if (tb[NHA_GATEWAY] || tb[NHA_OIF]) {
+		if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
+		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
 			NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
 			goto out;
 		}
@@ -626,6 +642,25 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 		}
 	}
 
+	if (tb[NHA_ENCAP]) {
+		cfg->nh_encap = tb[NHA_ENCAP];
+
+		if (!tb[NHA_ENCAP_TYPE]) {
+			NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
+			goto out;
+		}
+
+		cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
+		err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
+		if (err < 0)
+			goto out;
+
+	} else if (tb[NHA_ENCAP_TYPE]) {
+		NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
+		goto out;
+	}
+
+
 	err = 0;
 out:
 	return err;
-- 
cgit v1.2.3


From 430a049190de3c9e219f43084de9f1122da04570 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 24 May 2019 14:43:08 -0700
Subject: nexthop: Add support for nexthop groups

Allow the creation of nexthop groups which reference other nexthop
objects to create multipath routes:

                      +--------------+
   +------------+   +--------------+ |
   | nh  nh_grp --->| nh_grp_entry |-+
   +------------+   +---------|----+
     ^                |       |    +------------+
     +----------------+       +--->| nh, weight |
        nh_parent                  +------------+

A group entry points to a nexthop with a weight for that hop within the
group. The nexthop has a list_head, grp_list, for tracking which groups
it is a member of and the group entry has a reference back to the parent.
The grp_list is used when a nexthop is deleted - to efficiently remove
it from groups using it.

If a nexthop group spec is given, no other attributes can be set. Each
nexthop id in a group spec must already exist.

Similar to single nexthops, the specification of a nexthop group can be
updated so that data is managed with rcu locking.

Add path selection function to account for multiple paths and add
ipv{4,6}_good_nh helpers to know that if a neighbor entry exists it is
in a good state.

Update NETDEV event handling to rebalance multipath nexthop groups if
a nexthop is deleted due to a link event (down or unregister).

When a nexthop is removed any groups using it are updated. Groups using a
nexthop a tracked via a grp_list.

Nexthop dumps can be limited to groups only by adding NHA_GROUPS to the
request.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h |  98 +++++++++-
 net/ipv4/nexthop.c    | 504 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 578 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 7cde03337e14..6e1b8f53624c 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -35,6 +35,9 @@ struct nh_config {
 		struct in6_addr	ipv6;
 	} gw;
 
+	struct nlattr	*nh_grp;
+	u16		nh_grp_type;
+
 	struct nlattr	*nh_encap;
 	u16		nh_encap_type;
 
@@ -56,20 +59,39 @@ struct nh_info {
 	};
 };
 
+struct nh_grp_entry {
+	struct nexthop	*nh;
+	u8		weight;
+	atomic_t	upper_bound;
+
+	struct list_head nh_list;
+	struct nexthop	*nh_parent;  /* nexthop of group with this entry */
+};
+
+struct nh_group {
+	u16			num_nh;
+	bool			mpath;
+	bool			has_v4;
+	struct nh_grp_entry	nh_entries[0];
+};
+
 struct nexthop {
 	struct rb_node		rb_node;    /* entry on netns rbtree */
+	struct list_head	grp_list;   /* nh group entries using this nh */
 	struct net		*net;
 
 	u32			id;
 
 	u8			protocol;   /* app managing this nh */
 	u8			nh_flags;
+	bool			is_group;
 
 	refcount_t		refcnt;
 	struct rcu_head		rcu;
 
 	union {
 		struct nh_info	__rcu *nh_info;
+		struct nh_group __rcu *nh_grp;
 	};
 };
 
@@ -88,12 +110,86 @@ static inline void nexthop_put(struct nexthop *nh)
 		call_rcu(&nh->rcu, nexthop_free_rcu);
 }
 
+static inline bool nexthop_is_multipath(const struct nexthop *nh)
+{
+	if (nh->is_group) {
+		struct nh_group *nh_grp;
+
+		nh_grp = rcu_dereference_rtnl(nh->nh_grp);
+		return nh_grp->mpath;
+	}
+	return false;
+}
+
+struct nexthop *nexthop_select_path(struct nexthop *nh, int hash);
+
+static inline unsigned int nexthop_num_path(const struct nexthop *nh)
+{
+	unsigned int rc = 1;
+
+	if (nexthop_is_multipath(nh)) {
+		struct nh_group *nh_grp;
+
+		nh_grp = rcu_dereference_rtnl(nh->nh_grp);
+		rc = nh_grp->num_nh;
+	} else {
+		const struct nh_info *nhi;
+
+		nhi = rcu_dereference_rtnl(nh->nh_info);
+		if (nhi->reject_nh)
+			rc = 0;
+	}
+
+	return rc;
+}
+
+static inline
+struct nexthop *nexthop_mpath_select(const struct nexthop *nh, int nhsel)
+{
+	const struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
+
+	/* for_nexthops macros in fib_semantics.c grabs a pointer to
+	 * the nexthop before checking nhsel
+	 */
+	if (nhsel > nhg->num_nh)
+		return NULL;
+
+	return nhg->nh_entries[nhsel].nh;
+}
+
+static inline
+int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh)
+{
+	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+	int i;
+
+	for (i = 0; i < nhg->num_nh; i++) {
+		struct nexthop *nhe = nhg->nh_entries[i].nh;
+		struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info);
+		struct fib_nh_common *nhc = &nhi->fib_nhc;
+		int weight = nhg->nh_entries[i].weight;
+
+		if (fib_add_nexthop(skb, nhc, weight) < 0)
+			return -EMSGSIZE;
+	}
+
+	return 0;
+}
+
 /* called with rcu lock */
 static inline bool nexthop_is_blackhole(const struct nexthop *nh)
 {
 	const struct nh_info *nhi;
 
-	nhi = rcu_dereference(nh->nh_info);
+	if (nexthop_is_multipath(nh)) {
+		if (nexthop_num_path(nh) > 1)
+			return false;
+		nh = nexthop_mpath_select(nh, 0);
+		if (!nh)
+			return false;
+	}
+
+	nhi = rcu_dereference_rtnl(nh->nh_info);
 	return nhi->reject_nh;
 }
 #endif
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 3a1cbcb96baa..1af8a329dacb 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -8,12 +8,17 @@
 #include <linux/nexthop.h>
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
+#include <net/arp.h>
 #include <net/ipv6_stubs.h>
 #include <net/lwtunnel.h>
+#include <net/ndisc.h>
 #include <net/nexthop.h>
 #include <net/route.h>
 #include <net/sock.h>
 
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+			   struct nl_info *nlinfo);
+
 #define NH_DEV_HASHBITS  8
 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
 
@@ -53,9 +58,20 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
 	hlist_add_head(&nhi->dev_hash, head);
 }
 
-void nexthop_free_rcu(struct rcu_head *head)
+static void nexthop_free_mpath(struct nexthop *nh)
+{
+	struct nh_group *nhg;
+	int i;
+
+	nhg = rcu_dereference_raw(nh->nh_grp);
+	for (i = 0; i < nhg->num_nh; ++i)
+		WARN_ON(nhg->nh_entries[i].nh);
+
+	kfree(nhg);
+}
+
+static void nexthop_free_single(struct nexthop *nh)
 {
-	struct nexthop *nh = container_of(head, struct nexthop, rcu);
 	struct nh_info *nhi;
 
 	nhi = rcu_dereference_raw(nh->nh_info);
@@ -68,6 +84,16 @@ void nexthop_free_rcu(struct rcu_head *head)
 		break;
 	}
 	kfree(nhi);
+}
+
+void nexthop_free_rcu(struct rcu_head *head)
+{
+	struct nexthop *nh = container_of(head, struct nexthop, rcu);
+
+	if (nh->is_group)
+		nexthop_free_mpath(nh);
+	else
+		nexthop_free_single(nh);
 
 	kfree(nh);
 }
@@ -78,9 +104,26 @@ static struct nexthop *nexthop_alloc(void)
 	struct nexthop *nh;
 
 	nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
+	if (nh) {
+		INIT_LIST_HEAD(&nh->grp_list);
+	}
 	return nh;
 }
 
+static struct nh_group *nexthop_grp_alloc(u16 num_nh)
+{
+	size_t sz = offsetof(struct nexthop, nh_grp)
+		    + sizeof(struct nh_group)
+		    + sizeof(struct nh_grp_entry) * num_nh;
+	struct nh_group *nhg;
+
+	nhg = kzalloc(sz, GFP_KERNEL);
+	if (nhg)
+		nhg->num_nh = num_nh;
+
+	return nhg;
+}
+
 static void nh_base_seq_inc(struct net *net)
 {
 	while (++net->nexthop.seq == 0)
@@ -129,6 +172,37 @@ static u32 nh_find_unused_id(struct net *net)
 	return 0;
 }
 
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
+{
+	struct nexthop_grp *p;
+	size_t len = nhg->num_nh * sizeof(*p);
+	struct nlattr *nla;
+	u16 group_type = 0;
+	int i;
+
+	if (nhg->mpath)
+		group_type = NEXTHOP_GRP_TYPE_MPATH;
+
+	if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, NHA_GROUP, len);
+	if (!nla)
+		goto nla_put_failure;
+
+	p = nla_data(nla);
+	for (i = 0; i < nhg->num_nh; ++i) {
+		p->id = nhg->nh_entries[i].nh->id;
+		p->weight = nhg->nh_entries[i].weight - 1;
+		p += 1;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 			int event, u32 portid, u32 seq, unsigned int nlflags)
 {
@@ -152,6 +226,14 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 	if (nla_put_u32(skb, NHA_ID, nh->id))
 		goto nla_put_failure;
 
+	if (nh->is_group) {
+		struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+		if (nla_put_nh_group(skb, nhg))
+			goto nla_put_failure;
+		goto out;
+	}
+
 	nhi = rtnl_dereference(nh->nh_info);
 	nhm->nh_family = nhi->family;
 	if (nhi->reject_nh) {
@@ -196,15 +278,24 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-static size_t nh_nlmsg_size(struct nexthop *nh)
+static size_t nh_nlmsg_size_grp(struct nexthop *nh)
+{
+	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+	size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
+
+	return nla_total_size(sz) +
+	       nla_total_size(2);  /* NHA_GROUP_TYPE */
+}
+
+static size_t nh_nlmsg_size_single(struct nexthop *nh)
 {
 	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
-	size_t sz = nla_total_size(4);    /* NHA_ID */
+	size_t sz;
 
 	/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
 	 * are mutually exclusive
 	 */
-	sz += nla_total_size(4);  /* NHA_OIF */
+	sz = nla_total_size(4);  /* NHA_OIF */
 
 	switch (nhi->family) {
 	case AF_INET:
@@ -227,6 +318,18 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
 	return sz;
 }
 
+static size_t nh_nlmsg_size(struct nexthop *nh)
+{
+	size_t sz = nla_total_size(4);    /* NHA_ID */
+
+	if (nh->is_group)
+		sz += nh_nlmsg_size_grp(nh);
+	else
+		sz += nh_nlmsg_size_single(nh);
+
+	return sz;
+}
+
 static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 {
 	unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
@@ -254,17 +357,274 @@ errout:
 		rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
 }
 
-static void __remove_nexthop(struct net *net, struct nexthop *nh)
+static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
+			   struct netlink_ext_ack *extack)
 {
-	struct nh_info *nhi;
+	if (nh->is_group) {
+		struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 
-	nhi = rtnl_dereference(nh->nh_info);
-	if (nhi->fib_nhc.nhc_dev)
-		hlist_del(&nhi->dev_hash);
+		/* nested multipath (group within a group) is not
+		 * supported
+		 */
+		if (nhg->mpath) {
+			NL_SET_ERR_MSG(extack,
+				       "Multipath group can not be a nexthop within a group");
+			return false;
+		}
+	} else {
+		struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+		if (nhi->reject_nh && npaths > 1) {
+			NL_SET_ERR_MSG(extack,
+				       "Blackhole nexthop can not be used in a group with more than 1 path");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+			       struct netlink_ext_ack *extack)
+{
+	unsigned int len = nla_len(tb[NHA_GROUP]);
+	struct nexthop_grp *nhg;
+	unsigned int i, j;
+
+	if (len & (sizeof(struct nexthop_grp) - 1)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid length for nexthop group attribute");
+		return -EINVAL;
+	}
+
+	/* convert len to number of nexthop ids */
+	len /= sizeof(*nhg);
+
+	nhg = nla_data(tb[NHA_GROUP]);
+	for (i = 0; i < len; ++i) {
+		if (nhg[i].resvd1 || nhg[i].resvd2) {
+			NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
+			return -EINVAL;
+		}
+		if (nhg[i].weight > 254) {
+			NL_SET_ERR_MSG(extack, "Invalid value for weight");
+			return -EINVAL;
+		}
+		for (j = i + 1; j < len; ++j) {
+			if (nhg[i].id == nhg[j].id) {
+				NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
+				return -EINVAL;
+			}
+		}
+	}
+
+	nhg = nla_data(tb[NHA_GROUP]);
+	for (i = 0; i < len; ++i) {
+		struct nexthop *nh;
+
+		nh = nexthop_find_by_id(net, nhg[i].id);
+		if (!nh) {
+			NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+			return -EINVAL;
+		}
+		if (!valid_group_nh(nh, len, extack))
+			return -EINVAL;
+	}
+	for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+		if (!tb[i])
+			continue;
+
+		NL_SET_ERR_MSG(extack,
+			       "No other attributes can be set in nexthop groups");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool ipv6_good_nh(const struct fib6_nh *nh)
+{
+	int state = NUD_REACHABLE;
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+
+	n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
+	if (n)
+		state = n->nud_state;
+
+	rcu_read_unlock_bh();
+
+	return !!(state & NUD_VALID);
+}
+
+static bool ipv4_good_nh(const struct fib_nh *nh)
+{
+	int state = NUD_REACHABLE;
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+
+	n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+				      (__force u32)nh->fib_nh_gw4);
+	if (n)
+		state = n->nud_state;
+
+	rcu_read_unlock_bh();
+
+	return !!(state & NUD_VALID);
+}
+
+struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
+{
+	struct nexthop *rc = NULL;
+	struct nh_group *nhg;
+	int i;
+
+	if (!nh->is_group)
+		return nh;
+
+	nhg = rcu_dereference(nh->nh_grp);
+	for (i = 0; i < nhg->num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+		struct nh_info *nhi;
+
+		if (hash > atomic_read(&nhge->upper_bound))
+			continue;
+
+		/* nexthops always check if it is good and does
+		 * not rely on a sysctl for this behavior
+		 */
+		nhi = rcu_dereference(nhge->nh->nh_info);
+		switch (nhi->family) {
+		case AF_INET:
+			if (ipv4_good_nh(&nhi->fib_nh))
+				return nhge->nh;
+			break;
+		case AF_INET6:
+			if (ipv6_good_nh(&nhi->fib6_nh))
+				return nhge->nh;
+			break;
+		}
+
+		if (!rc)
+			rc = nhge->nh;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(nexthop_select_path);
+
+static void nh_group_rebalance(struct nh_group *nhg)
+{
+	int total = 0;
+	int w = 0;
+	int i;
+
+	for (i = 0; i < nhg->num_nh; ++i)
+		total += nhg->nh_entries[i].weight;
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+		int upper_bound;
+
+		w += nhge->weight;
+		upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
+		atomic_set(&nhge->upper_bound, upper_bound);
+	}
+}
+
+static void remove_nh_grp_entry(struct nh_grp_entry *nhge,
+				struct nh_group *nhg,
+				struct nl_info *nlinfo)
+{
+	struct nexthop *nh = nhge->nh;
+	struct nh_grp_entry *nhges;
+	bool found = false;
+	int i;
+
+	WARN_ON(!nh);
+
+	nhges = nhg->nh_entries;
+	for (i = 0; i < nhg->num_nh; ++i) {
+		if (found) {
+			nhges[i-1].nh = nhges[i].nh;
+			nhges[i-1].weight = nhges[i].weight;
+			list_del(&nhges[i].nh_list);
+			list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list);
+		} else if (nhg->nh_entries[i].nh == nh) {
+			found = true;
+		}
+	}
+
+	if (WARN_ON(!found))
+		return;
+
+	nhg->num_nh--;
+	nhg->nh_entries[nhg->num_nh].nh = NULL;
+
+	nh_group_rebalance(nhg);
+
+	nexthop_put(nh);
+
+	if (nlinfo)
+		nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+				       struct nl_info *nlinfo)
+{
+	struct nh_grp_entry *nhge, *tmp;
+
+	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
+		struct nh_group *nhg;
+
+		list_del(&nhge->nh_list);
+		nhg = rtnl_dereference(nhge->nh_parent->nh_grp);
+		remove_nh_grp_entry(nhge, nhg, nlinfo);
+
+		/* if this group has no more entries then remove it */
+		if (!nhg->num_nh)
+			remove_nexthop(net, nhge->nh_parent, nlinfo);
+	}
+}
+
+static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
+{
+	struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
+	int i, num_nh = nhg->num_nh;
+
+	for (i = 0; i < num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+		if (WARN_ON(!nhge->nh))
+			continue;
+
+		list_del(&nhge->nh_list);
+		nexthop_put(nhge->nh);
+		nhge->nh = NULL;
+		nhg->num_nh--;
+	}
+}
+
+static void __remove_nexthop(struct net *net, struct nexthop *nh,
+			     struct nl_info *nlinfo)
+{
+	if (nh->is_group) {
+		remove_nexthop_group(nh, nlinfo);
+	} else {
+		struct nh_info *nhi;
+
+		nhi = rtnl_dereference(nh->nh_info);
+		if (nhi->fib_nhc.nhc_dev)
+			hlist_del(&nhi->dev_hash);
+
+		remove_nexthop_from_groups(net, nh, nlinfo);
+	}
 }
 
 static void remove_nexthop(struct net *net, struct nexthop *nh,
-			   bool skip_fib, struct nl_info *nlinfo)
+			   struct nl_info *nlinfo)
 {
 	/* remove from the tree */
 	rb_erase(&nh->rb_node, &net->nexthop.rb_root);
@@ -272,7 +632,7 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
 	if (nlinfo)
 		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 
-	__remove_nexthop(net, nh);
+	__remove_nexthop(net, nh, nlinfo);
 	nh_base_seq_inc(net);
 
 	nexthop_put(nh);
@@ -353,7 +713,7 @@ static void nexthop_flush_dev(struct net_device *dev)
 		if (nhi->fib_nhc.nhc_dev != dev)
 			continue;
 
-		remove_nexthop(net, nhi->nh_parent, false, NULL);
+		remove_nexthop(net, nhi->nh_parent, NULL);
 	}
 }
 
@@ -366,11 +726,69 @@ static void flush_all_nexthops(struct net *net)
 
 	while ((node = rb_first(root))) {
 		nh = rb_entry(node, struct nexthop, rb_node);
-		remove_nexthop(net, nh, false, NULL);
+		remove_nexthop(net, nh, NULL);
 		cond_resched();
 	}
 }
 
+static struct nexthop *nexthop_create_group(struct net *net,
+					    struct nh_config *cfg)
+{
+	struct nlattr *grps_attr = cfg->nh_grp;
+	struct nexthop_grp *entry = nla_data(grps_attr);
+	struct nh_group *nhg;
+	struct nexthop *nh;
+	int i;
+
+	nh = nexthop_alloc();
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nh->is_group = 1;
+
+	nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
+	if (!nhg) {
+		kfree(nh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		struct nexthop *nhe;
+		struct nh_info *nhi;
+
+		nhe = nexthop_find_by_id(net, entry[i].id);
+		if (!nexthop_get(nhe))
+			goto out_no_nh;
+
+		nhi = rtnl_dereference(nhe->nh_info);
+		if (nhi->family == AF_INET)
+			nhg->has_v4 = true;
+
+		nhg->nh_entries[i].nh = nhe;
+		nhg->nh_entries[i].weight = entry[i].weight + 1;
+		list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
+		nhg->nh_entries[i].nh_parent = nh;
+	}
+
+	if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+		nhg->mpath = 1;
+		nh_group_rebalance(nhg);
+	}
+
+	rcu_assign_pointer(nh->nh_grp, nhg);
+
+	return nh;
+
+out_no_nh:
+	for (; i >= 0; --i)
+		nexthop_put(nhg->nh_entries[i].nh);
+
+	kfree(nhg);
+	kfree(nh);
+
+	return ERR_PTR(-ENOENT);
+}
+
 static int nh_create_ipv4(struct net *net, struct nexthop *nh,
 			  struct nh_info *nhi, struct nh_config *cfg,
 			  struct netlink_ext_ack *extack)
@@ -506,7 +924,11 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 		}
 	}
 
-	nh = nexthop_create(net, cfg, extack);
+	if (cfg->nh_grp)
+		nh = nexthop_create_group(net, cfg);
+	else
+		nh = nexthop_create(net, cfg, extack);
+
 	if (IS_ERR(nh))
 		return nh;
 
@@ -517,7 +939,7 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 
 	err = insert_nexthop(net, nh, cfg, extack);
 	if (err) {
-		__remove_nexthop(net, nh);
+		__remove_nexthop(net, nh, NULL);
 		nexthop_put(nh);
 		nh = ERR_PTR(err);
 	}
@@ -552,6 +974,10 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 	case AF_INET:
 	case AF_INET6:
 		break;
+	case AF_UNSPEC:
+		if (tb[NHA_GROUP])
+			break;
+		/* fallthrough */
 	default:
 		NL_SET_ERR_MSG(extack, "Invalid address family");
 		goto out;
@@ -575,6 +1001,27 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 	if (tb[NHA_ID])
 		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
 
+	if (tb[NHA_GROUP]) {
+		if (nhm->nh_family != AF_UNSPEC) {
+			NL_SET_ERR_MSG(extack, "Invalid family for group");
+			goto out;
+		}
+		cfg->nh_grp = tb[NHA_GROUP];
+
+		cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+		if (tb[NHA_GROUP_TYPE])
+			cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+		if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid group type");
+			goto out;
+		}
+		err = nh_check_attr_group(net, tb, extack);
+
+		/* no other attributes should be set */
+		goto out;
+	}
+
 	if (tb[NHA_BLACKHOLE]) {
 		if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
 		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
@@ -752,7 +1199,7 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (!nh)
 		return -ENOENT;
 
-	remove_nexthop(net, nh, false, &nlinfo);
+	remove_nexthop(net, nh, &nlinfo);
 
 	return 0;
 }
@@ -796,15 +1243,21 @@ errout_free:
 	goto out;
 }
 
-static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
-			     int master_idx, u8 family)
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
+			     bool group_filter, u8 family)
 {
 	const struct net_device *dev;
 	const struct nh_info *nhi;
 
+	if (group_filter && !nh->is_group)
+		return true;
+
 	if (!dev_idx && !master_idx && !family)
 		return false;
 
+	if (nh->is_group)
+		return true;
+
 	nhi = rtnl_dereference(nh->nh_info);
 	if (family && nhi->family != family)
 		return true;
@@ -827,8 +1280,8 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
 	return false;
 }
 
-static int nh_valid_dump_req(const struct nlmsghdr *nlh,
-			     int *dev_idx, int *master_idx,
+static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
+			     int *master_idx, bool *group_filter,
 			     struct netlink_callback *cb)
 {
 	struct netlink_ext_ack *extack = cb->extack;
@@ -863,6 +1316,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh,
 			}
 			*master_idx = idx;
 			break;
+		case NHA_GROUPS:
+			*group_filter = true;
+			break;
 		default:
 			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
 			return -EINVAL;
@@ -885,11 +1341,13 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 	int dev_filter_idx = 0, master_idx = 0;
 	struct net *net = sock_net(skb->sk);
 	struct rb_root *root = &net->nexthop.rb_root;
+	bool group_filter = false;
 	struct rb_node *node;
 	int idx = 0, s_idx;
 	int err;
 
-	err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, cb);
+	err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
+				&group_filter, cb);
 	if (err < 0)
 		return err;
 
@@ -902,7 +1360,7 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 
 		nh = rb_entry(node, struct nexthop, rb_node);
 		if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
-				     nhm->nh_family))
+				     group_filter, nhm->nh_family))
 			goto cont;
 
 		err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
-- 
cgit v1.2.3


From 54e9c9d4b506b611228890752d1cfa960e0965e1 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 May 2019 14:14:41 -0700
Subject: bpf: remove __rcu annotations from bpf_prog_array

Drop __rcu annotations and rcu read sections from bpf_prog_array
helper functions. They are not needed since all existing callers
call those helpers from the rcu update side while holding a mutex.
This guarantees that use-after-free could not happen.

In the next patches I'll fix the callers with missing
rcu_dereference_protected to make sparse/lockdep happy, the proper
way to use these helpers is:

	struct bpf_prog_array __rcu *progs = ...;
	struct bpf_prog_array *p;

	mutex_lock(&mtx);
	p = rcu_dereference_protected(progs, lockdep_is_held(&mtx));
	bpf_prog_array_length(p);
	bpf_prog_array_copy_to_user(p, ...);
	bpf_prog_array_delete_safe(p, ...);
	bpf_prog_array_copy_info(p, ...);
	bpf_prog_array_copy(p, ...);
	bpf_prog_array_free(p);
	mutex_unlock(&mtx);

No functional changes! rcu_dereference_protected with lockdep_is_held
should catch any cases where we update prog array without a mutex
(I've looked at existing call sites and I think we hold a mutex
everywhere).

Motivation is to fix sparse warnings:
kernel/bpf/core.c:1803:9: warning: incorrect type in argument 1 (different address spaces)
kernel/bpf/core.c:1803:9:    expected struct callback_head *head
kernel/bpf/core.c:1803:9:    got struct callback_head [noderef] <asn:4> *
kernel/bpf/core.c:1877:44: warning: incorrect type in initializer (different address spaces)
kernel/bpf/core.c:1877:44:    expected struct bpf_prog_array_item *item
kernel/bpf/core.c:1877:44:    got struct bpf_prog_array_item [noderef] <asn:4> *
kernel/bpf/core.c:1901:26: warning: incorrect type in assignment (different address spaces)
kernel/bpf/core.c:1901:26:    expected struct bpf_prog_array_item *existing
kernel/bpf/core.c:1901:26:    got struct bpf_prog_array_item [noderef] <asn:4> *
kernel/bpf/core.c:1935:26: warning: incorrect type in assignment (different address spaces)
kernel/bpf/core.c:1935:26:    expected struct bpf_prog_array_item *[assigned] existing
kernel/bpf/core.c:1935:26:    got struct bpf_prog_array_item [noderef] <asn:4> *

v2:
* remove comment about potential race; that can't happen
  because all callers are in rcu-update section

Cc: Roman Gushchin <guro@fb.com>
Acked-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 12 ++++++------
 kernel/bpf/core.c   | 37 +++++++++++++------------------------
 2 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d98141edb74b..ff3e00ff84d2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -514,17 +514,17 @@ struct bpf_prog_array {
 };
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
-int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+void bpf_prog_array_free(struct bpf_prog_array *progs);
+int bpf_prog_array_length(struct bpf_prog_array *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
 				struct bpf_prog *old_prog);
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 			     u32 *prog_ids, u32 request_cnt,
 			     u32 *prog_cnt);
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
 			struct bpf_prog_array **new_array);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3675b19ecb90..33fb292f2e30 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1795,38 +1795,33 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 	return &empty_prog_array.hdr;
 }
 
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+void bpf_prog_array_free(struct bpf_prog_array *progs)
 {
-	if (!progs ||
-	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+	if (!progs || progs == &empty_prog_array.hdr)
 		return;
 	kfree_rcu(progs, rcu);
 }
 
-int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
+int bpf_prog_array_length(struct bpf_prog_array *array)
 {
 	struct bpf_prog_array_item *item;
 	u32 cnt = 0;
 
-	rcu_read_lock();
-	item = rcu_dereference(array)->items;
-	for (; item->prog; item++)
+	for (item = array->items; item->prog; item++)
 		if (item->prog != &dummy_bpf_prog.prog)
 			cnt++;
-	rcu_read_unlock();
 	return cnt;
 }
 
 
-static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
+static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
 				     u32 *prog_ids,
 				     u32 request_cnt)
 {
 	struct bpf_prog_array_item *item;
 	int i = 0;
 
-	item = rcu_dereference_check(array, 1)->items;
-	for (; item->prog; item++) {
+	for (item = array->items; item->prog; item++) {
 		if (item->prog == &dummy_bpf_prog.prog)
 			continue;
 		prog_ids[i] = item->prog->aux->id;
@@ -1839,7 +1834,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
 	return !!(item->prog);
 }
 
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
 				__u32 __user *prog_ids, u32 cnt)
 {
 	unsigned long err = 0;
@@ -1850,18 +1845,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
 	 * cnt = bpf_prog_array_length();
 	 * if (cnt > 0)
 	 *     bpf_prog_array_copy_to_user(..., cnt);
-	 * so below kcalloc doesn't need extra cnt > 0 check, but
-	 * bpf_prog_array_length() releases rcu lock and
-	 * prog array could have been swapped with empty or larger array,
-	 * so always copy 'cnt' prog_ids to the user.
-	 * In a rare race the user will see zero prog_ids
+	 * so below kcalloc doesn't need extra cnt > 0 check.
 	 */
 	ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
 	if (!ids)
 		return -ENOMEM;
-	rcu_read_lock();
 	nospc = bpf_prog_array_copy_core(array, ids, cnt);
-	rcu_read_unlock();
 	err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
 	kfree(ids);
 	if (err)
@@ -1871,19 +1860,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
 	return 0;
 }
 
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
 				struct bpf_prog *old_prog)
 {
-	struct bpf_prog_array_item *item = array->items;
+	struct bpf_prog_array_item *item;
 
-	for (; item->prog; item++)
+	for (item = array->items; item->prog; item++)
 		if (item->prog == old_prog) {
 			WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
 			break;
 		}
 }
 
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
 			struct bpf_prog_array **new_array)
@@ -1947,7 +1936,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	return 0;
 }
 
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 			     u32 *prog_ids, u32 request_cnt,
 			     u32 *prog_cnt)
 {
-- 
cgit v1.2.3


From dbcc1ba26e43bd32cb308e50ac4cb4a29d2f5967 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 May 2019 14:14:43 -0700
Subject: bpf: cgroup: properly use bpf_prog_array api

Now that we don't have __rcu markers on the bpf_prog_array helpers,
let's use proper rcu_dereference_protected to obtain array pointer
under mutex.

We also don't need __rcu annotations on cgroup_bpf.inactive since
it's not read/updated concurrently.

v4:
* drop cgroup_rcu_xyz wrappers and use rcu APIs directly; presumably
  should be more clear to understand which mutex/refcount protects
  each particular place

v3:
* amend cgroup_rcu_dereference to include percpu_ref_is_dying;
  cgroup_bpf is now reference counted and we don't hold cgroup_mutex
  anymore in cgroup_bpf_release

v2:
* replace xchg with rcu_swap_protected

Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h |  2 +-
 kernel/bpf/cgroup.c        | 28 +++++++++++++++++-----------
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 9f100fc422c3..b631ee75762d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -72,7 +72,7 @@ struct cgroup_bpf {
 	u32 flags[MAX_BPF_ATTACH_TYPE];
 
 	/* temp storage for effective prog array used by prog_attach/detach */
-	struct bpf_prog_array __rcu *inactive;
+	struct bpf_prog_array *inactive;
 
 	/* reference counter used to detach bpf programs after cgroup removal */
 	struct percpu_ref refcnt;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index d995edbe816d..ff594eb86fd7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -38,6 +38,7 @@ static void cgroup_bpf_release(struct work_struct *work)
 	struct cgroup *cgrp = container_of(work, struct cgroup,
 					   bpf.release_work);
 	enum bpf_cgroup_storage_type stype;
+	struct bpf_prog_array *old_array;
 	unsigned int type;
 
 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
@@ -54,7 +55,10 @@ static void cgroup_bpf_release(struct work_struct *work)
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
-		bpf_prog_array_free(cgrp->bpf.effective[type]);
+		old_array = rcu_dereference_protected(
+				cgrp->bpf.effective[type],
+				percpu_ref_is_dying(&cgrp->bpf.refcnt));
+		bpf_prog_array_free(old_array);
 	}
 
 	percpu_ref_exit(&cgrp->bpf.refcnt);
@@ -126,7 +130,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
  */
 static int compute_effective_progs(struct cgroup *cgrp,
 				   enum bpf_attach_type type,
-				   struct bpf_prog_array __rcu **array)
+				   struct bpf_prog_array **array)
 {
 	enum bpf_cgroup_storage_type stype;
 	struct bpf_prog_array *progs;
@@ -164,17 +168,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
 		}
 	} while ((p = cgroup_parent(p)));
 
-	rcu_assign_pointer(*array, progs);
+	*array = progs;
 	return 0;
 }
 
 static void activate_effective_progs(struct cgroup *cgrp,
 				     enum bpf_attach_type type,
-				     struct bpf_prog_array __rcu *array)
+				     struct bpf_prog_array *old_array)
 {
-	struct bpf_prog_array __rcu *old_array;
-
-	old_array = xchg(&cgrp->bpf.effective[type], array);
+	rcu_swap_protected(cgrp->bpf.effective[type], old_array,
+			   lockdep_is_held(&cgroup_mutex));
 	/* free prog array after grace period, since __cgroup_bpf_run_*()
 	 * might be still walking the array
 	 */
@@ -191,7 +194,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
  * that array below is variable length
  */
 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
-	struct bpf_prog_array __rcu *arrays[NR] = {};
+	struct bpf_prog_array *arrays[NR] = {};
 	int ret, i;
 
 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
@@ -477,10 +480,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	enum bpf_attach_type type = attr->query.attach_type;
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog_array *effective;
 	int cnt, ret = 0, i;
 
+	effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+					      lockdep_is_held(&cgroup_mutex));
+
 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
-		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+		cnt = bpf_prog_array_length(effective);
 	else
 		cnt = prog_list_length(progs);
 
@@ -497,8 +504,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	}
 
 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
-		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
-						   prog_ids, cnt);
+		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
 	} else {
 		struct bpf_prog_list *pl;
 		u32 id;
-- 
cgit v1.2.3


From 279758f8001f0014b15656a4ef130a20852f6df6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 28 May 2019 15:02:31 +0800
Subject: rhashtable: Add rht_ptr_rcu and improve rht_ptr

This patch moves common code between rht_ptr and rht_ptr_exclusive
into __rht_ptr.  It also adds a new helper rht_ptr_rcu exclusively
for the RCU case.  This way rht_ptr becomes a lock-only construct
so we can use the lighter rcu_dereference_protected primitive.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 9f8bc06d4136..beb9a9da1699 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -352,37 +352,38 @@ static inline void rht_unlock(struct bucket_table *tbl,
 static inline struct rhash_head __rcu *__rht_ptr(
 	struct rhash_lock_head *const *bkt)
 {
-	return (struct rhash_head __rcu *)((unsigned long)*bkt & ~BIT(0));
+	return (struct rhash_head __rcu *)
+		((unsigned long)*bkt & ~BIT(0) ?:
+		 (unsigned long)RHT_NULLS_MARKER(bkt));
 }
 
 /*
  * Where 'bkt' is a bucket and might be locked:
- *   rht_ptr() dereferences that pointer and clears the lock bit.
+ *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
+ *   rht_ptr() dereferences in a context where the bucket is locked.
  *   rht_ptr_exclusive() dereferences in a context where exclusive
  *            access is guaranteed, such as when destroying the table.
  */
+static inline struct rhash_head *rht_ptr_rcu(
+	struct rhash_lock_head *const *bkt)
+{
+	struct rhash_head __rcu *p = __rht_ptr(bkt);
+
+	return rcu_dereference(p);
+}
+
 static inline struct rhash_head *rht_ptr(
 	struct rhash_lock_head *const *bkt,
 	struct bucket_table *tbl,
 	unsigned int hash)
 {
-	struct rhash_head __rcu *p = __rht_ptr(bkt);
-
-	if (!p)
-		return RHT_NULLS_MARKER(bkt);
-
-	return rht_dereference_bucket_rcu(p, tbl, hash);
+	return rht_dereference_bucket(__rht_ptr(bkt), tbl, hash);
 }
 
 static inline struct rhash_head *rht_ptr_exclusive(
 	struct rhash_lock_head *const *bkt)
 {
-	struct rhash_head __rcu *p = __rht_ptr(bkt);
-
-	if (!p)
-		return RHT_NULLS_MARKER(bkt);
-
-	return rcu_dereference_protected(p, 1);
+	return rcu_dereference_protected(__rht_ptr(bkt), 1);
 }
 
 static inline void rht_assign_locked(struct rhash_lock_head **bkt,
@@ -509,7 +510,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  */
 #define rht_for_each_rcu(pos, tbl, hash)			\
 	for (({barrier(); }),					\
-	     pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash);	\
+	     pos = rht_ptr_rcu(rht_bucket(tbl, hash));		\
 	     !rht_is_a_nulls(pos);				\
 	     pos = rcu_dereference_raw(pos->next))
 
@@ -546,8 +547,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  */
 #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
 	rht_for_each_entry_rcu_from(tpos, pos,				   \
-				    rht_ptr(rht_bucket(tbl, hash),	   \
-					    tbl, hash),			   \
+				    rht_ptr_rcu(rht_bucket(tbl, hash)),	   \
 				    tbl, hash, member)
 
 /**
@@ -603,7 +603,7 @@ restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
 	bkt = rht_bucket(tbl, hash);
 	do {
-		rht_for_each_rcu_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
 			if (params.obj_cmpfn ?
 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 			    rhashtable_compare(&arg, rht_obj(ht, he)))
-- 
cgit v1.2.3


From 24ec483cec981618f8a4782a36d1e3f319d42cad Mon Sep 17 00:00:00 2001
From: Kevin 'ldir' Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
Date: Tue, 28 May 2019 17:03:50 +0000
Subject: net: sched: Introduce act_ctinfo action
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ctinfo is a new tc filter action module.  It is designed to restore
information contained in firewall conntrack marks to other packet fields
and is typically used on packet ingress paths.  At present it has two
independent sub-functions or operating modes, DSCP restoration mode &
skb mark restoration mode.

The DSCP restore mode:

This mode copies DSCP values that have been placed in the firewall
conntrack mark back into the IPv4/v6 diffserv fields of relevant
packets.

The DSCP restoration is intended for use and has been found useful for
restoring ingress classifications based on egress classifications across
links that bleach or otherwise change DSCP, typically home ISP Internet
links.  Restoring DSCP on ingress on the WAN link allows qdiscs such as
but by no means limited to CAKE to shape inbound packets according to
policies that are easier to set & mark on egress.

Ingress classification is traditionally a challenging task since
iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT
lookups, hence are unable to see internal IPv4 addresses as used on the
typical home masquerading gateway.  Thus marking the connection in some
manner on egress for later restoration of classification on ingress is
easier to implement.

Parameters related to DSCP restore mode:

dscpmask - a 32 bit mask of 6 contiguous bits and indicate bits of the
conntrack mark field contain the DSCP value to be restored.

statemask - a 32 bit mask of (usually) 1 bit length, outside the area
specified by dscpmask.  This represents a conditional operation flag
whereby the DSCP is only restored if the flag is set.  This is useful to
implement a 'one shot' iptables based classification where the
'complicated' iptables rules are only run once to classify the
connection on initial (egress) packet and subsequent packets are all
marked/restored with the same DSCP.  A mask of zero disables the
conditional behaviour ie. the conntrack mark DSCP bits are always
restored to the ip diffserv field (assuming the conntrack entry is found
& the skb is an ipv4/ipv6 type)

e.g. dscpmask 0xfc000000 statemask 0x01000000

|----0xFC----conntrack mark----000000---|
| Bits 31-26 | bit 25 | bit24 |~~~ Bit 0|
| DSCP       | unused | flag  |unused   |
|-----------------------0x01---000000---|
      |                   |
      |                   |
      ---|             Conditional flag
         v             only restore if set
|-ip diffserv-|
| 6 bits      |
|-------------|

The skb mark restore mode (cpmark):

This mode copies the firewall conntrack mark to the skb's mark field.
It is completely the functional equivalent of the existing act_connmark
action with the additional feature of being able to apply a mask to the
restored value.

Parameters related to skb mark restore mode:

mask - a 32 bit mask applied to the firewall conntrack mark to mask out
bits unwanted for restoration.  This can be useful where the conntrack
mark is being used for different purposes by different applications.  If
not specified and by default the whole mark field is copied (i.e.
default mask of 0xffffffff)

e.g. mask 0x00ffffff to mask out the top 8 bits being used by the
aforementioned DSCP restore mode.

|----0x00----conntrack mark----ffffff---|
| Bits 31-24 |                          |
| DSCP & flag|      some value here     |
|---------------------------------------|
			|
			|
			v
|------------skb mark-------------------|
|            |                          |
|  zeroed    |                          |
|---------------------------------------|

Overall parameters:

zone - conntrack zone

control - action related control (reclassify | pipe | drop | continue |
ok | goto chain <CHAIN_INDEX>)

Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ctinfo.h            |  28 +++
 include/uapi/linux/pkt_cls.h              |   1 +
 include/uapi/linux/tc_act/tc_ctinfo.h     |  34 +++
 net/sched/Kconfig                         |  17 ++
 net/sched/Makefile                        |   1 +
 net/sched/act_ctinfo.c                    | 396 ++++++++++++++++++++++++++++++
 tools/testing/selftests/tc-testing/config |   1 +
 7 files changed, 478 insertions(+)
 create mode 100644 include/net/tc_act/tc_ctinfo.h
 create mode 100644 include/uapi/linux/tc_act/tc_ctinfo.h
 create mode 100644 net/sched/act_ctinfo.c

(limited to 'include')

diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
new file mode 100644
index 000000000000..d6a688571672
--- /dev/null
+++ b/include/net/tc_act/tc_ctinfo.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_CTINFO_H
+#define __NET_TC_CTINFO_H
+
+#include <net/act_api.h>
+
+struct tcf_ctinfo_params {
+	struct rcu_head rcu;
+	struct net *net;
+	u32 dscpmask;
+	u32 dscpstatemask;
+	u32 cpmarkmask;
+	u16 zone;
+	u8 mode;
+	u8 dscpmaskshift;
+};
+
+struct tcf_ctinfo {
+	struct tc_action common;
+	struct tcf_ctinfo_params __rcu *params;
+	u64 stats_dscp_set;
+	u64 stats_dscp_error;
+	u64 stats_cpmark_set;
+};
+
+#define to_ctinfo(a) ((struct tcf_ctinfo *)a)
+
+#endif /* __NET_TC_CTINFO_H */
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 51a0496f78ea..a93680fc4bfa 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -105,6 +105,7 @@ enum tca_id {
 	TCA_ID_IFE = TCA_ACT_IFE,
 	TCA_ID_SAMPLE = TCA_ACT_SAMPLE,
 	/* other actions go here */
+	TCA_ID_CTINFO,
 	__TCA_ID_MAX = 255
 };
 
diff --git a/include/uapi/linux/tc_act/tc_ctinfo.h b/include/uapi/linux/tc_act/tc_ctinfo.h
new file mode 100644
index 000000000000..da803e05a89b
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ctinfo.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __UAPI_TC_CTINFO_H
+#define __UAPI_TC_CTINFO_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+struct tc_ctinfo {
+	tc_gen;
+};
+
+enum {
+	TCA_CTINFO_UNSPEC,
+	TCA_CTINFO_PAD,
+	TCA_CTINFO_TM,
+	TCA_CTINFO_ACT,
+	TCA_CTINFO_ZONE,
+	TCA_CTINFO_PARMS_DSCP_MASK,
+	TCA_CTINFO_PARMS_DSCP_STATEMASK,
+	TCA_CTINFO_PARMS_CPMARK_MASK,
+	TCA_CTINFO_STATS_DSCP_SET,
+	TCA_CTINFO_STATS_DSCP_ERROR,
+	TCA_CTINFO_STATS_CPMARK_SET,
+	__TCA_CTINFO_MAX
+};
+
+#define TCA_CTINFO_MAX (__TCA_CTINFO_MAX - 1)
+
+enum {
+	CTINFO_MODE_DSCP	= BIT(0),
+	CTINFO_MODE_CPMARK	= BIT(1)
+};
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2c72d95c3050..d104f7ee26c7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -877,6 +877,23 @@ config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_CTINFO
+        tristate "Netfilter Connection Mark Actions"
+        depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+        depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+        help
+	  Say Y here to allow transfer of a connmark stored information.
+	  Current actions transfer connmark stored DSCP into
+	  ipv4/v6 diffserv and/or to transfer connmark to packet
+	  mark.  Both are useful for restoring egress based marks
+	  back onto ingress connections for qdisc priority mapping
+	  purposes.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_ctinfo.
+
 config NET_ACT_SKBMOD
         tristate "skb data modification action"
         depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c..d54bfcbd7981 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_CTINFO)	+= act_ctinfo.o
 obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
new file mode 100644
index 000000000000..926109139a81
--- /dev/null
+++ b/net/sched/act_ctinfo.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* net/sched/act_ctinfo.c  netfilter ctinfo connmark actions
+ *
+ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <uapi/linux/tc_act/tc_ctinfo.h>
+#include <net/tc_act/tc_ctinfo.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static struct tc_action_ops act_ctinfo_ops;
+static unsigned int ctinfo_net_id;
+
+static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+				struct tcf_ctinfo_params *cp,
+				struct sk_buff *skb, int wlen, int proto)
+{
+	u8 dscp, newdscp;
+
+	newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
+		     ~INET_ECN_MASK;
+
+	switch (proto) {
+	case NFPROTO_IPV4:
+		dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK;
+		if (dscp != newdscp) {
+			if (likely(!skb_try_make_writable(skb, wlen))) {
+				ipv4_change_dsfield(ip_hdr(skb),
+						    INET_ECN_MASK,
+						    newdscp);
+				ca->stats_dscp_set++;
+			} else {
+				ca->stats_dscp_error++;
+			}
+		}
+		break;
+	case NFPROTO_IPV6:
+		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK;
+		if (dscp != newdscp) {
+			if (likely(!skb_try_make_writable(skb, wlen))) {
+				ipv6_change_dsfield(ipv6_hdr(skb),
+						    INET_ECN_MASK,
+						    newdscp);
+				ca->stats_dscp_set++;
+			} else {
+				ca->stats_dscp_error++;
+			}
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+				  struct tcf_ctinfo_params *cp,
+				  struct sk_buff *skb)
+{
+	ca->stats_cpmark_set++;
+	skb->mark = ct->mark & cp->cpmarkmask;
+}
+
+static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	const struct nf_conntrack_tuple_hash *thash = NULL;
+	struct tcf_ctinfo *ca = to_ctinfo(a);
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_zone zone;
+	enum ip_conntrack_info ctinfo;
+	struct tcf_ctinfo_params *cp;
+	struct nf_conn *ct;
+	int proto, wlen;
+	int action;
+
+	cp = rcu_dereference_bh(ca->params);
+
+	tcf_lastuse_update(&ca->tcf_tm);
+	bstats_update(&ca->tcf_bstats, skb);
+	action = READ_ONCE(ca->tcf_action);
+
+	wlen = skb_network_offset(skb);
+	if (tc_skb_protocol(skb) == htons(ETH_P_IP)) {
+		wlen += sizeof(struct iphdr);
+		if (!pskb_may_pull(skb, wlen))
+			goto out;
+
+		proto = NFPROTO_IPV4;
+	} else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) {
+		wlen += sizeof(struct ipv6hdr);
+		if (!pskb_may_pull(skb, wlen))
+			goto out;
+
+		proto = NFPROTO_IPV6;
+	} else {
+		goto out;
+	}
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct) { /* look harder, usually ingress */
+		if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+				       proto, cp->net, &tuple))
+			goto out;
+		zone.id = cp->zone;
+		zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+		thash = nf_conntrack_find_get(cp->net, &zone, &tuple);
+		if (!thash)
+			goto out;
+
+		ct = nf_ct_tuplehash_to_ctrack(thash);
+	}
+
+	if (cp->mode & CTINFO_MODE_DSCP)
+		if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask))
+			tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
+
+	if (cp->mode & CTINFO_MODE_CPMARK)
+		tcf_ctinfo_cpmark_set(ct, ca, cp, skb);
+
+	if (thash)
+		nf_ct_put(ct);
+out:
+	return action;
+}
+
+static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
+	[TCA_CTINFO_ACT]		  = { .len = sizeof(struct
+							    tc_ctinfo) },
+	[TCA_CTINFO_ZONE]		  = { .type = NLA_U16 },
+	[TCA_CTINFO_PARMS_DSCP_MASK]	  = { .type = NLA_U32 },
+	[TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
+	[TCA_CTINFO_PARMS_CPMARK_MASK]	  = { .type = NLA_U32 },
+};
+
+static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind, bool rtnl_held,
+			   struct tcf_proto *tp,
+			   struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+	struct nlattr *tb[TCA_CTINFO_MAX + 1];
+	struct tcf_ctinfo_params *cp_new;
+	struct tcf_chain *goto_ch = NULL;
+	u32 dscpmask = 0, dscpstatemask;
+	struct tc_ctinfo *actparm;
+	struct tcf_ctinfo *ci;
+	u8 dscpmaskshift;
+	int ret = 0, err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_CTINFO_ACT])
+		return -EINVAL;
+	actparm = nla_data(tb[TCA_CTINFO_ACT]);
+
+	/* do some basic validation here before dynamically allocating things */
+	/* that we would otherwise have to clean up.			      */
+	if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) {
+		dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]);
+		/* need contiguous 6 bit mask */
+		dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0;
+		if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f)
+			return -EINVAL;
+		dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ?
+			nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0;
+		/* mask & statemask must not overlap */
+		if (dscpmask & dscpstatemask)
+			return -EINVAL;
+	}
+
+	/* done the validation:now to the actual action allocation */
+	err = tcf_idr_check_alloc(tn, &actparm->index, a, bind);
+	if (!err) {
+		ret = tcf_idr_create(tn, actparm->index, est, a,
+				     &act_ctinfo_ops, bind, false);
+		if (ret) {
+			tcf_idr_cleanup(tn, actparm->index);
+			return ret;
+		}
+	} else if (err > 0) {
+		if (bind) /* don't override defaults */
+			return 0;
+		if (!ovr) {
+			tcf_idr_release(*a, bind);
+			return -EEXIST;
+		}
+	} else {
+		return err;
+	}
+
+	err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack);
+	if (err < 0)
+		goto release_idr;
+
+	ci = to_ctinfo(*a);
+
+	cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL);
+	if (unlikely(!cp_new)) {
+		err = -ENOMEM;
+		goto put_chain;
+	}
+
+	cp_new->net = net;
+	cp_new->zone = tb[TCA_CTINFO_ZONE] ?
+			nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0;
+	if (dscpmask) {
+		cp_new->dscpmask = dscpmask;
+		cp_new->dscpmaskshift = dscpmaskshift;
+		cp_new->dscpstatemask = dscpstatemask;
+		cp_new->mode |= CTINFO_MODE_DSCP;
+	} else {
+		cp_new->mode &= ~CTINFO_MODE_DSCP;
+	}
+
+	if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) {
+		cp_new->cpmarkmask =
+				nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]);
+		cp_new->mode |= CTINFO_MODE_CPMARK;
+	} else {
+		cp_new->mode &= ~CTINFO_MODE_CPMARK;
+	}
+
+	spin_lock_bh(&ci->tcf_lock);
+	goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
+	rcu_swap_protected(ci->params, cp_new,
+			   lockdep_is_held(&ci->tcf_lock));
+	spin_unlock_bh(&ci->tcf_lock);
+
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+	if (cp_new)
+		kfree_rcu(cp_new, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_idr_insert(tn, *a);
+
+	return ret;
+
+put_chain:
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+release_idr:
+	tcf_idr_release(*a, bind);
+	return err;
+}
+
+static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	struct tcf_ctinfo *ci = to_ctinfo(a);
+	struct tc_ctinfo opt = {
+		.index   = ci->tcf_index,
+		.refcnt  = refcount_read(&ci->tcf_refcnt) - ref,
+		.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+	};
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ctinfo_params *cp;
+	struct tcf_t t;
+
+	spin_lock_bh(&ci->tcf_lock);
+	cp = rcu_dereference_protected(ci->params,
+				       lockdep_is_held(&ci->tcf_lock));
+
+	tcf_tm_dump(&t, &ci->tcf_tm);
+	if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
+		goto nla_put_failure;
+
+	opt.action = ci->tcf_action;
+	if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone))
+		goto nla_put_failure;
+
+	if (cp->mode & CTINFO_MODE_DSCP) {
+		if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK,
+				cp->dscpmask))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK,
+				cp->dscpstatemask))
+			goto nla_put_failure;
+	}
+
+	if (cp->mode & CTINFO_MODE_CPMARK) {
+		if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK,
+				cp->cpmarkmask))
+			goto nla_put_failure;
+	}
+
+	if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
+			      ci->stats_dscp_set, TCA_CTINFO_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
+			      ci->stats_dscp_error, TCA_CTINFO_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
+			      ci->stats_cpmark_set, TCA_CTINFO_PAD))
+		goto nla_put_failure;
+
+	spin_unlock_bh(&ci->tcf_lock);
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(&ci->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops,
+			     struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+	return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ctinfo_ops = {
+	.kind	= "ctinfo",
+	.id	= TCA_ID_CTINFO,
+	.owner	= THIS_MODULE,
+	.act	= tcf_ctinfo_act,
+	.dump	= tcf_ctinfo_dump,
+	.init	= tcf_ctinfo_init,
+	.walk	= tcf_ctinfo_walker,
+	.lookup	= tcf_ctinfo_search,
+	.size	= sizeof(struct tcf_ctinfo),
+};
+
+static __net_init int ctinfo_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+	return tc_action_net_init(tn, &act_ctinfo_ops);
+}
+
+static void __net_exit ctinfo_exit_net(struct list_head *net_list)
+{
+	tc_action_net_exit(net_list, ctinfo_net_id);
+}
+
+static struct pernet_operations ctinfo_net_ops = {
+	.init		= ctinfo_init_net,
+	.exit_batch	= ctinfo_exit_net,
+	.id		= &ctinfo_net_id,
+	.size		= sizeof(struct tc_action_net),
+};
+
+static int __init ctinfo_init_module(void)
+{
+	return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+static void __exit ctinfo_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+module_init(ctinfo_init_module);
+module_exit(ctinfo_cleanup_module);
+MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>");
+MODULE_DESCRIPTION("Connection tracking mark actions");
+MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
index 203302065458..b235efd55367 100644
--- a/tools/testing/selftests/tc-testing/config
+++ b/tools/testing/selftests/tc-testing/config
@@ -38,6 +38,7 @@ CONFIG_NET_ACT_CSUM=m
 CONFIG_NET_ACT_VLAN=m
 CONFIG_NET_ACT_BPF=m
 CONFIG_NET_ACT_CONNMARK=m
+CONFIG_NET_ACT_CTINFO=m
 CONFIG_NET_ACT_SKBMOD=m
 CONFIG_NET_ACT_IFE=m
 CONFIG_NET_ACT_TUNNEL_KEY=m
-- 
cgit v1.2.3


From 44cc27e43fa3b8977373915a8e7f515a9d263343 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Tue, 28 May 2019 20:38:12 +0300
Subject: net: phylink: Add struct phylink_config to PHYLINK API

The phylink_config structure will encapsulate a pointer to a struct
device and the operation type requested for this instance of PHYLINK.
This patch does not make any functional changes, it just transitions the
PHYLINK internals and all its users to the new API.

A pointer to a phylink_config structure will be passed to
phylink_create() instead of the net_device directly. Also, the same
phylink_config pointer will be passed back to all phylink_mac_ops
callbacks instead of the net_device. Using this mechanism, a PHYLINK
user can get the original net_device using a structure such as
'to_net_dev(config->dev)' or directly the structure containing the
phylink_config using a container_of call.

At the moment, only the PHYLINK_NETDEV is defined as a valid operation
type for PHYLINK. In this mode, a valid reference to a struct device
linked to the original net_device should be passed to PHYLINK through
the phylink_config structure.

This API changes is mainly driven by the necessity of adding a new
operation type in PHYLINK that disconnects the phy_device from the
net_device and also works when the net_device is lacking.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/sfp-phylink.rst        |  5 ++-
 drivers/net/ethernet/marvell/mvneta.c           | 36 ++++++++++------
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h      |  1 +
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 43 +++++++++++--------
 drivers/net/phy/phylink.c                       | 26 ++++++++----
 include/linux/phylink.h                         | 56 ++++++++++++++++---------
 include/net/dsa.h                               |  2 +
 net/dsa/slave.c                                 | 31 ++++++++------
 8 files changed, 128 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index 5bd26cb07244..91446b431b70 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -98,6 +98,7 @@ this documentation.
 4. Add::
 
 	struct phylink *phylink;
+	struct phylink_config phylink_config;
 
    to the driver's private data structure.  We shall refer to the
    driver's private data pointer as ``priv`` below, and the driver's
@@ -223,8 +224,10 @@ this documentation.
    .. code-block:: c
 
 	struct phylink *phylink;
+	priv->phylink_config.dev = &dev.dev;
+	priv->phylink_config.type = PHYLINK_NETDEV;
 
-	phylink = phylink_create(dev, node, phy_mode, &phylink_ops);
+	phylink = phylink_create(&priv->phylink_config, node, phy_mode, &phylink_ops);
 	if (IS_ERR(phylink)) {
 		err = PTR_ERR(phylink);
 		fail probe;
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index e758650b2c26..adbbcdde73e6 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -437,6 +437,7 @@ struct mvneta_port {
 	struct device_node *dn;
 	unsigned int tx_csum_limit;
 	struct phylink *phylink;
+	struct phylink_config phylink_config;
 	struct phy *comphy;
 
 	struct mvneta_bm *bm_priv;
@@ -3356,9 +3357,11 @@ static int mvneta_set_mac_addr(struct net_device *dev, void *addr)
 	return 0;
 }
 
-static void mvneta_validate(struct net_device *ndev, unsigned long *supported,
+static void mvneta_validate(struct phylink_config *config,
+			    unsigned long *supported,
 			    struct phylink_link_state *state)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 
@@ -3408,9 +3411,10 @@ static void mvneta_validate(struct net_device *ndev, unsigned long *supported,
 	phylink_helper_basex_speed(state);
 }
 
-static int mvneta_mac_link_state(struct net_device *ndev,
+static int mvneta_mac_link_state(struct phylink_config *config,
 				 struct phylink_link_state *state)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 gmac_stat;
 
@@ -3438,8 +3442,9 @@ static int mvneta_mac_link_state(struct net_device *ndev,
 	return 1;
 }
 
-static void mvneta_mac_an_restart(struct net_device *ndev)
+static void mvneta_mac_an_restart(struct phylink_config *config)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 gmac_an = mvreg_read(pp, MVNETA_GMAC_AUTONEG_CONFIG);
 
@@ -3449,9 +3454,10 @@ static void mvneta_mac_an_restart(struct net_device *ndev)
 		    gmac_an & ~MVNETA_GMAC_INBAND_RESTART_AN);
 }
 
-static void mvneta_mac_config(struct net_device *ndev, unsigned int mode,
-	const struct phylink_link_state *state)
+static void mvneta_mac_config(struct phylink_config *config, unsigned int mode,
+			      const struct phylink_link_state *state)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 new_ctrl0, gmac_ctrl0 = mvreg_read(pp, MVNETA_GMAC_CTRL_0);
 	u32 new_ctrl2, gmac_ctrl2 = mvreg_read(pp, MVNETA_GMAC_CTRL_2);
@@ -3581,9 +3587,10 @@ static void mvneta_set_eee(struct mvneta_port *pp, bool enable)
 	mvreg_write(pp, MVNETA_LPI_CTRL_1, lpi_ctl1);
 }
 
-static void mvneta_mac_link_down(struct net_device *ndev, unsigned int mode,
-				 phy_interface_t interface)
+static void mvneta_mac_link_down(struct phylink_config *config,
+				 unsigned int mode, phy_interface_t interface)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 val;
 
@@ -3600,10 +3607,11 @@ static void mvneta_mac_link_down(struct net_device *ndev, unsigned int mode,
 	mvneta_set_eee(pp, false);
 }
 
-static void mvneta_mac_link_up(struct net_device *ndev, unsigned int mode,
+static void mvneta_mac_link_up(struct phylink_config *config, unsigned int mode,
 			       phy_interface_t interface,
 			       struct phy_device *phy)
 {
+	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
 	u32 val;
 
@@ -4500,8 +4508,14 @@ static int mvneta_probe(struct platform_device *pdev)
 		comphy = NULL;
 	}
 
-	phylink = phylink_create(dev, pdev->dev.fwnode, phy_mode,
-				 &mvneta_phylink_ops);
+	pp = netdev_priv(dev);
+	spin_lock_init(&pp->lock);
+
+	pp->phylink_config.dev = &dev->dev;
+	pp->phylink_config.type = PHYLINK_NETDEV;
+
+	phylink = phylink_create(&pp->phylink_config, pdev->dev.fwnode,
+				 phy_mode, &mvneta_phylink_ops);
 	if (IS_ERR(phylink)) {
 		err = PTR_ERR(phylink);
 		goto err_free_irq;
@@ -4513,8 +4527,6 @@ static int mvneta_probe(struct platform_device *pdev)
 
 	dev->ethtool_ops = &mvneta_eth_tool_ops;
 
-	pp = netdev_priv(dev);
-	spin_lock_init(&pp->lock);
 	pp->phylink = phylink;
 	pp->comphy = comphy;
 	pp->phy_interface = phy_mode;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index 18ae8d06b692..d67c970f02e5 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -915,6 +915,7 @@ struct mvpp2_port {
 
 	phy_interface_t phy_interface;
 	struct phylink *phylink;
+	struct phylink_config phylink_config;
 	struct phy *comphy;
 
 	struct mvpp2_bm_pool *pool_long;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 3ed713b8dea5..757f8e31645e 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -56,9 +56,9 @@ static struct {
 /* The prototype is added here to be used in start_dev when using ACPI. This
  * will be removed once phylink is used for all modes (dt+ACPI).
  */
-static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
+static void mvpp2_mac_config(struct phylink_config *config, unsigned int mode,
 			     const struct phylink_link_state *state);
-static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
+static void mvpp2_mac_link_up(struct phylink_config *config, unsigned int mode,
 			      phy_interface_t interface, struct phy_device *phy);
 
 /* Queue modes */
@@ -3239,9 +3239,9 @@ static void mvpp2_start_dev(struct mvpp2_port *port)
 		struct phylink_link_state state = {
 			.interface = port->phy_interface,
 		};
-		mvpp2_mac_config(port->dev, MLO_AN_INBAND, &state);
-		mvpp2_mac_link_up(port->dev, MLO_AN_INBAND, port->phy_interface,
-				  NULL);
+		mvpp2_mac_config(&port->phylink_config, MLO_AN_INBAND, &state);
+		mvpp2_mac_link_up(&port->phylink_config, MLO_AN_INBAND,
+				  port->phy_interface, NULL);
 	}
 
 	netif_tx_start_all_queues(port->dev);
@@ -4463,11 +4463,12 @@ static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv,
 	eth_hw_addr_random(dev);
 }
 
-static void mvpp2_phylink_validate(struct net_device *dev,
+static void mvpp2_phylink_validate(struct phylink_config *config,
 				   unsigned long *supported,
 				   struct phylink_link_state *state)
 {
-	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_port *port = container_of(config, struct mvpp2_port,
+					       phylink_config);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 
 	/* Invalid combinations */
@@ -4591,10 +4592,11 @@ static void mvpp2_gmac_link_state(struct mvpp2_port *port,
 		state->pause |= MLO_PAUSE_TX;
 }
 
-static int mvpp2_phylink_mac_link_state(struct net_device *dev,
+static int mvpp2_phylink_mac_link_state(struct phylink_config *config,
 					struct phylink_link_state *state)
 {
-	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_port *port = container_of(config, struct mvpp2_port,
+					       phylink_config);
 
 	if (port->priv->hw_version == MVPP22 && port->gop_id == 0) {
 		u32 mode = readl(port->base + MVPP22_XLG_CTRL3_REG);
@@ -4610,9 +4612,10 @@ static int mvpp2_phylink_mac_link_state(struct net_device *dev,
 	return 1;
 }
 
-static void mvpp2_mac_an_restart(struct net_device *dev)
+static void mvpp2_mac_an_restart(struct phylink_config *config)
 {
-	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_port *port = container_of(config, struct mvpp2_port,
+					       phylink_config);
 	u32 val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 
 	writel(val | MVPP2_GMAC_IN_BAND_RESTART_AN,
@@ -4797,9 +4800,10 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 	}
 }
 
-static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
+static void mvpp2_mac_config(struct phylink_config *config, unsigned int mode,
 			     const struct phylink_link_state *state)
 {
+	struct net_device *dev = to_net_dev(config->dev);
 	struct mvpp2_port *port = netdev_priv(dev);
 	bool change_interface = port->phy_interface != state->interface;
 
@@ -4839,9 +4843,10 @@ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
 	mvpp2_port_enable(port);
 }
 
-static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
+static void mvpp2_mac_link_up(struct phylink_config *config, unsigned int mode,
 			      phy_interface_t interface, struct phy_device *phy)
 {
+	struct net_device *dev = to_net_dev(config->dev);
 	struct mvpp2_port *port = netdev_priv(dev);
 	u32 val;
 
@@ -4866,9 +4871,10 @@ static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
 	netif_tx_wake_all_queues(dev);
 }
 
-static void mvpp2_mac_link_down(struct net_device *dev, unsigned int mode,
-				phy_interface_t interface)
+static void mvpp2_mac_link_down(struct phylink_config *config,
+				unsigned int mode, phy_interface_t interface)
 {
+	struct net_device *dev = to_net_dev(config->dev);
 	struct mvpp2_port *port = netdev_priv(dev);
 	u32 val;
 
@@ -5125,8 +5131,11 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 
 	/* Phylink isn't used w/ ACPI as of now */
 	if (port_node) {
-		phylink = phylink_create(dev, port_fwnode, phy_mode,
-					 &mvpp2_phylink_ops);
+		port->phylink_config.dev = &dev->dev;
+		port->phylink_config.type = PHYLINK_NETDEV;
+
+		phylink = phylink_create(&port->phylink_config, port_fwnode,
+					 phy_mode, &mvpp2_phylink_ops);
 		if (IS_ERR(phylink)) {
 			err = PTR_ERR(phylink);
 			goto err_free_port_pcpu;
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 83ab83c3edba..5a283bf9d402 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -41,6 +41,7 @@ struct phylink {
 	/* private: */
 	struct net_device *netdev;
 	const struct phylink_mac_ops *ops;
+	struct phylink_config *config;
 
 	unsigned long phylink_disable_state; /* bitmask of disables */
 	struct phy_device *phydev;
@@ -111,7 +112,7 @@ static const char *phylink_an_mode_str(unsigned int mode)
 static int phylink_validate(struct phylink *pl, unsigned long *supported,
 			    struct phylink_link_state *state)
 {
-	pl->ops->validate(pl->netdev, supported, state);
+	pl->ops->validate(pl->config, supported, state);
 
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
@@ -299,7 +300,7 @@ static void phylink_mac_config(struct phylink *pl,
 		   __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising,
 		   state->pause, state->link, state->an_enabled);
 
-	pl->ops->mac_config(pl->netdev, pl->link_an_mode, state);
+	pl->ops->mac_config(pl->config, pl->link_an_mode, state);
 }
 
 static void phylink_mac_config_up(struct phylink *pl,
@@ -313,12 +314,11 @@ static void phylink_mac_an_restart(struct phylink *pl)
 {
 	if (pl->link_config.an_enabled &&
 	    phy_interface_mode_is_8023z(pl->link_config.interface))
-		pl->ops->mac_an_restart(pl->netdev);
+		pl->ops->mac_an_restart(pl->config);
 }
 
 static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state *state)
 {
-	struct net_device *ndev = pl->netdev;
 
 	linkmode_copy(state->advertising, pl->link_config.advertising);
 	linkmode_zero(state->lp_advertising);
@@ -330,7 +330,7 @@ static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state *
 	state->an_complete = 0;
 	state->link = 1;
 
-	return pl->ops->mac_link_state(ndev, state);
+	return pl->ops->mac_link_state(pl->config, state);
 }
 
 /* The fixed state is... fixed except for the link state,
@@ -400,7 +400,7 @@ static void phylink_mac_link_up(struct phylink *pl,
 {
 	struct net_device *ndev = pl->netdev;
 
-	pl->ops->mac_link_up(ndev, pl->link_an_mode,
+	pl->ops->mac_link_up(pl->config, pl->link_an_mode,
 			     pl->phy_state.interface,
 			     pl->phydev);
 
@@ -418,7 +418,7 @@ static void phylink_mac_link_down(struct phylink *pl)
 	struct net_device *ndev = pl->netdev;
 
 	netif_carrier_off(ndev);
-	pl->ops->mac_link_down(ndev, pl->link_an_mode,
+	pl->ops->mac_link_down(pl->config, pl->link_an_mode,
 			       pl->phy_state.interface);
 	netdev_info(ndev, "Link is Down\n");
 }
@@ -553,7 +553,7 @@ static int phylink_register_sfp(struct phylink *pl,
  * Returns a pointer to a &struct phylink, or an error-pointer value. Users
  * must use IS_ERR() to check for errors from this function.
  */
-struct phylink *phylink_create(struct net_device *ndev,
+struct phylink *phylink_create(struct phylink_config *config,
 			       struct fwnode_handle *fwnode,
 			       phy_interface_t iface,
 			       const struct phylink_mac_ops *ops)
@@ -567,7 +567,15 @@ struct phylink *phylink_create(struct net_device *ndev,
 
 	mutex_init(&pl->state_mutex);
 	INIT_WORK(&pl->resolve, phylink_resolve);
-	pl->netdev = ndev;
+
+	pl->config = config;
+	if (config->type == PHYLINK_NETDEV) {
+		pl->netdev = to_net_dev(config->dev);
+	} else {
+		kfree(pl);
+		return ERR_PTR(-EINVAL);
+	}
+
 	pl->phy_state.interface = iface;
 	pl->link_interface = iface;
 	if (iface == PHY_INTERFACE_MODE_MOCA)
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 6411c624f63a..67f35f07ac4b 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -54,6 +54,20 @@ struct phylink_link_state {
 	unsigned int an_complete:1;
 };
 
+enum phylink_op_type {
+	PHYLINK_NETDEV = 0,
+};
+
+/**
+ * struct phylink_config - PHYLINK configuration structure
+ * @dev: a pointer to a struct device associated with the MAC
+ * @type: operation type of PHYLINK instance
+ */
+struct phylink_config {
+	struct device *dev;
+	enum phylink_op_type type;
+};
+
 /**
  * struct phylink_mac_ops - MAC operations structure.
  * @validate: Validate and update the link configuration.
@@ -66,16 +80,17 @@ struct phylink_link_state {
  * The individual methods are described more fully below.
  */
 struct phylink_mac_ops {
-	void (*validate)(struct net_device *ndev, unsigned long *supported,
+	void (*validate)(struct phylink_config *config,
+			 unsigned long *supported,
 			 struct phylink_link_state *state);
-	int (*mac_link_state)(struct net_device *ndev,
+	int (*mac_link_state)(struct phylink_config *config,
 			      struct phylink_link_state *state);
-	void (*mac_config)(struct net_device *ndev, unsigned int mode,
+	void (*mac_config)(struct phylink_config *config, unsigned int mode,
 			   const struct phylink_link_state *state);
-	void (*mac_an_restart)(struct net_device *ndev);
-	void (*mac_link_down)(struct net_device *ndev, unsigned int mode,
+	void (*mac_an_restart)(struct phylink_config *config);
+	void (*mac_link_down)(struct phylink_config *config, unsigned int mode,
 			      phy_interface_t interface);
-	void (*mac_link_up)(struct net_device *ndev, unsigned int mode,
+	void (*mac_link_up)(struct phylink_config *config, unsigned int mode,
 			    phy_interface_t interface,
 			    struct phy_device *phy);
 };
@@ -83,7 +98,7 @@ struct phylink_mac_ops {
 #if 0 /* For kernel-doc purposes only. */
 /**
  * validate - Validate and update the link configuration
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @supported: ethtool bitmask for supported link modes.
  * @state: a pointer to a &struct phylink_link_state.
  *
@@ -100,12 +115,12 @@ struct phylink_mac_ops {
  * based on @state->advertising and/or @state->speed and update
  * @state->interface accordingly.
  */
-void validate(struct net_device *ndev, unsigned long *supported,
+void validate(struct phylink_config *config, unsigned long *supported,
 	      struct phylink_link_state *state);
 
 /**
  * mac_link_state() - Read the current link state from the hardware
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @state: a pointer to a &struct phylink_link_state.
  *
  * Read the current link state from the MAC, reporting the current
@@ -114,12 +129,12 @@ void validate(struct net_device *ndev, unsigned long *supported,
  * negotiation completion state in @state->an_complete, and link
  * up state in @state->link.
  */
-int mac_link_state(struct net_device *ndev,
+int mac_link_state(struct phylink_config *config,
 		   struct phylink_link_state *state);
 
 /**
  * mac_config() - configure the MAC for the selected mode and state
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND.
  * @state: a pointer to a &struct phylink_link_state.
  *
@@ -157,18 +172,18 @@ int mac_link_state(struct net_device *ndev,
  * down.  This "update" behaviour is critical to avoid bouncing the
  * link up status.
  */
-void mac_config(struct net_device *ndev, unsigned int mode,
+void mac_config(struct phylink_config *config, unsigned int mode,
 		const struct phylink_link_state *state);
 
 /**
  * mac_an_restart() - restart 802.3z BaseX autonegotiation
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  */
-void mac_an_restart(struct net_device *ndev);
+void mac_an_restart(struct phylink_config *config);
 
 /**
  * mac_link_down() - take the link down
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @mode: link autonegotiation mode
  * @interface: link &typedef phy_interface_t mode
  *
@@ -177,12 +192,12 @@ void mac_an_restart(struct net_device *ndev);
  * Energy Efficient Ethernet MAC configuration. Interface type
  * selection must be done in mac_config().
  */
-void mac_link_down(struct net_device *ndev, unsigned int mode,
+void mac_link_down(struct phylink_config *config, unsigned int mode,
 		   phy_interface_t interface);
 
 /**
  * mac_link_up() - allow the link to come up
- * @ndev: a pointer to a &struct net_device for the MAC.
+ * @config: a pointer to a &struct phylink_config.
  * @mode: link autonegotiation mode
  * @interface: link &typedef phy_interface_t mode
  * @phy: any attached phy
@@ -193,13 +208,14 @@ void mac_link_down(struct net_device *ndev, unsigned int mode,
  * phy_init_eee() and perform appropriate MAC configuration for EEE.
  * Interface type selection must be done in mac_config().
  */
-void mac_link_up(struct net_device *ndev, unsigned int mode,
+void mac_link_up(struct phylink_config *config, unsigned int mode,
 		 phy_interface_t interface,
 		 struct phy_device *phy);
 #endif
 
-struct phylink *phylink_create(struct net_device *, struct fwnode_handle *,
-	phy_interface_t iface, const struct phylink_mac_ops *ops);
+struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *,
+			       phy_interface_t iface,
+			       const struct phylink_mac_ops *ops);
 void phylink_destroy(struct phylink *);
 
 int phylink_connect_phy(struct phylink *, struct phy_device *);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 685294817712..a7f36219904f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -22,6 +22,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/platform_data/dsa.h>
+#include <linux/phylink.h>
 #include <net/devlink.h>
 #include <net/switchdev.h>
 
@@ -193,6 +194,7 @@ struct dsa_port {
 	struct net_device	*bridge_dev;
 	struct devlink_port	devlink_port;
 	struct phylink		*pl;
+	struct phylink_config	pl_config;
 
 	struct work_struct	xmit_work;
 	struct sk_buff_head	xmit_queue;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9892ca1f6859..48e017637d4f 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1164,11 +1164,11 @@ static struct device_type dsa_type = {
 	.name	= "dsa",
 };
 
-static void dsa_slave_phylink_validate(struct net_device *dev,
+static void dsa_slave_phylink_validate(struct phylink_config *config,
 				       unsigned long *supported,
 				       struct phylink_link_state *state)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_validate)
@@ -1177,10 +1177,10 @@ static void dsa_slave_phylink_validate(struct net_device *dev,
 	ds->ops->phylink_validate(ds, dp->index, supported, state);
 }
 
-static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
+static int dsa_slave_phylink_mac_link_state(struct phylink_config *config,
 					    struct phylink_link_state *state)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	/* Only called for SGMII and 802.3z */
@@ -1190,11 +1190,11 @@ static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
 	return ds->ops->phylink_mac_link_state(ds, dp->index, state);
 }
 
-static void dsa_slave_phylink_mac_config(struct net_device *dev,
+static void dsa_slave_phylink_mac_config(struct phylink_config *config,
 					 unsigned int mode,
 					 const struct phylink_link_state *state)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_config)
@@ -1203,9 +1203,9 @@ static void dsa_slave_phylink_mac_config(struct net_device *dev,
 	ds->ops->phylink_mac_config(ds, dp->index, mode, state);
 }
 
-static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
+static void dsa_slave_phylink_mac_an_restart(struct phylink_config *config)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_an_restart)
@@ -1214,11 +1214,12 @@ static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
 	ds->ops->phylink_mac_an_restart(ds, dp->index);
 }
 
-static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
+static void dsa_slave_phylink_mac_link_down(struct phylink_config *config,
 					    unsigned int mode,
 					    phy_interface_t interface)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct net_device *dev = dp->slave;
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_link_down) {
@@ -1230,12 +1231,13 @@ static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
 	ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
 }
 
-static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
+static void dsa_slave_phylink_mac_link_up(struct phylink_config *config,
 					  unsigned int mode,
 					  phy_interface_t interface,
 					  struct phy_device *phydev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct net_device *dev = dp->slave;
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_link_up) {
@@ -1303,7 +1305,10 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
 	if (mode < 0)
 		mode = PHY_INTERFACE_MODE_NA;
 
-	dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
+	dp->pl_config.dev = &slave_dev->dev;
+	dp->pl_config.type = PHYLINK_NETDEV;
+
+	dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode,
 				&dsa_slave_phylink_mac_ops);
 	if (IS_ERR(dp->pl)) {
 		netdev_err(slave_dev,
-- 
cgit v1.2.3


From 43de61959b999279bafb031c0c9bdf0f6cd1c501 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Tue, 28 May 2019 20:38:13 +0300
Subject: net: phylink: Add PHYLINK_DEV operation type

In the PHYLINK_DEV operation type, the PHYLINK infrastructure can work
without an attached net_device. For printing usecases, instead, a struct
device * should be passed to PHYLINK using the phylink_config structure.

Also, netif_carrier_* calls ar guarded by the presence of a valid
net_device. When using the PHYLINK_DEV operation type, we cannot check
link status using the netif_carrier_ok() API so instead, keep an
internal state of the MAC and call mac_link_{down,up} only when the link
changed.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 25 ++++++++++++++++++++-----
 include/linux/phylink.h   |  1 +
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 5a283bf9d402..5f6120f3fa3f 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -42,6 +42,8 @@ struct phylink {
 	struct net_device *netdev;
 	const struct phylink_mac_ops *ops;
 	struct phylink_config *config;
+	struct device *dev;
+	unsigned int old_link_state:1;
 
 	unsigned long phylink_disable_state; /* bitmask of disables */
 	struct phy_device *phydev;
@@ -404,7 +406,8 @@ static void phylink_mac_link_up(struct phylink *pl,
 			     pl->phy_state.interface,
 			     pl->phydev);
 
-	netif_carrier_on(ndev);
+	if (ndev)
+		netif_carrier_on(ndev);
 
 	netdev_info(ndev,
 		    "Link is Up - %s/%s - flow control %s\n",
@@ -417,7 +420,8 @@ static void phylink_mac_link_down(struct phylink *pl)
 {
 	struct net_device *ndev = pl->netdev;
 
-	netif_carrier_off(ndev);
+	if (ndev)
+		netif_carrier_off(ndev);
 	pl->ops->mac_link_down(pl->config, pl->link_an_mode,
 			       pl->phy_state.interface);
 	netdev_info(ndev, "Link is Down\n");
@@ -428,6 +432,7 @@ static void phylink_resolve(struct work_struct *w)
 	struct phylink *pl = container_of(w, struct phylink, resolve);
 	struct phylink_link_state link_state;
 	struct net_device *ndev = pl->netdev;
+	int link_changed;
 
 	mutex_lock(&pl->state_mutex);
 	if (pl->phylink_disable_state) {
@@ -470,7 +475,13 @@ static void phylink_resolve(struct work_struct *w)
 		}
 	}
 
-	if (link_state.link != netif_carrier_ok(ndev)) {
+	if (pl->netdev)
+		link_changed = (link_state.link != netif_carrier_ok(ndev));
+	else
+		link_changed = (link_state.link != pl->old_link_state);
+
+	if (link_changed) {
+		pl->old_link_state = link_state.link;
 		if (!link_state.link)
 			phylink_mac_link_down(pl);
 		else
@@ -571,6 +582,8 @@ struct phylink *phylink_create(struct phylink_config *config,
 	pl->config = config;
 	if (config->type == PHYLINK_NETDEV) {
 		pl->netdev = to_net_dev(config->dev);
+	} else if (config->type == PHYLINK_DEV) {
+		pl->dev = config->dev;
 	} else {
 		kfree(pl);
 		return ERR_PTR(-EINVAL);
@@ -910,7 +923,8 @@ void phylink_start(struct phylink *pl)
 		    phy_modes(pl->link_config.interface));
 
 	/* Always set the carrier off */
-	netif_carrier_off(pl->netdev);
+	if (pl->netdev)
+		netif_carrier_off(pl->netdev);
 
 	/* Apply the link configuration to the MAC when starting. This allows
 	 * a fixed-link to start with the correct parameters, and also
@@ -1255,7 +1269,8 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl,
 		switch (pl->link_an_mode) {
 		case MLO_AN_PHY:
 			/* Silently mark the carrier down, and then trigger a resolve */
-			netif_carrier_off(pl->netdev);
+			if (pl->netdev)
+				netif_carrier_off(pl->netdev);
 			phylink_run_resolve(pl);
 			break;
 
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 67f35f07ac4b..0f6f65bb9d44 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -56,6 +56,7 @@ struct phylink_link_state {
 
 enum phylink_op_type {
 	PHYLINK_NETDEV = 0,
+	PHYLINK_DEV,
 };
 
 /**
-- 
cgit v1.2.3


From 1e91a2e5d827e643cbabad66d133f155a7fcb0de Mon Sep 17 00:00:00 2001
From: Ruslan Babayev <ruslan@babayev.com>
Date: Tue, 28 May 2019 16:02:32 -0700
Subject: i2c: acpi: export i2c_acpi_find_adapter_by_handle

This allows drivers to lookup i2c adapters on ACPI based systems similar to
of_get_i2c_adapter_by_node() with DT based systems.

Signed-off-by: Ruslan Babayev <ruslan@babayev.com>
Cc: xe-linux-external@cisco.com
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/i2c/i2c-core-acpi.c | 3 ++-
 include/linux/i2c.h         | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 272800692088..964687534754 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -337,7 +337,7 @@ static int i2c_acpi_find_match_device(struct device *dev, void *data)
 	return ACPI_COMPANION(dev) == data;
 }
 
-static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 {
 	struct device *dev;
 
@@ -345,6 +345,7 @@ static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 			      i2c_acpi_find_match_adapter);
 	return dev ? i2c_verify_adapter(dev) : NULL;
 }
+EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle);
 
 static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 1308126fc384..e982b8913b73 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -14,6 +14,7 @@
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
+#include <linux/acpi.h>		/* for acpi_handle */
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
@@ -981,6 +982,7 @@ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 u32 i2c_acpi_find_bus_speed(struct device *dev);
 struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
 				       struct i2c_board_info *info);
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle);
 #else
 static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 					     struct acpi_resource_i2c_serialbus **i2c)
@@ -996,6 +998,10 @@ static inline struct i2c_client *i2c_acpi_new_device(struct device *dev,
 {
 	return NULL;
 }
+static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+{
+	return NULL;
+}
 #endif /* CONFIG_ACPI */
 
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3


From 9092a76d3cf8638467b09bbb4f409094349b2b53 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 29 May 2019 12:33:57 -0400
Subject: tcp: add backup TFO key infrastructure

We would like to be able to rotate TFO keys while minimizing the number of
client cookies that are rejected. Currently, we have only one key which can
be used to generate and validate cookies, thus if we simply replace this
key clients can easily have cookies rejected upon rotation.

We propose having the ability to have both a primary key and a backup key.
The primary key is used to generate as well as to validate cookies.
The backup is only used to validate cookies. Thus, keys can be rotated as:

1) generate new key
2) add new key as the backup key
3) swap the primary and backup key, thus setting the new key as the primary

We don't simply set the new key as the primary key and move the old key to
the backup slot because the ip may be behind a load balancer and we further
allow for the fact that all machines behind the load balancer will not be
updated simultaneously.

We make use of this infrastructure in subsequent patches.

Suggested-by: Igor Lubashev <ilubashe@akamai.com>
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h          |  41 ++++++++++-
 include/uapi/linux/snmp.h  |   1 +
 net/ipv4/proc.c            |   1 +
 net/ipv4/sysctl_net_ipv4.c |   2 +-
 net/ipv4/tcp.c             |   3 +-
 net/ipv4/tcp_fastopen.c    | 172 +++++++++++++++++++++++++++++++--------------
 6 files changed, 162 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 985aa5db570c..0083a14fb64f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1614,7 +1614,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp);
 void tcp_fastopen_destroy_cipher(struct sock *sk);
 void tcp_fastopen_ctx_destroy(struct net *net);
 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
-			      void *key, unsigned int len);
+			      void *primary_key, void *backup_key,
+			      unsigned int len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
@@ -1625,11 +1626,14 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
 bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
 #define TCP_FASTOPEN_KEY_LENGTH 16
+#define TCP_FASTOPEN_KEY_MAX 2
+#define TCP_FASTOPEN_KEY_BUF_LENGTH \
+	(TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)
 
 /* Fastopen key context */
 struct tcp_fastopen_context {
-	struct crypto_cipher	*tfm;
-	__u8			key[TCP_FASTOPEN_KEY_LENGTH];
+	struct crypto_cipher	*tfm[TCP_FASTOPEN_KEY_MAX];
+	__u8			key[TCP_FASTOPEN_KEY_BUF_LENGTH];
 	struct rcu_head		rcu;
 };
 
@@ -1639,6 +1643,37 @@ bool tcp_fastopen_active_should_disable(struct sock *sk);
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
 void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);
 
+/* Caller needs to wrap with rcu_read_(un)lock() */
+static inline
+struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk)
+{
+	struct tcp_fastopen_context *ctx;
+
+	ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
+	if (!ctx)
+		ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
+	return ctx;
+}
+
+static inline
+bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
+			       const struct tcp_fastopen_cookie *orig)
+{
+	if (orig->len == TCP_FASTOPEN_COOKIE_SIZE &&
+	    orig->len == foc->len &&
+	    !memcmp(orig->val, foc->val, foc->len))
+		return true;
+	return false;
+}
+
+static inline
+int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
+{
+	if (ctx->tfm[1])
+		return 2;
+	return 1;
+}
+
 /* Latencies incurred by various limits for a sender. They are
  * chronograph-like stats that are mutually exclusive.
  */
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 86dc24a96c90..74904e9d1b72 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -283,6 +283,7 @@ enum
 	LINUX_MIB_TCPACKCOMPRESSED,		/* TCPAckCompressed */
 	LINUX_MIB_TCPZEROWINDOWDROP,		/* TCPZeroWindowDrop */
 	LINUX_MIB_TCPRCVQDROP,			/* TCPRcvQDrop */
+	LINUX_MIB_TCPFASTOPENPASSIVEALTKEY,	/* TCPFastOpenPassiveAltKey */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b613572c6616..4746f963c439 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -291,6 +291,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
 	SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
 	SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
+	SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 875867b64d6a..72dc8ca98d43 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -318,7 +318,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 		for (i = 0; i < ARRAY_SIZE(user_key); i++)
 			key[i] = cpu_to_le32(user_key[i]);
 
-		tcp_fastopen_reset_cipher(net, NULL, key,
+		tcp_fastopen_reset_cipher(net, NULL, key, NULL,
 					  TCP_FASTOPEN_KEY_LENGTH);
 	}
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53d61ca3ac4b..bca51a351b0e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2798,7 +2798,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		if (copy_from_user(key, optval, optlen))
 			return -EFAULT;
 
-		return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+		return tcp_fastopen_reset_cipher(net, sk, key, NULL,
+						 sizeof(key));
 	}
 	default:
 		/* fallthru */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 3889ad28dd06..8e1580485c9e 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -30,14 +30,20 @@ void tcp_fastopen_init_key_once(struct net *net)
 	 * for a valid cookie, so this is an acceptable risk.
 	 */
 	get_random_bytes(key, sizeof(key));
-	tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key));
+	tcp_fastopen_reset_cipher(net, NULL, key, NULL, sizeof(key));
 }
 
 static void tcp_fastopen_ctx_free(struct rcu_head *head)
 {
 	struct tcp_fastopen_context *ctx =
 	    container_of(head, struct tcp_fastopen_context, rcu);
-	crypto_free_cipher(ctx->tfm);
+	int i;
+
+	/* We own ctx, thus no need to hold the Fastopen-lock */
+	for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++) {
+		if (ctx->tfm[i])
+			crypto_free_cipher(ctx->tfm[i]);
+	}
 	kfree(ctx);
 }
 
@@ -66,33 +72,54 @@ void tcp_fastopen_ctx_destroy(struct net *net)
 		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
 }
 
+struct tcp_fastopen_context *tcp_fastopen_alloc_ctx(void *primary_key,
+						    void *backup_key,
+						    unsigned int len)
+{
+	struct tcp_fastopen_context *new_ctx;
+	void *key = primary_key;
+	int err, i;
+
+	new_ctx = kmalloc(sizeof(*new_ctx), GFP_KERNEL);
+	if (!new_ctx)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++)
+		new_ctx->tfm[i] = NULL;
+	for (i = 0; i < (backup_key ? 2 : 1); i++) {
+		new_ctx->tfm[i] = crypto_alloc_cipher("aes", 0, 0);
+		if (IS_ERR(new_ctx->tfm[i])) {
+			err = PTR_ERR(new_ctx->tfm[i]);
+			new_ctx->tfm[i] = NULL;
+			pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
+			goto out;
+		}
+		err = crypto_cipher_setkey(new_ctx->tfm[i], key, len);
+		if (err) {
+			pr_err("TCP: TFO cipher key error: %d\n", err);
+			goto out;
+		}
+		memcpy(&new_ctx->key[i * TCP_FASTOPEN_KEY_LENGTH], key, len);
+		key = backup_key;
+	}
+	return new_ctx;
+out:
+	tcp_fastopen_ctx_free(&new_ctx->rcu);
+	return ERR_PTR(err);
+}
+
 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
-			      void *key, unsigned int len)
+			      void *primary_key, void *backup_key,
+			      unsigned int len)
 {
 	struct tcp_fastopen_context *ctx, *octx;
 	struct fastopen_queue *q;
-	int err;
+	int err = 0;
 
-	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
-	ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
-
-	if (IS_ERR(ctx->tfm)) {
-		err = PTR_ERR(ctx->tfm);
-error:		kfree(ctx);
-		pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
-		return err;
-	}
-	err = crypto_cipher_setkey(ctx->tfm, key, len);
-	if (err) {
-		pr_err("TCP: TFO cipher key error: %d\n", err);
-		crypto_free_cipher(ctx->tfm);
-		goto error;
+	ctx = tcp_fastopen_alloc_ctx(primary_key, backup_key, len);
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		goto out;
 	}
-	memcpy(ctx->key, key, len);
-
-
 	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
 	if (sk) {
 		q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
@@ -108,6 +135,7 @@ error:		kfree(ctx);
 
 	if (octx)
 		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+out:
 	return err;
 }
 
@@ -151,25 +179,20 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
  *
  * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
  */
-static bool tcp_fastopen_cookie_gen(struct sock *sk,
+static void tcp_fastopen_cookie_gen(struct sock *sk,
 				    struct request_sock *req,
 				    struct sk_buff *syn,
 				    struct tcp_fastopen_cookie *foc)
 {
 	struct tcp_fastopen_context *ctx;
-	bool ok = false;
 
 	rcu_read_lock();
-	ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
-	if (!ctx)
-		ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
+	ctx = tcp_fastopen_get_ctx(sk);
 	if (ctx)
-		ok = __tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm, foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[0], foc);
 	rcu_read_unlock();
-	return ok;
 }
 
-
 /* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
  * queue this additional data / FIN.
  */
@@ -213,6 +236,35 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 		tcp_fin(sk);
 }
 
+/* returns 0 - no key match, 1 for primary, 2 for backup */
+static int tcp_fastopen_cookie_gen_check(struct sock *sk,
+					 struct request_sock *req,
+					 struct sk_buff *syn,
+					 struct tcp_fastopen_cookie *orig,
+					 struct tcp_fastopen_cookie *valid_foc)
+{
+	struct tcp_fastopen_cookie search_foc = { .len = -1 };
+	struct tcp_fastopen_cookie *foc = valid_foc;
+	struct tcp_fastopen_context *ctx;
+	int i, ret = 0;
+
+	rcu_read_lock();
+	ctx = tcp_fastopen_get_ctx(sk);
+	if (!ctx)
+		goto out;
+	for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
+		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[i], foc);
+		if (tcp_fastopen_cookie_match(foc, orig)) {
+			ret = i + 1;
+			goto out;
+		}
+		foc = &search_foc;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
 					      struct sk_buff *skb,
 					      struct request_sock *req)
@@ -332,6 +384,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 	int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
 	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
 	struct sock *child;
+	int ret = 0;
 
 	if (foc->len == 0) /* Client requests a cookie */
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
@@ -347,31 +400,44 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 	    tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
 		goto fastopen;
 
-	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
-	    tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) &&
-	    foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
-	    foc->len == valid_foc.len &&
-	    !memcmp(foc->val, valid_foc.val, foc->len)) {
-		/* Cookie is valid. Create a (full) child socket to accept
-		 * the data in SYN before returning a SYN-ACK to ack the
-		 * data. If we fail to create the socket, fall back and
-		 * ack the ISN only but includes the same cookie.
-		 *
-		 * Note: Data-less SYN with valid cookie is allowed to send
-		 * data in SYN_RECV state.
-		 */
+	if (foc->len == 0) {
+		/* Client requests a cookie. */
+		tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc);
+	} else if (foc->len > 0) {
+		ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc,
+						    &valid_foc);
+		if (!ret) {
+			NET_INC_STATS(sock_net(sk),
+				      LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+		} else {
+			/* Cookie is valid. Create a (full) child socket to
+			 * accept the data in SYN before returning a SYN-ACK to
+			 * ack the data. If we fail to create the socket, fall
+			 * back and ack the ISN only but includes the same
+			 * cookie.
+			 *
+			 * Note: Data-less SYN with valid cookie is allowed to
+			 * send data in SYN_RECV state.
+			 */
 fastopen:
-		child = tcp_fastopen_create_child(sk, skb, req);
-		if (child) {
-			foc->len = -1;
+			child = tcp_fastopen_create_child(sk, skb, req);
+			if (child) {
+				if (ret == 2) {
+					valid_foc.exp = foc->exp;
+					*foc = valid_foc;
+					NET_INC_STATS(sock_net(sk),
+						      LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
+				} else {
+					foc->len = -1;
+				}
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPFASTOPENPASSIVE);
+				return child;
+			}
 			NET_INC_STATS(sock_net(sk),
-				      LINUX_MIB_TCPFASTOPENPASSIVE);
-			return child;
+				      LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
 		}
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
-	} else if (foc->len > 0) /* Client presents an invalid cookie */
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
-
+	}
 	valid_foc.exp = foc->exp;
 	*foc = valid_foc;
 	return NULL;
-- 
cgit v1.2.3


From c8b17be0b7a45d707fc202c11d257c25bc3952b8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 May 2019 13:25:31 +0200
Subject: net: ipv4: add skbuff fraglist splitter

This patch adds the skbuff fraglist splitter. This API provides an
iterator to transform the fraglist into single skbuff objects, it
consists of:

* ip_fraglist_init(), that initializes the internal state of the
  fraglist splitter.
* ip_fraglist_prepare(), that restores the IPv4 header on the
  fragments.
* ip_fraglist_next(), that retrieves the fragment from the fraglist and
  it updates the internal state of the splitter to point to the next
  fragment skbuff in the fraglist.

The ip_fraglist_iter object stores the internal state of the iterator.

This code has been extracted from ip_do_fragment(). Symbols are also
exported to allow to reuse this iterator from the bridge codepath to
build its own refragmentation routine by reusing the existing codebase.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h     | 23 ++++++++++++++
 net/ipv4/ip_output.c | 88 ++++++++++++++++++++++++++++++++--------------------
 2 files changed, 78 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 2d3cce7c3e8a..be899677504b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -165,6 +165,29 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		   int (*output)(struct net *, struct sock *, struct sk_buff *));
+
+struct ip_fraglist_iter {
+	struct sk_buff	*frag_list;
+	struct sk_buff	*frag;
+	struct iphdr	*iph;
+	int		offset;
+	unsigned int	hlen;
+};
+
+void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
+		      unsigned int hlen, struct ip_fraglist_iter *iter);
+void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter);
+
+static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter)
+{
+	struct sk_buff *skb = iter->frag;
+
+	iter->frag = skb->next;
+	skb_mark_not_on_list(skb);
+
+	return skb;
+}
+
 void ip_send_check(struct iphdr *ip);
 int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index bfd0ca554977..d03eb4ae0dd4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -561,6 +561,54 @@ static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	return ip_do_fragment(net, sk, skb, output);
 }
 
+void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
+		      unsigned int hlen, struct ip_fraglist_iter *iter)
+{
+	unsigned int first_len = skb_pagelen(skb);
+
+	iter->frag_list = skb_shinfo(skb)->frag_list;
+	iter->frag = iter->frag_list;
+	skb_frag_list_init(skb);
+
+	iter->offset = 0;
+	iter->iph = iph;
+	iter->hlen = hlen;
+
+	skb->data_len = first_len - skb_headlen(skb);
+	skb->len = first_len;
+	iph->tot_len = htons(first_len);
+	iph->frag_off = htons(IP_MF);
+	ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_init);
+
+void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
+{
+	unsigned int hlen = iter->hlen;
+	struct iphdr *iph = iter->iph;
+	struct sk_buff *frag;
+
+	frag = iter->frag;
+	frag->ip_summed = CHECKSUM_NONE;
+	skb_reset_transport_header(frag);
+	__skb_push(frag, hlen);
+	skb_reset_network_header(frag);
+	memcpy(skb_network_header(frag), iph, hlen);
+	iter->iph = ip_hdr(frag);
+	iph = iter->iph;
+	iph->tot_len = htons(frag->len);
+	ip_copy_metadata(frag, skb);
+	if (iter->offset == 0)
+		ip_options_fragment(frag);
+	iter->offset += skb->len - hlen;
+	iph->frag_off = htons(iter->offset >> 3);
+	if (frag->next)
+		iph->frag_off |= htons(IP_MF);
+	/* Ready, complete checksum */
+	ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_prepare);
+
 /*
  *	This IP datagram is too large to be sent in one piece.  Break it up into
  *	smaller pieces (each of size equal to IP header plus
@@ -578,6 +626,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	int offset;
 	__be16 not_last_frag;
 	struct rtable *rt = skb_rtable(skb);
+	struct ip_fraglist_iter iter;
 	int err = 0;
 
 	/* for offloaded checksums cleanup checksum before fragmentation */
@@ -642,49 +691,22 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		}
 
 		/* Everything is OK. Generate! */
-
-		err = 0;
-		offset = 0;
-		frag = skb_shinfo(skb)->frag_list;
-		skb_frag_list_init(skb);
-		skb->data_len = first_len - skb_headlen(skb);
-		skb->len = first_len;
-		iph->tot_len = htons(first_len);
-		iph->frag_off = htons(IP_MF);
-		ip_send_check(iph);
+		ip_fraglist_init(skb, iph, hlen, &iter);
 
 		for (;;) {
 			/* Prepare header of the next frame,
 			 * before previous one went down. */
-			if (frag) {
-				frag->ip_summed = CHECKSUM_NONE;
-				skb_reset_transport_header(frag);
-				__skb_push(frag, hlen);
-				skb_reset_network_header(frag);
-				memcpy(skb_network_header(frag), iph, hlen);
-				iph = ip_hdr(frag);
-				iph->tot_len = htons(frag->len);
-				ip_copy_metadata(frag, skb);
-				if (offset == 0)
-					ip_options_fragment(frag);
-				offset += skb->len - hlen;
-				iph->frag_off = htons(offset>>3);
-				if (frag->next)
-					iph->frag_off |= htons(IP_MF);
-				/* Ready, complete checksum */
-				ip_send_check(iph);
-			}
+			if (iter.frag)
+				ip_fraglist_prepare(skb, &iter);
 
 			err = output(net, sk, skb);
 
 			if (!err)
 				IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
-			if (err || !frag)
+			if (err || !iter.frag)
 				break;
 
-			skb = frag;
-			frag = skb->next;
-			skb_mark_not_on_list(skb);
+			skb = ip_fraglist_next(&iter);
 		}
 
 		if (err == 0) {
@@ -692,7 +714,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			return 0;
 		}
 
-		kfree_skb_list(frag);
+		kfree_skb_list(iter.frag_list);
 
 		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
 		return err;
-- 
cgit v1.2.3


From 0feca6190f88a1b7c9a9b9cdf41824e3ea4ba02c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 May 2019 13:25:32 +0200
Subject: net: ipv6: add skbuff fraglist splitter

This patch adds the skbuff fraglist split iterator. This API provides an
iterator to transform the fraglist into single skbuff objects, it
consists of:

* ip6_fraglist_init(), that initializes the internal state of the
  fraglist iterator.
* ip6_fraglist_prepare(), that restores the IPv6 header on the fragment.
* ip6_fraglist_next(), that retrieves the fragment from the fraglist and
  updates the internal state of the iterator to point to the next
  fragment in the fraglist.

The ip6_fraglist_iter object stores the internal state of the iterator.

This code has been extracted from ip6_fragment(). Symbols are also
exported to allow to reuse this iterator from the bridge codepath to
build its own refragmentation routine by reusing the existing codebase.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h    |  25 ++++++++++
 net/ipv6/ip6_output.c | 132 +++++++++++++++++++++++++++++---------------------
 2 files changed, 102 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index daf80863d3a5..acefbc718abe 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -154,6 +154,31 @@ struct frag_hdr {
 #define	IP6_MF		0x0001
 #define	IP6_OFFSET	0xFFF8
 
+struct ip6_fraglist_iter {
+	struct ipv6hdr	*tmp_hdr;
+	struct sk_buff	*frag_list;
+	struct sk_buff	*frag;
+	int		offset;
+	unsigned int	hlen;
+	__be32		frag_id;
+	u8		nexthdr;
+};
+
+int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
+		      u8 nexthdr, __be32 frag_id,
+		      struct ip6_fraglist_iter *iter);
+void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter);
+
+static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter)
+{
+	struct sk_buff *skb = iter->frag;
+
+	iter->frag = skb->next;
+	skb_mark_not_on_list(skb);
+
+	return skb;
+}
+
 #define IP6_REPLY_MARK(net, mark) \
 	((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0)
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index adef2236abe2..2567b22a888a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -592,6 +592,73 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	skb_copy_secmark(to, from);
 }
 
+int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
+		      u8 nexthdr, __be32 frag_id,
+		      struct ip6_fraglist_iter *iter)
+{
+	unsigned int first_len;
+	struct frag_hdr *fh;
+
+	/* BUILD HEADER */
+	*prevhdr = NEXTHDR_FRAGMENT;
+	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
+	if (!iter->tmp_hdr)
+		return -ENOMEM;
+
+	iter->frag_list = skb_shinfo(skb)->frag_list;
+	iter->frag = iter->frag_list;
+	skb_frag_list_init(skb);
+
+	iter->offset = 0;
+	iter->hlen = hlen;
+	iter->frag_id = frag_id;
+	iter->nexthdr = nexthdr;
+
+	__skb_pull(skb, hlen);
+	fh = __skb_push(skb, sizeof(struct frag_hdr));
+	__skb_push(skb, hlen);
+	skb_reset_network_header(skb);
+	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
+
+	fh->nexthdr = nexthdr;
+	fh->reserved = 0;
+	fh->frag_off = htons(IP6_MF);
+	fh->identification = frag_id;
+
+	first_len = skb_pagelen(skb);
+	skb->data_len = first_len - skb_headlen(skb);
+	skb->len = first_len;
+	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
+
+	return 0;
+}
+EXPORT_SYMBOL(ip6_fraglist_init);
+
+void ip6_fraglist_prepare(struct sk_buff *skb,
+			  struct ip6_fraglist_iter *iter)
+{
+	struct sk_buff *frag = iter->frag;
+	unsigned int hlen = iter->hlen;
+	struct frag_hdr *fh;
+
+	frag->ip_summed = CHECKSUM_NONE;
+	skb_reset_transport_header(frag);
+	fh = __skb_push(frag, sizeof(struct frag_hdr));
+	__skb_push(frag, hlen);
+	skb_reset_network_header(frag);
+	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
+	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
+	fh->nexthdr = iter->nexthdr;
+	fh->reserved = 0;
+	fh->frag_off = htons(iter->offset);
+	if (frag->next)
+		fh->frag_off |= htons(IP6_MF);
+	fh->identification = iter->frag_id;
+	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+	ip6_copy_metadata(frag, skb);
+}
+EXPORT_SYMBOL(ip6_fraglist_prepare);
+
 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
 {
@@ -599,7 +666,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 				inet6_sk(skb->sk) : NULL;
-	struct ipv6hdr *tmp_hdr;
 	struct frag_hdr *fh;
 	unsigned int mtu, hlen, left, len, nexthdr_offset;
 	int hroom, troom;
@@ -651,6 +717,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 	if (skb_has_frag_list(skb)) {
 		unsigned int first_len = skb_pagelen(skb);
+		struct ip6_fraglist_iter iter;
 		struct sk_buff *frag2;
 
 		if (first_len - hlen > mtu ||
@@ -678,74 +745,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			skb->truesize -= frag->truesize;
 		}
 
-		err = 0;
-		offset = 0;
-		/* BUILD HEADER */
-
-		*prevhdr = NEXTHDR_FRAGMENT;
-		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
-		if (!tmp_hdr) {
-			err = -ENOMEM;
+		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+					&iter);
+		if (err < 0)
 			goto fail;
-		}
-		frag = skb_shinfo(skb)->frag_list;
-		skb_frag_list_init(skb);
-
-		__skb_pull(skb, hlen);
-		fh = __skb_push(skb, sizeof(struct frag_hdr));
-		__skb_push(skb, hlen);
-		skb_reset_network_header(skb);
-		memcpy(skb_network_header(skb), tmp_hdr, hlen);
-
-		fh->nexthdr = nexthdr;
-		fh->reserved = 0;
-		fh->frag_off = htons(IP6_MF);
-		fh->identification = frag_id;
-
-		first_len = skb_pagelen(skb);
-		skb->data_len = first_len - skb_headlen(skb);
-		skb->len = first_len;
-		ipv6_hdr(skb)->payload_len = htons(first_len -
-						   sizeof(struct ipv6hdr));
 
 		for (;;) {
 			/* Prepare header of the next frame,
 			 * before previous one went down. */
-			if (frag) {
-				frag->ip_summed = CHECKSUM_NONE;
-				skb_reset_transport_header(frag);
-				fh = __skb_push(frag, sizeof(struct frag_hdr));
-				__skb_push(frag, hlen);
-				skb_reset_network_header(frag);
-				memcpy(skb_network_header(frag), tmp_hdr,
-				       hlen);
-				offset += skb->len - hlen - sizeof(struct frag_hdr);
-				fh->nexthdr = nexthdr;
-				fh->reserved = 0;
-				fh->frag_off = htons(offset);
-				if (frag->next)
-					fh->frag_off |= htons(IP6_MF);
-				fh->identification = frag_id;
-				ipv6_hdr(frag)->payload_len =
-						htons(frag->len -
-						      sizeof(struct ipv6hdr));
-				ip6_copy_metadata(frag, skb);
-			}
+			if (iter.frag)
+				ip6_fraglist_prepare(skb, &iter);
 
 			err = output(net, sk, skb);
 			if (!err)
 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 					      IPSTATS_MIB_FRAGCREATES);
 
-			if (err || !frag)
+			if (err || !iter.frag)
 				break;
 
-			skb = frag;
-			frag = skb->next;
-			skb_mark_not_on_list(skb);
+			skb = ip6_fraglist_next(&iter);
 		}
 
-		kfree(tmp_hdr);
+		kfree(iter.tmp_hdr);
 
 		if (err == 0) {
 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
@@ -753,7 +775,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			return 0;
 		}
 
-		kfree_skb_list(frag);
+		kfree_skb_list(iter.frag_list);
 
 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 			      IPSTATS_MIB_FRAGFAILS);
-- 
cgit v1.2.3


From 065ff79f8881e6267f4c29abb476d697eb87bfba Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 May 2019 13:25:33 +0200
Subject: net: ipv4: split skbuff into fragments transformer

This patch exposes a new API to refragment a skbuff. This allows you to
split either a linear skbuff or to force the refragmentation of an
existing fraglist using a different mtu. The API consists of:

* ip_frag_init(), that initializes the internal state of the transformer.
* ip_frag_next(), that allows you to fetch the next fragment. This function
  internally allocates the skbuff that represents the fragment, it pushes
  the IPv4 header, and it also copies the payload for each fragment.

The ip_frag_state object stores the internal state of the splitter.

This code has been extracted from ip_do_fragment(). Symbols are also
exported to allow to reuse this iterator from the bridge codepath to
build its own refragmentation routine by reusing the existing codebase.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h     |  16 +++++
 net/ipv4/ip_output.c | 200 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 128 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index be899677504b..029cc3fd26bd 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -188,6 +188,22 @@ static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter)
 	return skb;
 }
 
+struct ip_frag_state {
+	struct iphdr	*iph;
+	unsigned int	hlen;
+	unsigned int	ll_rs;
+	unsigned int	mtu;
+	unsigned int	left;
+	int		offset;
+	int		ptr;
+	__be16		not_last_frag;
+};
+
+void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs,
+		  unsigned int mtu, struct ip_frag_state *state);
+struct sk_buff *ip_frag_next(struct sk_buff *skb,
+			     struct ip_frag_state *state);
+
 void ip_send_check(struct iphdr *ip);
 int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d03eb4ae0dd4..c3f139843eca 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -609,6 +609,111 @@ void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
 }
 EXPORT_SYMBOL(ip_fraglist_prepare);
 
+void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
+		  unsigned int ll_rs, unsigned int mtu,
+		  struct ip_frag_state *state)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	state->hlen = hlen;
+	state->ll_rs = ll_rs;
+	state->mtu = mtu;
+
+	state->left = skb->len - hlen;	/* Space per frame */
+	state->ptr = hlen;		/* Where to start from */
+
+	state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+	state->not_last_frag = iph->frag_off & htons(IP_MF);
+}
+EXPORT_SYMBOL(ip_frag_init);
+
+struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
+{
+	unsigned int len = state->left;
+	struct sk_buff *skb2;
+	struct iphdr *iph;
+
+	len = state->left;
+	/* IF: it doesn't fit, use 'mtu' - the data space left */
+	if (len > state->mtu)
+		len = state->mtu;
+	/* IF: we are not sending up to and including the packet end
+	   then align the next start on an eight byte boundary */
+	if (len < state->left)	{
+		len &= ~7;
+	}
+
+	/* Allocate buffer */
+	skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
+	if (!skb2)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 *	Set up data on packet
+	 */
+
+	ip_copy_metadata(skb2, skb);
+	skb_reserve(skb2, state->ll_rs);
+	skb_put(skb2, len + state->hlen);
+	skb_reset_network_header(skb2);
+	skb2->transport_header = skb2->network_header + state->hlen;
+
+	/*
+	 *	Charge the memory for the fragment to any owner
+	 *	it might possess
+	 */
+
+	if (skb->sk)
+		skb_set_owner_w(skb2, skb->sk);
+
+	/*
+	 *	Copy the packet header into the new buffer.
+	 */
+
+	skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
+
+	/*
+	 *	Copy a block of the IP datagram.
+	 */
+	if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
+		BUG();
+	state->left -= len;
+
+	/*
+	 *	Fill in the new header fields.
+	 */
+	iph = ip_hdr(skb2);
+	iph->frag_off = htons((state->offset >> 3));
+
+	if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+		iph->frag_off |= htons(IP_DF);
+
+	/* ANK: dirty, but effective trick. Upgrade options only if
+	 * the segment to be fragmented was THE FIRST (otherwise,
+	 * options are already fixed) and make it ONCE
+	 * on the initial skb, so that all the following fragments
+	 * will inherit fixed options.
+	 */
+	if (state->offset == 0)
+		ip_options_fragment(skb);
+
+	/*
+	 *	Added AC : If we are fragmenting a fragment that's not the
+	 *		   last fragment then keep MF on each bit
+	 */
+	if (state->left > 0 || state->not_last_frag)
+		iph->frag_off |= htons(IP_MF);
+	state->ptr += len;
+	state->offset += len;
+
+	iph->tot_len = htons(len + state->hlen);
+
+	ip_send_check(iph);
+
+	return skb2;
+}
+EXPORT_SYMBOL(ip_frag_next);
+
 /*
  *	This IP datagram is too large to be sent in one piece.  Break it up into
  *	smaller pieces (each of size equal to IP header plus
@@ -620,13 +725,11 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		   int (*output)(struct net *, struct sock *, struct sk_buff *))
 {
 	struct iphdr *iph;
-	int ptr;
 	struct sk_buff *skb2;
-	unsigned int mtu, hlen, left, len, ll_rs;
-	int offset;
-	__be16 not_last_frag;
 	struct rtable *rt = skb_rtable(skb);
+	unsigned int mtu, hlen, ll_rs;
 	struct ip_fraglist_iter iter;
+	struct ip_frag_state state;
 	int err = 0;
 
 	/* for offloaded checksums cleanup checksum before fragmentation */
@@ -730,105 +833,26 @@ slow_path_clean:
 	}
 
 slow_path:
-	iph = ip_hdr(skb);
-
-	left = skb->len - hlen;		/* Space per frame */
-	ptr = hlen;		/* Where to start from */
-
 	/*
 	 *	Fragment the datagram.
 	 */
 
-	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
-	not_last_frag = iph->frag_off & htons(IP_MF);
+	ip_frag_init(skb, hlen, ll_rs, mtu, &state);
 
 	/*
 	 *	Keep copying data until we run out.
 	 */
 
-	while (left > 0) {
-		len = left;
-		/* IF: it doesn't fit, use 'mtu' - the data space left */
-		if (len > mtu)
-			len = mtu;
-		/* IF: we are not sending up to and including the packet end
-		   then align the next start on an eight byte boundary */
-		if (len < left)	{
-			len &= ~7;
-		}
-
-		/* Allocate buffer */
-		skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
-		if (!skb2) {
-			err = -ENOMEM;
+	while (state.left > 0) {
+		skb2 = ip_frag_next(skb, &state);
+		if (IS_ERR(skb2)) {
+			err = PTR_ERR(skb2);
 			goto fail;
 		}
 
-		/*
-		 *	Set up data on packet
-		 */
-
-		ip_copy_metadata(skb2, skb);
-		skb_reserve(skb2, ll_rs);
-		skb_put(skb2, len + hlen);
-		skb_reset_network_header(skb2);
-		skb2->transport_header = skb2->network_header + hlen;
-
-		/*
-		 *	Charge the memory for the fragment to any owner
-		 *	it might possess
-		 */
-
-		if (skb->sk)
-			skb_set_owner_w(skb2, skb->sk);
-
-		/*
-		 *	Copy the packet header into the new buffer.
-		 */
-
-		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
-
-		/*
-		 *	Copy a block of the IP datagram.
-		 */
-		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
-			BUG();
-		left -= len;
-
-		/*
-		 *	Fill in the new header fields.
-		 */
-		iph = ip_hdr(skb2);
-		iph->frag_off = htons((offset >> 3));
-
-		if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
-			iph->frag_off |= htons(IP_DF);
-
-		/* ANK: dirty, but effective trick. Upgrade options only if
-		 * the segment to be fragmented was THE FIRST (otherwise,
-		 * options are already fixed) and make it ONCE
-		 * on the initial skb, so that all the following fragments
-		 * will inherit fixed options.
-		 */
-		if (offset == 0)
-			ip_options_fragment(skb);
-
-		/*
-		 *	Added AC : If we are fragmenting a fragment that's not the
-		 *		   last fragment then keep MF on each bit
-		 */
-		if (left > 0 || not_last_frag)
-			iph->frag_off |= htons(IP_MF);
-		ptr += len;
-		offset += len;
-
 		/*
 		 *	Put this fragment into the sending queue.
 		 */
-		iph->tot_len = htons(len + hlen);
-
-		ip_send_check(iph);
-
 		err = output(net, sk, skb2);
 		if (err)
 			goto fail;
-- 
cgit v1.2.3


From 8a6a1f17640198f7daa5cfcce9a74e3674ce3b00 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 May 2019 13:25:34 +0200
Subject: net: ipv6: split skbuff into fragments transformer

This patch exposes a new API to refragment a skbuff. This allows you to
split either a linear skbuff or to force the refragmentation of an
existing fraglist using a different mtu. The API consists of:

* ip6_frag_init(), that initializes the internal state of the transformer.
* ip6_frag_next(), that allows you to fetch the next fragment. This function
  internally allocates the skbuff that represents the fragment, it pushes
  the IPv6 header, and it also copies the payload for each fragment.

The ip6_frag_state object stores the internal state of the splitter.

This code has been extracted from ip6_fragment(). Symbols are also
exported to allow to reuse this iterator from the bridge codepath to
build its own refragmentation routine by reusing the existing codebase.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h    |  19 ++++++
 net/ipv6/ip6_output.c | 183 +++++++++++++++++++++++++++++---------------------
 2 files changed, 126 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index acefbc718abe..21bb830e9679 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -179,6 +179,25 @@ static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter)
 	return skb;
 }
 
+struct ip6_frag_state {
+	u8		*prevhdr;
+	unsigned int	hlen;
+	unsigned int	mtu;
+	unsigned int	left;
+	int		offset;
+	int		ptr;
+	int		hroom;
+	int		troom;
+	__be32		frag_id;
+	u8		nexthdr;
+};
+
+void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
+		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
+		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state);
+struct sk_buff *ip6_frag_next(struct sk_buff *skb,
+			      struct ip6_frag_state *state);
+
 #define IP6_REPLY_MARK(net, mark) \
 	((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0)
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2567b22a888a..812a98b79ec6 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -659,6 +659,103 @@ void ip6_fraglist_prepare(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(ip6_fraglist_prepare);
 
+void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
+		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
+		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
+{
+	state->prevhdr = prevhdr;
+	state->nexthdr = nexthdr;
+	state->frag_id = frag_id;
+
+	state->hlen = hlen;
+	state->mtu = mtu;
+
+	state->left = skb->len - hlen;	/* Space per frame */
+	state->ptr = hlen;		/* Where to start from */
+
+	state->hroom = hdr_room;
+	state->troom = needed_tailroom;
+
+	state->offset = 0;
+}
+EXPORT_SYMBOL(ip6_frag_init);
+
+struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
+{
+	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
+	struct sk_buff *frag;
+	struct frag_hdr *fh;
+	unsigned int len;
+
+	len = state->left;
+	/* IF: it doesn't fit, use 'mtu' - the data space left */
+	if (len > state->mtu)
+		len = state->mtu;
+	/* IF: we are not sending up to and including the packet end
+	   then align the next start on an eight byte boundary */
+	if (len < state->left)
+		len &= ~7;
+
+	/* Allocate buffer */
+	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
+			 state->hroom + state->troom, GFP_ATOMIC);
+	if (!frag)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 *	Set up data on packet
+	 */
+
+	ip6_copy_metadata(frag, skb);
+	skb_reserve(frag, state->hroom);
+	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
+	skb_reset_network_header(frag);
+	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
+	frag->transport_header = (frag->network_header + state->hlen +
+				  sizeof(struct frag_hdr));
+
+	/*
+	 *	Charge the memory for the fragment to any owner
+	 *	it might possess
+	 */
+	if (skb->sk)
+		skb_set_owner_w(frag, skb->sk);
+
+	/*
+	 *	Copy the packet header into the new buffer.
+	 */
+	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
+
+	fragnexthdr_offset = skb_network_header(frag);
+	fragnexthdr_offset += prevhdr - skb_network_header(skb);
+	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
+
+	/*
+	 *	Build fragment header.
+	 */
+	fh->nexthdr = state->nexthdr;
+	fh->reserved = 0;
+	fh->identification = state->frag_id;
+
+	/*
+	 *	Copy a block of the IP datagram.
+	 */
+	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
+			     len));
+	state->left -= len;
+
+	fh->frag_off = htons(state->offset);
+	if (state->left > 0)
+		fh->frag_off |= htons(IP6_MF);
+	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+
+	state->ptr += len;
+	state->offset += len;
+
+	return frag;
+}
+EXPORT_SYMBOL(ip6_frag_next);
+
 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
 {
@@ -666,11 +763,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 				inet6_sk(skb->sk) : NULL;
-	struct frag_hdr *fh;
-	unsigned int mtu, hlen, left, len, nexthdr_offset;
-	int hroom, troom;
+	struct ip6_frag_state state;
+	unsigned int mtu, hlen, nexthdr_offset;
+	int hroom, err = 0;
 	__be32 frag_id;
-	int ptr, offset = 0, err = 0;
 	u8 *prevhdr, nexthdr = 0;
 
 	err = ip6_find_1stfragopt(skb, &prevhdr);
@@ -792,90 +888,25 @@ slow_path_clean:
 	}
 
 slow_path:
-	left = skb->len - hlen;		/* Space per frame */
-	ptr = hlen;			/* Where to start from */
-
 	/*
 	 *	Fragment the datagram.
 	 */
 
-	troom = rt->dst.dev->needed_tailroom;
+	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
+		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
+		      &state);
 
 	/*
 	 *	Keep copying data until we run out.
 	 */
-	while (left > 0)	{
-		u8 *fragnexthdr_offset;
-
-		len = left;
-		/* IF: it doesn't fit, use 'mtu' - the data space left */
-		if (len > mtu)
-			len = mtu;
-		/* IF: we are not sending up to and including the packet end
-		   then align the next start on an eight byte boundary */
-		if (len < left)	{
-			len &= ~7;
-		}
 
-		/* Allocate buffer */
-		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
-				 hroom + troom, GFP_ATOMIC);
-		if (!frag) {
-			err = -ENOMEM;
+	while (state.left > 0) {
+		frag = ip6_frag_next(skb, &state);
+		if (IS_ERR(frag)) {
+			err = PTR_ERR(frag);
 			goto fail;
 		}
 
-		/*
-		 *	Set up data on packet
-		 */
-
-		ip6_copy_metadata(frag, skb);
-		skb_reserve(frag, hroom);
-		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
-		skb_reset_network_header(frag);
-		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
-		frag->transport_header = (frag->network_header + hlen +
-					  sizeof(struct frag_hdr));
-
-		/*
-		 *	Charge the memory for the fragment to any owner
-		 *	it might possess
-		 */
-		if (skb->sk)
-			skb_set_owner_w(frag, skb->sk);
-
-		/*
-		 *	Copy the packet header into the new buffer.
-		 */
-		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
-
-		fragnexthdr_offset = skb_network_header(frag);
-		fragnexthdr_offset += prevhdr - skb_network_header(skb);
-		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
-
-		/*
-		 *	Build fragment header.
-		 */
-		fh->nexthdr = nexthdr;
-		fh->reserved = 0;
-		fh->identification = frag_id;
-
-		/*
-		 *	Copy a block of the IP datagram.
-		 */
-		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
-				     len));
-		left -= len;
-
-		fh->frag_off = htons(offset);
-		if (left > 0)
-			fh->frag_off |= htons(IP6_MF);
-		ipv6_hdr(frag)->payload_len = htons(frag->len -
-						    sizeof(struct ipv6hdr));
-
-		ptr += len;
-		offset += len;
-
 		/*
 		 *	Put this fragment into the sending queue.
 		 */
-- 
cgit v1.2.3


From d035f19f59c5bca2fda2faa43b5e9fe09dfb7884 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 May 2019 13:25:36 +0200
Subject: netfilter: nf_conntrack: allow to register bridge support

This patch adds infrastructure to register and to unregister bridge
support for the conntrack module via nf_ct_bridge_register() and
nf_ct_bridge_unregister().

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_conntrack.h        |  1 +
 include/net/netfilter/nf_conntrack_bridge.h | 13 ++++++
 net/netfilter/nf_conntrack_proto.c          | 61 +++++++++++++++++++++++++++--
 3 files changed, 72 insertions(+), 3 deletions(-)
 create mode 100644 include/net/netfilter/nf_conntrack_bridge.h

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index d2bc733a2ef1..5cb19ce454d1 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -49,6 +49,7 @@ union nf_conntrack_expect_proto {
 struct nf_conntrack_net {
 	unsigned int users4;
 	unsigned int users6;
+	unsigned int users_bridge;
 };
 
 #include <linux/types.h>
diff --git a/include/net/netfilter/nf_conntrack_bridge.h b/include/net/netfilter/nf_conntrack_bridge.h
new file mode 100644
index 000000000000..3be1642e04f7
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_bridge.h
@@ -0,0 +1,13 @@
+#ifndef NF_CONNTRACK_BRIDGE_
+#define NF_CONNTRACK_BRIDGE_
+
+struct nf_ct_bridge_info {
+	struct nf_hook_ops	*ops;
+	unsigned int		ops_size;
+	struct module		*me;
+};
+
+void nf_ct_bridge_register(struct nf_ct_bridge_info *info);
+void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info);
+
+#endif
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 37bb530d848f..3813cb551df9 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -16,6 +16,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
 #include <net/netfilter/nf_log.h>
 
 #include <linux/ip.h>
@@ -442,12 +443,14 @@ static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto)
 	return 0;
 }
 
+static struct nf_ct_bridge_info *nf_ct_bridge_info;
+
 static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
 {
 	struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
-	bool fixup_needed = false;
+	bool fixup_needed = false, retry = true;
 	int err = 0;
-
+retry:
 	mutex_lock(&nf_ct_proto_mutex);
 
 	switch (nfproto) {
@@ -487,6 +490,32 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
 			fixup_needed = true;
 		break;
 #endif
+	case NFPROTO_BRIDGE:
+		if (!nf_ct_bridge_info) {
+			if (!retry) {
+				err = -EPROTO;
+				goto out_unlock;
+			}
+			mutex_unlock(&nf_ct_proto_mutex);
+			request_module("nf_conntrack_bridge");
+			retry = false;
+			goto retry;
+		}
+		if (!try_module_get(nf_ct_bridge_info->me)) {
+			err = -EPROTO;
+			goto out_unlock;
+		}
+		cnet->users_bridge++;
+		if (cnet->users_bridge > 1)
+			goto out_unlock;
+
+		err = nf_register_net_hooks(net, nf_ct_bridge_info->ops,
+					    nf_ct_bridge_info->ops_size);
+		if (err)
+			cnet->users_bridge = 0;
+		else
+			fixup_needed = true;
+		break;
 	default:
 		err = -EPROTO;
 		break;
@@ -519,8 +548,16 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
 						ARRAY_SIZE(ipv6_conntrack_ops));
 		break;
 #endif
+	case NFPROTO_BRIDGE:
+		if (!nf_ct_bridge_info)
+			break;
+		if (cnet->users_bridge && (--cnet->users_bridge == 0))
+			nf_unregister_net_hooks(net, nf_ct_bridge_info->ops,
+						nf_ct_bridge_info->ops_size);
+
+		module_put(nf_ct_bridge_info->me);
+		break;
 	}
-
 	mutex_unlock(&nf_ct_proto_mutex);
 }
 
@@ -560,6 +597,24 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto)
 }
 EXPORT_SYMBOL_GPL(nf_ct_netns_put);
 
+void nf_ct_bridge_register(struct nf_ct_bridge_info *info)
+{
+	WARN_ON(nf_ct_bridge_info);
+	mutex_lock(&nf_ct_proto_mutex);
+	nf_ct_bridge_info = info;
+	mutex_unlock(&nf_ct_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_bridge_register);
+
+void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info)
+{
+	WARN_ON(!nf_ct_bridge_info);
+	mutex_lock(&nf_ct_proto_mutex);
+	nf_ct_bridge_info = NULL;
+	mutex_unlock(&nf_ct_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister);
+
 int nf_conntrack_proto_init(void)
 {
 	int ret;
-- 
cgit v1.2.3


From 3c171f496ef57774f8e5d509923372549734877f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 May 2019 13:25:37 +0200
Subject: netfilter: bridge: add connection tracking system

This patch adds basic connection tracking support for the bridge,
including initial IPv4 support.

This patch register two hooks to deal with the bridge forwarding path,
one from the bridge prerouting hook to call nf_conntrack_in(); and
another from the bridge postrouting hook to confirm the entry.

The conntrack bridge prerouting hook defragments packets before passing
them to nf_conntrack_in() to look up for an existing entry, otherwise a
new entry is allocated and it is attached to the skbuff. The conntrack
bridge postrouting hook confirms new conntrack entries, ie. if this is
the first packet seen, then it adds the entry to the hashtable and (if
needed) it refragments the skbuff into the original fragments, leaving
the geometry as is if possible. Exceptions are linearized skbuffs, eg.
skbuffs that are passed up to nfqueue and conntrack helpers, as well as
cloned skbuff for the local delivery (eg. tcpdump), also in case of
bridge port flooding (cloned skbuff too).

The packet defragmentation is done through the ip_defrag() call.  This
forces us to save the bridge control buffer, reset the IP control buffer
area and then restore it after call. This function also bumps the IP
fragmentation statistics, it would be probably desiderable to have
independent statistics for the bridge defragmentation/refragmentation.
The maximum fragment length is stored in the control buffer and it is
used to refragment the skbuff from the postrouting path.

The new fraglist splitter and fragment transformer APIs are used to
implement the bridge refragmentation code. The br_ip_fragment() function
drops the packet in case the maximum fragment size seen is larger than
the output port MTU.

This patchset follows the principle that conntrack should not drop
packets, so users can do it through policy via invalid state matching.

Like br_netfilter, there is no refragmentation for packets that are
passed up for local delivery, ie. prerouting -> input path. There are
calls to nf_reset() already in several spots in the stack since time ago
already, eg. af_packet, that show that skbuff fraglist handling from the
netif_rx path is supported already.

The helpers are called from the postrouting hook, before confirmation,
from there we may see packet floods to bridge ports. Then, although
unlikely, this may result in exercising the helpers many times for each
clone. It would be good to explore how to pass all the packets in a list
to the conntrack hook to do this handle only once for this case.

Thanks to Florian Westphal for handing me over an initial patchset
version to add support for conntrack bridge.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_conntrack_bridge.h |   7 +
 include/net/netfilter/nf_conntrack_core.h   |   3 +
 net/bridge/br_device.c                      |   1 +
 net/bridge/br_private.h                     |   1 +
 net/bridge/netfilter/Kconfig                |  14 ++
 net/bridge/netfilter/Makefile               |   3 +
 net/bridge/netfilter/nf_conntrack_bridge.c  | 378 ++++++++++++++++++++++++++++
 net/netfilter/nf_conntrack_proto.c          |   7 +-
 8 files changed, 410 insertions(+), 4 deletions(-)
 create mode 100644 net/bridge/netfilter/nf_conntrack_bridge.c

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_bridge.h b/include/net/netfilter/nf_conntrack_bridge.h
index 3be1642e04f7..9a5514d5bc51 100644
--- a/include/net/netfilter/nf_conntrack_bridge.h
+++ b/include/net/netfilter/nf_conntrack_bridge.h
@@ -10,4 +10,11 @@ struct nf_ct_bridge_info {
 void nf_ct_bridge_register(struct nf_ct_bridge_info *info);
 void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info);
 
+struct nf_ct_bridge_frag_data {
+	char	mac[ETH_HLEN];
+	bool	vlan_present;
+	u16	vlan_tci;
+	__be16	vlan_proto;
+};
+
 #endif
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index ae41e92251dd..de10faf2ce91 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -64,6 +64,9 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
 	return ret;
 }
 
+unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
+			struct nf_conn *ct, enum ip_conntrack_info ctinfo);
+
 void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 		 const struct nf_conntrack_l4proto *proto);
 
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 013323b6dbe4..693aefad7f8a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -56,6 +56,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	br_switchdev_frame_unmark(skb);
 	BR_INPUT_SKB_CB(skb)->brdev = dev;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = 0;
 
 	skb_reset_mac_header(skb);
 	eth = eth_hdr(skb);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 334a8c496b50..68561741e827 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -425,6 +425,7 @@ struct net_bridge {
 struct br_input_skb_cb {
 	struct net_device *brdev;
 
+	u16 frag_max_size;
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	u8 igmp;
 	u8 mrouters_only:1;
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index c3ad90c43801..f4fb0b9b927d 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -19,6 +19,20 @@ config NF_LOG_BRIDGE
 	tristate "Bridge packet logging"
 	select NF_LOG_COMMON
 
+config NF_CONNTRACK_BRIDGE
+	tristate "IPv4/IPV6 bridge connection tracking support"
+	depends on NF_CONNTRACK
+	default n
+	help
+	  Connection tracking keeps a record of what packets have passed
+	  through your machine, in order to figure out how they are related
+	  into connections. This is used to enhance packet filtering via
+	  stateful policies. Enable this if you want native tracking from
+	  the bridge. This provides a replacement for the `br_netfilter'
+	  infrastructure.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 endif # NF_TABLES_BRIDGE
 
 menuconfig BRIDGE_NF_EBTABLES
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 9b868861f21a..9d7767322a64 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -5,6 +5,9 @@
 
 obj-$(CONFIG_NFT_BRIDGE_REJECT)  += nft_reject_bridge.o
 
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o
+
 # packet logging
 obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o
 
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
new file mode 100644
index 000000000000..2571528ed582
--- /dev/null
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -0,0 +1,378 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+
+#include "../br_private.h"
+
+/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff
+ * has been linearized or cloned.
+ */
+static int nf_br_ip_fragment(struct net *net, struct sock *sk,
+			     struct sk_buff *skb,
+			     struct nf_ct_bridge_frag_data *data,
+			     int (*output)(struct net *, struct sock *sk,
+					   const struct nf_ct_bridge_frag_data *data,
+					   struct sk_buff *))
+{
+	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+	unsigned int hlen, ll_rs, mtu;
+	struct ip_frag_state state;
+	struct iphdr *iph;
+	int err;
+
+	/* for offloaded checksums cleanup checksum before fragmentation */
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    (err = skb_checksum_help(skb)))
+		goto blackhole;
+
+	iph = ip_hdr(skb);
+
+	/*
+	 *	Setup starting values
+	 */
+
+	hlen = iph->ihl * 4;
+	frag_max_size -= hlen;
+	ll_rs = LL_RESERVED_SPACE(skb->dev);
+	mtu = skb->dev->mtu;
+
+	if (skb_has_frag_list(skb)) {
+		unsigned int first_len = skb_pagelen(skb);
+		struct ip_fraglist_iter iter;
+		struct sk_buff *frag;
+
+		if (first_len - hlen > mtu ||
+		    skb_headroom(skb) < ll_rs)
+			goto blackhole;
+
+		if (skb_cloned(skb))
+			goto slow_path;
+
+		skb_walk_frags(skb, frag) {
+			if (frag->len > mtu ||
+			    skb_headroom(frag) < hlen + ll_rs)
+				goto blackhole;
+
+			if (skb_shared(frag))
+				goto slow_path;
+		}
+
+		ip_fraglist_init(skb, iph, hlen, &iter);
+
+		for (;;) {
+			if (iter.frag)
+				ip_fraglist_prepare(skb, &iter);
+
+			err = output(net, sk, data, skb);
+			if (err || !iter.frag)
+				break;
+
+			skb = ip_fraglist_next(&iter);
+		}
+		return err;
+	}
+slow_path:
+	/* This is a linearized skbuff, the original geometry is lost for us.
+	 * This may also be a clone skbuff, we could preserve the geometry for
+	 * the copies but probably not worth the effort.
+	 */
+	ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state);
+
+	while (state.left > 0) {
+		struct sk_buff *skb2;
+
+		skb2 = ip_frag_next(skb, &state);
+		if (IS_ERR(skb2)) {
+			err = PTR_ERR(skb2);
+			goto blackhole;
+		}
+
+		err = output(net, sk, data, skb2);
+		if (err)
+			goto blackhole;
+	}
+	consume_skb(skb);
+	return err;
+
+blackhole:
+	kfree_skb(skb);
+	return 0;
+}
+
+/* ip_defrag() expects IPCB() in place. */
+static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb,
+			   size_t inet_skb_parm_size)
+{
+	memcpy(cb, skb->cb, sizeof(*cb));
+	memset(skb->cb, 0, inet_skb_parm_size);
+}
+
+static void br_skb_cb_restore(struct sk_buff *skb,
+			      const struct br_input_skb_cb *cb,
+			      u16 fragsz)
+{
+	memcpy(skb->cb, cb, sizeof(*cb));
+	BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz;
+}
+
+static unsigned int nf_ct_br_defrag4(struct sk_buff *skb,
+				     const struct nf_hook_state *state)
+{
+	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+	enum ip_conntrack_info ctinfo;
+	struct br_input_skb_cb cb;
+	const struct nf_conn *ct;
+	int err;
+
+	if (!ip_is_fragment(ip_hdr(skb)))
+		return NF_ACCEPT;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct)
+		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+	br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm));
+	local_bh_disable();
+	err = ip_defrag(state->net, skb,
+			IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+	local_bh_enable();
+	if (!err) {
+		br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size);
+		skb->ignore_df = 1;
+		return NF_ACCEPT;
+	}
+
+	return NF_STOLEN;
+}
+
+static int nf_ct_br_ip_check(const struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	int nhoff, len;
+
+	nhoff = skb_network_offset(skb);
+	iph = ip_hdr(skb);
+	if (iph->ihl < 5 ||
+	    iph->version != 4)
+		return -1;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < nhoff + len ||
+	    len < (iph->ihl * 4))
+		return -1;
+
+	return 0;
+}
+
+static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
+				     const struct nf_hook_state *state)
+{
+	struct nf_hook_state bridge_state = *state;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	u32 len;
+	int ret;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if ((ct && !nf_ct_is_template(ct)) ||
+	    ctinfo == IP_CT_UNTRACKED)
+		return NF_ACCEPT;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+			return NF_ACCEPT;
+
+		len = ntohs(ip_hdr(skb)->tot_len);
+		if (pskb_trim_rcsum(skb, len))
+			return NF_ACCEPT;
+
+		if (nf_ct_br_ip_check(skb))
+			return NF_ACCEPT;
+
+		bridge_state.pf = NFPROTO_IPV4;
+		ret = nf_ct_br_defrag4(skb, &bridge_state);
+		break;
+	case htons(ETH_P_IPV6):
+		/* fall through */
+	default:
+		nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+		return NF_ACCEPT;
+	}
+
+	if (ret != NF_ACCEPT)
+		return ret;
+
+	return nf_conntrack_in(skb, &bridge_state);
+}
+
+static void nf_ct_bridge_frag_save(struct sk_buff *skb,
+				   struct nf_ct_bridge_frag_data *data)
+{
+	if (skb_vlan_tag_present(skb)) {
+		data->vlan_present = true;
+		data->vlan_tci = skb->vlan_tci;
+		data->vlan_proto = skb->vlan_proto;
+	} else {
+		data->vlan_present = false;
+	}
+	skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+}
+
+static unsigned int
+nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
+		    int (*output)(struct net *, struct sock *sk,
+				  const struct nf_ct_bridge_frag_data *data,
+				  struct sk_buff *))
+{
+	struct nf_ct_bridge_frag_data data;
+
+	if (!BR_INPUT_SKB_CB(skb)->frag_max_size)
+		return NF_ACCEPT;
+
+	nf_ct_bridge_frag_save(skb, &data);
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		nf_br_ip_fragment(state->net, state->sk, skb, &data, output);
+		break;
+	case htons(ETH_P_IPV6):
+		return NF_ACCEPT;
+	default:
+		WARN_ON_ONCE(1);
+		return NF_DROP;
+	}
+
+	return NF_STOLEN;
+}
+
+/* Actually only slow path refragmentation needs this. */
+static int nf_ct_bridge_frag_restore(struct sk_buff *skb,
+				     const struct nf_ct_bridge_frag_data *data)
+{
+	int err;
+
+	err = skb_cow_head(skb, ETH_HLEN);
+	if (err) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+	if (data->vlan_present)
+		__vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
+
+	skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+	skb_reset_mac_header(skb);
+
+	return 0;
+}
+
+static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk,
+				    const struct nf_ct_bridge_frag_data *data,
+				    struct sk_buff *skb)
+{
+	int err;
+
+	err = nf_ct_bridge_frag_restore(skb, data);
+	if (err < 0)
+		return err;
+
+	return br_dev_queue_push_xmit(net, sk, skb);
+}
+
+static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	int protoff;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		return nf_conntrack_confirm(skb);
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+		break;
+	case htons(ETH_P_IPV6): {
+		 unsigned char pnum = ipv6_hdr(skb)->nexthdr;
+		__be16 frag_off;
+
+		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+					   &frag_off);
+		if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+			return nf_conntrack_confirm(skb);
+		}
+		break;
+	default:
+		return NF_ACCEPT;
+	}
+	return nf_confirm(skb, protoff, ct, ctinfo);
+}
+
+static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb,
+				      const struct nf_hook_state *state)
+{
+	int ret;
+
+	ret = nf_ct_bridge_confirm(skb);
+	if (ret != NF_ACCEPT)
+		return ret;
+
+	return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post);
+}
+
+static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
+	{
+		.hook		= nf_ct_bridge_pre,
+		.pf		= NFPROTO_BRIDGE,
+		.hooknum	= NF_BR_PRE_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK,
+	},
+	{
+		.hook		= nf_ct_bridge_post,
+		.pf		= NFPROTO_BRIDGE,
+		.hooknum	= NF_BR_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
+	},
+};
+
+static struct nf_ct_bridge_info bridge_info = {
+	.ops		= nf_ct_bridge_hook_ops,
+	.ops_size	= ARRAY_SIZE(nf_ct_bridge_hook_ops),
+	.me		= THIS_MODULE,
+};
+
+static int __init nf_conntrack_l3proto_bridge_init(void)
+{
+	nf_ct_bridge_register(&bridge_info);
+
+	return 0;
+}
+
+static void __exit nf_conntrack_l3proto_bridge_fini(void)
+{
+	nf_ct_bridge_unregister(&bridge_info);
+}
+
+module_init(nf_conntrack_l3proto_bridge_init);
+module_exit(nf_conntrack_l3proto_bridge_fini);
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE));
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 3813cb551df9..7e2e8b8d6ebe 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -121,10 +121,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
 };
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);
 
-static unsigned int nf_confirm(struct sk_buff *skb,
-			       unsigned int protoff,
-			       struct nf_conn *ct,
-			       enum ip_conntrack_info ctinfo)
+unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
+			struct nf_conn *ct, enum ip_conntrack_info ctinfo)
 {
 	const struct nf_conn_help *help;
 
@@ -155,6 +153,7 @@ static unsigned int nf_confirm(struct sk_buff *skb,
 	/* We've seen it coming out the other side: confirm it */
 	return nf_conntrack_confirm(skb);
 }
+EXPORT_SYMBOL_GPL(nf_confirm);
 
 static unsigned int ipv4_confirm(void *priv,
 				 struct sk_buff *skb,
-- 
cgit v1.2.3


From 764dd163ac922f8683b5bcd3007251ce7b26cd33 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 29 May 2019 13:25:38 +0200
Subject: netfilter: nf_conntrack_bridge: add support for IPv6

br_defrag() and br_fragment() indirections are added in case that IPv6
support comes as a module, to avoid pulling innecessary dependencies in.

The new fraglist iterator and fragment transformer APIs are used to
implement the refragmentation code.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6.h             |  50 ++++++++++++
 net/bridge/netfilter/nf_conntrack_bridge.c |  59 +++++++++++++-
 net/ipv6/netfilter.c                       | 123 +++++++++++++++++++++++++++++
 3 files changed, 230 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 12113e502656..a21b8c9623ee 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -19,6 +19,7 @@ struct ip6_rt_info {
 };
 
 struct nf_queue_entry;
+struct nf_ct_bridge_frag_data;
 
 /*
  * Hook functions for ipv6 to allow xt_* modules to be built-in even
@@ -39,6 +40,15 @@ struct nf_ipv6_ops {
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
 			int (*output)(struct net *, struct sock *, struct sk_buff *));
 	int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry);
+#if IS_MODULE(CONFIG_IPV6)
+	int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user);
+	int (*br_fragment)(struct net *net, struct sock *sk,
+			   struct sk_buff *skb,
+			   struct nf_ct_bridge_frag_data *data,
+			   int (*output)(struct net *, struct sock *sk,
+					 const struct nf_ct_bridge_frag_data *data,
+					 struct sk_buff *));
+#endif
 };
 
 #ifdef CONFIG_NETFILTER
@@ -86,6 +96,46 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst,
 #endif
 }
 
+static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
+				    u32 user)
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (!v6_ops)
+		return 1;
+
+	return v6_ops->br_defrag(net, skb, user);
+#else
+	return nf_ct_frag6_gather(net, skb, user);
+#endif
+}
+
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		    struct nf_ct_bridge_frag_data *data,
+		    int (*output)(struct net *, struct sock *sk,
+				  const struct nf_ct_bridge_frag_data *data,
+				  struct sk_buff *));
+
+static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk,
+				     struct sk_buff *skb,
+				     struct nf_ct_bridge_frag_data *data,
+				     int (*output)(struct net *, struct sock *sk,
+						   const struct nf_ct_bridge_frag_data *data,
+						   struct sk_buff *))
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (!v6_ops)
+		return 1;
+
+	return v6_ops->br_fragment(net, sk, skb, data, output);
+#else
+	return br_ip6_fragment(net, sk, skb, data, output);
+#endif
+}
+
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb);
 
 static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 2571528ed582..b675cd7c1a82 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -163,6 +163,31 @@ static unsigned int nf_ct_br_defrag4(struct sk_buff *skb,
 	return NF_STOLEN;
 }
 
+static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
+				     const struct nf_hook_state *state)
+{
+	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+	enum ip_conntrack_info ctinfo;
+	struct br_input_skb_cb cb;
+	const struct nf_conn *ct;
+	int err;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct)
+		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+	br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
+
+	err = nf_ipv6_br_defrag(state->net, skb,
+				IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+	/* queued */
+	if (err == -EINPROGRESS)
+		return NF_STOLEN;
+
+	br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
+	return err == 0 ? NF_ACCEPT : NF_DROP;
+}
+
 static int nf_ct_br_ip_check(const struct sk_buff *skb)
 {
 	const struct iphdr *iph;
@@ -177,6 +202,23 @@ static int nf_ct_br_ip_check(const struct sk_buff *skb)
 	len = ntohs(iph->tot_len);
 	if (skb->len < nhoff + len ||
 	    len < (iph->ihl * 4))
+                return -1;
+
+	return 0;
+}
+
+static int nf_ct_br_ipv6_check(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	int nhoff, len;
+
+	nhoff = skb_network_offset(skb);
+	hdr = ipv6_hdr(skb);
+	if (hdr->version != 6)
+		return -1;
+
+	len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff;
+	if (skb->len < len)
 		return -1;
 
 	return 0;
@@ -212,7 +254,19 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
 		ret = nf_ct_br_defrag4(skb, &bridge_state);
 		break;
 	case htons(ETH_P_IPV6):
-		/* fall through */
+		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+			return NF_ACCEPT;
+
+		len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+		if (pskb_trim_rcsum(skb, len))
+			return NF_ACCEPT;
+
+		if (nf_ct_br_ipv6_check(skb))
+			return NF_ACCEPT;
+
+		bridge_state.pf = NFPROTO_IPV6;
+		ret = nf_ct_br_defrag6(skb, &bridge_state);
+		break;
 	default:
 		nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
 		return NF_ACCEPT;
@@ -254,7 +308,8 @@ nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
 		nf_br_ip_fragment(state->net, state->sk, skb, &data, output);
 		break;
 	case htons(ETH_P_IPV6):
-		return NF_ACCEPT;
+		nf_br_ip6_fragment(state->net, state->sk, skb, &data, output);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		return NF_DROP;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 1240ccd57f39..c6665382acb5 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -16,6 +16,9 @@
 #include <net/ip6_route.h>
 #include <net/xfrm.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include "../bridge/br_private.h"
 
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 {
@@ -109,6 +112,122 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst,
 }
 EXPORT_SYMBOL_GPL(__nf_ip6_route);
 
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		    struct nf_ct_bridge_frag_data *data,
+		    int (*output)(struct net *, struct sock *sk,
+				  const struct nf_ct_bridge_frag_data *data,
+				  struct sk_buff *))
+{
+	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+	struct ip6_frag_state state;
+	u8 *prevhdr, nexthdr = 0;
+	unsigned int mtu, hlen;
+	int hroom, err = 0;
+	__be32 frag_id;
+
+	err = ip6_find_1stfragopt(skb, &prevhdr);
+	if (err < 0)
+		goto blackhole;
+	hlen = err;
+	nexthdr = *prevhdr;
+
+	mtu = skb->dev->mtu;
+	if (frag_max_size > mtu ||
+	    frag_max_size < IPV6_MIN_MTU)
+		goto blackhole;
+
+	mtu = frag_max_size;
+	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+		goto blackhole;
+	mtu -= hlen + sizeof(struct frag_hdr);
+
+	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+				    &ipv6_hdr(skb)->saddr);
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    (err = skb_checksum_help(skb)))
+		goto blackhole;
+
+	hroom = LL_RESERVED_SPACE(skb->dev);
+	if (skb_has_frag_list(skb)) {
+		unsigned int first_len = skb_pagelen(skb);
+		struct ip6_fraglist_iter iter;
+		struct sk_buff *frag2;
+
+		if (first_len - hlen > mtu ||
+		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
+			goto blackhole;
+
+		if (skb_cloned(skb))
+			goto slow_path;
+
+		skb_walk_frags(skb, frag2) {
+			if (frag2->len > mtu ||
+			    skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
+				goto blackhole;
+
+			/* Partially cloned skb? */
+			if (skb_shared(frag2))
+				goto slow_path;
+		}
+
+		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+					&iter);
+		if (err < 0)
+			goto blackhole;
+
+		for (;;) {
+			/* Prepare header of the next frame,
+			 * before previous one went down.
+			 */
+			if (iter.frag)
+				ip6_fraglist_prepare(skb, &iter);
+
+			err = output(net, sk, data, skb);
+			if (err || !iter.frag)
+				break;
+
+			skb = ip6_fraglist_next(&iter);
+		}
+
+		kfree(iter.tmp_hdr);
+		if (!err)
+			return 0;
+
+		kfree_skb_list(iter.frag_list);
+		return err;
+	}
+slow_path:
+	/* This is a linearized skbuff, the original geometry is lost for us.
+	 * This may also be a clone skbuff, we could preserve the geometry for
+	 * the copies but probably not worth the effort.
+	 */
+	ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom,
+		      LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id,
+		      &state);
+
+	while (state.left > 0) {
+		struct sk_buff *skb2;
+
+		skb2 = ip6_frag_next(skb, &state);
+		if (IS_ERR(skb2)) {
+			err = PTR_ERR(skb2);
+			goto blackhole;
+		}
+
+		err = output(net, sk, data, skb2);
+		if (err)
+			goto blackhole;
+	}
+	consume_skb(skb);
+	return err;
+
+blackhole:
+	kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_ip6_fragment);
+
 static const struct nf_ipv6_ops ipv6ops = {
 #if IS_MODULE(CONFIG_IPV6)
 	.chk_addr		= ipv6_chk_addr,
@@ -119,6 +238,10 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.route_input		= ip6_route_input,
 	.fragment		= ip6_fragment,
 	.reroute		= nf_ip6_reroute,
+#if IS_MODULE(CONFIG_NF_CONNTRACK_BRIDGE)
+	.br_defrag		= nf_ct_frag6_gather,
+	.br_fragment		= br_ip6_fragment,
+#endif
 };
 
 int __init ipv6_netfilter_init(void)
-- 
cgit v1.2.3


From c3e933a5b8c19145d14e207e0ecf220f1d6cfda1 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Wed, 29 May 2019 17:39:41 +0200
Subject: sctp: deduplicate identical skb_checksum_ops

The same skb_checksum_ops struct is defined twice in two different places,
leading to code duplication. Declare it as a global variable into a common
header instead of allocating it on the stack on each function call.
bloat-o-meter reports a slight code shrink.

add/remove: 1/1 grow/shrink: 0/10 up/down: 128/-1282 (-1154)
Function                                     old     new   delta
sctp_csum_ops                                  -     128    +128
crc32c_csum_ops                               16       -     -16
sctp_rcv                                    6616    6583     -33
sctp_packet_pack                            4542    4504     -38
nf_conntrack_sctp_packet                    4980    4926     -54
execute_masked_set_action                   6453    6389     -64
tcf_csum_sctp                                575     428    -147
sctp_gso_segment                            1292    1126    -166
sctp_csum_check                              579     412    -167
sctp_snat_handler                            957     772    -185
sctp_dnat_handler                           1321    1132    -189
l4proto_manip_pkt                           2536    2313    -223
Total: Before=359297613, After=359296459, chg -0.00%

Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/checksum.h | 12 +++++++-----
 net/sctp/offload.c          |  7 +------
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 1c6e6c0766ca..1b74af2477d1 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -58,19 +58,21 @@ static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2,
 						   (__force __u32)csum2, len);
 }
 
+static const struct skb_checksum_ops sctp_csum_ops = {
+	.update  = sctp_csum_update,
+	.combine = sctp_csum_combine,
+};
+
 static inline __le32 sctp_compute_cksum(const struct sk_buff *skb,
 					unsigned int offset)
 {
 	struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
-	const struct skb_checksum_ops ops = {
-		.update  = sctp_csum_update,
-		.combine = sctp_csum_combine,
-	};
 	__le32 old = sh->checksum;
 	__wsum new;
 
 	sh->checksum = 0;
-	new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, &ops);
+	new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0,
+			      &sctp_csum_ops);
 	sh->checksum = old;
 
 	return cpu_to_le32((__force __u32)new);
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index edfcf16e704c..dac46dfadab5 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -103,11 +103,6 @@ static const struct net_offload sctp6_offload = {
 	},
 };
 
-static const struct skb_checksum_ops crc32c_csum_ops = {
-	.update  = sctp_csum_update,
-	.combine = sctp_csum_combine,
-};
-
 int __init sctp_offload_init(void)
 {
 	int ret;
@@ -120,7 +115,7 @@ int __init sctp_offload_init(void)
 	if (ret)
 		goto ipv4;
 
-	crc32c_csum_stub = &crc32c_csum_ops;
+	crc32c_csum_stub = &sctp_csum_ops;
 	return ret;
 
 ipv4:
-- 
cgit v1.2.3


From 07b0928918c694c845a387cc16256a8b63ced4fc Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 30 May 2019 15:09:15 +0200
Subject: net: phy: enable interrupts when PHY is attached already

This patch is a step towards allowing PHY drivers to handle more
interrupt sources than just link change. E.g. several PHY's have
built-in temperature monitoring and can raise an interrupt if a
temperature threshold is exceeded. We may be interested in such
interrupts also if the phylib state machine isn't started.
Therefore move enabling interrupts to phy_request_interrupt().

v2:
- patch added to series

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        | 36 ++++++++++++++++++++++--------------
 drivers/net/phy/phy_device.c |  2 +-
 include/linux/phy.h          |  1 +
 3 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e8885429293a..4ba71dc3aee7 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -799,10 +799,10 @@ static int phy_enable_interrupts(struct phy_device *phydev)
 }
 
 /**
- * phy_request_interrupt - request interrupt for a PHY device
+ * phy_request_interrupt - request and enable interrupt for a PHY device
  * @phydev: target phy_device struct
  *
- * Description: Request the interrupt for the given PHY.
+ * Description: Request and enable the interrupt for the given PHY.
  *   If this fails, then we set irq to PHY_POLL.
  *   This should only be called with a valid IRQ number.
  */
@@ -817,10 +817,30 @@ void phy_request_interrupt(struct phy_device *phydev)
 		phydev_warn(phydev, "Error %d requesting IRQ %d, falling back to polling\n",
 			    err, phydev->irq);
 		phydev->irq = PHY_POLL;
+	} else {
+		if (phy_enable_interrupts(phydev)) {
+			phydev_warn(phydev, "Can't enable interrupt, falling back to polling\n");
+			phy_free_interrupt(phydev);
+			phydev->irq = PHY_POLL;
+		}
 	}
 }
 EXPORT_SYMBOL(phy_request_interrupt);
 
+/**
+ * phy_free_interrupt - disable and free interrupt for a PHY device
+ * @phydev: target phy_device struct
+ *
+ * Description: Disable and free the interrupt for the given PHY.
+ *   This should only be called with a valid IRQ number.
+ */
+void phy_free_interrupt(struct phy_device *phydev)
+{
+	phy_disable_interrupts(phydev);
+	free_irq(phydev->irq, phydev);
+}
+EXPORT_SYMBOL(phy_free_interrupt);
+
 /**
  * phy_stop - Bring down the PHY link, and stop checking the status
  * @phydev: target phy_device struct
@@ -835,9 +855,6 @@ void phy_stop(struct phy_device *phydev)
 
 	mutex_lock(&phydev->lock);
 
-	if (phy_interrupt_is_valid(phydev))
-		phy_disable_interrupts(phydev);
-
 	phydev->state = PHY_HALTED;
 
 	mutex_unlock(&phydev->lock);
@@ -864,8 +881,6 @@ EXPORT_SYMBOL(phy_stop);
  */
 void phy_start(struct phy_device *phydev)
 {
-	int err;
-
 	mutex_lock(&phydev->lock);
 
 	if (phydev->state != PHY_READY && phydev->state != PHY_HALTED) {
@@ -877,13 +892,6 @@ void phy_start(struct phy_device *phydev)
 	/* if phy was suspended, bring the physical link up again */
 	__phy_resume(phydev);
 
-	/* make sure interrupts are enabled for the PHY */
-	if (phy_interrupt_is_valid(phydev)) {
-		err = phy_enable_interrupts(phydev);
-		if (err < 0)
-			goto out;
-	}
-
 	phydev->state = PHY_UP;
 
 	phy_start_machine(phydev);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 8b4fc3b4f269..2c879ba01f35 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1016,7 +1016,7 @@ void phy_disconnect(struct phy_device *phydev)
 		phy_stop(phydev);
 
 	if (phy_interrupt_is_valid(phydev))
-		free_irq(phydev->irq, phydev);
+		phy_free_interrupt(phydev);
 
 	phydev->adjust_link = NULL;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 7180b1d1e5e3..72e1196f9799 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1147,6 +1147,7 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev,
 			      const struct ethtool_link_ksettings *cmd);
 int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd);
 void phy_request_interrupt(struct phy_device *phydev);
+void phy_free_interrupt(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
 int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
-- 
cgit v1.2.3


From 49644e68f472c6480e015253fa4d7448c6cfa2aa Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 30 May 2019 15:10:06 +0200
Subject: net: phy: add callback for custom interrupt handler to struct
 phy_driver

The phylib interrupt handler handles link change events only currently.
However PHY drivers may want to use other interrupt sources too,
e.g. to report temperature monitoring events. Therefore add a callback
to struct phy_driver allowing PHY drivers to implement a custom
interrupt handler.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Suggested-by: Russell King - ARM Linux admin <linux@armlinux.org.uk>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 9 +++++++--
 include/linux/phy.h   | 3 +++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 4ba71dc3aee7..c6b0010a6d20 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -772,8 +772,13 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 	if (phydev->drv->did_interrupt && !phydev->drv->did_interrupt(phydev))
 		return IRQ_NONE;
 
-	/* reschedule state queue work to run as soon as possible */
-	phy_trigger_machine(phydev);
+	if (phydev->drv->handle_interrupt) {
+		if (phydev->drv->handle_interrupt(phydev))
+			goto phy_err;
+	} else {
+		/* reschedule state queue work to run as soon as possible */
+		phy_trigger_machine(phydev);
+	}
 
 	if (phy_clear_interrupt(phydev))
 		goto phy_err;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 72e1196f9799..16cd33915496 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -537,6 +537,9 @@ struct phy_driver {
 	 */
 	int (*did_interrupt)(struct phy_device *phydev);
 
+	/* Override default interrupt handling */
+	int (*handle_interrupt)(struct phy_device *phydev);
+
 	/* Clears up any memory if needed */
 	void (*remove)(struct phy_device *phydev);
 
-- 
cgit v1.2.3


From 97b33bdf9bddb6bebc2e87148df3e30aa7a13b2d Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 30 May 2019 15:11:06 +0200
Subject: net: phy: export phy_queue_state_machine

We face the issue that link change interrupt and link status may be
reported by different PHY layers. As a result the link change
interrupt may occur before the link status changes.
Export phy_queue_state_machine to allow PHY drivers to specify a
delay between link status change interrupt and link status check.

v2:
- change jiffies parameter type to unsigned long

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Suggested-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 8 +++++---
 include/linux/phy.h   | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index c6b0010a6d20..84671d868a80 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -29,6 +29,8 @@
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
 
+#define PHY_STATE_TIME	HZ
+
 #define PHY_STATE_STR(_state)			\
 	case PHY_##_state:			\
 		return __stringify(_state);	\
@@ -478,12 +480,12 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 }
 EXPORT_SYMBOL(phy_mii_ioctl);
 
-static void phy_queue_state_machine(struct phy_device *phydev,
-				    unsigned int secs)
+void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies)
 {
 	mod_delayed_work(system_power_efficient_wq, &phydev->state_queue,
-			 secs * HZ);
+			 jiffies);
 }
+EXPORT_SYMBOL(phy_queue_state_machine);
 
 static void phy_trigger_machine(struct phy_device *phydev)
 {
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 16cd33915496..dc4b51060ebc 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -188,7 +188,6 @@ static inline const char *phy_modes(phy_interface_t interface)
 
 
 #define PHY_INIT_TIMEOUT	100000
-#define PHY_STATE_TIME		1
 #define PHY_FORCE_TIMEOUT	10
 
 #define PHY_MAX_ADDR	32
@@ -1140,6 +1139,7 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
 int phy_drivers_register(struct phy_driver *new_driver, int n,
 			 struct module *owner);
 void phy_state_machine(struct work_struct *work);
+void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-- 
cgit v1.2.3


From 9c3c0c2048149d946d7f3ebdcbe70e2946750bfb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 18 Apr 2019 22:43:36 +0200
Subject: isdn: remove isdn4linux

With all isdn4linux hardware drivers gone, this is only a wrapper around
CAPI to support old user space. However, from looking at the mailing
list, it seems that the last time anyone asked about it was in 2014,
when the upgrade from a linux-2.4 installation failed, and mISDN was
suggested as a replacement.

The largest public ISDN network (Deutsche Telekom) was supposed to be
shut down 2018, which must have drastically reduced the number of legacy
installations.

When we last discussed removing i4l in 2016, Karsten Keil suggested
revisiting this in 2018. I guess this is overdue.

Link: http://listserv.isdn4linux.de/pipermail/isdn4linux/2014-October/006165.html
Link: https://patchwork.kernel.org/patch/8484861/#17900371
Link: https://listserv.isdn4linux.de/pipermail/isdn4linux/2019-April/thread.html
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 Documentation/isdn/INTERFACE        |  759 -------
 Documentation/isdn/INTERFACE.fax    |  163 --
 Documentation/isdn/README           |  599 ------
 Documentation/isdn/README.FAQ       |   26 -
 Documentation/isdn/README.audio     |  138 --
 Documentation/isdn/README.concap    |  259 ---
 Documentation/isdn/README.diversion |  127 --
 Documentation/isdn/README.fax       |   45 -
 Documentation/isdn/README.hfc-pci   |   41 -
 Documentation/isdn/README.syncppp   |   58 -
 Documentation/isdn/README.x25       |  184 --
 Documentation/isdn/syncPPP.FAQ      |  224 ---
 Documentation/process/changes.rst   |   16 +-
 MAINTAINERS                         |    2 -
 drivers/isdn/Kconfig                |   26 -
 drivers/isdn/Makefile               |    2 -
 drivers/isdn/capi/Kconfig           |    9 -
 drivers/isdn/capi/capidrv.c         | 2525 -----------------------
 drivers/isdn/capi/capidrv.h         |  140 --
 drivers/isdn/divert/Makefile        |   10 -
 drivers/isdn/divert/divert_init.c   |   82 -
 drivers/isdn/divert/divert_procfs.c |  336 ----
 drivers/isdn/divert/isdn_divert.c   |  846 --------
 drivers/isdn/divert/isdn_divert.h   |  132 --
 drivers/isdn/i4l/Kconfig            |  127 --
 drivers/isdn/i4l/Makefile           |   14 -
 drivers/isdn/i4l/isdn_audio.c       |  711 -------
 drivers/isdn/i4l/isdn_audio.h       |   44 -
 drivers/isdn/i4l/isdn_bsdcomp.c     |  930 ---------
 drivers/isdn/i4l/isdn_common.c      | 2368 ----------------------
 drivers/isdn/i4l/isdn_common.h      |   47 -
 drivers/isdn/i4l/isdn_concap.c      |   99 -
 drivers/isdn/i4l/isdn_concap.h      |   11 -
 drivers/isdn/i4l/isdn_net.c         | 3198 -----------------------------
 drivers/isdn/i4l/isdn_net.h         |  151 --
 drivers/isdn/i4l/isdn_ppp.c         | 3046 ----------------------------
 drivers/isdn/i4l/isdn_ppp.h         |   41 -
 drivers/isdn/i4l/isdn_tty.c         | 3756 -----------------------------------
 drivers/isdn/i4l/isdn_tty.h         |  120 --
 drivers/isdn/i4l/isdn_ttyfax.c      | 1123 -----------
 drivers/isdn/i4l/isdn_ttyfax.h      |   17 -
 drivers/isdn/i4l/isdn_v110.c        |  625 ------
 drivers/isdn/i4l/isdn_v110.h        |   29 -
 drivers/isdn/i4l/isdn_x25iface.c    |  332 ----
 drivers/isdn/i4l/isdn_x25iface.h    |   30 -
 drivers/isdn/isdnloop/Makefile      |    6 -
 drivers/isdn/isdnloop/isdnloop.c    | 1528 --------------
 drivers/isdn/isdnloop/isdnloop.h    |  112 --
 include/linux/concap.h              |  112 --
 include/linux/isdn.h                |  473 -----
 include/linux/isdn_divertif.h       |   35 -
 include/linux/isdn_ppp.h            |  194 --
 include/linux/isdnif.h              |  505 -----
 include/linux/wanrouter.h           |   11 -
 include/uapi/linux/isdn.h           |  144 --
 include/uapi/linux/isdn_divertif.h  |   31 -
 include/uapi/linux/isdn_ppp.h       |   68 -
 include/uapi/linux/isdnif.h         |   57 -
 include/uapi/linux/wanrouter.h      |   18 -
 59 files changed, 2 insertions(+), 26860 deletions(-)
 delete mode 100644 Documentation/isdn/INTERFACE
 delete mode 100644 Documentation/isdn/INTERFACE.fax
 delete mode 100644 Documentation/isdn/README
 delete mode 100644 Documentation/isdn/README.FAQ
 delete mode 100644 Documentation/isdn/README.audio
 delete mode 100644 Documentation/isdn/README.concap
 delete mode 100644 Documentation/isdn/README.diversion
 delete mode 100644 Documentation/isdn/README.fax
 delete mode 100644 Documentation/isdn/README.hfc-pci
 delete mode 100644 Documentation/isdn/README.syncppp
 delete mode 100644 Documentation/isdn/README.x25
 delete mode 100644 Documentation/isdn/syncPPP.FAQ
 delete mode 100644 drivers/isdn/capi/capidrv.c
 delete mode 100644 drivers/isdn/capi/capidrv.h
 delete mode 100644 drivers/isdn/divert/Makefile
 delete mode 100644 drivers/isdn/divert/divert_init.c
 delete mode 100644 drivers/isdn/divert/divert_procfs.c
 delete mode 100644 drivers/isdn/divert/isdn_divert.c
 delete mode 100644 drivers/isdn/divert/isdn_divert.h
 delete mode 100644 drivers/isdn/i4l/Kconfig
 delete mode 100644 drivers/isdn/i4l/isdn_audio.c
 delete mode 100644 drivers/isdn/i4l/isdn_audio.h
 delete mode 100644 drivers/isdn/i4l/isdn_bsdcomp.c
 delete mode 100644 drivers/isdn/i4l/isdn_common.c
 delete mode 100644 drivers/isdn/i4l/isdn_common.h
 delete mode 100644 drivers/isdn/i4l/isdn_concap.c
 delete mode 100644 drivers/isdn/i4l/isdn_concap.h
 delete mode 100644 drivers/isdn/i4l/isdn_net.c
 delete mode 100644 drivers/isdn/i4l/isdn_net.h
 delete mode 100644 drivers/isdn/i4l/isdn_ppp.c
 delete mode 100644 drivers/isdn/i4l/isdn_ppp.h
 delete mode 100644 drivers/isdn/i4l/isdn_tty.c
 delete mode 100644 drivers/isdn/i4l/isdn_tty.h
 delete mode 100644 drivers/isdn/i4l/isdn_ttyfax.c
 delete mode 100644 drivers/isdn/i4l/isdn_ttyfax.h
 delete mode 100644 drivers/isdn/i4l/isdn_v110.c
 delete mode 100644 drivers/isdn/i4l/isdn_v110.h
 delete mode 100644 drivers/isdn/i4l/isdn_x25iface.c
 delete mode 100644 drivers/isdn/i4l/isdn_x25iface.h
 delete mode 100644 drivers/isdn/isdnloop/Makefile
 delete mode 100644 drivers/isdn/isdnloop/isdnloop.c
 delete mode 100644 drivers/isdn/isdnloop/isdnloop.h
 delete mode 100644 include/linux/concap.h
 delete mode 100644 include/linux/isdn.h
 delete mode 100644 include/linux/isdn_divertif.h
 delete mode 100644 include/linux/isdn_ppp.h
 delete mode 100644 include/linux/isdnif.h
 delete mode 100644 include/linux/wanrouter.h
 delete mode 100644 include/uapi/linux/isdn.h
 delete mode 100644 include/uapi/linux/isdn_divertif.h
 delete mode 100644 include/uapi/linux/isdn_ppp.h
 delete mode 100644 include/uapi/linux/isdnif.h
 delete mode 100644 include/uapi/linux/wanrouter.h

(limited to 'include')

diff --git a/Documentation/isdn/INTERFACE b/Documentation/isdn/INTERFACE
deleted file mode 100644
index 5df17e5b25c8..000000000000
--- a/Documentation/isdn/INTERFACE
+++ /dev/null
@@ -1,759 +0,0 @@
-$Id: INTERFACE,v 1.15.8.2 2001/03/13 16:17:07 kai Exp $
-
-Description of the Interface between Linklevel and Hardwarelevel
-  of isdn4linux:
-
-
-  The Communication between Linklevel (LL) and Hardwarelevel (HL)
-  is based on the struct isdn_if (defined in isdnif.h).
-
-  An HL-driver can register itself at LL by calling the function
-  register_isdn() with a pointer to that struct. Prior to that, it has
-  to preset some of the fields of isdn_if. The LL sets the rest of
-  the fields. All further communication is done via callbacks using
-  the function-pointers defined in isdn_if.
-
-  Changes/Version numbering:
-
-  During development of the ISDN subsystem, several changes have been
-  made to the interface. Before it went into kernel, the package
-  had a unique version number. The last version, distributed separately
-  was 0.7.4. When the subsystem went into kernel, every functional unit
-  got a separate version number. These numbers are shown at initialization,
-  separated by slashes:
-
-     c.c/t.t/n.n/p.p/a.a/v.v
-
-  where
-
-   c.c is the revision of the common code.
-   t.t is the revision of the tty related code.
-   n.n is the revision of the network related code.
-   p.p is the revision of the ppp related code.
-   a.a is the revision of the audio related code.
-   v.v is the revision of the V.110 related code.
-
-  Changes in this document are marked with '***CHANGEx' where x representing
-  the version number. If that number starts with 0, it refers to the old,
-  separately distributed package. If it starts with one of the letters
-  above, it refers to the revision of the corresponding module. 
-  ***CHANGEIx refers to the revision number of the isdnif.h  
-
-1. Description of the fields of isdn_if:
-
-  int channels;
-
-    This field has to be set by the HL-driver to the number of channels
-    supported prior to calling register_isdn(). Upon return of the call,
-    the LL puts an id there, which has to be used by the HL-driver when
-    invoking the other callbacks.
-
-  int maxbufsize;
-
-    ***CHANGE0.6: New since this version.
-
-    Also to be preset by the HL-driver. With this value the HL-driver
-    tells the LL the maximum size of a data-packet it will accept. 
-
-  unsigned long features;
-
-    To be preset by the HL-driver. Using this field, the HL-driver
-    announces the features supported. At the moment this is limited to
-    report the supported layer2 and layer3-protocols. For setting this
-    field the constants ISDN_FEATURE..., declared in isdnif.h have to be
-    used.
-
-    ***CHANGE0.7.1: The line type (1TR6, EDSS1) has to be set.
-
-  unsigned short hl_hdrlen;
-
-    ***CHANGE0.7.4: New field.
-
-    To be preset by the HL-driver, if it supports sk_buff's. The driver
-    should put here the amount of additional space needed in sk_buff's for
-    its internal purposes. Drivers not supporting sk_buff's should 
-    initialize this field to 0.
-
-  void (*rcvcallb_skb)(int, int, struct sk_buff *)
-
-    ***CHANGE0.7.4: New field.
-
-    This field will be set by LL. The HL-driver delivers received data-
-    packets by calling this function. Upon calling, the HL-driver must
-    already have its private data pulled off the head of the sk_buff.
-
-    Parameter:
-      int              driver-Id
-      int              Channel-number locally to the driver. (starting with 0)
-      struct sk_buff * Pointer to sk_buff, containing received data.
-
-  int (*statcallb)(isdn_ctrl*);
-
-    This field will be set by LL. This function has to be called by the
-    HL-driver for signaling status-changes or other events to the LL.
-
-    Parameter:
-      isdn_ctrl*
-
-      The struct isdn_ctrl also defined in isdn_if. The exact meanings of its
-      fields are described together with the descriptions of the possible
-      events. Here is only a short description of the fields:
-
-        driver  = driver Id.
-        command = event-type. (one of the constants ISDN_STAT_...)
-        arg     = depends on event-type.
-        num     = depends on event-type.
-
-    Returnvalue:
-      0 on success, else -1
-
-  int (*command)(isdn_ctrl*);
-
-    This field has to be preset by the HL-driver. It points to a function,
-    to be called by LL to perform functions like dialing, B-channel
-    setup, etc. The exact meaning of the parameters is described with the
-    descriptions of the possible commands.
-
-    Parameter:
-      isdn_ctrl*
-        driver  = driver-Id
-        command = command to perform. (one of the constants ISDN_CMD_...)
-        arg     = depends on command.
-        num     = depends on command.
-    
-    Returnvalue:
-      >=0 on success, else error-code (-ENODEV etc.)
-
-  int (*writebuf_skb)(int, int, int, struct sk_buff *)
-
-    ***CHANGE0.7.4: New field.
-    ***CHANGEI.1.21: New field.
-
-    This field has to be preset by the HL-driver. The given function will
-    be called by the LL for delivering data to be send via B-Channel.
-
- 
-    Parameter:
-      int              driver-Id ***CHANGE0.7.4: New parameter.
-      int              channel-number locally to the HL-driver. (starts with 0)
-      int	       ack ***ChangeI1.21: New parameter
-		       If this is !0, the driver has to signal the delivery
-		       by sending an ISDN_STAT_BSENT. If this is 0, the driver
-		       MUST NOT send an ISDN_STAT_BSENT.
-      struct sk_buff * Pointer to sk_buff containing data to be send via
-                       B-channel.
-
-    Returnvalue:
-      Length of data accepted on success, else error-code (-EINVAL on
-      oversized packets etc.)
-
-  int (*writecmd)(u_char*, int, int, int, int);
-
-    This field has to be preset by the HL-driver. The given function will be
-    called to perform write-requests on /dev/isdnctrl (i.e. sending commands
-    to the card) The data-format is hardware-specific. This function is
-    intended for debugging only. It is not necessary for normal operation
-    and never will be called by the tty-emulation- or network-code. If
-    this function is not supported, the driver has to set NULL here.
-
-    Parameter:
-      u_char* pointer to data.
-      int     length of data.
-      int     flag: 0 = call from within kernel-space. (HL-driver must use
-                        memcpy, may NOT use schedule())
-                    1 = call from user-space. (HL-driver must use
-                        memcpy_fromfs, use of schedule() allowed)
-      int     driver-Id.
-      int     channel-number locally to the HL-driver. (starts with 0)
-
-***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
-
-    Returnvalue:
-      Length of data accepted on success, else error-code (-EINVAL etc.)
-
-  int (*readstat)(u_char*, int, int, int, int);
-
-    This field has to be preset by the HL-driver. The given function will be
-    called to perform read-requests on /dev/isdnctrl (i.e. reading replies
-    from the card) The data-format is hardware-specific. This function is
-    intended for debugging only. It is not necessary for normal operation
-    and never will be called by the tty-emulation- or network-code. If
-    this function is not supported, the driver has to set NULL here.
-
-    Parameter:
-      u_char* pointer to data.
-      int     length of data.
-      int     flag: 0 = call from within kernel-space. (HL-driver must use
-                        memcpy, may NOT use schedule())
-                    1 = call from user-space. (HL-driver must use
-                        memcpy_fromfs, use of schedule() allowed)
-      int     driver-Id.
-      int     channel-number locally to the HL-driver. (starts with 0)
-
-***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
-
-    Returnvalue:
-      Length of data on success, else error-code (-EINVAL etc.)
-
-  char id[20];
-       ***CHANGE0.7: New since this version.
-
-   This string has to be preset by the HL-driver. Its purpose is for
-   identification of the driver by the user. Eg.: it is shown in the
-   status-info of /dev/isdninfo. Furthermore it is used as Id for binding
-   net-interfaces to a specific channel. If a string of length zero is
-   given, upon return, isdn4linux will replace it by a generic name. (line0,
-   line1 etc.) It is recommended to make this string configurable during
-   module-load-time. (copy a global variable to this string.) For doing that,
-   modules 1.2.8 or newer are necessary.
-
-2. Description of the commands, a HL-driver has to support:
-
-   All commands will be performed by calling the function command() described
-   above from within the LL. The field command of the struct-parameter will
-   contain the desired command, the field driver is always set to the
-   appropriate driver-Id.
-
-   Until now, the following commands are defined:
-
-***CHANGEI1.34: The parameter "num" has been replaced by a union "parm" containing
-                the old "num" and a new setup_type struct used for ISDN_CMD_DIAL
-                and ISDN_STAT_ICALL callback.
-
-   ISDN_CMD_IOCTL:
-
-     This command is intended for performing ioctl-calls for configuring
-     hardware or similar purposes (setting port-addresses, loading firmware
-     etc.) For this purpose, in the LL all ioctl-calls with an argument
-     >= IIOCDRVCTL (0x100) will be handed transparently to this
-     function after subtracting 0x100 and placing the result in arg.
-     Example:
-       If a userlevel-program calls ioctl(0x101,...) the function gets
-       called with the field command set to 1.
-
-     Parameter:
-       driver   = driver-Id.
-       command  = ISDN_CMD_IOCTL
-       arg      = Original ioctl-cmd - IIOCDRVCTL
-       parm.num = first bytes filled with (unsigned long)arg
-   
-     Returnvalue:
-       Depending on driver.
-
-  
-  ISDN_CMD_DIAL:
-
-    This command is used to tell the HL-driver it should dial a given
-    number.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_DIAL
-      arg         = channel-number locally to the driver. (starting with 0)
-      
-      parm.setup.phone  = An ASCII-String containing the number to dial.
-      parm.setup.eazmsn = An ASCII-Sting containing the own EAZ or MSN.
-      parm.setup.si1    = The Service-Indicator.
-      parm.setup.si2    = Additional Service-Indicator.
-
-                    If the Line has been designed as SPV (a special german
-                    feature, meaning semi-leased-line) the phone has to
-                    start with an "S".
-      ***CHANGE0.6: In previous versions the EAZ has been given in the
-                    highbyte of arg.
-    ***CHANGE0.7.1: New since this version: ServiceIndicator and AddInfo.
-
-  ISDN_CMD_ACCEPTD:
-
-    With this command, the HL-driver is told to accept a D-Channel-setup.
-    (Response to an incoming call)
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ACCEPTD
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_ACCEPTB:
-
-    With this command, the HL-driver is told to perform a B-Channel-setup.
-    (after establishing D-Channel-Connection)
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ACCEPTB
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_HANGUP:
-
-    With this command, the HL-driver is told to hangup (B-Channel if
-    established first, then D-Channel). This command is also used for
-    actively rejecting an incoming call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_HANGUP
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_CLREAZ:
-
-    With this command, the HL-driver is told not to signal incoming
-    calls to the LL.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_CLREAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_CMD_SETEAZ:
-
-    With this command, the HL-driver is told to signal incoming calls for
-    the given EAZs/MSNs to the LL.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETEAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the desired EAZ's/MSN's
-                    (comma-separated). If an empty String is given, the
-                    HL-driver should respond to ALL incoming calls,
-                    regardless of the destination-address.
-      ***CHANGE0.6: New since this version the "empty-string"-feature.
-
-  ISDN_CMD_GETEAZ: (currently unused)
-
-    With this command, the HL-driver is told to report the current setting
-    given with ISDN_CMD_SETEAZ.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETEAZ
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the current EAZ's/MSN's
-
-  ISDN_CMD_SETSIL: (currently unused)
-
-    With this command, the HL-driver is told to signal only incoming
-    calls with the given Service-Indicators.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETSIL
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the desired Service-Indicators.
-
-  ISDN_CMD_GETSIL: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    Service-Indicators it will respond to.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETSIL
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing the current Service-Indicators.
-
-  ISDN_CMD_SETL2:
-
-    With this command, the HL-driver is told to select the given Layer-2-
-    protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
-    ISDN_CMD_ACCEPTD.
-
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETL2
-      arg         = channel-number locally to the driver. (starting with 0)
-                    logical or'ed with (protocol-Id << 8)
-                    protocol-Id is one of the constants ISDN_PROTO_L2...
-      parm        = unused.
-
-  ISDN_CMD_GETL2: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    setting of the Layer-2-protocol.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETL2
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-    Returnvalue:
-      current protocol-Id (one of the constants ISDN_L2_PROTO)
-
-  ISDN_CMD_SETL3:
-
-    With this command, the HL-driver is told to select the given Layer-3-
-    protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
-    ISDN_CMD_ACCEPTD.
-
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_SETL3
-      arg         = channel-number locally to the driver. (starting with 0)
-                    logical or'ed with (protocol-Id << 8)
-                    protocol-Id is one of the constants ISDN_PROTO_L3...
-      parm.fax    = Pointer to T30_s fax struct. (fax usage only)
-
-  ISDN_CMD_GETL2: (currently unused)
-
-    With this command, the HL-driver is told to return the current
-    setting of the Layer-3-protocol.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_GETL3
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-    Returnvalue:
-      current protocol-Id (one of the constants ISDN_L3_PROTO)
-
-  ISDN_CMD_PROCEED: 
-
-    With this command, the HL-driver is told to proceed with a incoming call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_PROCEED
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 with 
-                    PROCEED message
-
-  ISDN_CMD_ALERT: 
-
-    With this command, the HL-driver is told to alert a proceeding call.
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_ALERT
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 with 
-                    ALERT message
-
-  ISDN_CMD_REDIR: 
-
-    With this command, the HL-driver is told to redirect a call in proceeding
-    or alerting state.  
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_REDIR
-      arg         = channel-number locally to the driver. (starting with 0)
-      setup.eazmsn= empty string or string send as uus1 in DSS1 protocol
-      setup.screen= screening indicator
-      setup.phone = redirected to party number
-
-  ISDN_CMD_PROT_IO:
-
-    With this call, the LL-driver invokes protocol specific features through
-    the LL.
-    The call is not implicitely bound to a connection.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_CMD_PROT_IO
-      arg         = The lower 8 Bits define the addressed protocol as defined
-                    in ISDN_PTYPE..., the upper bits are used to differentiate
-                    the protocol specific CMD.  
-      
-      para        = protocol and function specific. See isdnif.h for detail.
-
-
-  ISDN_CMD_FAXCMD:
-
-    With this command the HL-driver receives a fax sub-command.
-    For details refer to INTERFACE.fax
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_CMD_FAXCMD
-      arg         = channel-number locally to the driver. (starting with 0)
-      parm        = unused.
-
-
-3. Description of the events to be signaled by the HL-driver to the LL.
-
-  All status-changes are signaled via calling the previously described
-  function statcallb(). The field command of the struct isdn_cmd has
-  to be set by the HL-driver with the appropriate Status-Id (event-number).
-  The field arg has to be set to the channel-number (locally to the driver,
-  starting with 0) to which this event applies. (Exception: STAVAIL-event)
-
-  Until now, the following Status-Ids are defined:
-
-  ISDN_STAT_AVAIL:
-
-    With this call, the HL-driver signals the availability of new data
-    for readstat(). Used only for debugging-purposes, see description
-    of readstat().
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_STAVAIL
-      arg         = length of available data.
-      parm        = unused.
-
-  ISDN_STAT_ICALL:
-  ISDN_STAT_ICALLW:
-
-    With this call, the HL-driver signals an incoming call to the LL.
-    If ICALLW is signalled the incoming call is a waiting call without
-    a available B-chan.
-
-    Parameter:
-      driver            = driver-Id
-      command           = ISDN_STAT_ICALL
-      arg               = channel-number, locally to the driver. (starting with 0)
-      para.setup.phone  = Callernumber.
-      para.setup.eazmsn = CalledNumber.
-      para.setup.si1    = Service Indicator.
-      para.setup.si2    = Additional Service Indicator.
-      para.setup.plan   = octet 3 from Calling party number Information Element.
-      para.setup.screen = octet 3a from Calling party number Information Element.
-
-    Return:
-      0           = No device matching this call.
-      1           = At least one device matching this call (RING on ttyI).
-                    HL-driver may send ALERTING on the D-channel in this case.
-      2           = Call will be rejected.
-      3           = Incoming called party number is currently incomplete.
-                    Additional digits are required. 
-                    Used for signalling with PtP connections.
-      4	          = Call will be held in a proceeding state 
-                    (HL driver sends PROCEEDING)
-                    Used when a user space prog needs time to interpret a call
-		    para.setup.eazmsn may be filled with an uus1 message of
-		    30 octets maximum. Empty string if no uus. 
-      5           = Call will be actively deflected to another party
-                    Only available in DSS1/EURO protocol
-		    para.setup.phone must be set to destination party number
-		    para.setup.eazmsn may be filled with an uus1 message of
-		    30 octets maximum. Empty string if no uus. 
-      -1          = An error happened. (Invalid parameters for example.)
-  The keypad support now is included in the dial command.	        
-
-
-  ISDN_STAT_RUN:
-
-    With this call, the HL-driver signals availability of the ISDN-card.
-    (after initializing, loading firmware)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_RUN
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_STOP:
-
-    With this call, the HL-driver signals unavailability of the ISDN-card.
-    (before unloading, while resetting/reconfiguring the card)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_STOP
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_DCONN:
-
-   With this call, the HL-driver signals the successful establishment of
-   a D-Channel-connection. (Response to ISDN_CMD_ACCEPTD or ISDN_CMD_DIAL)
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DCONN
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_BCONN:
-
-   With this call, the HL-driver signals the successful establishment of
-   a B-Channel-connection. (Response to ISDN_CMD_ACCEPTB or because the
-   remote-station has initiated establishment)
-
-   The HL driver should call this when the logical l2/l3 protocol 
-   connection on top of the physical B-channel is established.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BCONN
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII-String, containing type of connection (for analog
-		    modem only). This will be appended to the CONNECT message
-		    e.g. 14400/V.32bis
-
-  ISDN_STAT_DHUP:
-
-   With this call, the HL-driver signals the shutdown of a
-   D-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
-   or caused by a remote-hangup or if the remote-station has actively
-   rejected a call.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DHUP
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_BHUP:
-
-   With this call, the HL-driver signals the shutdown of a
-   B-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
-   or caused by a remote-hangup.
-
-   The HL driver should call this as soon as the logical l2/l3 protocol 
-   connection on top of the physical B-channel is released.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BHUP
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_CINF:
-
-   With this call, the HL-driver delivers charge-unit information to the
-   LL.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_CINF
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII string containing charge-units (digits only).
-
-  ISDN_STAT_LOAD: (currently unused)
-
-  ISDN_STAT_UNLOAD:
-
-   With this call, the HL-driver signals that it will be unloaded now. This
-   tells the LL to release all corresponding data-structures.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_UNLOAD
-      arg         = unused.
-      parm        = unused.
-
-  ISDN_STAT_BSENT:
-
-    With this call the HL-driver signals the delivery of a data-packet.
-    This callback is used by the network-interfaces only, tty-Emulation
-    does not need this call.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_BSENT
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.length = ***CHANGEI.1.21: New field.
-		    the driver has to set this to the original length
-		    of the skb at the time of receiving it from the linklevel.
-
-  ISDN_STAT_NODCH:
-
-    With this call, the driver has to respond to a prior ISDN_CMD_DIAL, if
-    no D-Channel is available.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_NODCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
-  ISDN_STAT_ADDCH: 
-
-    This call is for HL-drivers, which are unable to check card-type
-    or numbers of supported channels before they have loaded any firmware
-    using ioctl. Those HL-driver simply set the channel-parameter to a
-    minimum channel-number when registering, and later if they know
-    the real amount, perform this call, allocating additional channels.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_ADDCH
-      arg         = number of channels to be added.
-      parm        = unused.
-
-  ISDN_STAT_CAUSE:
-
-    With this call, the HL-driver delivers CAUSE-messages to the LL.
-    Currently the LL does not use this messages. Their contents is simply
-    logged via kernel-messages. Therefore, currently the format of the
-    messages is completely free. However they should be printable.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_NODCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num    = ASCII string containing CAUSE-message.
-
-  ISDN_STAT_DISPLAY:
-
-    With this call, the HL-driver delivers DISPLAY-messages to the LL.
-    Currently the LL does not use this messages. 
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DISPLAY
-      arg         = channel-number, locally to the driver. (starting with 0)
-      para.display= string containing DISPLAY-message.
-
-  ISDN_STAT_PROT:
-
-    With this call, the HL-driver delivers protocol specific infos to the LL.
-    The call is not implicitely bound to a connection.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_PROT
-      arg         = The lower 8 Bits define the addressed protocol as defined
-                    in ISDN_PTYPE..., the upper bits are used to differentiate
-                    the protocol specific STAT.  
-      
-      para        = protocol and function specific. See isdnif.h for detail.
-
-  ISDN_STAT_DISCH:
-
-    With this call, the HL-driver signals the LL to disable or enable the
-    use of supplied channel and driver.
-    The call may be used to reduce the available number of B-channels after
-    loading the driver. The LL has to ignore a disabled channel when searching
-    for free channels. The HL driver itself never delivers STAT callbacks for
-    disabled channels. 	    
-    The LL returns a nonzero code if the operation was not successful or the
-    selected channel is actually regarded as busy.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_DISCH
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.num[0] = 0 if channel shall be disabled, else enabled.
-
-  ISDN_STAT_L1ERR:
-
-    ***CHANGEI1.21 new status message.
-    A signal can be sent to the linklevel if an Layer1-error results in
-    packet-loss on receive or send. The field errcode of the cmd.parm
-    union describes the error more precisely.
-
-    Parameter:
-      driver      = driver-Id
-      command     = ISDN_STAT_L1ERR
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm.errcode= ISDN_STAT_L1ERR_SEND:     Packet lost while sending.
-		    ISDN_STAT_L1ERR_RECV:     Packet lost while receiving.
-  ISDN_STAT_FAXIND:
-
-    With this call the HL-driver signals a fax sub-command to the LL.
-    For details refer to INTERFACE.fax
-
-    Parameter:
-      driver      = driver-Id.
-      command     = ISDN_STAT_FAXIND
-      arg         = channel-number, locally to the driver. (starting with 0)
-      parm        = unused.
-
diff --git a/Documentation/isdn/INTERFACE.fax b/Documentation/isdn/INTERFACE.fax
deleted file mode 100644
index 9c8c6d914ec7..000000000000
--- a/Documentation/isdn/INTERFACE.fax
+++ /dev/null
@@ -1,163 +0,0 @@
-$Id: INTERFACE.fax,v 1.2 2000/08/06 09:22:50 armin Exp $
-
-
-Description of the fax-subinterface between linklevel and hardwarelevel of 
-  isdn4linux. 
-
-  The communication between linklevel (LL) and hardwarelevel (HL) for fax
-  is based on the struct T30_s (defined in isdnif.h).
-  This struct is allocated in the LL.  
-  In order to use fax, the LL provides the pointer to this struct with the 
-  command ISDN_CMD_SETL3 (parm.fax). This pointer expires in case of hangup 
-  and when a new channel to a new connection is assigned. 
-
-
-Data handling:
-  In send-mode the HL-driver has to handle the <DLE> codes and the bit-order 
-  conversion by itself. 
-  In receive-mode the LL-driver takes care of the bit-order conversion
-  (specified by +FBOR)
-
-Structure T30_s description:
-
-  This structure stores the values (set by AT-commands), the remote-
-  capability-values and the command-codes between LL and HL.
-
-  If the HL-driver receives ISDN_CMD_FAXCMD, all needed information
-  is in this struct set by the LL.
-  To signal information to the LL, the HL-driver has to set the 
-  parameters and use ISDN_STAT_FAXIND.
-  (Please refer to INTERFACE)
-
-Structure T30_s:
-
-  All members are 8-bit unsigned (__u8)
-
-  -  resolution     
-  -  rate
-  -  width
-  -  length
-  -  compression
-  -  ecm
-  -  binary
-  -  scantime
-  -  id[]
-  Local faxmachine's parameters, set by +FDIS, +FDCS, +FLID, ...
-
-  -  r_resolution
-  -  r_rate
-  -  r_width
-  -  r_length
-  -  r_compression
-  -  r_ecm
-  -  r_binary
-  -  r_scantime
-  -  r_id[]
-  Remote faxmachine's parameters. To be set by HL-driver.
-
-  -  phase      
-  Defines the actual state of fax connection. Set by HL or LL
-  depending on progress and type of connection.
-  If the phase changes because of an AT command, the LL driver
-  changes this value. Otherwise the HL-driver takes care of it, but
-  only necessary on call establishment (from IDLE to PHASE_A).
-  (one of the constants ISDN_FAX_PHASE_[IDLE,A,B,C,D,E])
-
-  -  direction
-  Defines outgoing/send or incoming/receive connection.
-  (ISDN_TTY_FAX_CONN_[IN,OUT])
-
-  -  code
-  Commands from LL to HL; possible constants : 
-      ISDN_TTY_FAX_DR        signals +FDR command to HL
-
-      ISDN_TTY_FAX_DT        signals +FDT command to HL 
-
-      ISDN_TTY_FAX_ET        signals +FET command to HL
-
-
-  Other than that the "code" is set with the hangup-code value at
-  the end of connection for the +FHNG message.
-        
-  -  r_code 
-  Commands from HL to LL; possible constants :
-      ISDN_TTY_FAX_CFR       output of +FCFR message. 
-
-      ISDN_TTY_FAX_RID       output of remote ID set in r_id[]
-                             (+FCSI/+FTSI on send/receive)
-
-      ISDN_TTY_FAX_DCS       output of +FDCS and CONNECT message,
-                             switching to phase C.
-
-      ISDN_TTY_FAX_ET        signals end of data,
-                             switching to phase D.
-
-      ISDN_TTY_FAX_FCON      signals the established, outgoing connection,
-                             switching to phase B.
-
-      ISDN_TTY_FAX_FCON_I    signals the established, incoming connection,
-                             switching to phase B.
-
-      ISDN_TTY_FAX_DIS       output of +FDIS message and values.
-
-      ISDN_TTY_FAX_SENT      signals that all data has been sent 
-                             and <DLE><ETX> is acknowledged,
-                             OK message will be sent.
-
-      ISDN_TTY_FAX_PTS       signals a msg-confirmation (page sent successful),
-                             depending on fet value:
-                             0: output OK message (more pages follow)
-                             1: switching to phase B (next document)
-
-      ISDN_TTY_FAX_TRAIN_OK  output of +FDCS and OK message (for receive mode).
-
-      ISDN_TTY_FAX_EOP       signals end of data in receive mode,
-                             switching to phase D.
-
-      ISDN_TTY_FAX_HNG       output of the +FHNG and value set by code and
-                             OK message, switching to phase E.
-
-
-  -  badlin
-  Value of +FBADLIN  
-
-  -  badmul
-  Value of +FBADMUL
-
-  -  bor
-  Value of +FBOR
-
-  -  fet
-  Value of +FET command in send-mode.
-  Set by HL in receive-mode for +FET message.
-
-  -  pollid[]  
-  ID-string, set by +FCIG
-
-  -  cq
-  Value of +FCQ
-
-  -  cr
-  Value of +FCR
-
-  -  ctcrty
-  Value of +FCTCRTY
-
-  -  minsp
-  Value of +FMINSP
-
-  -  phcto
-  Value of +FPHCTO
-
-  -  rel
-  Value of +FREL
-
-  -  nbc
-  Value of +FNBC (0,1)
-  (+FNBC is not a known class 2 fax command, I added this to change the
-   automatic "best capabilities" connection in the eicon HL-driver)
-
-  
-Armin
-mac@melware.de
-
diff --git a/Documentation/isdn/README b/Documentation/isdn/README
deleted file mode 100644
index 74bd2bdb455b..000000000000
--- a/Documentation/isdn/README
+++ /dev/null
@@ -1,599 +0,0 @@
-README for the ISDN-subsystem
-
-1. Preface
-
-  1.1 Introduction
-
-  This README describes how to set up and how to use the different parts
-  of the ISDN-subsystem.
-
-  For using the ISDN-subsystem, some additional userlevel programs are
-  necessary. Those programs and some contributed utilities are available
-  at
-
-   ftp.isdn4linux.de
-
-   /pub/isdn4linux/isdn4k-utils-<VersionNumber>.tar.gz
-
-
-  We also have set up a mailing-list:
-
-   The isdn4linux-project originates in Germany, and therefore by historical
-   reasons, the mailing-list's primary language is german. However mails
-   written in english have been welcome all the time.
-
-   to subscribe: write a email to majordomo@listserv.isdn4linux.de,
-   Subject irrelevant, in the message body:
-   subscribe isdn4linux <your_email_address>
-
-   To write to the mailing-list, write to isdn4linux@listserv.isdn4linux.de
-
-   This mailinglist is bidirectionally gated to the newsgroup
-
-     de.alt.comm.isdn4linux
-
-  There is also a well maintained FAQ in English available at
-     https://www.mhessler.de/i4lfaq/
-  It can be viewed online, or downloaded in sgml/text/html format.
-  The FAQ can also be viewed online at
-     https://www.isdn4linux.de/faq/i4lfaq.html
-  or downloaded from
-     ftp://ftp.isdn4linux.de/pub/isdn4linux/FAQ/
-
-  1.1 Technical details
-
-  In the following Text, the terms MSN and EAZ are used.
-
-  MSN is the abbreviation for (M)ultiple(S)ubscriber(N)umber, and applies
-  to Euro(EDSS1)-type lines. Usually it is simply the phone number.
-
-  EAZ is the abbreviation of (E)ndgeraete(A)uswahl(Z)iffer and
-  applies to German 1TR6-type lines. This is a one-digit string,
-  simply appended to the base phone number
-
-  The internal handling is nearly identical, so replace the appropriate
-  term to that one, which applies to your local ISDN-environment.
-
-  When the link-level-module isdn.o is loaded, it supports up to 16
-  low-level-modules with up to 64 channels. (The number 64 is arbitrarily
-  chosen and can be configured at compile-time --ISDN_MAX in isdn.h).
-  A low-level-driver can register itself through an interface (which is
-  defined in isdnif.h) and gets assigned a slot.
-  The following char-devices are made available for each channel:
-
-  A raw-control-device with the following functions:
-     write: raw D-channel-messages (format: depends on driver).
-     read:  raw D-channel-messages (format: depends on driver).
-     ioctl: depends on driver, i.e. for the ICN-driver, the base-address of
-            the ports and the shared memory on the card can be set and read
-            also the boot-code and the protocol software can be loaded into
-            the card.
-
-   O N L Y !!!  for debugging (no locking against other devices):
-   One raw-data-device with the following functions:
-     write: data to B-channel.
-     read:  data from B-channel.
-
-   In addition the following devices are made available:
-
-   128 tty-devices (64 cuix and 64 ttyIx) with integrated modem-emulator:
-   The functionality is almost the same as that of a serial device
-   (the line-discs are handled by the kernel), which lets you run
-   SLIP, CSLIP and asynchronous PPP through the devices. We have tested
-   Seyon, minicom, CSLIP (uri-dip) PPP, mgetty, XCept and Hylafax. 
-
-   The modem-emulation supports the following:
-           1.3.1 Commands:
-
-               ATA      Answer incoming call.
-               ATD<No.> Dial, the number may contain:
-                        [0-9] and [,#.*WPT-S]
-                        the latter are ignored until 'S'.
-                        The 'S' must precede the number, if
-                        the line is a SPV (German 1TR6).
-               ATE0     Echo off.
-               ATE1     Echo on (default).
-               ATH      Hang-up.
-               ATH1     Off hook (ignored).
-               ATH0     Hang-up.
-               ATI      Return "ISDN for Linux...".
-               ATI0        "
-               ATI1        "
-               ATI2     Report of last connection.
-               ATO      On line (data mode).
-               ATQ0     Enable result codes (default).
-               ATQ1     Disable result codes (default).
-               ATSx=y   Set register x to y.
-               ATSx?    Show contents of register x.
-               ATV0     Numeric responses.
-               ATV1     English responses (default).
-               ATZ      Load registers and EAZ/MSN from Profile.
-               AT&Bx    Set Send-Packet-size to x (max. 4000)
-                        The real packet-size may be limited by the
-                        low-level-driver used. e.g. the HiSax-Module-
-                        limit is 2000. You will get NO Error-Message,
-                        if you set it to higher values, because at the
-                        time of giving this command the corresponding
-                        driver may not be selected (see "Automatic
-                        Assignment") however the size of outgoing packets
-                        will be limited correctly.
-               AT&D0    Ignore DTR
-               AT&D2    DTR-low-edge: Hang up and return to
-                        command mode (default).
-               AT&D3    Same as AT&D2 but also resets all registers.
-               AT&Ex    Set the EAZ/MSN for this channel to x.
-               AT&F     Reset all registers and profile to "factory-defaults"
-               AT&Lx    Set list of phone numbers to listen on.  x is a
-                        list of wildcard patterns separated by semicolon.
-                        If this is set, it has precedence over the MSN set
-                        by AT&E.
-               AT&Rx    Select V.110 bitrate adaption.
-                        This command enables V.110 protocol with 9600 baud
-                        (x=9600), 19200 baud (x=19200) or 38400 baud
-                        (x=38400). A value of x=0 disables V.110 switching
-                        back to default X.75. This command sets the following
-                        Registers:
-                          Reg 14 (Layer-2 protocol):
-                            x = 0:     0
-                            x = 9600:  7
-                            x = 19200: 8
-                            x = 38400: 9
-                          Reg 18.2 = 1
-                          Reg 19 (Additional Service Indicator):
-                            x = 0:       0
-                            x = 9600:  197
-                            x = 19200: 199
-                            x = 38400: 198
-                          Note on value in Reg 19:
-                            There is _NO_ common convention for 38400 baud.
-                            The value 198 is chosen arbitrarily. Users
-                            _MUST_ negotiate this value before establishing
-                            a connection.
-               AT&Sx    Set window-size (x = 1..8) (not yet implemented)
-               AT&V     Show all settings.
-               AT&W0    Write registers and EAZ/MSN to profile. See also
-                        iprofd (5.c in this README).
-               AT&X0    BTX-mode and T.70-mode off (default)
-               AT&X1    BTX-mode on. (S13.1=1, S13.5=0 S14=0, S16=7, S18=7, S19=0)
-               AT&X2    T.70-mode on. (S13.1=1, S13.5=1, S14=0, S16=7, S18=7, S19=0)
-               AT+Rx    Resume a suspended call with CallID x (x = 1,2,3...)
-               AT+Sx    Suspend a call with CallID x (x = 1,2,3...)
-
-           For voice-mode commands refer to README.audio
-
-           1.3.2 Escape sequence:
-               During a connection, the emulation reacts just like
-               a normal modem to the escape sequence <DELAY>+++<DELAY>.
-               (The escape character - default '+' - can be set in the
-               register 2).
-               The DELAY must at least be 1.5 seconds long and delay
-               between the escape characters must not exceed 0.5 seconds.
-
-           1.3.3 Registers:
-
-              Nr.  Default  Description
-              0    0        Answer on ring number.
-                            (no auto-answer if S0=0).
-              1    0        Count of rings.
-              2    43       Escape character.
-                            (a value >= 128 disables the escape sequence).
-              3    13       Carriage return character (ASCII).
-              4    10       Line feed character (ASCII).
-              5    8        Backspace character (ASCII).
-              6    3        Delay in seconds before dialing.
-              7    60       Wait for carrier.
-              8    2        Pause time for comma (ignored)
-              9    6        Carrier detect time (ignored)
-             10    7        Carrier loss to disconnect time (ignored).
-             11    70       Touch tone timing (ignored).
-             12    69       Bit coded register:
-                            Bit 0:    0 = Suppress response messages.
-                                      1 = Show response messages.
-                            Bit 1:    0 = English response messages.
-                                      1 = Numeric response messages.
-                            Bit 2:    0 = Echo off.
-                                      1 = Echo on.
-                            Bit 3     0 = DCD always on.
-                                      1 = DCD follows carrier.
-                            Bit 4     0 = CTS follows RTS
-                                      1 = Ignore RTS, CTS always on.
-                            Bit 5     0 = return to command mode on DTR low.
-                                      1 = Same as 0 but also resets all
-                                          registers.
-                                      See also register 13, bit 2
-                            Bit 6     0 = DSR always on.
-                                      1 = DSR only on if channel is available.
-                            Bit 7     0 = Cisco-PPP-flag-hack off (default).
-                                      1 = Cisco-PPP-flag-hack on.
-             13   0         Bit coded register:
-                            Bit 0:    0 = Use delayed tty-send-algorithm
-                                      1 = Direct tty-send.
-                            Bit 1:    0 = T.70 protocol (Only for BTX!) off
-                                      1 = T.70 protocol (Only for BTX!) on
-                            Bit 2:    0 = Don't hangup on DTR low.
-                                      1 = Hangup on DTR low.
-                            Bit 3:    0 = Standard response messages
-                                      1 = Extended response messages
-                            Bit 4:    0 = CALLER NUMBER before every RING.
-                                      1 = CALLER NUMBER after first RING.
-                            Bit 5:    0 = T.70 extended protocol off
-                                      1 = T.70 extended protocol on
-                            Bit 6:    0 = Special RUNG Message off
-                                      1 = Special RUNG Message on
-                                          "RUNG" is delivered on a ttyI, if
-                                          an incoming call happened (RING) and
-                                          the remote party hung up before any
-                                          local ATA was given.
-			    Bit 7:    0 = Don't show display messages from net
-                                      1 = Show display messages from net
-				          (S12 Bit 1 must be 0 too)      
-             14   0         Layer-2 protocol:
-                                      0 = X75/LAPB with I-frames
-                                      1 = X75/LAPB with UI-frames
-                                      2 = X75/LAPB with BUI-frames
-                                      3 = HDLC
-                                      4 = Transparent (audio)
-                                      7 = V.110, 9600 baud
-                                      8 = V.110, 19200 baud
-                                      9 = V.110, 38400 baud
-                                     10 = Analog Modem (only if hardware supports this)
-                                     11 = Fax G3 (only if hardware supports this)
-             15   0         Layer-3 protocol:
-                                      0 = transparent
-                                      1 = transparent with audio features (e.g. DSP)
-                                      2 = Fax G3 Class 2 commands (S14 has to be set to 11)
-                                      3 = Fax G3 Class 1 commands (S14 has to be set to 11)
-             16   250       Send-Packet-size/16
-             17   8         Window-size (not yet implemented)
-             18   4         Bit coded register, Service-Octet-1 to accept,
-                            or to be used on dialout:
-                            Bit 0:    Service 1 (audio) when set.
-                            Bit 1:    Service 5 (BTX) when set.
-                            Bit 2:    Service 7 (data) when set.
-                            Note: It is possible to set more than one
-                                  bit. In this case, on incoming calls
-                                  the selected services are accepted,
-                                  and if the service is "audio", the
-                                  Layer-2-protocol is automatically
-                                  changed to 4 regardless of the setting
-                                  of register 14. On outgoing calls,
-                                  the most significant 1-bit is chosen to
-                                  select the outgoing service octet.
-             19   0         Service-Octet-2
-             20   0         Bit coded register (readonly)
-                            Service-Octet-1 of last call.
-                            Bit mapping is the same as register 18
-             21   0         Bit coded register (readonly)
-                            Set on incoming call (during RING) to
-                            octet 3 of calling party number IE (Numbering plan)
-                            See section 4.5.10 of ITU Q.931
-             22   0         Bit coded register (readonly)
-                            Set on incoming call (during RING) to
-                            octet 3a of calling party number IE (Screening info)
-                            See section 4.5.10 of ITU Q.931
-             23   0         Bit coded register:
-                            Bit 0:    0 = Add CPN to RING message off
-                                      1 = Add CPN to RING message on
-                            Bit 1:    0 = Add CPN to FCON message off
-                                      1 = Add CPN to FCON message on
-                            Bit 2:    0 = Add CDN to RING/FCON message off
-                                      1 = Add CDN to RING/FCON message on
-
-  Last but not least a (at the moment fairly primitive) device to request
-  the line-status (/dev/isdninfo) is made available.
-
-  Automatic assignment of devices to lines:
-
-  All inactive physical lines are listening to all EAZs for incoming
-  calls and are NOT assigned to a specific tty or network interface.
-  When an incoming call is detected, the driver looks first for a network
-  interface and then for an opened tty which:
-
-  1. is configured for the same EAZ.
-  2. has the same protocol settings for the B-channel.
-  3. (only for network interfaces if the security flag is set)
-     contains the caller number in its access list.
-  4. Either the channel is not bound exclusively to another Net-interface, or
-     it is bound AND the other checks apply to exactly this interface.
-     (For usage of the bind-features, refer to the isdnctrl-man-page)
-
-  Only when a matching interface or tty is found is the call accepted
-  and the "connection" between the low-level-layer and the link-level-layer
-  is established and kept until the end of the connection.
-  In all other cases no connection is established. Isdn4linux can be
-  configured to either do NOTHING in this case (which is useful, if
-  other, external devices with the same EAZ/MSN are connected to the bus)
-  or to reject the call actively. (isdnctrl busreject ...)
-
-  For an outgoing call, the inactive physical lines are searched.
-  The call is placed on the first physical line, which supports the
-  requested protocols for the B-channel. If a net-interface, however
-  is pre-bound to a channel, this channel is used directly.
-
-  This makes it possible to configure several network interfaces and ttys
-  for one EAZ, if the network interfaces are set to secure operation.
-  If an incoming call matches one network interface, it gets connected to it.
-  If another incoming call for the same EAZ arrives, which does not match
-  a network interface, the first tty gets a "RING" and so on.
-
-2 System prerequisites:
-
-  ATTENTION!
-
-  Always use the latest module utilities. The current version is
-  named in Documentation/Changes. Some old versions of insmod
-  are not capable of setting the driver-Ids correctly.
-
-3. Lowlevel-driver configuration.
-
-   Configuration depends on how the drivers are built. See the
-   README.<yourDriver> for information on driver-specific setup.
-
-4. Device-inodes
-
-   The major and minor numbers and their names are described in
-   Documentation/admin-guide/devices.rst. The major numbers are:
-
-     43 for the ISDN-tty's.
-     44 for the ISDN-callout-tty's.
-     45 for control/info/debug devices.
-
-5. Application
-
-   a) For some card-types, firmware has to be loaded into the cards, before
-      proceeding with device-independent setup. See README.<yourDriver>
-      for how to do that.
-
-   b) If you only intend to use ttys, you are nearly ready now.
-
-   c) If you want to have really permanent "Modem"-settings on disk, you
-      can start the daemon iprofd. Give it a path to a file at the command-
-      line. It will store the profile-settings in this file every time
-      an AT&W0 is performed on any ISDN-tty. If the file already exists,
-      all profiles are initialized from this file. If you want to unload
-      any of the modules, kill iprofd first.
-
-   d) For networking, continue: Create an interface:
-       isdnctrl addif isdn0
-
-   e) Set the EAZ (or MSN for Euro-ISDN):
-       isdnctrl eaz isdn0 2
-
-     (For 1TR6 a single digit is allowed, for Euro-ISDN the number is your
-      real MSN e.g.: Phone-Number)
-
-   f) Set the number for outgoing calls on the interface:
-       isdnctrl addphone isdn0 out 1234567
-       ... (this can be executed more than once, all assigned numbers are
-            tried in order)
-      and the number(s) for incoming calls:
-       isdnctrl addphone isdn0 in 1234567
-
-   g) Set the timeout for hang-up:
-       isdnctrl huptimeout isdn0 <timeout_in_seconds>
-
-   h) additionally you may activate charge-hang-up (= Hang up before
-      next charge-info, this only works, if your isdn-provider transmits
-      the charge-info during and after the connection):
-       isdnctrl chargehup isdn0 on
-
-   i) Set the dial mode of the interface:
-       isdnctrl dialmode isdn0 auto
-      "off" means that you (or the system) cannot make any connection
-        (neither incoming or outgoing connections are possible). Use
-        this if you want to be sure that no connections will be made.
-      "auto" means that the interface is in auto-dial mode, and will
-        attempt to make a connection whenever a network data packet needs
-        the interface's link. Note that this can cause unexpected dialouts,
-        and lead to a high phone bill! Some daemons or other pc's that use
-        this interface can cause this.
-        Incoming connections are also possible.
-      "manual" is a dial mode created to prevent the unexpected dialouts.
-        In this mode, the interface will never make any connections on its
-        own. You must explicitly initiate a connection with "isdnctrl dial
-        isdn0". However, after an idle time of no traffic as configured for
-	the huptimeout value with isdnctrl, the connection _will_ be ended.
-	If you don't want any automatic hangup, set the huptimeout value to 0.
-        "manual" is the default.
-
-   j) Setup the interface with ifconfig as usual, and set a route to it.
-
-   k) (optional) If you run X11 and have Tcl/Tk-wish version 4.0, you can use
-     the script tools/tcltk/isdnmon. You can add actions for line-status
-     changes. See the comments at the beginning of the script for how to
-     do that. There are other tty-based tools in the tools-subdirectory
-     contributed by Michael Knigge (imon), Volker Götz (imontty) and
-     Andreas Kool (isdnmon).
-
-   l) For initial testing, you can set the verbose-level to 2 (default: 0).
-      Then all incoming calls are logged, even if they are not addressed
-      to one of the configured net-interfaces:
-      isdnctrl verbose 2
-
-  Now you are ready! A ping to the set address should now result in an
-  automatic dial-out (look at syslog kernel-messages).
-  The phone numbers and EAZs can be assigned at any time with isdnctrl.
-  You can add as many interfaces as you like with addif following the
-  directions above. Of course, there may be some limitations. But we have
-  tested as many as 20 interfaces without any problem. However, if you
-  don't give an interface name to addif, the  kernel will assign a name
-  which starts with "eth". The number of "eth"-interfaces is limited by
-  the kernel.
-
-5. Additional options for isdnctrl:
-
-   "isdnctrl secure <InterfaceName> on"
-   Only incoming calls, for which the caller-id is listed in the access
-   list of the interface are accepted. You can add caller-id's With the
-   command "isdnctrl addphone <InterfaceName> in <caller-id>"
-   Euro-ISDN does not transmit the leading '0' of the caller-id for an
-   incoming call, therefore you should configure it accordingly.
-   If the real number for the dialout e.g. is "09311234567" the number
-   to configure here is "9311234567". The pattern-match function
-   works similar to the shell mechanism.
-
-     ?     one arbitrary digit
-     *     zero or arbitrary many digits
-     [123] one of the digits in the list
-     [1-5] one digit between '1' and '5'
-           a '^' as the first character in a list inverts the list
-
-
-   "isdnctrl secure <InterfaceName> off"
-   Switch off secure operation (default).
-
-   "isdnctrl ihup <InterfaceName> [on|off]"
-   Switch the hang-up-timer for incoming calls on or off.
-
-   "isdnctrl eaz <InterfaceName>"
-   Returns the EAZ of an interface.
-
-   "isdnctrl delphone <InterfaceName> in|out <number>"
-   Deletes a number from one of the access-lists of the interface.
-
-   "isdnctrl delif <InterfaceName>"
-   Removes the interface (and possible slaves) from the kernel.
-   (You have to unregister it with "ifconfig <InterfaceName> down" before).
-
-   "isdnctrl callback <InterfaceName> [on|off]"
-   Switches an interface to callback-mode. In this mode, an incoming call
-   will be rejected and after this the remote-station will be called. If
-   you test this feature by using ping, some routers will re-dial very
-   quickly, so that the callback from isdn4linux may not be recognized.
-   In this case use ping with the option -i <sec> to increase the interval
-   between echo-packets.
-
-   "isdnctrl cbdelay <InterfaceName> [seconds]"
-   Sets the delay (default 5 sec) between an incoming call and start of
-   dialing when callback is enabled.
-
-   "isdnctrl cbhup <InterfaceName> [on|off]"
-   This enables (default) or disables an active hangup (reject) when getting an
-   incoming call for an interface which is configured for callback.
-
-   "isdnctrl encap <InterfaceName> <EncapType>"
-   Selects the type of packet-encapsulation. The encapsulation can be changed
-   only while an interface is down.
-
-   At the moment the following values are supported:
-
-   rawip    (Default) Selects raw-IP-encapsulation. This means, MAC-headers
-            are stripped off.
-   ip       IP with type-field. Same as IP but the type-field of the MAC-header
-            is preserved.
-   x25iface X.25 interface encapsulation (first byte semantics as defined in
-            ../networking/x25-iface.txt). Use this for running the linux
-            X.25 network protocol stack (AF_X25 sockets) on top of isdn.
-   cisco-h  A special-mode for communicating with a Cisco, which is configured
-            to do "hdlc"
-   ethernet No stripping. Packets are sent with full MAC-header.
-            The Ethernet-address of the interface is faked, from its
-            IP-address: fc:fc:i1:i2:i3:i4, where i1-4 are the IP-addr.-values.
-   syncppp  Synchronous PPP
-
-   uihdlc   HDLC with UI-frame-header (for use with DOS ISPA, option -h1)
-
-
-   NOTE:    x25iface encapsulation is currently experimental. Please
-            read README.x25 for further details
-
-
-   Watching packets, using standard-tcpdump will fail for all encapsulations
-   except ethernet because tcpdump does not know how to handle packets
-   without MAC-header. A patch for tcpdump is included in the utility-package
-   mentioned above.
-
-   "isdnctrl l2_prot <InterfaceName> <L2-ProtocolName>"
-   Selects a layer-2-protocol.
-   (With the ICN-driver and the HiSax-driver, "x75i" and "hdlc" is available.
-   With other drivers, "x75ui", "x75bui", "x25dte", "x25dce" may be
-   possible too. See README.x25 for x25 related l2 protocols.)
-
-   isdnctrl l3_prot <InterfaceName> <L3-ProtocolName>
-   The same for layer-3. (At the moment only "trans" is allowed)
-
-   "isdnctrl list <InterfaceName>"
-   Shows all parameters of an interface and the charge-info.
-   Try "all" as the interface name.
-
-   "isdnctrl hangup <InterfaceName>"
-   Forces hangup of an interface.
-
-   "isdnctrl bind <InterfaceName> <DriverId>,<ChannelNumber> [exclusive]"
-   If you are using more than one ISDN card, it is sometimes necessary to
-   dial out using a specific card or even preserve a specific channel for
-   dialout of a specific net-interface. This can be done with the above
-   command. Replace <DriverId> by whatever you assigned while loading the
-   module. The <ChannelNumber> is counted from zero. The upper limit
-   depends on the card used. At the moment no card supports more than
-   2 channels, so the upper limit is one.
-
-   "isdnctrl unbind <InterfaceName>"
-   unbinds a previously bound interface.
-
-   "isdnctrl busreject <DriverId> on|off"
-   If switched on, isdn4linux replies a REJECT to incoming calls, it
-   cannot match to any configured interface.
-   If switched off, nothing happens in this case.
-   You normally should NOT enable this feature, if the ISDN adapter is not
-   the only device connected to the S0-bus. Otherwise it could happen that
-   isdn4linux rejects an incoming call, which belongs to another device on
-   the bus.
-
-   "isdnctrl addslave <InterfaceName> <SlaveName>
-   Creates a slave interface for channel-bundling. Slave interfaces are
-   not seen by the kernel, but their ISDN-part can be configured with
-   isdnctrl as usual. (Phone numbers, EAZ/MSN, timeouts etc.) If more
-   than two channels are to be bundled, feel free to create as many as you
-   want. InterfaceName must be a real interface, NOT a slave. Slave interfaces
-   start dialing, if the master interface resp. the previous slave interface
-   has a load of more than 7000 cps. They hangup if the load goes under 7000
-   cps, according to their "huptimeout"-parameter.
-
-   "isdnctrl sdelay <InterfaceName> secs."
-   This sets the minimum time an Interface has to be fully loaded, until
-   it sends a dial-request to its slave.
-
-   "isdnctrl dial <InterfaceName>"
-   Forces an interface to start dialing even if no packets are to be
-   transferred.
-
-   "isdnctrl mapping <DriverId> MSN0,MSN1,MSN2,...MSN9"
-   This installs a mapping table for EAZ<->MSN-mapping for a single line.
-   Missing MSN's have to be given as "-" or can be omitted, if at the end
-   of the commandline.
-   With this command, it's now possible to have an interface listening to
-   mixed 1TR6- and Euro-Type lines. In this case, the interface has to be
-   configured to a 1TR6-type EAZ (one digit). The mapping is also valid
-   for tty-emulation. Seen from the interface/tty-level the mapping
-   CAN be used, however it's possible to use single tty's/interfaces with
-   real MSN's (more digits) also, in which case the mapping will be ignored.
-   Here is an example:
-
-   You have a 1TR6-type line with base-nr. 1234567 and a Euro-line with
-   MSN's 987654, 987655 and 987656. The DriverId for the Euro-line is "EURO".
-
-   isdnctrl mapping EURO -,987654,987655,987656,-,987655
-   ...
-   isdnctrl eaz isdn0 1      # listen on 12345671(1tr6) and 987654(euro)
-   ...
-   isdnctrl eaz isdn1 4      # listen on 12345674(1tr6) only.
-   ...
-   isdnctrl eaz isdn2 987654 # listen on 987654(euro) only.
-
-   Same scheme is used with AT&E...  at the tty's.
-
-6. If you want to write a new low-level-driver, you are welcome.
-   The interface to the link-level-module is described in the file INTERFACE.
-   If the interface should be expanded for any reason, don't do it
-   on your own, send me a mail containing the proposed changes and
-   some reasoning about them.
-   If other drivers will not be affected, I will include the changes
-   in the next release.
-   For developers only, there is a second mailing-list. Write to me
-   (fritz@isdn4linux.de), if you want to join that list.
-
-Have fun!
-
- -Fritz
-
diff --git a/Documentation/isdn/README.FAQ b/Documentation/isdn/README.FAQ
deleted file mode 100644
index e5dd1addacdd..000000000000
--- a/Documentation/isdn/README.FAQ
+++ /dev/null
@@ -1,26 +0,0 @@
-
-The FAQ for isdn4linux
-======================
-
-Please note that there is a big FAQ available in the isdn4k-utils.
-You find it in:
- isdn4k-utils/FAQ/i4lfaq.sgml
-
-In case you just want to see the FAQ online, or download the newest version,
-you can have a look at my website:
-https://www.mhessler.de/i4lfaq/ (view + download)
-or:
-https://www.isdn4linux.de/faq/4lfaq.html (view)
-
-As the extension tells, the FAQ is in SGML format, and you can convert it
-into text/html/... format by using the sgml2txt/sgml2html/... tools.
-Alternatively, you can also do a 'configure; make all' in the FAQ directory.
-
-
-Please have a look at the FAQ before posting anything in the Mailinglist,
-or the newsgroup!
-
-
-Matthias Hessler
-hessler@isdn4linux.de
-
diff --git a/Documentation/isdn/README.audio b/Documentation/isdn/README.audio
deleted file mode 100644
index 8ebca19290d9..000000000000
--- a/Documentation/isdn/README.audio
+++ /dev/null
@@ -1,138 +0,0 @@
-$Id: README.audio,v 1.8 1999/07/11 17:17:29 armin Exp $
-
-ISDN subsystem for Linux.
-  Description of audio mode.
-
-When enabled during kernel configuration, the tty emulator of the ISDN
-subsystem is capable of a reduced set of commands to support audio.
-This document describes the commands supported and the format of
-audio data.
-
-Commands for enabling/disabling audio mode:
-
-        AT+FCLASS=8      Enable audio mode.
-                         This affects the following registers:
-                           S18: Bits 0 and 2 are set.
-                           S16: Set to 48 and any further change to
-                                larger values is blocked.
-        AT+FCLASS=0      Disable audio mode.
-                         Register 18 is set to 4.
-        AT+FCLASS=?      Show possible modes.
-        AT+FCLASS?       Report current mode (0 or 8).
-
-Commands supported in audio mode:
-
-All audio mode commands have one of the following forms:
-
-        AT+Vxx?          Show current setting.
-        AT+Vxx=?         Show possible settings.
-        AT+Vxx=v         Set simple parameter.
-        AT+Vxx=v,v ...   Set complex parameter.
-
-where xx is a two-character code and v are alphanumerical parameters.
-The following commands are supported:
-
-        AT+VNH=x         Auto hangup setting. NO EFFECT, supported
-                         for compatibility only.
-        AT+VNH?          Always reporting "1"
-        AT+VNH=?         Always reporting "1"
-
-        AT+VIP           Reset all audio parameters.
-
-        AT+VLS=x         Line select. x is one of the following:
-                           0 = No device.
-                           2 = Phone line.
-        AT+VLS=?         Always reporting "0,2"
-        AT+VLS?          Show current line.
-
-        AT+VRX           Start recording. Emulator responds with
-                         CONNECT and starts sending audio data to
-                         the application. See below for data format
-
-        AT+VSD=x,y       Set silence-detection parameters.
-                         Possible parameters:
-                           x = 0 ... 31  sensitivity threshold level.
-                                         (default 0 , deactivated)
-                           y = 0 ... 255 range of interval in units
-                                         of 0.1 second. (default 70)
-        AT+VSD=?         Report possible parameters.
-        AT+VSD?          Show current parameters.
-
-        AT+VDD=x,y       Set DTMF-detection parameters.
-                         Only possible if online and during this connection.
-                         Possible parameters:
-                           x = 0 ... 15  sensitivity threshold level.
-                                         (default 0 , I4L soft-decode)
-                                         (1-15 soft-decode off, hardware on)
-                           y = 0 ... 255 tone duration in units of 5ms.
-                                         Not for I4L soft decode (default 8, 40ms)
-        AT+VDD=?         Report possible parameters.
-        AT+VDD?          Show current parameters.
-
-        AT+VSM=x         Select audio data format.
-                         Possible parameters:
-                           2 = ADPCM-2
-                           3 = ADPCM-3
-                           4 = ADPCM-4
-                           5 = aLAW
-                           6 = uLAW
-        AT+VSM=?         Show possible audio formats.
-
-        AT+VTX           Start audio playback. Emulator responds
-                         with CONNECT and starts sending audio data
-                         received from the application via phone line.
-General behavior and description of data formats/protocol.
-    when a connection is made:
-
-      On incoming calls, if the application responds to a RING
-      with ATA, depending on the calling service, the emulator
-      responds with either CONNECT (data call) or VCON (voice call).
-      
-      On outgoing voice calls, the emulator responds with VCON
-      upon connection setup.
-
-  Audio recording.
-
-    When receiving audio data, a kind of bisync protocol is used.
-    Upon AT+VRX command, the emulator responds with CONNECT, and
-    starts sending audio data to the application. There are several
-    escape sequences defined, all using DLE (0x10) as Escape char:
-
-    <DLE><ETX>              End of audio data. (i.e. caused by a
-                            hangup of the remote side) Emulator stops
-                            recording, responding with VCON.
-    <DLE><DC4>		    Abort recording, (send by appl.) Emulator
-			    stops recording, sends DLE,ETX.
-    <DLE><DLE>              Escape sequence for DLE in data stream.
-    <DLE>0                  Touchtone "0" received.
-         ...
-    <DLE>9                  Touchtone "9" received.
-    <DLE>#                  Touchtone "#" received.
-    <DLE>*                  Touchtone "*" received.
-    <DLE>A                  Touchtone "A" received.
-    <DLE>B                  Touchtone "B" received.
-    <DLE>C                  Touchtone "C" received.
-    <DLE>D                  Touchtone "D" received.
-
-    <DLE>q                  quiet. Silence detected after non-silence.
-    <DLE>s                  silence. Silence detected from the
-                            start of recording.
-
-    Currently unsupported DLE sequences:
-
-    <DLE>c                  FAX calling tone received.
-    <DLE>b                  busy tone received.
-
-  Audio playback.
-
-    When sending audio data, upon AT+VTX command, emulator responds with
-    CONNECT, and starts transferring data from application to the phone line.
-    The same DLE sequences apply to this mode.
-
-  Full-Duplex-Audio:
-
-    When _both_ commands for recording and playback are given in _one_
-    AT-command-line (i.e.: "AT+VTX+VRX"), full-duplex-mode is selected.
-	In this mode, the only way to stop recording is sending <DLE><DC4>
-    and the only way to stop playback is to send <DLE><ETX>.
-
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
deleted file mode 100644
index a76d74845a4c..000000000000
--- a/Documentation/isdn/README.concap
+++ /dev/null
@@ -1,259 +0,0 @@
-Description of the "concap" encapsulation protocol interface
-============================================================
-
-The "concap" interface is intended to be used by network device
-drivers that need to process an encapsulation protocol. 
-It is assumed that the protocol interacts with a linux network device by
-- data transmission
-- connection control (establish, release)
-Thus, the mnemonic: "CONnection CONtrolling eNCAPsulation Protocol".
-
-This is currently only used inside the isdn subsystem. But it might
-also be useful to other kinds of network devices. Thus, if you want
-to suggest changes that improve usability or performance of the
-interface, please let me know. I'm willing to include them in future
-releases (even if I needed to adapt the current isdn code to the
-changed interface).
-
-
-Why is this useful?
-===================
-
-The encapsulation protocol used on top of WAN connections or permanent
-point-to-point links are frequently chosen upon bilateral agreement.
-Thus, a device driver for a certain type of hardware must support
-several different encapsulation protocols at once.
-
-The isdn device driver did already support several different
-encapsulation protocols. The encapsulation protocol is configured by a
-user space utility (isdnctrl). The isdn network interface code then
-uses several case statements which select appropriate actions
-depending on the currently configured encapsulation protocol.
-
-In contrast, LAN network interfaces always used a single encapsulation
-protocol which is unique to the hardware type of the interface. The LAN
-encapsulation is usually done by just sticking a header on the data. Thus,
-traditional linux network device drivers used to process the
-encapsulation protocol directly (usually by just providing a hard_header()
-method in the device structure) using some hardware type specific support
-functions. This is simple, direct and efficient. But it doesn't fit all
-the requirements for complex WAN encapsulations. 
-
-
-   The configurability of the encapsulation protocol to be used
-   makes isdn network interfaces more flexible, but also much more
-   complex than traditional lan network interfaces.
-
-
-Many Encapsulation protocols used on top of WAN connections will not just
-stick a header on the data. They also might need to set up or release
-the WAN connection. They also might want to send other data for their
-private purpose over the wire, e.g. ppp does a lot of link level
-negotiation before the first piece of user data can be transmitted.
-Such encapsulation protocols for WAN devices are typically more complex
-than encapsulation protocols for lan devices. Thus, network interface
-code for typical WAN devices also tends to be more complex.
-
-
-In order to support Linux' x25 PLP implementation on top of
-isdn network interfaces I could have introduced yet another branch to
-the various case statements inside drivers/isdn/isdn_net.c.
-This eventually made isdn_net.c even more complex. In addition, it made
-isdn_net.c harder to maintain. Thus, by identifying an abstract
-interface between the network interface code and the encapsulation
-protocol, complexity could be reduced and maintainability could be
-increased.
-
-
-Likewise, a similar encapsulation protocol will frequently be needed by
-several different interfaces of even different hardware type, e.g. the
-synchronous ppp implementation used by the isdn driver and the
-asynchronous ppp implementation used by the ppp driver have a lot of
-similar code in them. By cleanly separating the encapsulation protocol
-from the hardware specific interface stuff such code could be shared
-better in future.
-
-
-When operating over dial-up-connections (e.g. telephone lines via modem,
-non-permanent virtual circuits of wide area networks, ISDN) many
-encapsulation protocols will need to control the connection. Therefore,
-some basic connection control primitives are supported. The type and
-semantics of the connection (i.e the ISO layer where connection service
-is provided) is outside our scope and might be different depending on
-the encapsulation protocol used, e.g. for a ppp module using our service
-on top of a modem connection a connect_request will result in dialing
-a (somewhere else configured) remote phone number. For an X25-interface
-module (LAPB semantics, as defined in Documentation/networking/x25-iface.txt)
-a connect_request will ask for establishing a reliable lapb
-datalink connection.
-
-
-The encapsulation protocol currently provides the following
-service primitives to the network device.
-
-- create a new encapsulation protocol instance
-- delete encapsulation protocol instance and free all its resources
-- initialize (open) the encapsulation protocol instance for use.
-- deactivate (close) an encapsulation protocol instance.
-- process (xmit) data handed down by upper protocol layer
-- receive data from lower (hardware) layer
-- process connect indication from lower (hardware) layer
-- process disconnect indication from lower (hardware) layer
-
-
-The network interface driver accesses those primitives via callbacks
-provided by the encapsulation protocol instance within a
-struct concap_proto_ops.
-
-struct concap_proto_ops{
-
-	/* create a new encapsulation protocol instance of same type */
-	struct concap_proto *  (*proto_new) (void);
-
-	/* delete encapsulation protocol instance and free all its resources.
-	   cprot may no longer be referenced after calling this */
-	void (*proto_del)(struct concap_proto *cprot);
-
-	/* initialize the protocol's data. To be called at interface startup
-	   or when the device driver resets the interface. All services of the
-	   encapsulation protocol may be used after this*/
-	int (*restart)(struct concap_proto *cprot, 
-		       struct net_device *ndev,
-		       struct concap_device_ops *dops);
-
-	/* deactivate an encapsulation protocol instance. The encapsulation
-	   protocol may not call any *dops methods after this. */
-	int (*close)(struct concap_proto *cprot);
-
-	/* process a frame handed down to us by upper layer */
-	int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called for each data entity received from lower layer*/ 
-	int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called when a connection was set up/down.
-	   Protocols that don't process these primitives might fill in
-	   dummy methods here */
-	int (*connect_ind)(struct concap_proto *cprot);
-	int (*disconn_ind)(struct concap_proto *cprot);
-};
-
-
-The data structures are defined in the header file include/linux/concap.h.
-
-
-A Network interface using encapsulation protocols must also provide
-some service primitives to the encapsulation protocol:
-
-- request data being submitted by lower layer (device hardware) 
-- request a connection being set up by lower layer 
-- request a connection being released by lower layer
-
-The encapsulation protocol accesses those primitives via callbacks
-provided by the network interface within a struct concap_device_ops.
-
-struct concap_device_ops{
-
-	/* to request data be submitted by device */ 
-	int (*data_req)(struct concap_proto *, struct sk_buff *);
-
-	/* Control methods must be set to NULL by devices which do not
-	   support connection control. */
-	/* to request a connection be set up */ 
-	int (*connect_req)(struct concap_proto *);
-
-	/* to request a connection be released */
-	int (*disconn_req)(struct concap_proto *);	
-};
-
-The network interface does not explicitly provide a receive service
-because the encapsulation protocol directly calls netif_rx(). 
-
-
-
-
-An encapsulation protocol itself is actually the
-struct concap_proto{
-	struct net_device *net_dev;		/* net device using our service  */
-	struct concap_device_ops *dops; /* callbacks provided by device */
- 	struct concap_proto_ops  *pops; /* callbacks provided by us */
-	int flags;
-	void *proto_data;               /* protocol specific private data, to
-					   be accessed via *pops methods only*/
-	/*
-	  :
-	  whatever 
-	  :
-	  */
-};
-
-Most of this is filled in when the device requests the protocol to 
-be reset (opend). The network interface must provide the net_dev and
-dops pointers. Other concap_proto members should be considered private
-data that are only accessed by the pops callback functions. Likewise,
-a concap proto should access the network device's private data
-only by means of the callbacks referred to by the dops pointer.
-
-
-A possible extended device structure which uses the connection controlling
-encapsulation services could look like this:
-
-struct concap_device{
-	struct net_device net_dev;
-	struct my_priv  /* device->local stuff */
-			/* the my_priv struct might contain a 
-			   struct concap_device_ops *dops;
-	                   to provide the device specific callbacks
-			*/
-	struct concap_proto *cprot;        /* callbacks provided by protocol */
-};
-
-
-
-Misc Thoughts
-=============
-
-The concept of the concap proto might help to reuse protocol code and
-reduce the complexity of certain network interface implementations.
-The trade off is that it introduces yet another procedure call layer
-when processing the protocol. This has of course some impact on
-performance. However, typically the concap interface will be used by
-devices attached to slow lines (like telephone, isdn, leased synchronous
-lines). For such slow lines, the overhead is probably negligible.
-This might no longer hold for certain high speed WAN links (like
-ATM).
-
-
-If general linux network interfaces explicitly supported concap
-protocols (e.g. by a member struct concap_proto* in struct net_device)
-then the interface of the service function could be changed
-by passing a pointer of type (struct net_device*) instead of
-type (struct concap_proto*). Doing so would make many of the service
-functions compatible to network device support functions.
-
-e.g. instead of the concap protocol's service function
-
-  int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-we could have
-
-  int (*encap_and_xmit)(struct net_device *ndev, struct sk_buff *skb);
-
-As this is compatible to the dev->hard_start_xmit() method, the device
-driver could directly register the concap protocol's encap_and_xmit()
-function as its hard_start_xmit() method. This would eliminate one
-procedure call layer.
-
-
-The device's data request function could also be defined as
- 
-  int (*data_req)(struct net_device *ndev, struct sk_buff *skb);
-
-This might even allow for some protocol stacking. And the network
-interface might even register the same data_req() function directly
-as its hard_start_xmit() method when a zero layer encapsulation
-protocol is configured. Thus, eliminating the performance penalty
-of the concap interface when a trivial concap protocol is used.
-Nevertheless, the device remains able to support encapsulation
-protocol configuration.
-
diff --git a/Documentation/isdn/README.diversion b/Documentation/isdn/README.diversion
deleted file mode 100644
index bddcd5fb86ff..000000000000
--- a/Documentation/isdn/README.diversion
+++ /dev/null
@@ -1,127 +0,0 @@
-The isdn diversion services are a supporting module working together with
-the isdn4linux and the HiSax module for passive cards. 
-Active cards, TAs and cards using a own or other driver than the HiSax 
-module need to be adapted to the HL<->LL interface described in a separate 
-document. The diversion services may be used with all cards supported by 
-the HiSax driver.
-The diversion kernel interface and controlling tool divertctrl were written
-by Werner Cornelius (werner@isdn4linux.de or werner@titro.de) under the
-GNU General Public License.
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-Table of contents
-=================
-
-1. Features of the i4l diversion services 
-   (Or what can the i4l diversion services do for me)
-
-2. Required hard- and software
-
-3. Compiling, installing and loading/unloading the module  
-   Tracing calling and diversion information 
-
-4. Tracing calling and diversion information
- 
-5. Format of the divert device ASCII output
- 
-
-1. Features of the i4l diversion services 
-   (Or what can the i4l diversion services do for me)
-
-   The i4l diversion services offers call forwarding and logging normally 
-   only supported by isdn phones. Incoming calls may be diverted 
-   unconditionally (CFU), when not reachable (CFNR) or on busy condition 
-   (CFB). 
-   The diversions may be invoked statically in the providers exchange
-   as normally done by isdn phones. In this case all incoming calls
-   with a special (or all) service identifiers are forwarded if the 
-   forwarding reason is met. Activated static services may also be 
-   interrogated (queried).
-   The i4l diversion services additionally offers a dynamic version of
-   call forwarding which is not preprogrammed inside the providers exchange
-   but dynamically activated by i4l.
-   In this case all incoming calls are checked by rules that may be
-   compared to the mechanism of ipfwadm or ipchains. If a given rule matches
-   the checking process is finished and the rule matching will be applied
-   to the call.
-   The rules include primary and secondary service identifiers, called 
-   number and subaddress, callers number and subaddress and whether the rule
-   matches to all filtered calls or only those when all B-channel resources
-   are exhausted.
-   Actions that may be invoked by a rule are ignore, proceed, reject, 
-   direct divert or delayed divert of a call.
-   All incoming calls matching a rule except the ignore rule a reported and
-   logged as ASCII via the proc filesystem (/proc/net/isdn/divert). If proceed
-   is selected the call will be held in a proceeding state (without ringing)
-   for a certain amount of time to let an external program or client decide
-   how to handle the call. 
-            
-
-2. Required hard- and software
-   
-   For using the i4l diversion services the isdn line must be of a EURO/DSS1
-   type. Additionally the i4l services only work together with the HiSax 
-   driver for passive isdn cards. All HiSax supported cards may be used for
-   the diversion purposes.
-   The static diversion services require the provider having static services
-   CFU, CFNR, CFB activated on an MSN-line. The static services may not be 
-   used on a point-to-point connection. Further the static services are only
-   available in some countries (for example germany). Countries requiring the 
-   keypad protocol for activating static diversions (like the netherlands) are
-   not supported but may use the tty devices for this purpose.
-   The dynamic diversion services may be used in all countries if the provider
-   enables the feature CF (call forwarding). This should work on both MSN- and
-   point-to-point lines.
-   To add and delete rules the additional divertctrl program is needed. This
-   program is part of the isdn4kutils package.   
-
-3. Compiling, installing and loading/unloading the module  
-   Tracing calling and diversion information 
-
-
-   To compile the i4l code with diversion support you need to say yes to the 
-   DSS1 diversion services when selecting the i4l options in the kernel 
-   config (menuconfig or config).
-   After having properly activated a make modules and make modules_install all
-   required modules will be correctly installed in the needed modules dirs.
-   As the diversion services are currently not included in the scripts of most
-   standard distributions you will have to add a "insmod dss1_divert" after
-   having loaded the global isdn module.
-   The module can be loaded without any command line parameters.
-   If the module is actually loaded and active may be checked with a 
-   "cat /proc/modules" or "ls /proc/net/isdn/divert". The divert file is 
-   dynamically created by the diversion module and removed when the module is
-   unloaded.
-
-
-4. Tracing calling and diversion information
- 
-   You also may put a "cat /proc/net/isdn/divert" in the background with the
-   output redirected to a file. Then all actions of the module are logged.
-   The divert file in the proc system may be opened more than once, so in 
-   conjunction with inetd and a small remote client on other machines inside
-   your network incoming calls and reactions by the module may be shown on 
-   every listening machine. 
-   If a call is reported as proceeding an external program or client may 
-   specify during a certain amount of time (normally 4 to 10 seconds) what
-   to do with that call.      
-   To unload the module all open files to the device in the proc system must
-   be closed. Otherwise the module (and isdn.o) may not be unloaded. 
-
-5. Format of the divert device ASCII output
- 
-   To be done later
-
diff --git a/Documentation/isdn/README.fax b/Documentation/isdn/README.fax
deleted file mode 100644
index 5314958a8a6e..000000000000
--- a/Documentation/isdn/README.fax
+++ /dev/null
@@ -1,45 +0,0 @@
-
-Fax with isdn4linux
-===================
-
-When enabled during kernel configuration, the tty emulator
-of the ISDN subsystem is capable of the Fax Class 2 commands.
-
-This only makes sense under the following conditions :
-
-- You need the commands as dummy, because you are using
-  hylafax (with patch) for AVM capi.
-- You want to use the fax capabilities of your isdn-card.
-  (supported cards are listed below)
-
-
-NOTE: This implementation does *not* support fax with passive
-      ISDN-cards (known as softfax). The low-level driver of
-      the ISDN-card and/or the card itself must support this.
-
-
-Supported ISDN-Cards
---------------------
-
-Eicon DIVA Server BRI/PCI
-	- full support with both B-channels.
-
-Eicon DIVA Server 4BRI/PCI
-	- full support with all B-channels.
-
-Eicon DIVA Server PRI/PCI
-	- full support on amount of B-channels
-		depending on DSPs on board.
-
-
-
-The command set is known as Class 2 (not Class 2.0) and
-can be activated by AT+FCLASS=2
-
-
-The interface between the link-level-module and the hardware-level driver
-is described in the files INTERFACE.fax and INTERFACE.
-
-Armin
-mac@melware.de
-
diff --git a/Documentation/isdn/README.hfc-pci b/Documentation/isdn/README.hfc-pci
deleted file mode 100644
index e8a4ef0226e8..000000000000
--- a/Documentation/isdn/README.hfc-pci
+++ /dev/null
@@ -1,41 +0,0 @@
-The driver for the HFC-PCI and HFC-PCI-A chips from CCD may be used
-for many OEM cards using this chips.
-Additionally the driver has a special feature which makes it possible
-to read the echo-channel of the isdn bus. So all frames in both directions
-may be logged.
-When the echo logging feature is used the number of available B-channels
-for a HFC-PCI card is reduced to 1. Of course this is only relevant to
-the card, not to the isdn line.
-To activate the echo mode the following ioctls must be entered:
-
-hisaxctrl <driver/cardname> 10 1
-
-This reduces the available channels to 1. There must not be open connections
-through this card when entering the command.
-And then:
-
-hisaxctrl <driver/cardname> 12 1
-
-This enables the echo mode. If Hex logging is activated the isdnctrlx 
-devices show a output with a line beginning of HEX: for the providers
-exchange and ECHO: for isdn devices sending to the provider.
-
-If more than one HFC-PCI cards are installed, a specific card may be selected
-at the hisax module load command line. Supply the load command with the desired
-IO-address of the desired card. 
-Example:
-There tree cards installed in your machine at IO-base addresses 0xd000, 0xd400 
-and 0xdc00
-If you want to use the card at 0xd400 standalone you should supply the insmod
-or depmod with type=35 io=0xd400.
-If you want to use all three cards, but the order needs to be at 0xdc00,0xd400,
-0xd000 you may give the parameters type=35,35,35 io=0xdc00,0xd400,0xd00 
-Then the desired card will be the initialised in the desired order.
-If the io parameter is used the io addresses of all used cards should be 
-supplied else the parameter is assumed 0 and a auto search for a free card is
-invoked which may not give the wanted result. 
-
-Comments and reports to werner@isdn4linux.de or werner@isdn-development.de
-
-
-
diff --git a/Documentation/isdn/README.syncppp b/Documentation/isdn/README.syncppp
deleted file mode 100644
index 27d260095cce..000000000000
--- a/Documentation/isdn/README.syncppp
+++ /dev/null
@@ -1,58 +0,0 @@
-Some additional information for setting up a syncPPP
-connection using network interfaces.
----------------------------------------------------------------
-
-You need one thing beside the isdn4linux package:
-
-  a patched pppd .. (I called it ipppd to show the difference)
-
-Compiling isdn4linux with sync PPP:
------------------------------------
-To compile isdn4linux with the sync PPP part, you have
-to answer the appropriate question when doing a "make config"
-Don't forget to load the slhc.o
-module before the isdn.o module, if VJ-compression support
-is not compiled into your kernel. (e.g if you have no PPP or
-CSLIP in the kernel)
-
-Using isdn4linux with sync PPP:
--------------------------------
-Sync PPP is just another encapsulation for isdn4linux. The
-name to enable sync PPP encapsulation is 'syncppp' .. e.g:
-
-  /sbin/isdnctrl encap ippp0 syncppp
-
-The name of the interface is here 'ippp0'. You need 
-one interface with the name 'ippp0' to saturate the
-ipppd, which checks the ppp version via this interface.
-Currently, all devices must have the name ipppX where
-'X' is a decimal value.
-
-To set up a PPP connection you need the ipppd .. You must start 
-the ipppd once after installing the modules. The ipppd 
-communicates with the isdn4linux link-level driver using the
-/dev/ippp0 to /dev/ippp15 devices. One ipppd can handle
-all devices at once. If you want to use two PPP connections
-at the same time, you have to connect the ipppd to two
-devices .. and so on. 
-I've implemented one additional option for the ipppd:
- 'useifip' will get (if set to not 0.0.0.0) the IP address 
- for the negotiation from the attached network-interface. 
-(also: ipppd will try to negotiate pointopoint IP as remote IP)
-You must disable BSD-compression, this implementation can't
-handle compressed packets.
-
-Check the etc/rc.isdn.syncppp in the isdn4kernel-util package
-for an example setup script.
-
-To use the MPPP stuff, you must configure a slave device
-with isdn4linux. Now call the ipppd with the '+mp' option.
-To increase the number of links, you must use the
-'addlink' option of the isdnctrl tool. (rc.isdn.syncppp.MPPP is
-an example script)
-
-enjoy it,
-    michael
-     
-
-
diff --git a/Documentation/isdn/README.x25 b/Documentation/isdn/README.x25
deleted file mode 100644
index e561a77c4e22..000000000000
--- a/Documentation/isdn/README.x25
+++ /dev/null
@@ -1,184 +0,0 @@
-  
-X.25 support within isdn4linux
-==============================
-
-This is alpha/beta test code. Use it completely at your own risk.
-As new versions appear, the stuff described here might suddenly change
-or become invalid without notice.
-
-Keep in mind:
-
-You are using several new parts of the 2.2.x kernel series which
-have not been tested in a large scale. Therefore, you might encounter
-more bugs as usual.
-
-- If you connect to an X.25 neighbour not operated by yourself, ASK the
-  other side first. Be prepared that bugs in the protocol implementation
-  might result in problems.
-
-- This implementation has never wiped out my whole hard disk yet. But as
-  this is experimental code, don't blame me if that happened to you.
-  Backing up important data will never harm.
-
-- Monitor your isdn connections while using this software. This should
-  prevent you from undesired phone bills in case of driver problems.
-  
- 
-
-
-How to configure the kernel
-===========================
- 
-The ITU-T (former CCITT) X.25 network protocol layer has been implemented
-in the Linux source tree since version 2.1.16. The isdn subsystem might be 
-useful to run X.25 on top of ISDN. If you want to try it, select
-
-   "CCITT X.25 Packet Layer"
-
-from the networking options as well as
-
-   "ISDN Support" and "X.25 PLP on Top of ISDN"
-
-from the ISDN subsystem options when you configure your kernel for
-compilation. You currently also need to enable
-"Prompt for development and/or incomplete code/drivers" from the
-"Code maturity level options" menu. For the x25trace utility to work
-you also need to enable "Packet socket".
-
-For local testing it is also recommended to enable the isdnloop driver
-from the isdn subsystem's configuration menu.
-
-For testing, it is recommended that all isdn drivers and the X.25 PLP
-protocol are compiled as loadable modules. Like this, you can recover
-from certain errors by simply unloading and reloading the modules.
-
-
-
-What's it for? How to use it?
-=============================
-
-X.25 on top of isdn might be useful with two different scenarios:
-
-- You might want to access a public X.25 data network from your Linux box.
-  You can use i4l if you were physically connected to the X.25 switch
-  by an ISDN B-channel (leased line as well as dial up connection should
-  work).
-
-  This corresponds to ITU-T recommendation X.31 Case A (circuit-mode
-  access to PSPDN [packet switched public data network]).
-
-  NOTE: X.31 also covers a Case B (access to PSPDN via virtual
-  circuit / packet mode service). The latter mode (which in theory
-  also allows using the D-channel) is not supported by isdn4linux.
-  It should however be possible to establish such packet mode connections
-  with certain active isdn cards provided that the firmware supports X.31
-  and the driver exports this functionality to the user. Currently, 
-  the AVM B1 driver is the only driver which does so. (It should be
-  possible to access D-channel X.31 with active AVM cards using the
-  CAPI interface of the AVM-B1 driver).
-
-- Or you might want to operate certain ISDN teleservices on your linux
-  box. A lot of those teleservices run on top of the ISO-8208
-  (DTE-DTE mode) network layer protocol. ISO-8208 is essentially the
-  same as ITU-T X.25.
-
-  Popular candidates of such teleservices are EUROfile transfer or any
-  teleservice applying ITU-T recommendation T.90.
-
-To use the X.25 protocol on top of isdn, just create an isdn network
-interface as usual, configure your own and/or peer's ISDN numbers,
-and choose x25iface encapsulation by
-
-   isdnctrl encap <iface-name> x25iface.
-
-Once encap is set like this, the device can be used by the X.25 packet layer.
-
-All the stuff needed for X.25 is implemented inside the isdn link
-level (mainly isdn_net.c and some new source files). Thus, it should
-work with every existing HL driver. I was able to successfully open X.25
-connections on top of the isdnloop driver and the hisax driver.
-"x25iface"-encapsulation bypasses demand dialing. Dialing will be
-initiated when the upper (X.25 packet) layer requests the lapb datalink to
-be established. But hangup timeout is still active. Whenever a hangup
-occurs, all existing X.25 connections on that link will be cleared
-It is recommended to use sufficiently large hangup-timeouts for the
-isdn interfaces.
-
-
-In order to set up a conforming protocol stack you also need to
-specify the proper l2_prot parameter:
-
-To operate in ISO-8208  X.25 DTE-DTE mode, use
-
-   isdnctrl l2_prot <iface-name> x75i
-
-To access an X.25 network switch via isdn (your linux box is the DTE), use
-
-   isdnctrl l2_prot <iface-name> x25dte
-
-To mimic an X.25 network switch (DCE side of the connection), use
-
-   isdnctrl l2_prot <iface-name> x25dce
-
-However, x25dte or x25dce is currently not supported by any real HL
-level driver. The main difference between x75i and x25dte/dce is that
-x25d[tc]e uses fixed lap_b addresses. With x75i, the side which
-initiates the isdn connection uses the DTE's lap_b address while the
-called side used the DCE's lap_b address. Thus, l2_prot x75i might
-probably work if you access a public X.25 network as long as the
-corresponding isdn connection is set up by you. At least one test
-was successful to connect via isdn4linux to an X.25 switch using this
-trick. At the switch side, a terminal adapter X.21 was used to connect
-it to the isdn.
-
-
-How to set up a test installation?
-==================================
-
-To test X.25 on top of isdn, you need to get
-
-- a recent version of the "isdnctrl" program that supports setting the new
-  X.25 specific parameters.
-
-- the x25-utils-2.X package from 
-  ftp://ftp.hes.iki.fi/pub/ham/linux/ax25/x25utils-*
-  (don't confuse the x25-utils with the ax25-utils)
-
-- an application program that uses linux PF_X25 sockets (some are
-  contained in the x25-util package).
-
-Before compiling the user level utilities make sure that the compiler/
-preprocessor will fetch the proper kernel header files of this kernel
-source tree. Either make /usr/include/linux a symbolic link pointing to 
-this kernel's include/linux directory or set the appropriate compiler flags.
-
-When all drivers and interfaces are loaded and configured you need to
-ifconfig the network interfaces up and add X.25-routes to them. Use
-the usual ifconfig tool.
-
-ifconfig <iface-name> up
-
-But a special x25route tool (distributed with the x25-util package)
-is needed to set up X.25 routes. I.e. 
-
-x25route add 01 <iface-name>
-
-will cause all x.25 connections to the destination X.25-address
-"01" to be routed to your created isdn network interface.
-
-There are currently no real X.25 applications available. However, for
-tests, the x25-utils package contains a modified version of telnet
-and telnetd that uses X.25 sockets instead of tcp/ip sockets. You can
-use those for your first tests. Furthermore, you might check
-ftp://ftp.hamburg.pop.de/pub/LOCAL/linux/i4l-eft/ which contains some
-alpha-test implementation ("eftp4linux") of the EUROfile transfer
-protocol.
-
-The scripts distributed with the eftp4linux test releases might also
-provide useful examples for setting up X.25 on top of isdn.
-
-The x25-utility package also contains an x25trace tool that can be
-used to monitor X.25 packets received by the network interfaces.
-The /proc/net/x25* files also contain useful information. 
-
-- Henner
diff --git a/Documentation/isdn/syncPPP.FAQ b/Documentation/isdn/syncPPP.FAQ
deleted file mode 100644
index 3257a4bc0786..000000000000
--- a/Documentation/isdn/syncPPP.FAQ
+++ /dev/null
@@ -1,224 +0,0 @@
-simple isdn4linux PPP FAQ .. to be continued .. not 'debugged' 
--------------------------------------------------------------------
-
-Q01: what's pppd, ipppd, syncPPP, asyncPPP ??
-Q02: error message "this system lacks PPP support"
-Q03: strange information using 'ifconfig'
-Q04: MPPP?? What's that and how can I use it ...
-Q05: I tried MPPP but it doesn't work 
-Q06: can I use asynchronous PPP encapsulation with network devices
-Q07: A SunISDN machine can't connect to my i4l system
-Q08: I wanna talk to several machines, which need different configs
-Q09: Starting the ipppd, I get only error messages from i4l
-Q10: I wanna use dynamic IP address assignment 
-Q11: I can't connect. How can I check where the problem is.
-Q12: How can I reduce login delay? 
-
--------------------------------------------------------------------
-
-Q01: pppd, ipppd, syncPPP, asyncPPP .. what is that ?
-   what should I use?
-A: The pppd is for asynchronous PPP .. asynchronous means
-   here, the framing is character based. (e.g when
-   using ttyI* or tty* devices)
-
-   The ipppd handles PPP packets coming in HDLC
-   frames (bit based protocol) ... The PPP driver
-   in isdn4linux pushes all IP packets direct
-   to the network layer and all PPP protocol
-   frames to the /dev/ippp* device. 
-   So, the ipppd is a simple external network
-   protocol handler.
-
-   If you login into a remote machine using the
-   /dev/ttyI* devices and then enable PPP on the
-   remote terminal server -> use the 'old' pppd
-
-   If your remote side immediately starts to send
-   frames ... you probably connect to a 
-   syncPPP machine .. use the network device part
-   of isdn4linux with the 'syncppp' encapsulation
-   and make sure, that the ipppd is running and 
-   connected to at least one /dev/ippp*. Check the 
-   isdn4linux manual on how to configure a network device.
-
---
-
-Q02: when I start the ipppd .. I only get the
-   error message "this system lacks PPP support"
-A: check that at least the device 'ippp0' exists.
-   (you can check this e.g with the program 'ifconfig')
-   The ipppd NEEDS this device under THIS name .. 
-   If this device doesn't exists, use:
-	isdnctrl addif ippp0
-	isdnctrl encap ippp0 syncppp
-	... (see isdn4linux doc for more) ...
-A: Maybe you have compiled the ipppd with another
-   kernel source tree than the kernel you currently
-   run ... 
-
---
-
-Q03: when I list the netdevices with ifconfig I see, that
-   my ISDN interface has a HWaddr and IRQ=0 and Base 
-   address = 0 
-A: The device is a fake ethernet device .. ignore IRQ and baseaddr
-   You need the HWaddr only for ethernet encapsulation.
-   
---
-
-Q04: MPPP?? What's that and how can I use it ...
-
-A: MPPP or MP or MPP (Warning: MP is also an 
-   acronym for 'Multi Processor') stands for
-   Multi Point to Point and means bundling
-   of several channels to one logical stream.
-   To enable MPPP negotiation you must call the
-   ipppd with the '+mp' option. 
-   You must also configure a slave device for
-   every additional channel. (see the i4l manual
-   for more)
-   To use channel bundling you must first activate
-   the 'master' or initial call. Now you can add 
-   the slave channels with the command:
-       isdnctrl addlink <device>
-   e.g:
-       isdnctrl addlink ippp0
-   This is different from other encapsulations of
-   isdn4linux! With syncPPP, there is no automatic
-   activation of slave devices.
-
---
-
-Q05: I tried MPPP but it doesn't work .. the ipppd
-   writes in the debug log something like:
-   .. rcvd [0][proto=0x3d] c0 00 00 00 80 fd 01 01 00 0a ...
-   .. sent [0][LCP ProtRej id=0x2 00 3d c0 00 00 00 80 fd 01 ...
-
-A: you forgot to compile MPPP/RFC1717 support into the
-   ISDN Subsystem. Recompile with this option enabled.
-
---
-
-Q06: can I use asynchronous PPP encapsulation
-   over the network interface of isdn4linux ..
-
-A: No .. that's not possible .. Use the standard
-   PPP package over the /dev/ttyI* devices. You
-   must not use the ipppd for this.
-   
---
-
-Q07: A SunISDN machine tries to connect my i4l system,
-   which doesn't work.
-   Checking the debug log I just saw garbage like:
-!![ ... fill in the line ... ]!!
-
-A: The Sun tries to talk asynchronous PPP ... i4l
-   can't understand this ... try to use the ttyI*
-   devices with the standard PPP/pppd package
-
-A: (from Alexanter Strauss: )
-!![ ... fill in mail ]!!
-
---
-
-Q08: I wanna talk to remote machines, which need
-   a different configuration. The only way
-   I found to do this is to kill the ipppd and
-   start a new one with another config to connect
-   to the second machine. 
-
-A: you must bind a network interface explicitly to
-   an ippp device, where you can connect a (for this
-   interface) individually configured ipppd.
-
---
-
-Q09: When I start the ipppd I only get error messages
-   from the i4l driver .. 
-
-A: When starting, the ipppd calls functions which may 
-   trigger a network packet. (e.g gethostbyname()).
-   Without the ipppd (at this moment, it is not
-   fully started) we can't handle this network request.
-   Try to configure hostnames necessary for the ipppd
-   in your local /etc/hosts file or in a way, that
-   your system can resolve it without using an
-   isdn/ippp network-interface.
-
---
-
-Q10: I wanna use dynamic IP address assignment ... How 
-   must I configure the network device.
-
-A: At least you must have a route which forwards
-   a packet to the ippp network-interface to trigger
-   the dial-on-demand.
-   A default route to the ippp-interface will work.
-   Now you must choose a dummy IP address for your
-   interface.
-   If for some reason you can't set the default
-   route to the ippp interface, you may take any 
-   address of the subnet from which you expect your
-   dynamic IP number and set a 'network route' for
-   this subnet to the ippp interface.
-   To allow overriding of the dummy address you
-   must call the ipppd with the 'ipcp-accept-local' option.
-
-A: You must know, how the ipppd gets the addresses it wanna
-   configure. If you don't give any option, the ipppd
-   tries to negotiate the local host address!
-   With the option 'noipdefault' it requests an address
-   from the remote machine. With 'useifip' it gets the
-   addresses from the net interface. Or you set the address
-   on the option line with the <a.b.c.d:e.f.g.h> option.
-   Note: the IP address of the remote machine must be configured
-   locally or the remote machine must send it in an IPCP request.
-   If your side doesn't know the IP address after negotiation, it
-   closes the connection!
-   You must allow overriding of address with the 'ipcp-accept-*'
-   options, if you have set your own or the remote address 
-   explicitly.
-
-A: Maybe you try these options .. e.g:   
-
-    /sbin/ipppd :$REMOTE noipdefault /dev/ippp0
-
-   where REMOTE must be the address of the remote machine (the
-   machine, which gives you your address)
-
---
-
-Q11: I can't connect. How can I check where the problem is.
-
-A: A good help log is the debug output from the ipppd...
-   Check whether you can find there:
-   - only a few LCP-conf-req SENT messages (less then 10)
-     and then a Term-REQ:
-     -> check whether your ISDN card is well configured
-        it seems, that your machine doesn't dial
-        (IRQ,IO,Proto, etc problems)
-        Configure your ISDN card to print debug messages and
-        check the /dev/isdnctrl output next time. There
-        you can see, whether there is activity on the card/line.
-   - there are at least a few RECV messages in the log:
-     -> fine: your card is dialing and your remote machine
-        tries to talk with you. Maybe only a missing 
-        authentication. Check your ipppd configuration again.
-   - the ipppd exits for some reason:
-     -> not good ... check /var/adm/syslog and /var/adm/daemon.
-        Could be a bug in the ipppd.
-
---
-
-Q12: How can I reduce login delay?
-
-A: Log a login session ('debug' log) and check which options 
-  your remote side rejects. Next time configure your ipppd
-  to not negotiate these options. Another 'side effect' is, that
-  this increases redundancy. (e.g your remote side is buggy and
-  rejects options in a wrong way).
-
-
-
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 18735dc460a0..111636ad1bad 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -23,8 +23,8 @@ running, the suggested command should tell you.
 
 Again, keep in mind that this list assumes you are already functionally
 running a Linux kernel.  Also, not all tools are necessary on all
-systems; obviously, if you don't have any ISDN hardware, for example,
-you probably needn't concern yourself with isdn4k-utils.
+systems; obviously, if you don't have any PC Card hardware, for example,
+you probably needn't concern yourself with pcmciautils.
 
 ====================== ===============  ========================================
         Program        Minimal version       Command to check the version
@@ -45,7 +45,6 @@ btrfs-progs            0.18             btrfsck
 pcmciautils            004              pccardctl -V
 quota-tools            3.09             quota -V
 PPP                    2.4.0            pppd --version
-isdn4k-utils           3.1pre1          isdnctrl 2>&1|grep version
 nfs-utils              1.0.5            showmount --version
 procps                 3.2.0            ps --version
 oprofile               0.9              oprofiled --version
@@ -279,12 +278,6 @@ which can be made by::
 
 as root.
 
-Isdn4k-utils
-------------
-
-Due to changes in the length of the phone number field, isdn4k-utils
-needs to be recompiled or (preferably) upgraded.
-
 NFS-utils
 ---------
 
@@ -448,11 +441,6 @@ PPP
 
 - <ftp://ftp.samba.org/pub/ppp/>
 
-Isdn4k-utils
-------------
-
-- <ftp://ftp.isdn4linux.de/pub/isdn4linux/utils/>
-
 NFS-utils
 ---------
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 0c55b0fedbe2..3a761e680296 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8371,9 +8371,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kkeil/isdn-2.6.git
 S:	Maintained
 F:	Documentation/isdn/
 F:	drivers/isdn/
-F:	include/linux/isdn.h
 F:	include/linux/isdn/
-F:	include/uapi/linux/isdn.h
 F:	include/uapi/linux/isdn/
 
 IT87 HARDWARE MONITORING DRIVER
diff --git a/drivers/isdn/Kconfig b/drivers/isdn/Kconfig
index 1ca4d70d198a..6e3bf833c67e 100644
--- a/drivers/isdn/Kconfig
+++ b/drivers/isdn/Kconfig
@@ -21,27 +21,6 @@ menuconfig ISDN
 
 if ISDN
 
-menuconfig ISDN_I4L
-	tristate "Old ISDN4Linux (deprecated)"
-	depends on TTY
-	---help---
-	  This driver allows you to use an ISDN adapter for networking
-	  connections and as dialin/out device.  The isdn-tty's have a built
-	  in AT-compatible modem emulator.  Network devices support autodial,
-	  channel-bundling, callback and caller-authentication without having
-	  a daemon running.  A reduced T.70 protocol is supported with tty's
-	  suitable for German BTX.  On D-Channel, the protocols EDSS1
-	  (Euro-ISDN) and 1TR6 (German style) are supported.  See
-	  <file:Documentation/isdn/README> for more information.
-
-	  ISDN support in the linux kernel is moving towards a new API,
-	  called CAPI (Common ISDN Application Programming Interface).
-	  Therefore the old ISDN4Linux layer will eventually become obsolete.
-	  It is still available, though, for use with adapters that are not
-	  supported by the new CAPI subsystem yet.
-
-source "drivers/isdn/i4l/Kconfig"
-
 menuconfig ISDN_CAPI
 	tristate "CAPI 2.0 subsystem"
 	help
@@ -71,9 +50,4 @@ source "drivers/isdn/hysdn/Kconfig"
 
 source "drivers/isdn/mISDN/Kconfig"
 
-config ISDN_HDLC
-	tristate
-	select CRC_CCITT
-	select BITREVERSE
-
 endif # ISDN
diff --git a/drivers/isdn/Makefile b/drivers/isdn/Makefile
index 7487f0bbe855..379b4a03c321 100644
--- a/drivers/isdn/Makefile
+++ b/drivers/isdn/Makefile
@@ -7,7 +7,5 @@ obj-$(CONFIG_ISDN_I4L)			+= i4l/
 obj-$(CONFIG_ISDN_CAPI)			+= capi/
 obj-$(CONFIG_MISDN)			+= mISDN/
 obj-$(CONFIG_ISDN)			+= hardware/
-obj-$(CONFIG_ISDN_DIVERSION)		+= divert/
-obj-$(CONFIG_ISDN_DRV_LOOP)		+= isdnloop/
 obj-$(CONFIG_HYSDN)			+= hysdn/
 obj-$(CONFIG_ISDN_DRV_GIGASET)		+= gigaset/
diff --git a/drivers/isdn/capi/Kconfig b/drivers/isdn/capi/Kconfig
index abaadce376c5..089dbee18f36 100644
--- a/drivers/isdn/capi/Kconfig
+++ b/drivers/isdn/capi/Kconfig
@@ -27,15 +27,6 @@ config ISDN_CAPI_MIDDLEWARE
 	  device.  If you want to use pppd with pppdcapiplugin to dial up to
 	  your ISP, say Y here.
 
-config ISDN_CAPI_CAPIDRV
-	tristate "CAPI2.0 capidrv interface support"
-	depends on ISDN_I4L
-	help
-	  This option provides the glue code to hook up CAPI driven cards to
-	  the legacy isdn4linux link layer.  If you have a card which is
-	  supported by a CAPI driver, but still want to use old features like
-	  ippp interfaces or ttyI emulation, say Y/M here.
-
 config ISDN_CAPI_CAPIDRV_VERBOSE
 	bool "Verbose reason code reporting"
 	depends on ISDN_CAPI_CAPIDRV
diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c
deleted file mode 100644
index e8949f3dcae1..000000000000
--- a/drivers/isdn/capi/capidrv.c
+++ /dev/null
@@ -1,2525 +0,0 @@
-/* $Id: capidrv.c,v 1.1.2.2 2004/01/12 23:17:24 keil Exp $
- *
- * ISDN4Linux Driver, using capi20 interface (kernelcapi)
- *
- * Copyright 1997 by Carsten Paeth <calle@calle.de>
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/slab.h>
-#include <linux/fcntl.h>
-#include <linux/fs.h>
-#include <linux/signal.h>
-#include <linux/mm.h>
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/skbuff.h>
-#include <linux/isdn.h>
-#include <linux/isdnif.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/capi.h>
-#include <linux/kernelcapi.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/moduleparam.h>
-
-#include <linux/isdn/capiutil.h>
-#include <linux/isdn/capicmd.h>
-#include "capidrv.h"
-
-static int debugmode = 0;
-
-MODULE_DESCRIPTION("CAPI4Linux: Interface to ISDN4Linux");
-MODULE_AUTHOR("Carsten Paeth");
-MODULE_LICENSE("GPL");
-module_param(debugmode, uint, S_IRUGO | S_IWUSR);
-
-/* -------- type definitions ----------------------------------------- */
-
-
-struct capidrv_contr {
-
-	struct capidrv_contr *next;
-	struct module *owner;
-	u32 contrnr;
-	char name[20];
-
-	/*
-	 * for isdn4linux
-	 */
-	isdn_if interface;
-	int myid;
-
-	/*
-	 * LISTEN state
-	 */
-	int state;
-	u32 cipmask;
-	u32 cipmask2;
-	struct timer_list listentimer;
-
-	/*
-	 * ID of capi message sent
-	 */
-	u16 msgid;
-
-	/*
-	 * B-Channels
-	 */
-	int nbchan;
-	struct capidrv_bchan {
-		struct capidrv_contr *contr;
-		u8 msn[ISDN_MSNLEN];
-		int l2;
-		int l3;
-		u8 num[ISDN_MSNLEN];
-		u8 mynum[ISDN_MSNLEN];
-		int si1;
-		int si2;
-		int incoming;
-		int disconnecting;
-		struct capidrv_plci {
-			struct capidrv_plci *next;
-			u32 plci;
-			u32 ncci;	/* ncci for CONNECT_ACTIVE_IND */
-			u16 msgid;	/* to identfy CONNECT_CONF */
-			int chan;
-			int state;
-			int leasedline;
-			struct capidrv_ncci {
-				struct capidrv_ncci *next;
-				struct capidrv_plci *plcip;
-				u32 ncci;
-				u16 msgid;	/* to identfy CONNECT_B3_CONF */
-				int chan;
-				int state;
-				int oldstate;
-				/* */
-				u16 datahandle;
-				struct ncci_datahandle_queue {
-					struct ncci_datahandle_queue *next;
-					u16                         datahandle;
-					int                           len;
-				} *ackqueue;
-			} *ncci_list;
-		} *plcip;
-		struct capidrv_ncci *nccip;
-	} *bchans;
-
-	struct capidrv_plci *plci_list;
-
-	/* for q931 data */
-	u8  q931_buf[4096];
-	u8 *q931_read;
-	u8 *q931_write;
-	u8 *q931_end;
-};
-
-
-struct capidrv_data {
-	struct capi20_appl ap;
-	int ncontr;
-	struct capidrv_contr *contr_list;
-};
-
-typedef struct capidrv_plci capidrv_plci;
-typedef struct capidrv_ncci capidrv_ncci;
-typedef struct capidrv_contr capidrv_contr;
-typedef struct capidrv_data capidrv_data;
-typedef struct capidrv_bchan capidrv_bchan;
-
-/* -------- data definitions ----------------------------------------- */
-
-static capidrv_data global;
-static DEFINE_SPINLOCK(global_lock);
-
-static void handle_dtrace_data(capidrv_contr *card,
-			       int send, int level2, u8 *data, u16 len);
-
-/* -------- convert functions ---------------------------------------- */
-
-static inline u32 b1prot(int l2, int l3)
-{
-	switch (l2) {
-	case ISDN_PROTO_L2_X75I:
-	case ISDN_PROTO_L2_X75UI:
-	case ISDN_PROTO_L2_X75BUI:
-		return 0;
-	case ISDN_PROTO_L2_HDLC:
-	default:
-		return 0;
-	case ISDN_PROTO_L2_TRANS:
-		return 1;
-	case ISDN_PROTO_L2_V11096:
-	case ISDN_PROTO_L2_V11019:
-	case ISDN_PROTO_L2_V11038:
-		return 2;
-	case ISDN_PROTO_L2_FAX:
-		return 4;
-	case ISDN_PROTO_L2_MODEM:
-		return 8;
-	}
-}
-
-static inline u32 b2prot(int l2, int l3)
-{
-	switch (l2) {
-	case ISDN_PROTO_L2_X75I:
-	case ISDN_PROTO_L2_X75UI:
-	case ISDN_PROTO_L2_X75BUI:
-	default:
-		return 0;
-	case ISDN_PROTO_L2_HDLC:
-	case ISDN_PROTO_L2_TRANS:
-	case ISDN_PROTO_L2_V11096:
-	case ISDN_PROTO_L2_V11019:
-	case ISDN_PROTO_L2_V11038:
-	case ISDN_PROTO_L2_MODEM:
-		return 1;
-	case ISDN_PROTO_L2_FAX:
-		return 4;
-	}
-}
-
-static inline u32 b3prot(int l2, int l3)
-{
-	switch (l2) {
-	case ISDN_PROTO_L2_X75I:
-	case ISDN_PROTO_L2_X75UI:
-	case ISDN_PROTO_L2_X75BUI:
-	case ISDN_PROTO_L2_HDLC:
-	case ISDN_PROTO_L2_TRANS:
-	case ISDN_PROTO_L2_V11096:
-	case ISDN_PROTO_L2_V11019:
-	case ISDN_PROTO_L2_V11038:
-	case ISDN_PROTO_L2_MODEM:
-	default:
-		return 0;
-	case ISDN_PROTO_L2_FAX:
-		return 4;
-	}
-}
-
-static _cstruct b1config_async_v110(u16 rate)
-{
-	/* CAPI-Spec "B1 Configuration" */
-	static unsigned char buf[9];
-	buf[0] = 8; /* len */
-	/* maximum bitrate */
-	buf[1] = rate & 0xff; buf[2] = (rate >> 8) & 0xff;
-	buf[3] = 8; buf[4] = 0; /* 8 bits per character */
-	buf[5] = 0; buf[6] = 0; /* parity none */
-	buf[7] = 0; buf[8] = 0; /* 1 stop bit */
-	return buf;
-}
-
-static _cstruct b1config(int l2, int l3)
-{
-	switch (l2) {
-	case ISDN_PROTO_L2_X75I:
-	case ISDN_PROTO_L2_X75UI:
-	case ISDN_PROTO_L2_X75BUI:
-	case ISDN_PROTO_L2_HDLC:
-	case ISDN_PROTO_L2_TRANS:
-	default:
-		return NULL;
-	case ISDN_PROTO_L2_V11096:
-		return b1config_async_v110(9600);
-	case ISDN_PROTO_L2_V11019:
-		return b1config_async_v110(19200);
-	case ISDN_PROTO_L2_V11038:
-		return b1config_async_v110(38400);
-	}
-}
-
-static inline u16 si2cip(u8 si1, u8 si2)
-{
-	static const u8 cip[17][5] =
-		{
-			/*  0  1  2  3  4  */
-			{0, 0, 0, 0, 0},	/*0 */
-			{16, 16, 4, 26, 16},	/*1 */
-			{17, 17, 17, 4, 4},	/*2 */
-			{2, 2, 2, 2, 2},	/*3 */
-			{18, 18, 18, 18, 18},	/*4 */
-			{2, 2, 2, 2, 2},	/*5 */
-			{0, 0, 0, 0, 0},	/*6 */
-			{2, 2, 2, 2, 2},	/*7 */
-			{2, 2, 2, 2, 2},	/*8 */
-			{21, 21, 21, 21, 21},	/*9 */
-			{19, 19, 19, 19, 19},	/*10 */
-			{0, 0, 0, 0, 0},	/*11 */
-			{0, 0, 0, 0, 0},	/*12 */
-			{0, 0, 0, 0, 0},	/*13 */
-			{0, 0, 0, 0, 0},	/*14 */
-			{22, 22, 22, 22, 22},	/*15 */
-			{27, 27, 27, 28, 27}	/*16 */
-		};
-	if (si1 > 16)
-		si1 = 0;
-	if (si2 > 4)
-		si2 = 0;
-
-	return (u16) cip[si1][si2];
-}
-
-static inline u8 cip2si1(u16 cipval)
-{
-	static const u8 si[32] =
-		{7, 1, 7, 7, 1, 1, 7, 7,	/*0-7 */
-		 7, 1, 0, 0, 0, 0, 0, 0,	/*8-15 */
-		 1, 2, 4, 10, 9, 9, 15, 7,	/*16-23 */
-		 7, 7, 1, 16, 16, 0, 0, 0};	/*24-31 */
-
-	if (cipval > 31)
-		cipval = 0;	/* .... */
-	return si[cipval];
-}
-
-static inline u8 cip2si2(u16 cipval)
-{
-	static const u8 si[32] =
-		{0, 0, 0, 0, 2, 3, 0, 0,	/*0-7 */
-		 0, 3, 0, 0, 0, 0, 0, 0,	/*8-15 */
-		 1, 2, 0, 0, 9, 0, 0, 0,	/*16-23 */
-		 0, 0, 3, 2, 3, 0, 0, 0};	/*24-31 */
-
-	if (cipval > 31)
-		cipval = 0;	/* .... */
-	return si[cipval];
-}
-
-
-/* -------- controller management ------------------------------------- */
-
-static inline capidrv_contr *findcontrbydriverid(int driverid)
-{
-	unsigned long flags;
-	capidrv_contr *p;
-
-	spin_lock_irqsave(&global_lock, flags);
-	for (p = global.contr_list; p; p = p->next)
-		if (p->myid == driverid)
-			break;
-	spin_unlock_irqrestore(&global_lock, flags);
-	return p;
-}
-
-static capidrv_contr *findcontrbynumber(u32 contr)
-{
-	unsigned long flags;
-	capidrv_contr *p = global.contr_list;
-
-	spin_lock_irqsave(&global_lock, flags);
-	for (p = global.contr_list; p; p = p->next)
-		if (p->contrnr == contr)
-			break;
-	spin_unlock_irqrestore(&global_lock, flags);
-	return p;
-}
-
-
-/* -------- plci management ------------------------------------------ */
-
-static capidrv_plci *new_plci(capidrv_contr *card, int chan)
-{
-	capidrv_plci *plcip;
-
-	plcip = kzalloc(sizeof(capidrv_plci), GFP_ATOMIC);
-
-	if (plcip == NULL)
-		return NULL;
-
-	plcip->state = ST_PLCI_NONE;
-	plcip->plci = 0;
-	plcip->msgid = 0;
-	plcip->chan = chan;
-	plcip->next = card->plci_list;
-	card->plci_list = plcip;
-	card->bchans[chan].plcip = plcip;
-
-	return plcip;
-}
-
-static capidrv_plci *find_plci_by_plci(capidrv_contr *card, u32 plci)
-{
-	capidrv_plci *p;
-	for (p = card->plci_list; p; p = p->next)
-		if (p->plci == plci)
-			return p;
-	return NULL;
-}
-
-static capidrv_plci *find_plci_by_msgid(capidrv_contr *card, u16 msgid)
-{
-	capidrv_plci *p;
-	for (p = card->plci_list; p; p = p->next)
-		if (p->msgid == msgid)
-			return p;
-	return NULL;
-}
-
-static capidrv_plci *find_plci_by_ncci(capidrv_contr *card, u32 ncci)
-{
-	capidrv_plci *p;
-	for (p = card->plci_list; p; p = p->next)
-		if (p->plci == (ncci & 0xffff))
-			return p;
-	return NULL;
-}
-
-static void free_plci(capidrv_contr *card, capidrv_plci *plcip)
-{
-	capidrv_plci **pp;
-
-	for (pp = &card->plci_list; *pp; pp = &(*pp)->next) {
-		if (*pp == plcip) {
-			*pp = (*pp)->next;
-			card->bchans[plcip->chan].plcip = NULL;
-			card->bchans[plcip->chan].disconnecting = 0;
-			card->bchans[plcip->chan].incoming = 0;
-			kfree(plcip);
-			return;
-		}
-	}
-	printk(KERN_ERR "capidrv-%d: free_plci %p (0x%x) not found, Huh?\n",
-	       card->contrnr, plcip, plcip->plci);
-}
-
-/* -------- ncci management ------------------------------------------ */
-
-static inline capidrv_ncci *new_ncci(capidrv_contr *card,
-				     capidrv_plci *plcip,
-				     u32 ncci)
-{
-	capidrv_ncci *nccip;
-
-	nccip = kzalloc(sizeof(capidrv_ncci), GFP_ATOMIC);
-
-	if (nccip == NULL)
-		return NULL;
-
-	nccip->ncci = ncci;
-	nccip->state = ST_NCCI_NONE;
-	nccip->plcip = plcip;
-	nccip->chan = plcip->chan;
-	nccip->datahandle = 0;
-
-	nccip->next = plcip->ncci_list;
-	plcip->ncci_list = nccip;
-
-	card->bchans[plcip->chan].nccip = nccip;
-
-	return nccip;
-}
-
-static inline capidrv_ncci *find_ncci(capidrv_contr *card, u32 ncci)
-{
-	capidrv_plci *plcip;
-	capidrv_ncci *p;
-
-	if ((plcip = find_plci_by_ncci(card, ncci)) == NULL)
-		return NULL;
-
-	for (p = plcip->ncci_list; p; p = p->next)
-		if (p->ncci == ncci)
-			return p;
-	return NULL;
-}
-
-static inline capidrv_ncci *find_ncci_by_msgid(capidrv_contr *card,
-					       u32 ncci, u16 msgid)
-{
-	capidrv_plci *plcip;
-	capidrv_ncci *p;
-
-	if ((plcip = find_plci_by_ncci(card, ncci)) == NULL)
-		return NULL;
-
-	for (p = plcip->ncci_list; p; p = p->next)
-		if (p->msgid == msgid)
-			return p;
-	return NULL;
-}
-
-static void free_ncci(capidrv_contr *card, struct capidrv_ncci *nccip)
-{
-	struct capidrv_ncci **pp;
-
-	for (pp = &(nccip->plcip->ncci_list); *pp; pp = &(*pp)->next) {
-		if (*pp == nccip) {
-			*pp = (*pp)->next;
-			break;
-		}
-	}
-	card->bchans[nccip->chan].nccip = NULL;
-	kfree(nccip);
-}
-
-static int capidrv_add_ack(struct capidrv_ncci *nccip,
-			   u16 datahandle, int len)
-{
-	struct ncci_datahandle_queue *n, **pp;
-
-	n = kmalloc(sizeof(struct ncci_datahandle_queue), GFP_ATOMIC);
-	if (!n) {
-		printk(KERN_ERR "capidrv: kmalloc ncci_datahandle failed\n");
-		return -1;
-	}
-	n->next = NULL;
-	n->datahandle = datahandle;
-	n->len = len;
-	for (pp = &nccip->ackqueue; *pp; pp = &(*pp)->next);
-	*pp = n;
-	return 0;
-}
-
-static int capidrv_del_ack(struct capidrv_ncci *nccip, u16 datahandle)
-{
-	struct ncci_datahandle_queue **pp, *p;
-	int len;
-
-	for (pp = &nccip->ackqueue; *pp; pp = &(*pp)->next) {
-		if ((*pp)->datahandle == datahandle) {
-			p = *pp;
-			len = p->len;
-			*pp = (*pp)->next;
-			kfree(p);
-			return len;
-		}
-	}
-	return -1;
-}
-
-/* -------- convert and send capi message ---------------------------- */
-
-static void send_message(capidrv_contr *card, _cmsg *cmsg)
-{
-	struct sk_buff *skb;
-	size_t len;
-
-	if (capi_cmsg2message(cmsg, cmsg->buf)) {
-		printk(KERN_ERR "capidrv::send_message: parser failure\n");
-		return;
-	}
-	len = CAPIMSG_LEN(cmsg->buf);
-	skb = alloc_skb(len, GFP_ATOMIC);
-	if (!skb) {
-		printk(KERN_ERR "capidrv::send_message: can't allocate mem\n");
-		return;
-	}
-	skb_put_data(skb, cmsg->buf, len);
-	if (capi20_put_message(&global.ap, skb) != CAPI_NOERROR)
-		kfree_skb(skb);
-}
-
-/* -------- state machine -------------------------------------------- */
-
-struct listenstatechange {
-	int actstate;
-	int nextstate;
-	int event;
-};
-
-static struct listenstatechange listentable[] =
-{
-	{ST_LISTEN_NONE, ST_LISTEN_WAIT_CONF, EV_LISTEN_REQ},
-	{ST_LISTEN_ACTIVE, ST_LISTEN_ACTIVE_WAIT_CONF, EV_LISTEN_REQ},
-	{ST_LISTEN_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_ERROR},
-	{ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_ERROR},
-	{ST_LISTEN_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_EMPTY},
-	{ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_EMPTY},
-	{ST_LISTEN_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_OK},
-	{ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_OK},
-	{},
-};
-
-static void listen_change_state(capidrv_contr *card, int event)
-{
-	struct listenstatechange *p = listentable;
-	while (p->event) {
-		if (card->state == p->actstate && p->event == event) {
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: listen_change_state %d -> %d\n",
-				       card->contrnr, card->state, p->nextstate);
-			card->state = p->nextstate;
-			return;
-		}
-		p++;
-	}
-	printk(KERN_ERR "capidrv-%d: listen_change_state state=%d event=%d ????\n",
-	       card->contrnr, card->state, event);
-
-}
-
-/* ------------------------------------------------------------------ */
-
-static void p0(capidrv_contr *card, capidrv_plci *plci)
-{
-	isdn_ctrl cmd;
-
-	card->bchans[plci->chan].contr = NULL;
-	cmd.command = ISDN_STAT_DHUP;
-	cmd.driver = card->myid;
-	cmd.arg = plci->chan;
-	card->interface.statcallb(&cmd);
-	free_plci(card, plci);
-}
-
-/* ------------------------------------------------------------------ */
-
-struct plcistatechange {
-	int actstate;
-	int nextstate;
-	int event;
-	void (*changefunc)(capidrv_contr *card, capidrv_plci *plci);
-};
-
-static struct plcistatechange plcitable[] =
-{
-	/* P-0 */
-	{ST_PLCI_NONE, ST_PLCI_OUTGOING, EV_PLCI_CONNECT_REQ, NULL},
-	{ST_PLCI_NONE, ST_PLCI_ALLOCATED, EV_PLCI_FACILITY_IND_UP, NULL},
-	{ST_PLCI_NONE, ST_PLCI_INCOMING, EV_PLCI_CONNECT_IND, NULL},
-	{ST_PLCI_NONE, ST_PLCI_RESUMEING, EV_PLCI_RESUME_REQ, NULL},
-	/* P-0.1 */
-	{ST_PLCI_OUTGOING, ST_PLCI_NONE, EV_PLCI_CONNECT_CONF_ERROR, p0},
-	{ST_PLCI_OUTGOING, ST_PLCI_ALLOCATED, EV_PLCI_CONNECT_CONF_OK, NULL},
-	/* P-1 */
-	{ST_PLCI_ALLOCATED, ST_PLCI_ACTIVE, EV_PLCI_CONNECT_ACTIVE_IND, NULL},
-	{ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	/* P-ACT */
-	{ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	{ST_PLCI_ACTIVE, ST_PLCI_HELD, EV_PLCI_HOLD_IND, NULL},
-	{ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_SUSPEND_IND, NULL},
-	/* P-2 */
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_CONNECT_REJECT, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_FACILITY_IND, EV_PLCI_FACILITY_IND_UP, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_ACCEPTING, EV_PLCI_CONNECT_RESP, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	{ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_CD_IND, NULL},
-	/* P-3 */
-	{ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_CONNECT_REJECT, NULL},
-	{ST_PLCI_FACILITY_IND, ST_PLCI_ACCEPTING, EV_PLCI_CONNECT_ACTIVE_IND, NULL},
-	{ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	/* P-4 */
-	{ST_PLCI_ACCEPTING, ST_PLCI_ACTIVE, EV_PLCI_CONNECT_ACTIVE_IND, NULL},
-	{ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL},
-	{ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL},
-	{ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	/* P-5 */
-	{ST_PLCI_DISCONNECTING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL},
-	/* P-6 */
-	{ST_PLCI_DISCONNECTED, ST_PLCI_NONE, EV_PLCI_DISCONNECT_RESP, p0},
-	/* P-0.Res */
-	{ST_PLCI_RESUMEING, ST_PLCI_NONE, EV_PLCI_RESUME_CONF_ERROR, p0},
-	{ST_PLCI_RESUMEING, ST_PLCI_RESUME, EV_PLCI_RESUME_CONF_OK, NULL},
-	/* P-RES */
-	{ST_PLCI_RESUME, ST_PLCI_ACTIVE, EV_PLCI_RESUME_IND, NULL},
-	/* P-HELD */
-	{ST_PLCI_HELD, ST_PLCI_ACTIVE, EV_PLCI_RETRIEVE_IND, NULL},
-	{},
-};
-
-static void plci_change_state(capidrv_contr *card, capidrv_plci *plci, int event)
-{
-	struct plcistatechange *p = plcitable;
-	while (p->event) {
-		if (plci->state == p->actstate && p->event == event) {
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: plci_change_state:0x%x %d -> %d\n",
-				       card->contrnr, plci->plci, plci->state, p->nextstate);
-			plci->state = p->nextstate;
-			if (p->changefunc)
-				p->changefunc(card, plci);
-			return;
-		}
-		p++;
-	}
-	printk(KERN_ERR "capidrv-%d: plci_change_state:0x%x state=%d event=%d ????\n",
-	       card->contrnr, plci->plci, plci->state, event);
-}
-
-/* ------------------------------------------------------------------ */
-
-static _cmsg cmsg;
-
-static void n0(capidrv_contr *card, capidrv_ncci *ncci)
-{
-	isdn_ctrl cmd;
-
-	capi_fill_DISCONNECT_REQ(&cmsg,
-				 global.ap.applid,
-				 card->msgid++,
-				 ncci->plcip->plci,
-				 NULL,	/* BChannelinformation */
-				 NULL,	/* Keypadfacility */
-				 NULL,	/* Useruserdata */   /* $$$$ */
-				 NULL	/* Facilitydataarray */
-		);
-	plci_change_state(card, ncci->plcip, EV_PLCI_DISCONNECT_REQ);
-	send_message(card, &cmsg);
-
-	cmd.command = ISDN_STAT_BHUP;
-	cmd.driver = card->myid;
-	cmd.arg = ncci->chan;
-	card->interface.statcallb(&cmd);
-	free_ncci(card, ncci);
-}
-
-/* ------------------------------------------------------------------ */
-
-struct nccistatechange {
-	int actstate;
-	int nextstate;
-	int event;
-	void (*changefunc)(capidrv_contr *card, capidrv_ncci *ncci);
-};
-
-static struct nccistatechange nccitable[] =
-{
-	/* N-0 */
-	{ST_NCCI_NONE, ST_NCCI_OUTGOING, EV_NCCI_CONNECT_B3_REQ, NULL},
-	{ST_NCCI_NONE, ST_NCCI_INCOMING, EV_NCCI_CONNECT_B3_IND, NULL},
-	/* N-0.1 */
-	{ST_NCCI_OUTGOING, ST_NCCI_ALLOCATED, EV_NCCI_CONNECT_B3_CONF_OK, NULL},
-	{ST_NCCI_OUTGOING, ST_NCCI_NONE, EV_NCCI_CONNECT_B3_CONF_ERROR, n0},
-	/* N-1 */
-	{ST_NCCI_INCOMING, ST_NCCI_DISCONNECTING, EV_NCCI_CONNECT_B3_REJECT, NULL},
-	{ST_NCCI_INCOMING, ST_NCCI_ALLOCATED, EV_NCCI_CONNECT_B3_RESP, NULL},
-	{ST_NCCI_INCOMING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_INCOMING, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL},
-	/* N-2 */
-	{ST_NCCI_ALLOCATED, ST_NCCI_ACTIVE, EV_NCCI_CONNECT_B3_ACTIVE_IND, NULL},
-	{ST_NCCI_ALLOCATED, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_ALLOCATED, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL},
-	/* N-ACT */
-	{ST_NCCI_ACTIVE, ST_NCCI_ACTIVE, EV_NCCI_RESET_B3_IND, NULL},
-	{ST_NCCI_ACTIVE, ST_NCCI_RESETING, EV_NCCI_RESET_B3_REQ, NULL},
-	{ST_NCCI_ACTIVE, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_ACTIVE, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL},
-	/* N-3 */
-	{ST_NCCI_RESETING, ST_NCCI_ACTIVE, EV_NCCI_RESET_B3_IND, NULL},
-	{ST_NCCI_RESETING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_RESETING, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL},
-	/* N-4 */
-	{ST_NCCI_DISCONNECTING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL},
-	{ST_NCCI_DISCONNECTING, ST_NCCI_PREVIOUS, EV_NCCI_DISCONNECT_B3_CONF_ERROR, NULL},
-	/* N-5 */
-	{ST_NCCI_DISCONNECTED, ST_NCCI_NONE, EV_NCCI_DISCONNECT_B3_RESP, n0},
-	{},
-};
-
-static void ncci_change_state(capidrv_contr *card, capidrv_ncci *ncci, int event)
-{
-	struct nccistatechange *p = nccitable;
-	while (p->event) {
-		if (ncci->state == p->actstate && p->event == event) {
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: ncci_change_state:0x%x %d -> %d\n",
-				       card->contrnr, ncci->ncci, ncci->state, p->nextstate);
-			if (p->nextstate == ST_NCCI_PREVIOUS) {
-				ncci->state = ncci->oldstate;
-				ncci->oldstate = p->actstate;
-			} else {
-				ncci->oldstate = p->actstate;
-				ncci->state = p->nextstate;
-			}
-			if (p->changefunc)
-				p->changefunc(card, ncci);
-			return;
-		}
-		p++;
-	}
-	printk(KERN_ERR "capidrv-%d: ncci_change_state:0x%x state=%d event=%d ????\n",
-	       card->contrnr, ncci->ncci, ncci->state, event);
-}
-
-/* ------------------------------------------------------------------- */
-
-static inline int new_bchan(capidrv_contr *card)
-{
-	int i;
-	for (i = 0; i < card->nbchan; i++) {
-		if (card->bchans[i].plcip == NULL) {
-			card->bchans[i].disconnecting = 0;
-			return i;
-		}
-	}
-	return -1;
-}
-
-/* ------------------------------------------------------------------- */
-static char *capi_info2str(u16 reason)
-{
-#ifndef CONFIG_ISDN_CAPI_CAPIDRV_VERBOSE
-	return "..";
-#else
-	switch (reason) {
-
-/*-- informative values (corresponding message was processed) -----*/
-	case 0x0001:
-		return "NCPI not supported by current protocol, NCPI ignored";
-	case 0x0002:
-		return "Flags not supported by current protocol, flags ignored";
-	case 0x0003:
-		return "Alert already sent by another application";
-
-/*-- error information concerning CAPI_REGISTER -----*/
-	case 0x1001:
-		return "Too many applications";
-	case 0x1002:
-		return "Logical block size too small, must be at least 128 Bytes";
-	case 0x1003:
-		return "Buffer exceeds 64 kByte";
-	case 0x1004:
-		return "Message buffer size too small, must be at least 1024 Bytes";
-	case 0x1005:
-		return "Max. number of logical connections not supported";
-	case 0x1006:
-		return "Reserved";
-	case 0x1007:
-		return "The message could not be accepted because of an internal busy condition";
-	case 0x1008:
-		return "OS resource error (no memory ?)";
-	case 0x1009:
-		return "CAPI not installed";
-	case 0x100A:
-		return "Controller does not support external equipment";
-	case 0x100B:
-		return "Controller does only support external equipment";
-
-/*-- error information concerning message exchange functions -----*/
-	case 0x1101:
-		return "Illegal application number";
-	case 0x1102:
-		return "Illegal command or subcommand or message length less than 12 bytes";
-	case 0x1103:
-		return "The message could not be accepted because of a queue full condition !! The error code does not imply that CAPI cannot receive messages directed to another controller, PLCI or NCCI";
-	case 0x1104:
-		return "Queue is empty";
-	case 0x1105:
-		return "Queue overflow, a message was lost !! This indicates a configuration error. The only recovery from this error is to perform a CAPI_RELEASE";
-	case 0x1106:
-		return "Unknown notification parameter";
-	case 0x1107:
-		return "The Message could not be accepted because of an internal busy condition";
-	case 0x1108:
-		return "OS Resource error (no memory ?)";
-	case 0x1109:
-		return "CAPI not installed";
-	case 0x110A:
-		return "Controller does not support external equipment";
-	case 0x110B:
-		return "Controller does only support external equipment";
-
-/*-- error information concerning resource / coding problems -----*/
-	case 0x2001:
-		return "Message not supported in current state";
-	case 0x2002:
-		return "Illegal Controller / PLCI / NCCI";
-	case 0x2003:
-		return "Out of PLCI";
-	case 0x2004:
-		return "Out of NCCI";
-	case 0x2005:
-		return "Out of LISTEN";
-	case 0x2006:
-		return "Out of FAX resources (protocol T.30)";
-	case 0x2007:
-		return "Illegal message parameter coding";
-
-/*-- error information concerning requested services  -----*/
-	case 0x3001:
-		return "B1 protocol not supported";
-	case 0x3002:
-		return "B2 protocol not supported";
-	case 0x3003:
-		return "B3 protocol not supported";
-	case 0x3004:
-		return "B1 protocol parameter not supported";
-	case 0x3005:
-		return "B2 protocol parameter not supported";
-	case 0x3006:
-		return "B3 protocol parameter not supported";
-	case 0x3007:
-		return "B protocol combination not supported";
-	case 0x3008:
-		return "NCPI not supported";
-	case 0x3009:
-		return "CIP Value unknown";
-	case 0x300A:
-		return "Flags not supported (reserved bits)";
-	case 0x300B:
-		return "Facility not supported";
-	case 0x300C:
-		return "Data length not supported by current protocol";
-	case 0x300D:
-		return "Reset procedure not supported by current protocol";
-
-/*-- informations about the clearing of a physical connection -----*/
-	case 0x3301:
-		return "Protocol error layer 1 (broken line or B-channel removed by signalling protocol)";
-	case 0x3302:
-		return "Protocol error layer 2";
-	case 0x3303:
-		return "Protocol error layer 3";
-	case 0x3304:
-		return "Another application got that call";
-/*-- T.30 specific reasons -----*/
-	case 0x3311:
-		return "Connecting not successful (remote station is no FAX G3 machine)";
-	case 0x3312:
-		return "Connecting not successful (training error)";
-	case 0x3313:
-		return "Disconnected before transfer (remote station does not support transfer mode, e.g. resolution)";
-	case 0x3314:
-		return "Disconnected during transfer (remote abort)";
-	case 0x3315:
-		return "Disconnected during transfer (remote procedure error, e.g. unsuccessful repetition of T.30 commands)";
-	case 0x3316:
-		return "Disconnected during transfer (local tx data underrun)";
-	case 0x3317:
-		return "Disconnected during transfer (local rx data overflow)";
-	case 0x3318:
-		return "Disconnected during transfer (local abort)";
-	case 0x3319:
-		return "Illegal parameter coding (e.g. SFF coding error)";
-
-/*-- disconnect causes from the network according to ETS 300 102-1/Q.931 -----*/
-	case 0x3481: return "Unallocated (unassigned) number";
-	case 0x3482: return "No route to specified transit network";
-	case 0x3483: return "No route to destination";
-	case 0x3486: return "Channel unacceptable";
-	case 0x3487:
-		return "Call awarded and being delivered in an established channel";
-	case 0x3490: return "Normal call clearing";
-	case 0x3491: return "User busy";
-	case 0x3492: return "No user responding";
-	case 0x3493: return "No answer from user (user alerted)";
-	case 0x3495: return "Call rejected";
-	case 0x3496: return "Number changed";
-	case 0x349A: return "Non-selected user clearing";
-	case 0x349B: return "Destination out of order";
-	case 0x349C: return "Invalid number format";
-	case 0x349D: return "Facility rejected";
-	case 0x349E: return "Response to STATUS ENQUIRY";
-	case 0x349F: return "Normal, unspecified";
-	case 0x34A2: return "No circuit / channel available";
-	case 0x34A6: return "Network out of order";
-	case 0x34A9: return "Temporary failure";
-	case 0x34AA: return "Switching equipment congestion";
-	case 0x34AB: return "Access information discarded";
-	case 0x34AC: return "Requested circuit / channel not available";
-	case 0x34AF: return "Resources unavailable, unspecified";
-	case 0x34B1: return "Quality of service unavailable";
-	case 0x34B2: return "Requested facility not subscribed";
-	case 0x34B9: return "Bearer capability not authorized";
-	case 0x34BA: return "Bearer capability not presently available";
-	case 0x34BF: return "Service or option not available, unspecified";
-	case 0x34C1: return "Bearer capability not implemented";
-	case 0x34C2: return "Channel type not implemented";
-	case 0x34C5: return "Requested facility not implemented";
-	case 0x34C6: return "Only restricted digital information bearer capability is available";
-	case 0x34CF: return "Service or option not implemented, unspecified";
-	case 0x34D1: return "Invalid call reference value";
-	case 0x34D2: return "Identified channel does not exist";
-	case 0x34D3: return "A suspended call exists, but this call identity does not";
-	case 0x34D4: return "Call identity in use";
-	case 0x34D5: return "No call suspended";
-	case 0x34D6: return "Call having the requested call identity has been cleared";
-	case 0x34D8: return "Incompatible destination";
-	case 0x34DB: return "Invalid transit network selection";
-	case 0x34DF: return "Invalid message, unspecified";
-	case 0x34E0: return "Mandatory information element is missing";
-	case 0x34E1: return "Message type non-existent or not implemented";
-	case 0x34E2: return "Message not compatible with call state or message type non-existent or not implemented";
-	case 0x34E3: return "Information element non-existent or not implemented";
-	case 0x34E4: return "Invalid information element contents";
-	case 0x34E5: return "Message not compatible with call state";
-	case 0x34E6: return "Recovery on timer expiry";
-	case 0x34EF: return "Protocol error, unspecified";
-	case 0x34FF: return "Interworking, unspecified";
-
-	default: return "No additional information";
-	}
-#endif
-}
-
-static void handle_controller(_cmsg *cmsg)
-{
-	capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f);
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n",
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController & 0x7f);
-		return;
-	}
-	switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) {
-
-	case CAPI_LISTEN_CONF:	/* Controller */
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: listenconf Info=0x%4x (%s) cipmask=0x%x\n",
-			       card->contrnr, cmsg->Info, capi_info2str(cmsg->Info), card->cipmask);
-		if (cmsg->Info) {
-			listen_change_state(card, EV_LISTEN_CONF_ERROR);
-		} else if (card->cipmask == 0) {
-			listen_change_state(card, EV_LISTEN_CONF_EMPTY);
-		} else {
-			listen_change_state(card, EV_LISTEN_CONF_OK);
-		}
-		break;
-
-	case CAPI_MANUFACTURER_IND:	/* Controller */
-		if (cmsg->ManuID == 0x214D5641
-		    && cmsg->Class == 0
-		    && cmsg->Function == 1) {
-			u8  *data = cmsg->ManuData + 3;
-			u16  len = cmsg->ManuData[0];
-			u16 layer;
-			int direction;
-			if (len == 255) {
-				len = (cmsg->ManuData[1] | (cmsg->ManuData[2] << 8));
-				data += 2;
-			}
-			len -= 2;
-			layer = ((*(data - 1)) << 8) | *(data - 2);
-			if (layer & 0x300)
-				direction = (layer & 0x200) ? 0 : 1;
-			else direction = (layer & 0x800) ? 0 : 1;
-			if (layer & 0x0C00) {
-				if ((layer & 0xff) == 0x80) {
-					handle_dtrace_data(card, direction, 1, data, len);
-					break;
-				}
-			} else if ((layer & 0xff) < 0x80) {
-				handle_dtrace_data(card, direction, 0, data, len);
-				break;
-			}
-			printk(KERN_INFO "capidrv-%d: %s from controller 0x%x layer 0x%x, ignored\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->adr.adrController, layer);
-			break;
-		}
-		goto ignored;
-	case CAPI_MANUFACTURER_CONF:	/* Controller */
-		if (cmsg->ManuID == 0x214D5641) {
-			char *s = NULL;
-			switch (cmsg->Class) {
-			case 0: break;
-			case 1: s = "unknown class"; break;
-			case 2: s = "unknown function"; break;
-			default: s = "unknown error"; break;
-			}
-			if (s)
-				printk(KERN_INFO "capidrv-%d: %s from controller 0x%x function %d: %s\n",
-				       card->contrnr,
-				       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-				       cmsg->adr.adrController,
-				       cmsg->Function, s);
-			break;
-		}
-		goto ignored;
-	case CAPI_FACILITY_IND:	/* Controller/plci/ncci */
-		goto ignored;
-	case CAPI_FACILITY_CONF:	/* Controller/plci/ncci */
-		goto ignored;
-	case CAPI_INFO_IND:	/* Controller/plci */
-		goto ignored;
-	case CAPI_INFO_CONF:	/* Controller/plci */
-		goto ignored;
-
-	default:
-		printk(KERN_ERR "capidrv-%d: got %s from controller 0x%x ???",
-		       card->contrnr,
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController);
-	}
-	return;
-
-ignored:
-	printk(KERN_INFO "capidrv-%d: %s from controller 0x%x ignored\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrController);
-}
-
-static void handle_incoming_call(capidrv_contr *card, _cmsg *cmsg)
-{
-	capidrv_plci *plcip;
-	capidrv_bchan *bchan;
-	isdn_ctrl cmd;
-	int chan;
-
-	if ((chan = new_bchan(card)) == -1) {
-		printk(KERN_ERR "capidrv-%d: incoming call on not existing bchan ?\n", card->contrnr);
-		return;
-	}
-	bchan = &card->bchans[chan];
-	if ((plcip = new_plci(card, chan)) == NULL) {
-		printk(KERN_ERR "capidrv-%d: incoming call: no memory, sorry.\n", card->contrnr);
-		return;
-	}
-	bchan->incoming = 1;
-	plcip->plci = cmsg->adr.adrPLCI;
-	plci_change_state(card, plcip, EV_PLCI_CONNECT_IND);
-
-	cmd.command = ISDN_STAT_ICALL;
-	cmd.driver = card->myid;
-	cmd.arg = chan;
-	memset(&cmd.parm.setup, 0, sizeof(cmd.parm.setup));
-	strncpy(cmd.parm.setup.phone,
-		cmsg->CallingPartyNumber + 3,
-		cmsg->CallingPartyNumber[0] - 2);
-	strncpy(cmd.parm.setup.eazmsn,
-		cmsg->CalledPartyNumber + 2,
-		cmsg->CalledPartyNumber[0] - 1);
-	cmd.parm.setup.si1 = cip2si1(cmsg->CIPValue);
-	cmd.parm.setup.si2 = cip2si2(cmsg->CIPValue);
-	cmd.parm.setup.plan = cmsg->CallingPartyNumber[1];
-	cmd.parm.setup.screen = cmsg->CallingPartyNumber[2];
-
-	printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s\n",
-	       card->contrnr,
-	       cmd.parm.setup.phone,
-	       cmd.parm.setup.si1,
-	       cmd.parm.setup.si2,
-	       cmd.parm.setup.eazmsn);
-
-	if (cmd.parm.setup.si1 == 1 && cmd.parm.setup.si2 != 0) {
-		printk(KERN_INFO "capidrv-%d: patching si2=%d to 0 for VBOX\n",
-		       card->contrnr,
-		       cmd.parm.setup.si2);
-		cmd.parm.setup.si2 = 0;
-	}
-
-	switch (card->interface.statcallb(&cmd)) {
-	case 0:
-	case 3:
-		/* No device matching this call.
-		 * and isdn_common.c has send a HANGUP command
-		 * which is ignored in state ST_PLCI_INCOMING,
-		 * so we send RESP to ignore the call
-		 */
-		capi_cmsg_answer(cmsg);
-		cmsg->Reject = 1;	/* ignore */
-		plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT);
-		send_message(card, cmsg);
-		printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s ignored\n",
-		       card->contrnr,
-		       cmd.parm.setup.phone,
-		       cmd.parm.setup.si1,
-		       cmd.parm.setup.si2,
-		       cmd.parm.setup.eazmsn);
-		break;
-	case 1:
-		/* At least one device matching this call (RING on ttyI)
-		 * HL-driver may send ALERTING on the D-channel in this
-		 * case.
-		 * really means: RING on ttyI or a net interface
-		 * accepted this call already.
-		 *
-		 * If the call was accepted, state has already changed,
-		 * and CONNECT_RESP already sent.
-		 */
-		if (plcip->state == ST_PLCI_INCOMING) {
-			printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s tty alerting\n",
-			       card->contrnr,
-			       cmd.parm.setup.phone,
-			       cmd.parm.setup.si1,
-			       cmd.parm.setup.si2,
-			       cmd.parm.setup.eazmsn);
-			capi_fill_ALERT_REQ(cmsg,
-					    global.ap.applid,
-					    card->msgid++,
-					    plcip->plci,	/* adr */
-					    NULL,/* BChannelinformation */
-					    NULL,/* Keypadfacility */
-					    NULL,/* Useruserdata */
-					    NULL /* Facilitydataarray */
-				);
-			plcip->msgid = cmsg->Messagenumber;
-			send_message(card, cmsg);
-		} else {
-			printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s on netdev\n",
-			       card->contrnr,
-			       cmd.parm.setup.phone,
-			       cmd.parm.setup.si1,
-			       cmd.parm.setup.si2,
-			       cmd.parm.setup.eazmsn);
-		}
-		break;
-
-	case 2:		/* Call will be rejected. */
-		capi_cmsg_answer(cmsg);
-		cmsg->Reject = 2;	/* reject call, normal call clearing */
-		plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT);
-		send_message(card, cmsg);
-		break;
-
-	default:
-		/* An error happened. (Invalid parameters for example.) */
-		capi_cmsg_answer(cmsg);
-		cmsg->Reject = 8;	/* reject call,
-					   destination out of order */
-		plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT);
-		send_message(card, cmsg);
-		break;
-	}
-	return;
-}
-
-static void handle_plci(_cmsg *cmsg)
-{
-	capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f);
-	capidrv_plci *plcip;
-	isdn_ctrl cmd;
-	_cdebbuf *cdb;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n",
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController & 0x7f);
-		return;
-	}
-	switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) {
-
-	case CAPI_DISCONNECT_IND:	/* plci */
-		if (cmsg->Reason) {
-			printk(KERN_INFO "capidrv-%d: %s reason 0x%x (%s) for plci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Reason, capi_info2str(cmsg->Reason), cmsg->adr.adrPLCI);
-		}
-		if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI))) {
-			capi_cmsg_answer(cmsg);
-			send_message(card, cmsg);
-			goto notfound;
-		}
-		card->bchans[plcip->chan].disconnecting = 1;
-		plci_change_state(card, plcip, EV_PLCI_DISCONNECT_IND);
-		capi_cmsg_answer(cmsg);
-		plci_change_state(card, plcip, EV_PLCI_DISCONNECT_RESP);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_DISCONNECT_CONF:	/* plci */
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrPLCI);
-		}
-		if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI)))
-			goto notfound;
-
-		card->bchans[plcip->chan].disconnecting = 1;
-		break;
-
-	case CAPI_ALERT_CONF:	/* plci */
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrPLCI);
-		}
-		break;
-
-	case CAPI_CONNECT_IND:	/* plci */
-		handle_incoming_call(card, cmsg);
-		break;
-
-	case CAPI_CONNECT_CONF:	/* plci */
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrPLCI);
-		}
-		if (!(plcip = find_plci_by_msgid(card, cmsg->Messagenumber)))
-			goto notfound;
-
-		plcip->plci = cmsg->adr.adrPLCI;
-		if (cmsg->Info) {
-			plci_change_state(card, plcip, EV_PLCI_CONNECT_CONF_ERROR);
-		} else {
-			plci_change_state(card, plcip, EV_PLCI_CONNECT_CONF_OK);
-		}
-		break;
-
-	case CAPI_CONNECT_ACTIVE_IND:	/* plci */
-
-		if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI)))
-			goto notfound;
-
-		if (card->bchans[plcip->chan].incoming) {
-			capi_cmsg_answer(cmsg);
-			plci_change_state(card, plcip, EV_PLCI_CONNECT_ACTIVE_IND);
-			send_message(card, cmsg);
-		} else {
-			capidrv_ncci *nccip;
-			capi_cmsg_answer(cmsg);
-			send_message(card, cmsg);
-
-			nccip = new_ncci(card, plcip, cmsg->adr.adrPLCI);
-
-			if (!nccip) {
-				printk(KERN_ERR "capidrv-%d: no mem for ncci, sorry\n", card->contrnr);
-				break;	/* $$$$ */
-			}
-			capi_fill_CONNECT_B3_REQ(cmsg,
-						 global.ap.applid,
-						 card->msgid++,
-						 plcip->plci,	/* adr */
-						 NULL	/* NCPI */
-				);
-			nccip->msgid = cmsg->Messagenumber;
-			plci_change_state(card, plcip,
-					  EV_PLCI_CONNECT_ACTIVE_IND);
-			ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_REQ);
-			send_message(card, cmsg);
-			cmd.command = ISDN_STAT_DCONN;
-			cmd.driver = card->myid;
-			cmd.arg = plcip->chan;
-			card->interface.statcallb(&cmd);
-		}
-		break;
-
-	case CAPI_INFO_IND:	/* Controller/plci */
-
-		if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI)))
-			goto notfound;
-
-		if (cmsg->InfoNumber == 0x4000) {
-			if (cmsg->InfoElement[0] == 4) {
-				cmd.command = ISDN_STAT_CINF;
-				cmd.driver = card->myid;
-				cmd.arg = plcip->chan;
-				sprintf(cmd.parm.num, "%lu",
-					(unsigned long)
-					((u32) cmsg->InfoElement[1]
-					 | ((u32) (cmsg->InfoElement[2]) << 8)
-					 | ((u32) (cmsg->InfoElement[3]) << 16)
-					 | ((u32) (cmsg->InfoElement[4]) << 24)));
-				card->interface.statcallb(&cmd);
-				break;
-			}
-		}
-		cdb = capi_cmsg2str(cmsg);
-		if (cdb) {
-			printk(KERN_WARNING "capidrv-%d: %s\n",
-			       card->contrnr, cdb->buf);
-			cdebbuf_free(cdb);
-		} else
-			printk(KERN_WARNING "capidrv-%d: CAPI_INFO_IND InfoNumber %x not handled\n",
-			       card->contrnr, cmsg->InfoNumber);
-
-		break;
-
-	case CAPI_CONNECT_ACTIVE_CONF:		/* plci */
-		goto ignored;
-	case CAPI_SELECT_B_PROTOCOL_CONF:	/* plci */
-		goto ignored;
-	case CAPI_FACILITY_IND:	/* Controller/plci/ncci */
-		goto ignored;
-	case CAPI_FACILITY_CONF:	/* Controller/plci/ncci */
-		goto ignored;
-
-	case CAPI_INFO_CONF:	/* Controller/plci */
-		goto ignored;
-
-	default:
-		printk(KERN_ERR "capidrv-%d: got %s for plci 0x%x ???",
-		       card->contrnr,
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrPLCI);
-	}
-	return;
-ignored:
-	printk(KERN_INFO "capidrv-%d: %s for plci 0x%x ignored\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrPLCI);
-	return;
-notfound:
-	printk(KERN_ERR "capidrv-%d: %s: plci 0x%x not found\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrPLCI);
-	return;
-}
-
-static void handle_ncci(_cmsg *cmsg)
-{
-	capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f);
-	capidrv_plci *plcip;
-	capidrv_ncci *nccip;
-	isdn_ctrl cmd;
-	int len;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n",
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController & 0x7f);
-		return;
-	}
-	switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) {
-
-	case CAPI_CONNECT_B3_ACTIVE_IND:	/* ncci */
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-
-		capi_cmsg_answer(cmsg);
-		ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_ACTIVE_IND);
-		send_message(card, cmsg);
-
-		cmd.command = ISDN_STAT_BCONN;
-		cmd.driver = card->myid;
-		cmd.arg = nccip->chan;
-		card->interface.statcallb(&cmd);
-
-		printk(KERN_INFO "capidrv-%d: chan %d up with ncci 0x%x\n",
-		       card->contrnr, nccip->chan, nccip->ncci);
-		break;
-
-	case CAPI_CONNECT_B3_ACTIVE_CONF:	/* ncci */
-		goto ignored;
-
-	case CAPI_CONNECT_B3_IND:	/* ncci */
-
-		plcip = find_plci_by_ncci(card, cmsg->adr.adrNCCI);
-		if (plcip) {
-			nccip = new_ncci(card, plcip, cmsg->adr.adrNCCI);
-			if (nccip) {
-				ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_IND);
-				capi_fill_CONNECT_B3_RESP(cmsg,
-							  global.ap.applid,
-							  card->msgid++,
-							  nccip->ncci,	/* adr */
-							  0,	/* Reject */
-							  NULL	/* NCPI */
-					);
-				ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_RESP);
-				send_message(card, cmsg);
-				break;
-			}
-			printk(KERN_ERR "capidrv-%d: no mem for ncci, sorry\n",							card->contrnr);
-		} else {
-			printk(KERN_ERR "capidrv-%d: %s: plci for ncci 0x%x not found\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->adr.adrNCCI);
-		}
-		capi_fill_CONNECT_B3_RESP(cmsg,
-					  global.ap.applid,
-					  card->msgid++,
-					  cmsg->adr.adrNCCI,
-					  2,	/* Reject */
-					  NULL	/* NCPI */
-			);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_CONNECT_B3_CONF:	/* ncci */
-
-		if (!(nccip = find_ncci_by_msgid(card,
-						 cmsg->adr.adrNCCI,
-						 cmsg->Messagenumber)))
-			goto notfound;
-
-		nccip->ncci = cmsg->adr.adrNCCI;
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for ncci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrNCCI);
-		}
-
-		if (cmsg->Info)
-			ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_CONF_ERROR);
-		else
-			ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_CONF_OK);
-		break;
-
-	case CAPI_CONNECT_B3_T90_ACTIVE_IND:	/* ncci */
-		capi_cmsg_answer(cmsg);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_DATA_B3_IND:	/* ncci */
-		/* handled in handle_data() */
-		goto ignored;
-
-	case CAPI_DATA_B3_CONF:	/* ncci */
-		if (cmsg->Info) {
-			printk(KERN_WARNING "CAPI_DATA_B3_CONF: Info %x - %s\n",
-			       cmsg->Info, capi_info2str(cmsg->Info));
-		}
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-
-		len = capidrv_del_ack(nccip, cmsg->DataHandle);
-		if (len < 0)
-			break;
-		cmd.command = ISDN_STAT_BSENT;
-		cmd.driver = card->myid;
-		cmd.arg = nccip->chan;
-		cmd.parm.length = len;
-		card->interface.statcallb(&cmd);
-		break;
-
-	case CAPI_DISCONNECT_B3_IND:	/* ncci */
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-
-		card->bchans[nccip->chan].disconnecting = 1;
-		ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_IND);
-		capi_cmsg_answer(cmsg);
-		ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_RESP);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_DISCONNECT_B3_CONF:	/* ncci */
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-		if (cmsg->Info) {
-			printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for ncci 0x%x\n",
-			       card->contrnr,
-			       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-			       cmsg->Info, capi_info2str(cmsg->Info),
-			       cmsg->adr.adrNCCI);
-			ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_CONF_ERROR);
-		}
-		break;
-
-	case CAPI_RESET_B3_IND:	/* ncci */
-		if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI)))
-			goto notfound;
-		ncci_change_state(card, nccip, EV_NCCI_RESET_B3_IND);
-		capi_cmsg_answer(cmsg);
-		send_message(card, cmsg);
-		break;
-
-	case CAPI_RESET_B3_CONF:	/* ncci */
-		goto ignored;	/* $$$$ */
-
-	case CAPI_FACILITY_IND:	/* Controller/plci/ncci */
-		goto ignored;
-	case CAPI_FACILITY_CONF:	/* Controller/plci/ncci */
-		goto ignored;
-
-	default:
-		printk(KERN_ERR "capidrv-%d: got %s for ncci 0x%x ???",
-		       card->contrnr,
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrNCCI);
-	}
-	return;
-ignored:
-	printk(KERN_INFO "capidrv-%d: %s for ncci 0x%x ignored\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrNCCI);
-	return;
-notfound:
-	printk(KERN_ERR "capidrv-%d: %s: ncci 0x%x not found\n",
-	       card->contrnr,
-	       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-	       cmsg->adr.adrNCCI);
-}
-
-
-static void handle_data(_cmsg *cmsg, struct sk_buff *skb)
-{
-	capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f);
-	capidrv_ncci *nccip;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n",
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrController & 0x7f);
-		kfree_skb(skb);
-		return;
-	}
-	if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI))) {
-		printk(KERN_ERR "capidrv-%d: %s: ncci 0x%x not found\n",
-		       card->contrnr,
-		       capi_cmd2str(cmsg->Command, cmsg->Subcommand),
-		       cmsg->adr.adrNCCI);
-		kfree_skb(skb);
-		return;
-	}
-	(void) skb_pull(skb, CAPIMSG_LEN(skb->data));
-	card->interface.rcvcallb_skb(card->myid, nccip->chan, skb);
-	capi_cmsg_answer(cmsg);
-	send_message(card, cmsg);
-}
-
-static _cmsg s_cmsg;
-
-static void capidrv_recv_message(struct capi20_appl *ap, struct sk_buff *skb)
-{
-	if (capi_message2cmsg(&s_cmsg, skb->data)) {
-		printk(KERN_ERR "capidrv: applid=%d: received invalid message\n",
-		       ap->applid);
-		kfree_skb(skb);
-		return;
-	}
-	if (debugmode > 3) {
-		_cdebbuf *cdb = capi_cmsg2str(&s_cmsg);
-
-		if (cdb) {
-			printk(KERN_DEBUG "%s: applid=%d %s\n", __func__,
-			       ap->applid, cdb->buf);
-			cdebbuf_free(cdb);
-		} else
-			printk(KERN_DEBUG "%s: applid=%d %s not traced\n",
-			       __func__, ap->applid,
-			       capi_cmd2str(s_cmsg.Command, s_cmsg.Subcommand));
-	}
-	if (s_cmsg.Command == CAPI_DATA_B3
-	    && s_cmsg.Subcommand == CAPI_IND) {
-		handle_data(&s_cmsg, skb);
-		return;
-	}
-	if ((s_cmsg.adr.adrController & 0xffffff00) == 0)
-		handle_controller(&s_cmsg);
-	else if ((s_cmsg.adr.adrPLCI & 0xffff0000) == 0)
-		handle_plci(&s_cmsg);
-	else
-		handle_ncci(&s_cmsg);
-	/*
-	 * data of skb used in s_cmsg,
-	 * free data when s_cmsg is not used again
-	 * thanks to Lars Heete <hel@admin.de>
-	 */
-	kfree_skb(skb);
-}
-
-/* ------------------------------------------------------------------- */
-
-#define PUTBYTE_TO_STATUS(card, byte)				\
-	do {							\
-		*(card)->q931_write++ = (byte);			\
-		if ((card)->q931_write > (card)->q931_end)	\
-			(card)->q931_write = (card)->q931_buf;	\
-	} while (0)
-
-static void handle_dtrace_data(capidrv_contr *card,
-			       int send, int level2, u8 *data, u16 len)
-{
-	u8 *p, *end;
-	isdn_ctrl cmd;
-
-	if (!len) {
-		printk(KERN_DEBUG "capidrv-%d: avmb1_q931_data: len == %d\n",
-		       card->contrnr, len);
-		return;
-	}
-
-	if (level2) {
-		PUTBYTE_TO_STATUS(card, 'D');
-		PUTBYTE_TO_STATUS(card, '2');
-		PUTBYTE_TO_STATUS(card, send ? '>' : '<');
-		PUTBYTE_TO_STATUS(card, ':');
-	} else {
-		PUTBYTE_TO_STATUS(card, 'D');
-		PUTBYTE_TO_STATUS(card, '3');
-		PUTBYTE_TO_STATUS(card, send ? '>' : '<');
-		PUTBYTE_TO_STATUS(card, ':');
-	}
-
-	for (p = data, end = data + len; p < end; p++) {
-		PUTBYTE_TO_STATUS(card, ' ');
-		PUTBYTE_TO_STATUS(card, hex_asc_hi(*p));
-		PUTBYTE_TO_STATUS(card, hex_asc_lo(*p));
-	}
-	PUTBYTE_TO_STATUS(card, '\n');
-
-	cmd.command = ISDN_STAT_STAVAIL;
-	cmd.driver = card->myid;
-	cmd.arg = len * 3 + 5;
-	card->interface.statcallb(&cmd);
-}
-
-/* ------------------------------------------------------------------- */
-
-static _cmsg cmdcmsg;
-
-static int capidrv_ioctl(isdn_ctrl *c, capidrv_contr *card)
-{
-	switch (c->arg) {
-	case 1:
-		debugmode = (int)(*((unsigned int *)c->parm.num));
-		printk(KERN_DEBUG "capidrv-%d: debugmode=%d\n",
-		       card->contrnr, debugmode);
-		return 0;
-	default:
-		printk(KERN_DEBUG "capidrv-%d: capidrv_ioctl(%ld) called ??\n",
-		       card->contrnr, c->arg);
-		return -EINVAL;
-	}
-	return -EINVAL;
-}
-
-/*
- * Handle leased lines (CAPI-Bundling)
- */
-
-struct internal_bchannelinfo {
-	unsigned short channelalloc;
-	unsigned short operation;
-	unsigned char  cmask[31];
-};
-
-static int decodeFVteln(char *teln, unsigned long *bmaskp, int *activep)
-{
-	unsigned long bmask = 0;
-	int active = !0;
-	char *s;
-	int i;
-
-	if (strncmp(teln, "FV:", 3) != 0)
-		return 1;
-	s = teln + 3;
-	while (*s && *s == ' ') s++;
-	if (!*s) return -2;
-	if (*s == 'p' || *s == 'P') {
-		active = 0;
-		s++;
-	}
-	if (*s == 'a' || *s == 'A') {
-		active = !0;
-		s++;
-	}
-	while (*s) {
-		int digit1 = 0;
-		int digit2 = 0;
-		char *endp;
-
-		digit1 = simple_strtoul(s, &endp, 10);
-		if (s == endp)
-			return -3;
-		s = endp;
-
-		if (digit1 <= 0 || digit1 > 30) return -4;
-		if (*s == 0 || *s == ',' || *s == ' ') {
-			bmask |= (1 << digit1);
-			digit1 = 0;
-			if (*s) s++;
-			continue;
-		}
-		if (*s != '-') return -5;
-		s++;
-
-		digit2 = simple_strtoul(s, &endp, 10);
-		if (s == endp)
-			return -3;
-		s = endp;
-
-		if (digit2 <= 0 || digit2 > 30) return -4;
-		if (*s == 0 || *s == ',' || *s == ' ') {
-			if (digit1 > digit2)
-				for (i = digit2; i <= digit1; i++)
-					bmask |= (1 << i);
-			else
-				for (i = digit1; i <= digit2; i++)
-					bmask |= (1 << i);
-			digit1 = digit2 = 0;
-			if (*s) s++;
-			continue;
-		}
-		return -6;
-	}
-	if (activep) *activep = active;
-	if (bmaskp) *bmaskp = bmask;
-	return 0;
-}
-
-static int FVteln2capi20(char *teln, u8 AdditionalInfo[1 + 2 + 2 + 31])
-{
-	unsigned long bmask;
-	int active;
-	int rc, i;
-
-	rc = decodeFVteln(teln, &bmask, &active);
-	if (rc) return rc;
-	/* Length */
-	AdditionalInfo[0] = 2 + 2 + 31;
-	/* Channel: 3 => use channel allocation */
-	AdditionalInfo[1] = 3; AdditionalInfo[2] = 0;
-	/* Operation: 0 => DTE mode, 1 => DCE mode */
-	if (active) {
-		AdditionalInfo[3] = 0; AdditionalInfo[4] = 0;
-	} else {
-		AdditionalInfo[3] = 1; AdditionalInfo[4] = 0;
-	}
-	/* Channel mask array */
-	AdditionalInfo[5] = 0; /* no D-Channel */
-	for (i = 1; i <= 30; i++)
-		AdditionalInfo[5 + i] = (bmask & (1 << i)) ? 0xff : 0;
-	return 0;
-}
-
-static int capidrv_command(isdn_ctrl *c, capidrv_contr *card)
-{
-	isdn_ctrl cmd;
-	struct capidrv_bchan *bchan;
-	struct capidrv_plci *plcip;
-	u8 AdditionalInfo[1 + 2 + 2 + 31];
-	int rc, isleasedline = 0;
-
-	if (c->command == ISDN_CMD_IOCTL)
-		return capidrv_ioctl(c, card);
-
-	switch (c->command) {
-	case ISDN_CMD_DIAL: {
-		u8 calling[ISDN_MSNLEN + 3];
-		u8 called[ISDN_MSNLEN + 2];
-
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_DIAL(ch=%ld,\"%s,%d,%d,%s\")\n",
-			       card->contrnr,
-			       c->arg,
-			       c->parm.setup.phone,
-			       c->parm.setup.si1,
-			       c->parm.setup.si2,
-			       c->parm.setup.eazmsn);
-
-		bchan = &card->bchans[c->arg % card->nbchan];
-
-		if (bchan->plcip) {
-			printk(KERN_ERR "capidrv-%d: dail ch=%ld,\"%s,%d,%d,%s\" in use (plci=0x%x)\n",
-			       card->contrnr,
-			       c->arg,
-			       c->parm.setup.phone,
-			       c->parm.setup.si1,
-			       c->parm.setup.si2,
-			       c->parm.setup.eazmsn,
-			       bchan->plcip->plci);
-			return 0;
-		}
-		bchan->si1 = c->parm.setup.si1;
-		bchan->si2 = c->parm.setup.si2;
-
-		strncpy(bchan->num, c->parm.setup.phone, sizeof(bchan->num));
-		strncpy(bchan->mynum, c->parm.setup.eazmsn, sizeof(bchan->mynum));
-		rc = FVteln2capi20(bchan->num, AdditionalInfo);
-		isleasedline = (rc == 0);
-		if (rc < 0)
-			printk(KERN_ERR "capidrv-%d: WARNING: invalid leased linedefinition \"%s\"\n", card->contrnr, bchan->num);
-
-		if (isleasedline) {
-			calling[0] = 0;
-			called[0] = 0;
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: connecting leased line\n", card->contrnr);
-		} else {
-			calling[0] = strlen(bchan->mynum) + 2;
-			calling[1] = 0;
-			calling[2] = 0x80;
-			strncpy(calling + 3, bchan->mynum, ISDN_MSNLEN);
-			called[0] = strlen(bchan->num) + 1;
-			called[1] = 0x80;
-			strncpy(called + 2, bchan->num, ISDN_MSNLEN);
-		}
-
-		capi_fill_CONNECT_REQ(&cmdcmsg,
-				      global.ap.applid,
-				      card->msgid++,
-				      card->contrnr,	/* adr */
-				      si2cip(bchan->si1, bchan->si2),	/* cipvalue */
-				      called,	/* CalledPartyNumber */
-				      calling,	/* CallingPartyNumber */
-				      NULL,	/* CalledPartySubaddress */
-				      NULL,	/* CallingPartySubaddress */
-				      b1prot(bchan->l2, bchan->l3),	/* B1protocol */
-				      b2prot(bchan->l2, bchan->l3),	/* B2protocol */
-				      b3prot(bchan->l2, bchan->l3),	/* B3protocol */
-				      b1config(bchan->l2, bchan->l3),	/* B1configuration */
-				      NULL,	/* B2configuration */
-				      NULL,	/* B3configuration */
-				      NULL,	/* BC */
-				      NULL,	/* LLC */
-				      NULL,	/* HLC */
-				      /* BChannelinformation */
-				      isleasedline ? AdditionalInfo : NULL,
-				      NULL,	/* Keypadfacility */
-				      NULL,	/* Useruserdata */
-				      NULL	/* Facilitydataarray */
-			);
-		if ((plcip = new_plci(card, (c->arg % card->nbchan))) == NULL) {
-			cmd.command = ISDN_STAT_DHUP;
-			cmd.driver = card->myid;
-			cmd.arg = (c->arg % card->nbchan);
-			card->interface.statcallb(&cmd);
-			return -1;
-		}
-		plcip->msgid = cmdcmsg.Messagenumber;
-		plcip->leasedline = isleasedline;
-		plci_change_state(card, plcip, EV_PLCI_CONNECT_REQ);
-		send_message(card, &cmdcmsg);
-		return 0;
-	}
-
-	case ISDN_CMD_ACCEPTD:
-
-		bchan = &card->bchans[c->arg % card->nbchan];
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_ACCEPTD(ch=%ld) l2=%d l3=%d\n",
-			       card->contrnr,
-			       c->arg, bchan->l2, bchan->l3);
-
-		capi_fill_CONNECT_RESP(&cmdcmsg,
-				       global.ap.applid,
-				       card->msgid++,
-				       bchan->plcip->plci,	/* adr */
-				       0,	/* Reject */
-				       b1prot(bchan->l2, bchan->l3),	/* B1protocol */
-				       b2prot(bchan->l2, bchan->l3),	/* B2protocol */
-				       b3prot(bchan->l2, bchan->l3),	/* B3protocol */
-				       b1config(bchan->l2, bchan->l3),	/* B1configuration */
-				       NULL,	/* B2configuration */
-				       NULL,	/* B3configuration */
-				       NULL,	/* ConnectedNumber */
-				       NULL,	/* ConnectedSubaddress */
-				       NULL,	/* LLC */
-				       NULL,	/* BChannelinformation */
-				       NULL,	/* Keypadfacility */
-				       NULL,	/* Useruserdata */
-				       NULL	/* Facilitydataarray */
-			);
-		if (capi_cmsg2message(&cmdcmsg, cmdcmsg.buf)) {
-			printk(KERN_ERR "capidrv-%d: capidrv_command: parser failure\n",
-			       card->contrnr);
-			return -EINVAL;
-		}
-		plci_change_state(card, bchan->plcip, EV_PLCI_CONNECT_RESP);
-		send_message(card, &cmdcmsg);
-		return 0;
-
-	case ISDN_CMD_ACCEPTB:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_ACCEPTB(ch=%ld)\n",
-			       card->contrnr,
-			       c->arg);
-		return -ENOSYS;
-
-	case ISDN_CMD_HANGUP:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_HANGUP(ch=%ld)\n",
-			       card->contrnr,
-			       c->arg);
-		bchan = &card->bchans[c->arg % card->nbchan];
-
-		if (bchan->disconnecting) {
-			if (debugmode)
-				printk(KERN_DEBUG "capidrv-%d: chan %ld already disconnecting ...\n",
-				       card->contrnr,
-				       c->arg);
-			return 0;
-		}
-		if (bchan->nccip) {
-			bchan->disconnecting = 1;
-			capi_fill_DISCONNECT_B3_REQ(&cmdcmsg,
-						    global.ap.applid,
-						    card->msgid++,
-						    bchan->nccip->ncci,
-						    NULL	/* NCPI */
-				);
-			ncci_change_state(card, bchan->nccip, EV_NCCI_DISCONNECT_B3_REQ);
-			send_message(card, &cmdcmsg);
-			return 0;
-		} else if (bchan->plcip) {
-			if (bchan->plcip->state == ST_PLCI_INCOMING) {
-				/*
-				 * just ignore, we a called from
-				 * isdn_status_callback(),
-				 * which will return 0 or 2, this is handled
-				 * by the CONNECT_IND handler
-				 */
-				bchan->disconnecting = 1;
-				return 0;
-			} else if (bchan->plcip->plci) {
-				bchan->disconnecting = 1;
-				capi_fill_DISCONNECT_REQ(&cmdcmsg,
-							 global.ap.applid,
-							 card->msgid++,
-							 bchan->plcip->plci,
-							 NULL,	/* BChannelinformation */
-							 NULL,	/* Keypadfacility */
-							 NULL,	/* Useruserdata */
-							 NULL	/* Facilitydataarray */
-					);
-				plci_change_state(card, bchan->plcip, EV_PLCI_DISCONNECT_REQ);
-				send_message(card, &cmdcmsg);
-				return 0;
-			} else {
-				printk(KERN_ERR "capidrv-%d: chan %ld disconnect request while waiting for CONNECT_CONF\n",
-				       card->contrnr,
-				       c->arg);
-				return -EINVAL;
-			}
-		}
-		printk(KERN_ERR "capidrv-%d: chan %ld disconnect request on free channel\n",
-		       card->contrnr,
-		       c->arg);
-		return -EINVAL;
-/* ready */
-
-	case ISDN_CMD_SETL2:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: set L2 on chan %ld to %ld\n",
-			       card->contrnr,
-			       (c->arg & 0xff), (c->arg >> 8));
-		bchan = &card->bchans[(c->arg & 0xff) % card->nbchan];
-		bchan->l2 = (c->arg >> 8);
-		return 0;
-
-	case ISDN_CMD_SETL3:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: set L3 on chan %ld to %ld\n",
-			       card->contrnr,
-			       (c->arg & 0xff), (c->arg >> 8));
-		bchan = &card->bchans[(c->arg & 0xff) % card->nbchan];
-		bchan->l3 = (c->arg >> 8);
-		return 0;
-
-	case ISDN_CMD_SETEAZ:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: set EAZ \"%s\" on chan %ld\n",
-			       card->contrnr,
-			       c->parm.num, c->arg);
-		bchan = &card->bchans[c->arg % card->nbchan];
-		strncpy(bchan->msn, c->parm.num, ISDN_MSNLEN);
-		return 0;
-
-	case ISDN_CMD_CLREAZ:
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: clearing EAZ on chan %ld\n",
-			       card->contrnr, c->arg);
-		bchan = &card->bchans[c->arg % card->nbchan];
-		bchan->msn[0] = 0;
-		return 0;
-
-	default:
-		printk(KERN_ERR "capidrv-%d: ISDN_CMD_%d, Huh?\n",
-		       card->contrnr, c->command);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int if_command(isdn_ctrl *c)
-{
-	capidrv_contr *card = findcontrbydriverid(c->driver);
-
-	if (card)
-		return capidrv_command(c, card);
-
-	printk(KERN_ERR
-	       "capidrv: if_command %d called with invalid driverId %d!\n",
-	       c->command, c->driver);
-	return -ENODEV;
-}
-
-static _cmsg sendcmsg;
-
-static int if_sendbuf(int id, int channel, int doack, struct sk_buff *skb)
-{
-	capidrv_contr *card = findcontrbydriverid(id);
-	capidrv_bchan *bchan;
-	capidrv_ncci *nccip;
-	int len = skb->len;
-	int msglen;
-	u16 errcode;
-	u16 datahandle;
-	u32 data;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: if_sendbuf called with invalid driverId %d!\n",
-		       id);
-		return 0;
-	}
-	if (debugmode > 4)
-		printk(KERN_DEBUG "capidrv-%d: sendbuf len=%d skb=%p doack=%d\n",
-		       card->contrnr, len, skb, doack);
-	bchan = &card->bchans[channel % card->nbchan];
-	nccip = bchan->nccip;
-	if (!nccip || nccip->state != ST_NCCI_ACTIVE) {
-		printk(KERN_ERR "capidrv-%d: if_sendbuf: %s:%d: chan not up!\n",
-		       card->contrnr, card->name, channel);
-		return 0;
-	}
-	datahandle = nccip->datahandle;
-
-	/*
-	 * Here we copy pointer skb->data into the 32-bit 'Data' field.
-	 * The 'Data' field is not used in practice in linux kernel
-	 * (neither in 32 or 64 bit), but should have some value,
-	 * since a CAPI message trace will display it.
-	 *
-	 * The correct value in the 32 bit case is the address of the
-	 * data, in 64 bit it makes no sense, we use 0 there.
-	 */
-
-#ifdef CONFIG_64BIT
-	data = 0;
-#else
-	data = (unsigned long) skb->data;
-#endif
-
-	capi_fill_DATA_B3_REQ(&sendcmsg, global.ap.applid, card->msgid++,
-			      nccip->ncci,	/* adr */
-			      data,		/* Data */
-			      skb->len,		/* DataLength */
-			      datahandle,	/* DataHandle */
-			      0	/* Flags */
-		);
-
-	if (capidrv_add_ack(nccip, datahandle, doack ? (int)skb->len : -1) < 0)
-		return 0;
-
-	if (capi_cmsg2message(&sendcmsg, sendcmsg.buf)) {
-		printk(KERN_ERR "capidrv-%d: if_sendbuf: parser failure\n",
-		       card->contrnr);
-		return -EINVAL;
-	}
-	msglen = CAPIMSG_LEN(sendcmsg.buf);
-	if (skb_headroom(skb) < msglen) {
-		struct sk_buff *nskb = skb_realloc_headroom(skb, msglen);
-		if (!nskb) {
-			printk(KERN_ERR "capidrv-%d: if_sendbuf: no memory\n",
-			       card->contrnr);
-			(void)capidrv_del_ack(nccip, datahandle);
-			return 0;
-		}
-		printk(KERN_DEBUG "capidrv-%d: only %d bytes headroom, need %d\n",
-		       card->contrnr, skb_headroom(skb), msglen);
-		memcpy(skb_push(nskb, msglen), sendcmsg.buf, msglen);
-		errcode = capi20_put_message(&global.ap, nskb);
-		if (errcode == CAPI_NOERROR) {
-			dev_kfree_skb(skb);
-			nccip->datahandle++;
-			return len;
-		}
-		if (debugmode > 3)
-			printk(KERN_DEBUG "capidrv-%d: sendbuf putmsg ret(%x) - %s\n",
-			       card->contrnr, errcode, capi_info2str(errcode));
-		(void)capidrv_del_ack(nccip, datahandle);
-		dev_kfree_skb(nskb);
-		return errcode == CAPI_SENDQUEUEFULL ? 0 : -1;
-	} else {
-		memcpy(skb_push(skb, msglen), sendcmsg.buf, msglen);
-		errcode = capi20_put_message(&global.ap, skb);
-		if (errcode == CAPI_NOERROR) {
-			nccip->datahandle++;
-			return len;
-		}
-		if (debugmode > 3)
-			printk(KERN_DEBUG "capidrv-%d: sendbuf putmsg ret(%x) - %s\n",
-			       card->contrnr, errcode, capi_info2str(errcode));
-		skb_pull(skb, msglen);
-		(void)capidrv_del_ack(nccip, datahandle);
-		return errcode == CAPI_SENDQUEUEFULL ? 0 : -1;
-	}
-}
-
-static int if_readstat(u8 __user *buf, int len, int id, int channel)
-{
-	capidrv_contr *card = findcontrbydriverid(id);
-	int count;
-	u8 __user *p;
-
-	if (!card) {
-		printk(KERN_ERR "capidrv: if_readstat called with invalid driverId %d!\n",
-		       id);
-		return -ENODEV;
-	}
-
-	for (p = buf, count = 0; count < len; p++, count++) {
-		if (put_user(*card->q931_read++, p))
-			return -EFAULT;
-		if (card->q931_read > card->q931_end)
-			card->q931_read = card->q931_buf;
-	}
-	return count;
-
-}
-
-static void enable_dchannel_trace(capidrv_contr *card)
-{
-	u8 manufacturer[CAPI_MANUFACTURER_LEN];
-	capi_version version;
-	u16 contr = card->contrnr;
-	u16 errcode;
-	u16 avmversion[3];
-
-	errcode = capi20_get_manufacturer(contr, manufacturer);
-	if (errcode != CAPI_NOERROR) {
-		printk(KERN_ERR "%s: can't get manufacturer (0x%x)\n",
-		       card->name, errcode);
-		return;
-	}
-	if (strstr(manufacturer, "AVM") == NULL) {
-		printk(KERN_ERR "%s: not from AVM, no d-channel trace possible (%s)\n",
-		       card->name, manufacturer);
-		return;
-	}
-	errcode = capi20_get_version(contr, &version);
-	if (errcode != CAPI_NOERROR) {
-		printk(KERN_ERR "%s: can't get version (0x%x)\n",
-		       card->name, errcode);
-		return;
-	}
-	avmversion[0] = (version.majormanuversion >> 4) & 0x0f;
-	avmversion[1] = (version.majormanuversion << 4) & 0xf0;
-	avmversion[1] |= (version.minormanuversion >> 4) & 0x0f;
-	avmversion[2] |= version.minormanuversion & 0x0f;
-
-	if (avmversion[0] > 3 || (avmversion[0] == 3 && avmversion[1] > 5)) {
-		printk(KERN_INFO "%s: D2 trace enabled\n", card->name);
-		capi_fill_MANUFACTURER_REQ(&cmdcmsg, global.ap.applid,
-					   card->msgid++,
-					   contr,
-					   0x214D5641,  /* ManuID */
-					   0,           /* Class */
-					   1,           /* Function */
-					   (_cstruct)"\004\200\014\000\000");
-	} else {
-		printk(KERN_INFO "%s: D3 trace enabled\n", card->name);
-		capi_fill_MANUFACTURER_REQ(&cmdcmsg, global.ap.applid,
-					   card->msgid++,
-					   contr,
-					   0x214D5641,  /* ManuID */
-					   0,           /* Class */
-					   1,           /* Function */
-					   (_cstruct)"\004\002\003\000\000");
-	}
-	send_message(card, &cmdcmsg);
-}
-
-
-static void send_listen(capidrv_contr *card)
-{
-	capi_fill_LISTEN_REQ(&cmdcmsg, global.ap.applid,
-			     card->msgid++,
-			     card->contrnr, /* controller */
-			     1 << 6,	/* Infomask */
-			     card->cipmask,
-			     card->cipmask2,
-			     NULL, NULL);
-	listen_change_state(card, EV_LISTEN_REQ);
-	send_message(card, &cmdcmsg);
-}
-
-static void listentimerfunc(struct timer_list *t)
-{
-	capidrv_contr *card = from_timer(card, t, listentimer);
-	if (card->state != ST_LISTEN_NONE && card->state != ST_LISTEN_ACTIVE)
-		printk(KERN_ERR "%s: controller dead ??\n", card->name);
-	send_listen(card);
-	mod_timer(&card->listentimer, jiffies + 60 * HZ);
-}
-
-
-static int capidrv_addcontr(u16 contr, struct capi_profile *profp)
-{
-	capidrv_contr *card;
-	unsigned long flags;
-	isdn_ctrl cmd;
-	char id[20];
-	int i;
-
-	sprintf(id, "capidrv-%d", contr);
-	if (!try_module_get(THIS_MODULE)) {
-		printk(KERN_WARNING "capidrv: (%s) Could not reserve module\n", id);
-		return -1;
-	}
-	if (!(card = kzalloc(sizeof(capidrv_contr), GFP_ATOMIC))) {
-		printk(KERN_WARNING
-		       "capidrv: (%s) Could not allocate contr-struct.\n", id);
-		return -1;
-	}
-	card->owner = THIS_MODULE;
-	timer_setup(&card->listentimer, listentimerfunc, 0);
-	strcpy(card->name, id);
-	card->contrnr = contr;
-	card->nbchan = profp->nbchannel;
-	card->bchans = kmalloc_array(card->nbchan, sizeof(capidrv_bchan),
-				     GFP_ATOMIC);
-	if (!card->bchans) {
-		printk(KERN_WARNING
-		       "capidrv: (%s) Could not allocate bchan-structs.\n", id);
-		module_put(card->owner);
-		kfree(card);
-		return -1;
-	}
-	card->interface.channels = profp->nbchannel;
-	card->interface.maxbufsize = 2048;
-	card->interface.command = if_command;
-	card->interface.writebuf_skb = if_sendbuf;
-	card->interface.writecmd = NULL;
-	card->interface.readstat = if_readstat;
-	card->interface.features =
-		ISDN_FEATURE_L2_HDLC |
-		ISDN_FEATURE_L2_TRANS |
-		ISDN_FEATURE_L3_TRANS |
-		ISDN_FEATURE_P_UNKNOWN |
-		ISDN_FEATURE_L2_X75I |
-		ISDN_FEATURE_L2_X75UI |
-		ISDN_FEATURE_L2_X75BUI;
-	if (profp->support1 & (1 << 2))
-		card->interface.features |=
-			ISDN_FEATURE_L2_V11096 |
-			ISDN_FEATURE_L2_V11019 |
-			ISDN_FEATURE_L2_V11038;
-	if (profp->support1 & (1 << 8))
-		card->interface.features |= ISDN_FEATURE_L2_MODEM;
-	card->interface.hl_hdrlen = 22; /* len of DATA_B3_REQ */
-	strncpy(card->interface.id, id, sizeof(card->interface.id) - 1);
-
-
-	card->q931_read = card->q931_buf;
-	card->q931_write = card->q931_buf;
-	card->q931_end = card->q931_buf + sizeof(card->q931_buf) - 1;
-
-	if (!register_isdn(&card->interface)) {
-		printk(KERN_ERR "capidrv: Unable to register contr %s\n", id);
-		kfree(card->bchans);
-		module_put(card->owner);
-		kfree(card);
-		return -1;
-	}
-	card->myid = card->interface.channels;
-	memset(card->bchans, 0, sizeof(capidrv_bchan) * card->nbchan);
-	for (i = 0; i < card->nbchan; i++) {
-		card->bchans[i].contr = card;
-	}
-
-	spin_lock_irqsave(&global_lock, flags);
-	card->next = global.contr_list;
-	global.contr_list = card;
-	global.ncontr++;
-	spin_unlock_irqrestore(&global_lock, flags);
-
-	cmd.command = ISDN_STAT_RUN;
-	cmd.driver = card->myid;
-	card->interface.statcallb(&cmd);
-
-	card->cipmask = 0x1FFF03FF;	/* any */
-	card->cipmask2 = 0;
-
-	send_listen(card);
-	mod_timer(&card->listentimer, jiffies + 60 * HZ);
-
-	printk(KERN_INFO "%s: now up (%d B channels)\n",
-	       card->name, card->nbchan);
-
-	enable_dchannel_trace(card);
-
-	return 0;
-}
-
-static int capidrv_delcontr(u16 contr)
-{
-	capidrv_contr **pp, *card;
-	unsigned long flags;
-	isdn_ctrl cmd;
-
-	spin_lock_irqsave(&global_lock, flags);
-	for (card = global.contr_list; card; card = card->next) {
-		if (card->contrnr == contr)
-			break;
-	}
-	if (!card) {
-		spin_unlock_irqrestore(&global_lock, flags);
-		printk(KERN_ERR "capidrv: delcontr: no contr %u\n", contr);
-		return -1;
-	}
-
-	/* FIXME: maybe a race condition the card should be removed
-	 * here from global list /kkeil
-	 */
-	spin_unlock_irqrestore(&global_lock, flags);
-
-	del_timer(&card->listentimer);
-
-	if (debugmode)
-		printk(KERN_DEBUG "capidrv-%d: id=%d unloading\n",
-		       card->contrnr, card->myid);
-
-	cmd.command = ISDN_STAT_STOP;
-	cmd.driver = card->myid;
-	card->interface.statcallb(&cmd);
-
-	while (card->nbchan) {
-
-		cmd.command = ISDN_STAT_DISCH;
-		cmd.driver = card->myid;
-		cmd.arg = card->nbchan - 1;
-		cmd.parm.num[0] = 0;
-		if (debugmode)
-			printk(KERN_DEBUG "capidrv-%d: id=%d disable chan=%ld\n",
-			       card->contrnr, card->myid, cmd.arg);
-		card->interface.statcallb(&cmd);
-
-		if (card->bchans[card->nbchan - 1].nccip)
-			free_ncci(card, card->bchans[card->nbchan - 1].nccip);
-		if (card->bchans[card->nbchan - 1].plcip)
-			free_plci(card, card->bchans[card->nbchan - 1].plcip);
-		if (card->plci_list)
-			printk(KERN_ERR "capidrv: bug in free_plci()\n");
-		card->nbchan--;
-	}
-	kfree(card->bchans);
-	card->bchans = NULL;
-
-	if (debugmode)
-		printk(KERN_DEBUG "capidrv-%d: id=%d isdn unload\n",
-		       card->contrnr, card->myid);
-
-	cmd.command = ISDN_STAT_UNLOAD;
-	cmd.driver = card->myid;
-	card->interface.statcallb(&cmd);
-
-	if (debugmode)
-		printk(KERN_DEBUG "capidrv-%d: id=%d remove contr from list\n",
-		       card->contrnr, card->myid);
-
-	spin_lock_irqsave(&global_lock, flags);
-	for (pp = &global.contr_list; *pp; pp = &(*pp)->next) {
-		if (*pp == card) {
-			*pp = (*pp)->next;
-			card->next = NULL;
-			global.ncontr--;
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&global_lock, flags);
-
-	module_put(card->owner);
-	printk(KERN_INFO "%s: now down.\n", card->name);
-	kfree(card);
-	return 0;
-}
-
-
-static int
-lower_callback(struct notifier_block *nb, unsigned long val, void *v)
-{
-	capi_profile profile;
-	u32 contr = (long)v;
-
-	switch (val) {
-	case CAPICTR_UP:
-		printk(KERN_INFO "capidrv: controller %hu up\n", contr);
-		if (capi20_get_profile(contr, &profile) == CAPI_NOERROR)
-			(void) capidrv_addcontr(contr, &profile);
-		break;
-	case CAPICTR_DOWN:
-		printk(KERN_INFO "capidrv: controller %hu down\n", contr);
-		(void) capidrv_delcontr(contr);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-/*
- * /proc/capi/capidrv:
- * nrecvctlpkt nrecvdatapkt nsendctlpkt nsenddatapkt
- */
-static int __maybe_unused capidrv_proc_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "%lu %lu %lu %lu\n",
-		   global.ap.nrecvctlpkt,
-		   global.ap.nrecvdatapkt,
-		   global.ap.nsentctlpkt,
-		   global.ap.nsentdatapkt);
-	return 0;
-}
-
-static void __init proc_init(void)
-{
-	proc_create_single("capi/capidrv", 0, NULL, capidrv_proc_show);
-}
-
-static void __exit proc_exit(void)
-{
-	remove_proc_entry("capi/capidrv", NULL);
-}
-
-static struct notifier_block capictr_nb = {
-	.notifier_call = lower_callback,
-};
-
-static int __init capidrv_init(void)
-{
-	capi_profile profile;
-	u32 ncontr, contr;
-	u16 errcode;
-
-	global.ap.rparam.level3cnt = -2;  /* number of bchannels twice */
-	global.ap.rparam.datablkcnt = 16;
-	global.ap.rparam.datablklen = 2048;
-
-	global.ap.recv_message = capidrv_recv_message;
-	errcode = capi20_register(&global.ap);
-	if (errcode) {
-		return -EIO;
-	}
-
-	register_capictr_notifier(&capictr_nb);
-
-	errcode = capi20_get_profile(0, &profile);
-	if (errcode != CAPI_NOERROR) {
-		unregister_capictr_notifier(&capictr_nb);
-		capi20_release(&global.ap);
-		return -EIO;
-	}
-
-	ncontr = profile.ncontroller;
-	for (contr = 1; contr <= ncontr; contr++) {
-		errcode = capi20_get_profile(contr, &profile);
-		if (errcode != CAPI_NOERROR)
-			continue;
-		(void) capidrv_addcontr(contr, &profile);
-	}
-	proc_init();
-
-	return 0;
-}
-
-static void __exit capidrv_exit(void)
-{
-	unregister_capictr_notifier(&capictr_nb);
-	capi20_release(&global.ap);
-
-	proc_exit();
-}
-
-module_init(capidrv_init);
-module_exit(capidrv_exit);
diff --git a/drivers/isdn/capi/capidrv.h b/drivers/isdn/capi/capidrv.h
deleted file mode 100644
index 4466b2e0176d..000000000000
--- a/drivers/isdn/capi/capidrv.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* $Id: capidrv.h,v 1.2.8.2 2001/09/23 22:24:33 kai Exp $
- *
- * ISDN4Linux Driver, using capi20 interface (kernelcapi)
- *
- * Copyright 1997 by Carsten Paeth <calle@calle.de>
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef __CAPIDRV_H__
-#define __CAPIDRV_H__
-
-/*
- * LISTEN state machine
- */
-#define ST_LISTEN_NONE			0	/* L-0 */
-#define ST_LISTEN_WAIT_CONF		1	/* L-0.1 */
-#define ST_LISTEN_ACTIVE		2	/* L-1 */
-#define ST_LISTEN_ACTIVE_WAIT_CONF	3	/* L-1.1 */
-
-
-#define EV_LISTEN_REQ			1	/* L-0 -> L-0.1
-						   L-1 -> L-1.1 */
-#define EV_LISTEN_CONF_ERROR		2	/* L-0.1 -> L-0
-						   L-1.1 -> L-1 */
-#define EV_LISTEN_CONF_EMPTY		3	/* L-0.1 -> L-0
-						   L-1.1 -> L-0 */
-#define EV_LISTEN_CONF_OK		4	/* L-0.1 -> L-1
-						   L-1.1 -> L.1 */
-
-/*
- * per plci state machine
- */
-#define ST_PLCI_NONE			0	/* P-0 */
-#define ST_PLCI_OUTGOING		1	/* P-0.1 */
-#define ST_PLCI_ALLOCATED		2	/* P-1 */
-#define ST_PLCI_ACTIVE			3	/* P-ACT */
-#define ST_PLCI_INCOMING		4	/* P-2 */
-#define ST_PLCI_FACILITY_IND		5	/* P-3 */
-#define ST_PLCI_ACCEPTING		6	/* P-4 */
-#define ST_PLCI_DISCONNECTING		7	/* P-5 */
-#define ST_PLCI_DISCONNECTED		8	/* P-6 */
-#define ST_PLCI_RESUMEING		9	/* P-0.Res */
-#define ST_PLCI_RESUME			10	/* P-Res */
-#define ST_PLCI_HELD			11	/* P-HELD */
-
-#define EV_PLCI_CONNECT_REQ		1	/* P-0 -> P-0.1
-						 */
-#define EV_PLCI_CONNECT_CONF_ERROR	2	/* P-0.1 -> P-0
-						 */
-#define EV_PLCI_CONNECT_CONF_OK		3	/* P-0.1 -> P-1
-						 */
-#define EV_PLCI_FACILITY_IND_UP		4	/* P-0 -> P-1
-						 */
-#define EV_PLCI_CONNECT_IND		5	/* P-0 -> P-2
-						 */
-#define EV_PLCI_CONNECT_ACTIVE_IND	6	/* P-1 -> P-ACT
-						 */
-#define EV_PLCI_CONNECT_REJECT		7	/* P-2 -> P-5
-						   P-3 -> P-5
-						*/
-#define EV_PLCI_DISCONNECT_REQ		8	/* P-1 -> P-5
-						   P-2 -> P-5
-						   P-3 -> P-5
-						   P-4 -> P-5
-						   P-ACT -> P-5
-						   P-Res -> P-5 (*)
-						   P-HELD -> P-5 (*)
-						*/
-#define EV_PLCI_DISCONNECT_IND		9	/* P-1 -> P-6
-						   P-2 -> P-6
-						   P-3 -> P-6
-						   P-4 -> P-6
-						   P-5 -> P-6
-						   P-ACT -> P-6
-						   P-Res -> P-6 (*)
-						   P-HELD -> P-6 (*)
-						*/
-#define EV_PLCI_FACILITY_IND_DOWN	10	/* P-0.1 -> P-5
-						   P-1 -> P-5
-						   P-ACT -> P-5
-						   P-2 -> P-5
-						   P-3 -> P-5
-						   P-4 -> P-5
-						*/
-#define EV_PLCI_DISCONNECT_RESP		11	/* P-6 -> P-0
-						 */
-#define EV_PLCI_CONNECT_RESP		12	/* P-6 -> P-0
-						 */
-
-#define EV_PLCI_RESUME_REQ		13	/* P-0 -> P-0.Res
-						 */
-#define EV_PLCI_RESUME_CONF_OK		14	/* P-0.Res -> P-Res
-						 */
-#define EV_PLCI_RESUME_CONF_ERROR	15	/* P-0.Res -> P-0
-						 */
-#define EV_PLCI_RESUME_IND		16	/* P-Res -> P-ACT
-						 */
-#define EV_PLCI_HOLD_IND		17	/* P-ACT -> P-HELD
-						 */
-#define EV_PLCI_RETRIEVE_IND		18	/* P-HELD -> P-ACT
-						 */
-#define EV_PLCI_SUSPEND_IND		19	/* P-ACT -> P-5
-						 */
-#define EV_PLCI_CD_IND			20	/* P-2 -> P-5
-						 */
-
-/*
- * per ncci state machine
- */
-#define ST_NCCI_PREVIOUS			-1
-#define ST_NCCI_NONE				0	/* N-0 */
-#define ST_NCCI_OUTGOING			1	/* N-0.1 */
-#define ST_NCCI_INCOMING			2	/* N-1 */
-#define ST_NCCI_ALLOCATED			3	/* N-2 */
-#define ST_NCCI_ACTIVE				4	/* N-ACT */
-#define ST_NCCI_RESETING			5	/* N-3 */
-#define ST_NCCI_DISCONNECTING			6	/* N-4 */
-#define ST_NCCI_DISCONNECTED			7	/* N-5 */
-
-#define EV_NCCI_CONNECT_B3_REQ			1	/* N-0 -> N-0.1 */
-#define EV_NCCI_CONNECT_B3_IND			2	/* N-0 -> N.1 */
-#define EV_NCCI_CONNECT_B3_CONF_OK		3	/* N-0.1 -> N.2 */
-#define EV_NCCI_CONNECT_B3_CONF_ERROR		4	/* N-0.1 -> N.0 */
-#define EV_NCCI_CONNECT_B3_REJECT		5	/* N-1 -> N-4 */
-#define EV_NCCI_CONNECT_B3_RESP			6	/* N-1 -> N-2 */
-#define EV_NCCI_CONNECT_B3_ACTIVE_IND		7	/* N-2 -> N-ACT */
-#define EV_NCCI_RESET_B3_REQ			8	/* N-ACT -> N-3 */
-#define EV_NCCI_RESET_B3_IND			9	/* N-3 -> N-ACT */
-#define EV_NCCI_DISCONNECT_B3_IND		10	/* N-4 -> N.5 */
-#define EV_NCCI_DISCONNECT_B3_CONF_ERROR	11	/* N-4 -> previous */
-#define EV_NCCI_DISCONNECT_B3_REQ		12	/* N-1 -> N-4
-							   N-2 -> N-4
-							   N-3 -> N-4
-							   N-ACT -> N-4 */
-#define EV_NCCI_DISCONNECT_B3_RESP		13	/* N-5 -> N-0 */
-
-#endif				/* __CAPIDRV_H__ */
diff --git a/drivers/isdn/divert/Makefile b/drivers/isdn/divert/Makefile
deleted file mode 100644
index 07684fe53537..000000000000
--- a/drivers/isdn/divert/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-# Makefile for the dss1_divert ISDN module
-
-# Each configuration option enables a list of files.
-
-obj-$(CONFIG_ISDN_DIVERSION)	+= dss1_divert.o
-
-# Multipart objects.
-
-dss1_divert-y			:= isdn_divert.o divert_procfs.o divert_init.o
diff --git a/drivers/isdn/divert/divert_init.c b/drivers/isdn/divert/divert_init.c
deleted file mode 100644
index 267dede13bfd..000000000000
--- a/drivers/isdn/divert/divert_init.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/* $Id divert_init.c,v 1.5.6.2 2001/01/24 22:18:17 kai Exp $
- *
- * Module init for DSS1 diversion services for i4l.
- *
- * Copyright 1999       by Werner Cornelius (werner@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-
-#include "isdn_divert.h"
-
-MODULE_DESCRIPTION("ISDN4Linux: Call diversion support");
-MODULE_AUTHOR("Werner Cornelius");
-MODULE_LICENSE("GPL");
-
-/****************************************/
-/* structure containing interface to hl */
-/****************************************/
-isdn_divert_if divert_if = {
-	DIVERT_IF_MAGIC,	/* magic value */
-	DIVERT_CMD_REG,		/* register cmd */
-	ll_callback,		/* callback routine from ll */
-	NULL,			/* command still not specified */
-	NULL,			/* drv_to_name */
-	NULL,			/* name_to_drv */
-};
-
-/*************************/
-/* Module interface code */
-/* no cmd line parms     */
-/*************************/
-static int __init divert_init(void)
-{
-	int i;
-
-	if (divert_dev_init()) {
-		printk(KERN_WARNING "dss1_divert: cannot install device, not loaded\n");
-		return (-EIO);
-	}
-	if ((i = DIVERT_REG_NAME(&divert_if)) != DIVERT_NO_ERR) {
-		divert_dev_deinit();
-		printk(KERN_WARNING "dss1_divert: error %d registering module, not loaded\n", i);
-		return (-EIO);
-	}
-	printk(KERN_INFO "dss1_divert module successfully installed\n");
-	return (0);
-}
-
-/**********************/
-/* Module deinit code */
-/**********************/
-static void __exit divert_exit(void)
-{
-	unsigned long flags;
-	int i;
-
-	spin_lock_irqsave(&divert_lock, flags);
-	divert_if.cmd = DIVERT_CMD_REL; /* release */
-	if ((i = DIVERT_REG_NAME(&divert_if)) != DIVERT_NO_ERR) {
-		printk(KERN_WARNING "dss1_divert: error %d releasing module\n", i);
-		spin_unlock_irqrestore(&divert_lock, flags);
-		return;
-	}
-	if (divert_dev_deinit()) {
-		printk(KERN_WARNING "dss1_divert: device busy, remove cancelled\n");
-		spin_unlock_irqrestore(&divert_lock, flags);
-		return;
-	}
-	spin_unlock_irqrestore(&divert_lock, flags);
-	deleterule(-1); /* delete all rules and free mem */
-	deleteprocs();
-	printk(KERN_INFO "dss1_divert module successfully removed \n");
-}
-
-module_init(divert_init);
-module_exit(divert_exit);
diff --git a/drivers/isdn/divert/divert_procfs.c b/drivers/isdn/divert/divert_procfs.c
deleted file mode 100644
index 342585e04fd3..000000000000
--- a/drivers/isdn/divert/divert_procfs.c
+++ /dev/null
@@ -1,336 +0,0 @@
-/* $Id: divert_procfs.c,v 1.11.6.2 2001/09/23 22:24:36 kai Exp $
- *
- * Filesystem handling for the diversion supplementary services.
- *
- * Copyright 1998       by Werner Cornelius (werner@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/module.h>
-#include <linux/poll.h>
-#include <linux/slab.h>
-#ifdef CONFIG_PROC_FS
-#include <linux/proc_fs.h>
-#else
-#include <linux/fs.h>
-#endif
-#include <linux/sched.h>
-#include <linux/isdnif.h>
-#include <net/net_namespace.h>
-#include <linux/mutex.h>
-#include "isdn_divert.h"
-
-
-/*********************************/
-/* Variables for interface queue */
-/*********************************/
-ulong if_used = 0;		/* number of interface users */
-static DEFINE_MUTEX(isdn_divert_mutex);
-static struct divert_info *divert_info_head = NULL;	/* head of queue */
-static struct divert_info *divert_info_tail = NULL;	/* pointer to last entry */
-static DEFINE_SPINLOCK(divert_info_lock);/* lock for queue */
-static wait_queue_head_t rd_queue;
-
-/*********************************/
-/* put an info buffer into queue */
-/*********************************/
-void
-put_info_buffer(char *cp)
-{
-	struct divert_info *ib;
-	unsigned long flags;
-
-	if (if_used <= 0)
-		return;
-	if (!cp)
-		return;
-	if (!*cp)
-		return;
-	if (!(ib = kmalloc(sizeof(struct divert_info) + strlen(cp), GFP_ATOMIC)))
-		return;	/* no memory */
-	strcpy(ib->info_start, cp);	/* set output string */
-	ib->next = NULL;
-	spin_lock_irqsave(&divert_info_lock, flags);
-	ib->usage_cnt = if_used;
-	if (!divert_info_head)
-		divert_info_head = ib;	/* new head */
-	else
-		divert_info_tail->next = ib;	/* follows existing messages */
-	divert_info_tail = ib;	/* new tail */
-
-	/* delete old entrys */
-	while (divert_info_head->next) {
-		if ((divert_info_head->usage_cnt <= 0) &&
-		    (divert_info_head->next->usage_cnt <= 0)) {
-			ib = divert_info_head;
-			divert_info_head = divert_info_head->next;
-			kfree(ib);
-		} else
-			break;
-	}			/* divert_info_head->next */
-	spin_unlock_irqrestore(&divert_info_lock, flags);
-	wake_up_interruptible(&(rd_queue));
-}				/* put_info_buffer */
-
-#ifdef CONFIG_PROC_FS
-
-/**********************************/
-/* deflection device read routine */
-/**********************************/
-static ssize_t
-isdn_divert_read(struct file *file, char __user *buf, size_t count, loff_t *off)
-{
-	struct divert_info *inf;
-	int len;
-
-	if (!(inf = *((struct divert_info **) file->private_data))) {
-		if (file->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		wait_event_interruptible(rd_queue, (inf =
-			*((struct divert_info **) file->private_data)));
-	}
-	if (!inf)
-		return (0);
-
-	inf->usage_cnt--;	/* new usage count */
-	file->private_data = &inf->next;	/* next structure */
-	if ((len = strlen(inf->info_start)) <= count) {
-		if (copy_to_user(buf, inf->info_start, len))
-			return -EFAULT;
-		*off += len;
-		return (len);
-	}
-	return (0);
-}				/* isdn_divert_read */
-
-/**********************************/
-/* deflection device write routine */
-/**********************************/
-static ssize_t
-isdn_divert_write(struct file *file, const char __user *buf, size_t count, loff_t *off)
-{
-	return (-ENODEV);
-}				/* isdn_divert_write */
-
-
-/***************************************/
-/* select routines for various kernels */
-/***************************************/
-static __poll_t
-isdn_divert_poll(struct file *file, poll_table *wait)
-{
-	__poll_t mask = 0;
-
-	poll_wait(file, &(rd_queue), wait);
-	/* mask = EPOLLOUT | EPOLLWRNORM; */
-	if (*((struct divert_info **) file->private_data)) {
-		mask |= EPOLLIN | EPOLLRDNORM;
-	}
-	return mask;
-}				/* isdn_divert_poll */
-
-/****************/
-/* Open routine */
-/****************/
-static int
-isdn_divert_open(struct inode *ino, struct file *filep)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&divert_info_lock, flags);
-	if_used++;
-	if (divert_info_head)
-		filep->private_data = &(divert_info_tail->next);
-	else
-		filep->private_data = &divert_info_head;
-	spin_unlock_irqrestore(&divert_info_lock, flags);
-	/*  start_divert(); */
-	return nonseekable_open(ino, filep);
-}				/* isdn_divert_open */
-
-/*******************/
-/* close routine   */
-/*******************/
-static int
-isdn_divert_close(struct inode *ino, struct file *filep)
-{
-	struct divert_info *inf;
-	unsigned long flags;
-
-	spin_lock_irqsave(&divert_info_lock, flags);
-	if_used--;
-	inf = *((struct divert_info **) filep->private_data);
-	while (inf) {
-		inf->usage_cnt--;
-		inf = inf->next;
-	}
-	if (if_used <= 0)
-		while (divert_info_head) {
-			inf = divert_info_head;
-			divert_info_head = divert_info_head->next;
-			kfree(inf);
-		}
-	spin_unlock_irqrestore(&divert_info_lock, flags);
-	return (0);
-}				/* isdn_divert_close */
-
-/*********/
-/* IOCTL */
-/*********/
-static int isdn_divert_ioctl_unlocked(struct file *file, uint cmd, ulong arg)
-{
-	divert_ioctl dioctl;
-	int i;
-	unsigned long flags;
-	divert_rule *rulep;
-	char *cp;
-
-	if (copy_from_user(&dioctl, (void __user *) arg, sizeof(dioctl)))
-		return -EFAULT;
-
-	switch (cmd) {
-	case IIOCGETVER:
-		dioctl.drv_version = DIVERT_IIOC_VERSION;	/* set version */
-		break;
-
-	case IIOCGETDRV:
-		if ((dioctl.getid.drvid = divert_if.name_to_drv(dioctl.getid.drvnam)) < 0)
-			return (-EINVAL);
-		break;
-
-	case IIOCGETNAM:
-		cp = divert_if.drv_to_name(dioctl.getid.drvid);
-		if (!cp)
-			return (-EINVAL);
-		if (!*cp)
-			return (-EINVAL);
-		strcpy(dioctl.getid.drvnam, cp);
-		break;
-
-	case IIOCGETRULE:
-		if (!(rulep = getruleptr(dioctl.getsetrule.ruleidx)))
-			return (-EINVAL);
-		dioctl.getsetrule.rule = *rulep;	/* copy data */
-		break;
-
-	case IIOCMODRULE:
-		if (!(rulep = getruleptr(dioctl.getsetrule.ruleidx)))
-			return (-EINVAL);
-		spin_lock_irqsave(&divert_lock, flags);
-		*rulep = dioctl.getsetrule.rule;	/* copy data */
-		spin_unlock_irqrestore(&divert_lock, flags);
-		return (0);	/* no copy required */
-		break;
-
-	case IIOCINSRULE:
-		return (insertrule(dioctl.getsetrule.ruleidx, &dioctl.getsetrule.rule));
-		break;
-
-	case IIOCDELRULE:
-		return (deleterule(dioctl.getsetrule.ruleidx));
-		break;
-
-	case IIOCDODFACT:
-		return (deflect_extern_action(dioctl.fwd_ctrl.subcmd,
-					      dioctl.fwd_ctrl.callid,
-					      dioctl.fwd_ctrl.to_nr));
-
-	case IIOCDOCFACT:
-	case IIOCDOCFDIS:
-	case IIOCDOCFINT:
-		if (!divert_if.drv_to_name(dioctl.cf_ctrl.drvid))
-			return (-EINVAL);	/* invalid driver */
-		if (strnlen(dioctl.cf_ctrl.msn, sizeof(dioctl.cf_ctrl.msn)) ==
-		    sizeof(dioctl.cf_ctrl.msn))
-			return -EINVAL;
-		if (strnlen(dioctl.cf_ctrl.fwd_nr, sizeof(dioctl.cf_ctrl.fwd_nr)) ==
-		    sizeof(dioctl.cf_ctrl.fwd_nr))
-			return -EINVAL;
-		if ((i = cf_command(dioctl.cf_ctrl.drvid,
-				    (cmd == IIOCDOCFACT) ? 1 : (cmd == IIOCDOCFDIS) ? 0 : 2,
-				    dioctl.cf_ctrl.cfproc,
-				    dioctl.cf_ctrl.msn,
-				    dioctl.cf_ctrl.service,
-				    dioctl.cf_ctrl.fwd_nr,
-				    &dioctl.cf_ctrl.procid)))
-			return (i);
-		break;
-
-	default:
-		return (-EINVAL);
-	}			/* switch cmd */
-	return copy_to_user((void __user *)arg, &dioctl, sizeof(dioctl)) ? -EFAULT : 0;
-}				/* isdn_divert_ioctl */
-
-static long isdn_divert_ioctl(struct file *file, uint cmd, ulong arg)
-{
-	long ret;
-
-	mutex_lock(&isdn_divert_mutex);
-	ret = isdn_divert_ioctl_unlocked(file, cmd, arg);
-	mutex_unlock(&isdn_divert_mutex);
-
-	return ret;
-}
-
-static const struct file_operations isdn_fops =
-{
-	.owner          = THIS_MODULE,
-	.llseek         = no_llseek,
-	.read           = isdn_divert_read,
-	.write          = isdn_divert_write,
-	.poll           = isdn_divert_poll,
-	.unlocked_ioctl = isdn_divert_ioctl,
-	.open           = isdn_divert_open,
-	.release        = isdn_divert_close,
-};
-
-/****************************/
-/* isdn subdir in /proc/net */
-/****************************/
-static struct proc_dir_entry *isdn_proc_entry = NULL;
-static struct proc_dir_entry *isdn_divert_entry = NULL;
-#endif	/* CONFIG_PROC_FS */
-
-/***************************************************************************/
-/* divert_dev_init must be called before the proc filesystem may be used   */
-/***************************************************************************/
-int
-divert_dev_init(void)
-{
-
-	init_waitqueue_head(&rd_queue);
-
-#ifdef CONFIG_PROC_FS
-	isdn_proc_entry = proc_mkdir("isdn", init_net.proc_net);
-	if (!isdn_proc_entry)
-		return (-1);
-	isdn_divert_entry = proc_create("divert", S_IFREG | S_IRUGO,
-					isdn_proc_entry, &isdn_fops);
-	if (!isdn_divert_entry) {
-		remove_proc_entry("isdn", init_net.proc_net);
-		return (-1);
-	}
-#endif	/* CONFIG_PROC_FS */
-
-	return (0);
-}				/* divert_dev_init */
-
-/***************************************************************************/
-/* divert_dev_deinit must be called before leaving isdn when included as   */
-/* a module.                                                               */
-/***************************************************************************/
-int
-divert_dev_deinit(void)
-{
-
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry("divert", isdn_proc_entry);
-	remove_proc_entry("isdn", init_net.proc_net);
-#endif	/* CONFIG_PROC_FS */
-
-	return (0);
-}				/* divert_dev_deinit */
diff --git a/drivers/isdn/divert/isdn_divert.c b/drivers/isdn/divert/isdn_divert.c
deleted file mode 100644
index 5620fd2c6009..000000000000
--- a/drivers/isdn/divert/isdn_divert.c
+++ /dev/null
@@ -1,846 +0,0 @@
-/* $Id: isdn_divert.c,v 1.6.6.3 2001/09/23 22:24:36 kai Exp $
- *
- * DSS1 main diversion supplementary handling for i4l.
- *
- * Copyright 1999       by Werner Cornelius (werner@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/timer.h>
-#include <linux/jiffies.h>
-
-#include "isdn_divert.h"
-
-/**********************************/
-/* structure keeping calling info */
-/**********************************/
-struct call_struc {
-	isdn_ctrl ics; /* delivered setup + driver parameters */
-	ulong divert_id; /* Id delivered to user */
-	unsigned char akt_state; /* actual state */
-	char deflect_dest[35]; /* deflection destination */
-	struct timer_list timer; /* timer control structure */
-	char info[90]; /* device info output */
-	struct call_struc *next; /* pointer to next entry */
-	struct call_struc *prev;
-};
-
-
-/********************************************/
-/* structure keeping deflection table entry */
-/********************************************/
-struct deflect_struc {
-	struct deflect_struc *next, *prev;
-	divert_rule rule; /* used rule */
-};
-
-
-/*****************************************/
-/* variables for main diversion services */
-/*****************************************/
-/* diversion/deflection processes */
-static struct call_struc *divert_head = NULL; /* head of remembered entrys */
-static ulong next_id = 1; /* next info id */
-static struct deflect_struc *table_head = NULL;
-static struct deflect_struc *table_tail = NULL;
-static unsigned char extern_wait_max = 4; /* maximum wait in s for external process */
-
-DEFINE_SPINLOCK(divert_lock);
-
-/***************************/
-/* timer callback function */
-/***************************/
-static void deflect_timer_expire(struct timer_list *t)
-{
-	unsigned long flags;
-	struct call_struc *cs = from_timer(cs, t, timer);
-
-	spin_lock_irqsave(&divert_lock, flags);
-	del_timer(&cs->timer); /* delete active timer */
-	spin_unlock_irqrestore(&divert_lock, flags);
-
-	switch (cs->akt_state) {
-	case DEFLECT_PROCEED:
-		cs->ics.command = ISDN_CMD_HANGUP; /* cancel action */
-		divert_if.ll_cmd(&cs->ics);
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-		cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-		add_timer(&cs->timer);
-		spin_unlock_irqrestore(&divert_lock, flags);
-		break;
-
-	case DEFLECT_ALERT:
-		cs->ics.command = ISDN_CMD_REDIR; /* protocol */
-		strlcpy(cs->ics.parm.setup.phone, cs->deflect_dest, sizeof(cs->ics.parm.setup.phone));
-		strcpy(cs->ics.parm.setup.eazmsn, "Testtext delayed");
-		divert_if.ll_cmd(&cs->ics);
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-		cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-		add_timer(&cs->timer);
-		spin_unlock_irqrestore(&divert_lock, flags);
-		break;
-
-	case DEFLECT_AUTODEL:
-	default:
-		spin_lock_irqsave(&divert_lock, flags);
-		if (cs->prev)
-			cs->prev->next = cs->next; /* forward link */
-		else
-			divert_head = cs->next;
-		if (cs->next)
-			cs->next->prev = cs->prev; /* back link */
-		spin_unlock_irqrestore(&divert_lock, flags);
-		kfree(cs);
-		return;
-
-	} /* switch */
-} /* deflect_timer_func */
-
-
-/*****************************************/
-/* handle call forwarding de/activations */
-/* 0 = deact, 1 = act, 2 = interrogate   */
-/*****************************************/
-int cf_command(int drvid, int mode,
-	       u_char proc, char *msn,
-	       u_char service, char *fwd_nr, ulong *procid)
-{
-	unsigned long flags;
-	int retval, msnlen;
-	int fwd_len;
-	char *p, *ielenp, tmp[60];
-	struct call_struc *cs;
-
-	if (strchr(msn, '.')) return (-EINVAL); /* subaddress not allowed in msn */
-	if ((proc & 0x7F) > 2) return (-EINVAL);
-	proc &= 3;
-	p = tmp;
-	*p++ = 0x30; /* enumeration */
-	ielenp = p++; /* remember total length position */
-	*p++ = 0xa; /* proc tag */
-	*p++ = 1;   /* length */
-	*p++ = proc & 0x7F; /* procedure to de/activate/interrogate */
-	*p++ = 0xa; /* service tag */
-	*p++ = 1;   /* length */
-	*p++ = service; /* service to handle */
-
-	if (mode == 1) {
-		if (!*fwd_nr) return (-EINVAL); /* destination missing */
-		if (strchr(fwd_nr, '.')) return (-EINVAL); /* subaddress not allowed */
-		fwd_len = strlen(fwd_nr);
-		*p++ = 0x30; /* number enumeration */
-		*p++ = fwd_len + 2; /* complete forward to len */
-		*p++ = 0x80; /* fwd to nr */
-		*p++ = fwd_len; /* length of number */
-		strcpy(p, fwd_nr); /* copy number */
-		p += fwd_len; /* pointer beyond fwd */
-	} /* activate */
-
-	msnlen = strlen(msn);
-	*p++ = 0x80; /* msn number */
-	if (msnlen > 1) {
-		*p++ = msnlen; /* length */
-		strcpy(p, msn);
-		p += msnlen;
-	} else
-		*p++ = 0;
-
-	*ielenp = p - ielenp - 1; /* set total IE length */
-
-	/* allocate mem for information struct */
-	if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC)))
-		return (-ENOMEM); /* no memory */
-	timer_setup(&cs->timer, deflect_timer_expire, 0);
-	cs->info[0] = '\0';
-	cs->ics.driver = drvid;
-	cs->ics.command = ISDN_CMD_PROT_IO; /* protocol specific io */
-	cs->ics.arg = DSS1_CMD_INVOKE; /* invoke supplementary service */
-	cs->ics.parm.dss1_io.proc = (mode == 1) ? 7 : (mode == 2) ? 11 : 8; /* operation */
-	cs->ics.parm.dss1_io.timeout = 4000; /* from ETS 300 207-1 */
-	cs->ics.parm.dss1_io.datalen = p - tmp; /* total len */
-	cs->ics.parm.dss1_io.data = tmp; /* start of buffer */
-
-	spin_lock_irqsave(&divert_lock, flags);
-	cs->ics.parm.dss1_io.ll_id = next_id++; /* id for callback */
-	spin_unlock_irqrestore(&divert_lock, flags);
-	*procid = cs->ics.parm.dss1_io.ll_id;
-
-	sprintf(cs->info, "%d 0x%lx %s%s 0 %s %02x %d%s%s\n",
-		(!mode) ? DIVERT_DEACTIVATE : (mode == 1) ? DIVERT_ACTIVATE : DIVERT_REPORT,
-		cs->ics.parm.dss1_io.ll_id,
-		(mode != 2) ? "" : "0 ",
-		divert_if.drv_to_name(cs->ics.driver),
-		msn,
-		service & 0xFF,
-		proc,
-		(mode != 1) ? "" : " 0 ",
-		(mode != 1) ? "" : fwd_nr);
-
-	retval = divert_if.ll_cmd(&cs->ics); /* execute command */
-
-	if (!retval) {
-		cs->prev = NULL;
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->next = divert_head;
-		divert_head = cs;
-		spin_unlock_irqrestore(&divert_lock, flags);
-	} else
-		kfree(cs);
-	return (retval);
-} /* cf_command */
-
-
-/****************************************/
-/* handle a external deflection command */
-/****************************************/
-int deflect_extern_action(u_char cmd, ulong callid, char *to_nr)
-{
-	struct call_struc *cs;
-	isdn_ctrl ic;
-	unsigned long flags;
-	int i;
-
-	if ((cmd & 0x7F) > 2) return (-EINVAL); /* invalid command */
-	cs = divert_head; /* start of parameter list */
-	while (cs) {
-		if (cs->divert_id == callid) break; /* found */
-		cs = cs->next;
-	} /* search entry */
-	if (!cs) return (-EINVAL); /* invalid callid */
-
-	ic.driver = cs->ics.driver;
-	ic.arg = cs->ics.arg;
-	i = -EINVAL;
-	if (cs->akt_state == DEFLECT_AUTODEL) return (i); /* no valid call */
-	switch (cmd & 0x7F) {
-	case 0: /* hangup */
-		del_timer(&cs->timer);
-		ic.command = ISDN_CMD_HANGUP;
-		i = divert_if.ll_cmd(&ic);
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-		cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-		add_timer(&cs->timer);
-		spin_unlock_irqrestore(&divert_lock, flags);
-		break;
-
-	case 1: /* alert */
-		if (cs->akt_state == DEFLECT_ALERT) return (0);
-		cmd &= 0x7F; /* never wait */
-		del_timer(&cs->timer);
-		ic.command = ISDN_CMD_ALERT;
-		if ((i = divert_if.ll_cmd(&ic))) {
-			spin_lock_irqsave(&divert_lock, flags);
-			cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-			cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-			add_timer(&cs->timer);
-			spin_unlock_irqrestore(&divert_lock, flags);
-		} else
-			cs->akt_state = DEFLECT_ALERT;
-		break;
-
-	case 2: /* redir */
-		del_timer(&cs->timer);
-		strlcpy(cs->ics.parm.setup.phone, to_nr, sizeof(cs->ics.parm.setup.phone));
-		strcpy(cs->ics.parm.setup.eazmsn, "Testtext manual");
-		ic.command = ISDN_CMD_REDIR;
-		if ((i = divert_if.ll_cmd(&ic))) {
-			spin_lock_irqsave(&divert_lock, flags);
-			cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-			cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-			add_timer(&cs->timer);
-			spin_unlock_irqrestore(&divert_lock, flags);
-		} else
-			cs->akt_state = DEFLECT_ALERT;
-		break;
-
-	} /* switch */
-	return (i);
-} /* deflect_extern_action */
-
-/********************************/
-/* insert a new rule before idx */
-/********************************/
-int insertrule(int idx, divert_rule *newrule)
-{
-	struct deflect_struc *ds, *ds1 = NULL;
-	unsigned long flags;
-
-	if (!(ds = kmalloc(sizeof(struct deflect_struc), GFP_KERNEL)))
-		return (-ENOMEM); /* no memory */
-
-	ds->rule = *newrule; /* set rule */
-
-	spin_lock_irqsave(&divert_lock, flags);
-
-	if (idx >= 0) {
-		ds1 = table_head;
-		while ((ds1) && (idx > 0))
-		{ idx--;
-			ds1 = ds1->next;
-		}
-		if (!ds1) idx = -1;
-	}
-
-	if (idx < 0) {
-		ds->prev = table_tail; /* previous entry */
-		ds->next = NULL; /* end of chain */
-		if (ds->prev)
-			ds->prev->next = ds; /* last forward */
-		else
-			table_head = ds; /* is first entry */
-		table_tail = ds; /* end of queue */
-	} else {
-		ds->next = ds1; /* next entry */
-		ds->prev = ds1->prev; /* prev entry */
-		ds1->prev = ds; /* backward chain old element */
-		if (!ds->prev)
-			table_head = ds; /* first element */
-	}
-
-	spin_unlock_irqrestore(&divert_lock, flags);
-	return (0);
-} /* insertrule */
-
-/***********************************/
-/* delete the rule at position idx */
-/***********************************/
-int deleterule(int idx)
-{
-	struct deflect_struc *ds, *ds1;
-	unsigned long flags;
-
-	if (idx < 0) {
-		spin_lock_irqsave(&divert_lock, flags);
-		ds = table_head;
-		table_head = NULL;
-		table_tail = NULL;
-		spin_unlock_irqrestore(&divert_lock, flags);
-		while (ds) {
-			ds1 = ds;
-			ds = ds->next;
-			kfree(ds1);
-		}
-		return (0);
-	}
-
-	spin_lock_irqsave(&divert_lock, flags);
-	ds = table_head;
-
-	while ((ds) && (idx > 0)) {
-		idx--;
-		ds = ds->next;
-	}
-
-	if (!ds) {
-		spin_unlock_irqrestore(&divert_lock, flags);
-		return (-EINVAL);
-	}
-
-	if (ds->next)
-		ds->next->prev = ds->prev; /* backward chain */
-	else
-		table_tail = ds->prev; /* end of chain */
-
-	if (ds->prev)
-		ds->prev->next = ds->next; /* forward chain */
-	else
-		table_head = ds->next; /* start of chain */
-
-	spin_unlock_irqrestore(&divert_lock, flags);
-	kfree(ds);
-	return (0);
-} /* deleterule */
-
-/*******************************************/
-/* get a pointer to a specific rule number */
-/*******************************************/
-divert_rule *getruleptr(int idx)
-{
-	struct deflect_struc *ds = table_head;
-
-	if (idx < 0) return (NULL);
-	while ((ds) && (idx >= 0)) {
-		if (!(idx--)) {
-			return (&ds->rule);
-			break;
-		}
-		ds = ds->next;
-	}
-	return (NULL);
-} /* getruleptr */
-
-/*************************************************/
-/* called from common module on an incoming call */
-/*************************************************/
-static int isdn_divert_icall(isdn_ctrl *ic)
-{
-	int retval = 0;
-	unsigned long flags;
-	struct call_struc *cs = NULL;
-	struct deflect_struc *dv;
-	char *p, *p1;
-	u_char accept;
-
-	/* first check the internal deflection table */
-	for (dv = table_head; dv; dv = dv->next) {
-		/* scan table */
-		if (((dv->rule.callopt == 1) && (ic->command == ISDN_STAT_ICALLW)) ||
-		    ((dv->rule.callopt == 2) && (ic->command == ISDN_STAT_ICALL)))
-			continue; /* call option check */
-		if (!(dv->rule.drvid & (1L << ic->driver)))
-			continue; /* driver not matching */
-		if ((dv->rule.si1) && (dv->rule.si1 != ic->parm.setup.si1))
-			continue; /* si1 not matching */
-		if ((dv->rule.si2) && (dv->rule.si2 != ic->parm.setup.si2))
-			continue; /* si2 not matching */
-
-		p = dv->rule.my_msn;
-		p1 = ic->parm.setup.eazmsn;
-		accept = 0;
-		while (*p) {
-			/* complete compare */
-			if (*p == '-') {
-				accept = 1; /* call accepted */
-				break;
-			}
-			if (*p++ != *p1++)
-				break; /* not accepted */
-			if ((!*p) && (!*p1))
-				accept = 1;
-		} /* complete compare */
-		if (!accept) continue; /* not accepted */
-
-		if ((strcmp(dv->rule.caller, "0")) ||
-		    (ic->parm.setup.phone[0])) {
-			p = dv->rule.caller;
-			p1 = ic->parm.setup.phone;
-			accept = 0;
-			while (*p) {
-				/* complete compare */
-				if (*p == '-') {
-					accept = 1; /* call accepted */
-					break;
-				}
-				if (*p++ != *p1++)
-					break; /* not accepted */
-				if ((!*p) && (!*p1))
-					accept = 1;
-			} /* complete compare */
-			if (!accept) continue; /* not accepted */
-		}
-
-		switch (dv->rule.action) {
-		case DEFLECT_IGNORE:
-			return 0;
-
-		case DEFLECT_ALERT:
-		case DEFLECT_PROCEED:
-		case DEFLECT_REPORT:
-		case DEFLECT_REJECT:
-			if (dv->rule.action == DEFLECT_PROCEED)
-				if ((!if_used) || ((!extern_wait_max) && (!dv->rule.waittime)))
-					return (0); /* no external deflection needed */
-			if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC)))
-				return (0); /* no memory */
-			timer_setup(&cs->timer, deflect_timer_expire, 0);
-			cs->info[0] = '\0';
-
-			cs->ics = *ic; /* copy incoming data */
-			if (!cs->ics.parm.setup.phone[0]) strcpy(cs->ics.parm.setup.phone, "0");
-			if (!cs->ics.parm.setup.eazmsn[0]) strcpy(cs->ics.parm.setup.eazmsn, "0");
-			cs->ics.parm.setup.screen = dv->rule.screen;
-			if (dv->rule.waittime)
-				cs->timer.expires = jiffies + (HZ * dv->rule.waittime);
-			else if (dv->rule.action == DEFLECT_PROCEED)
-				cs->timer.expires = jiffies + (HZ * extern_wait_max);
-			else
-				cs->timer.expires = 0;
-			cs->akt_state = dv->rule.action;
-			spin_lock_irqsave(&divert_lock, flags);
-			cs->divert_id = next_id++; /* new sequence number */
-			spin_unlock_irqrestore(&divert_lock, flags);
-			cs->prev = NULL;
-			if (cs->akt_state == DEFLECT_ALERT) {
-				strcpy(cs->deflect_dest, dv->rule.to_nr);
-				if (!cs->timer.expires) {
-					strcpy(ic->parm.setup.eazmsn,
-					       "Testtext direct");
-					ic->parm.setup.screen = dv->rule.screen;
-					strlcpy(ic->parm.setup.phone, dv->rule.to_nr, sizeof(ic->parm.setup.phone));
-					cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */
-					cs->timer.expires = jiffies + (HZ * AUTODEL_TIME);
-					retval = 5;
-				} else
-					retval = 1; /* alerting */
-			} else {
-				cs->deflect_dest[0] = '\0';
-				retval = 4; /* only proceed */
-			}
-			snprintf(cs->info, sizeof(cs->info),
-				 "%d 0x%lx %s %s %s %s 0x%x 0x%x %d %d %s\n",
-				 cs->akt_state,
-				 cs->divert_id,
-				 divert_if.drv_to_name(cs->ics.driver),
-				 (ic->command == ISDN_STAT_ICALLW) ? "1" : "0",
-				 cs->ics.parm.setup.phone,
-				 cs->ics.parm.setup.eazmsn,
-				 cs->ics.parm.setup.si1,
-				 cs->ics.parm.setup.si2,
-				 cs->ics.parm.setup.screen,
-				 dv->rule.waittime,
-				 cs->deflect_dest);
-			if ((dv->rule.action == DEFLECT_REPORT) ||
-			    (dv->rule.action == DEFLECT_REJECT)) {
-				put_info_buffer(cs->info);
-				kfree(cs); /* remove */
-				return ((dv->rule.action == DEFLECT_REPORT) ? 0 : 2); /* nothing to do */
-			}
-			break;
-
-		default:
-			return 0; /* ignore call */
-		} /* switch action */
-		break; /* will break the 'for' looping */
-	} /* scan_table */
-
-	if (cs) {
-		cs->prev = NULL;
-		spin_lock_irqsave(&divert_lock, flags);
-		cs->next = divert_head;
-		divert_head = cs;
-		if (cs->timer.expires) add_timer(&cs->timer);
-		spin_unlock_irqrestore(&divert_lock, flags);
-
-		put_info_buffer(cs->info);
-		return (retval);
-	} else
-		return (0);
-} /* isdn_divert_icall */
-
-
-void deleteprocs(void)
-{
-	struct call_struc *cs, *cs1;
-	unsigned long flags;
-
-	spin_lock_irqsave(&divert_lock, flags);
-	cs = divert_head;
-	divert_head = NULL;
-	while (cs) {
-		del_timer(&cs->timer);
-		cs1 = cs;
-		cs = cs->next;
-		kfree(cs1);
-	}
-	spin_unlock_irqrestore(&divert_lock, flags);
-} /* deleteprocs */
-
-/****************************************************/
-/* put a address including address type into buffer */
-/****************************************************/
-static int put_address(char *st, u_char *p, int len)
-{
-	u_char retval = 0;
-	u_char adr_typ = 0; /* network standard */
-
-	if (len < 2) return (retval);
-	if (*p == 0xA1) {
-		retval = *(++p) + 2; /* total length */
-		if (retval > len) return (0); /* too short */
-		len = retval - 2; /* remaining length */
-		if (len < 3) return (0);
-		if ((*(++p) != 0x0A) || (*(++p) != 1)) return (0);
-		adr_typ = *(++p);
-		len -= 3;
-		p++;
-		if (len < 2) return (0);
-		if (*p++ != 0x12) return (0);
-		if (*p > len) return (0); /* check number length */
-		len = *p++;
-	} else if (*p == 0x80) {
-		retval = *(++p) + 2; /* total length */
-		if (retval > len) return (0);
-		len = retval - 2;
-		p++;
-	} else
-		return (0); /* invalid address information */
-
-	sprintf(st, "%d ", adr_typ);
-	st += strlen(st);
-	if (!len)
-		*st++ = '-';
-	else
-		while (len--)
-			*st++ = *p++;
-	*st = '\0';
-	return (retval);
-} /* put_address */
-
-/*************************************/
-/* report a successful interrogation */
-/*************************************/
-static int interrogate_success(isdn_ctrl *ic, struct call_struc *cs)
-{
-	char *src = ic->parm.dss1_io.data;
-	int restlen = ic->parm.dss1_io.datalen;
-	int cnt = 1;
-	u_char n, n1;
-	char st[90], *p, *stp;
-
-	if (restlen < 2) return (-100); /* frame too short */
-	if (*src++ != 0x30) return (-101);
-	if ((n = *src++) > 0x81) return (-102); /* invalid length field */
-	restlen -= 2; /* remaining bytes */
-	if (n == 0x80) {
-		if (restlen < 2) return (-103);
-		if ((*(src + restlen - 1)) || (*(src + restlen - 2))) return (-104);
-		restlen -= 2;
-	} else if (n == 0x81) {
-		n = *src++;
-		restlen--;
-		if (n > restlen) return (-105);
-		restlen = n;
-	} else if (n > restlen)
-		return (-106);
-	else
-		restlen = n; /* standard format */
-	if (restlen < 3) return (-107); /* no procedure */
-	if ((*src++ != 2) || (*src++ != 1) || (*src++ != 0x0B)) return (-108);
-	restlen -= 3;
-	if (restlen < 2) return (-109); /* list missing */
-	if (*src == 0x31) {
-		src++;
-		if ((n = *src++) > 0x81) return (-110); /* invalid length field */
-		restlen -= 2; /* remaining bytes */
-		if (n == 0x80) {
-			if (restlen < 2) return (-111);
-			if ((*(src + restlen - 1)) || (*(src + restlen - 2))) return (-112);
-			restlen -= 2;
-		} else if (n == 0x81) {
-			n = *src++;
-			restlen--;
-			if (n > restlen) return (-113);
-			restlen = n;
-		} else if (n > restlen)
-			return (-114);
-		else
-			restlen = n; /* standard format */
-	} /* result list header */
-
-	while (restlen >= 2) {
-		stp = st;
-		sprintf(stp, "%d 0x%lx %d %s ", DIVERT_REPORT, ic->parm.dss1_io.ll_id,
-			cnt++, divert_if.drv_to_name(ic->driver));
-		stp += strlen(stp);
-		if (*src++ != 0x30) return (-115); /* invalid enum */
-		n = *src++;
-		restlen -= 2;
-		if (n > restlen) return (-116); /* enum length wrong */
-		restlen -= n;
-		p = src; /* one entry */
-		src += n;
-		if (!(n1 = put_address(stp, p, n & 0xFF))) continue;
-		stp += strlen(stp);
-		p += n1;
-		n -= n1;
-		if (n < 6) continue; /* no service and proc */
-		if ((*p++ != 0x0A) || (*p++ != 1)) continue;
-		sprintf(stp, " 0x%02x ", (*p++) & 0xFF);
-		stp += strlen(stp);
-		if ((*p++ != 0x0A) || (*p++ != 1)) continue;
-		sprintf(stp, "%d ", (*p++) & 0xFF);
-		stp += strlen(stp);
-		n -= 6;
-		if (n > 2) {
-			if (*p++ != 0x30) continue;
-			if (*p > (n - 2)) continue;
-			n = *p++;
-			if (!(n1 = put_address(stp, p, n & 0xFF))) continue;
-			stp += strlen(stp);
-		}
-		sprintf(stp, "\n");
-		put_info_buffer(st);
-	} /* while restlen */
-	if (restlen) return (-117);
-	return (0);
-} /* interrogate_success */
-
-/*********************************************/
-/* callback for protocol specific extensions */
-/*********************************************/
-static int prot_stat_callback(isdn_ctrl *ic)
-{
-	struct call_struc *cs, *cs1;
-	int i;
-	unsigned long flags;
-
-	cs = divert_head; /* start of list */
-	cs1 = NULL;
-	while (cs) {
-		if (ic->driver == cs->ics.driver) {
-			switch (cs->ics.arg) {
-			case DSS1_CMD_INVOKE:
-				if ((cs->ics.parm.dss1_io.ll_id == ic->parm.dss1_io.ll_id) &&
-				    (cs->ics.parm.dss1_io.hl_id == ic->parm.dss1_io.hl_id)) {
-					switch (ic->arg) {
-					case DSS1_STAT_INVOKE_ERR:
-						sprintf(cs->info, "128 0x%lx 0x%x\n",
-							ic->parm.dss1_io.ll_id,
-							ic->parm.dss1_io.timeout);
-						put_info_buffer(cs->info);
-						break;
-
-					case DSS1_STAT_INVOKE_RES:
-						switch (cs->ics.parm.dss1_io.proc) {
-						case  7:
-						case  8:
-							put_info_buffer(cs->info);
-							break;
-
-						case  11:
-							i = interrogate_success(ic, cs);
-							if (i)
-								sprintf(cs->info, "%d 0x%lx %d\n", DIVERT_REPORT,
-									ic->parm.dss1_io.ll_id, i);
-							put_info_buffer(cs->info);
-							break;
-
-						default:
-							printk(KERN_WARNING "dss1_divert: unknown proc %d\n", cs->ics.parm.dss1_io.proc);
-							break;
-						}
-
-						break;
-
-					default:
-						printk(KERN_WARNING "dss1_divert unknown invoke answer %lx\n", ic->arg);
-						break;
-					}
-					cs1 = cs; /* remember structure */
-					cs = NULL;
-					continue; /* abort search */
-				} /* id found */
-				break;
-
-			case DSS1_CMD_INVOKE_ABORT:
-				printk(KERN_WARNING "dss1_divert unhandled invoke abort\n");
-				break;
-
-			default:
-				printk(KERN_WARNING "dss1_divert unknown cmd 0x%lx\n", cs->ics.arg);
-				break;
-			} /* switch ics.arg */
-			cs = cs->next;
-		} /* driver ok */
-	}
-
-	if (!cs1) {
-		printk(KERN_WARNING "dss1_divert unhandled process\n");
-		return (0);
-	}
-
-	if (cs1->ics.driver == -1) {
-		spin_lock_irqsave(&divert_lock, flags);
-		del_timer(&cs1->timer);
-		if (cs1->prev)
-			cs1->prev->next = cs1->next; /* forward link */
-		else
-			divert_head = cs1->next;
-		if (cs1->next)
-			cs1->next->prev = cs1->prev; /* back link */
-		spin_unlock_irqrestore(&divert_lock, flags);
-		kfree(cs1);
-	}
-
-	return (0);
-} /* prot_stat_callback */
-
-
-/***************************/
-/* status callback from HL */
-/***************************/
-static int isdn_divert_stat_callback(isdn_ctrl *ic)
-{
-	struct call_struc *cs, *cs1;
-	unsigned long flags;
-	int retval;
-
-	retval = -1;
-	cs = divert_head; /* start of list */
-	while (cs) {
-		if ((ic->driver == cs->ics.driver) &&
-		    (ic->arg == cs->ics.arg)) {
-			switch (ic->command) {
-			case ISDN_STAT_DHUP:
-				sprintf(cs->info, "129 0x%lx\n", cs->divert_id);
-				del_timer(&cs->timer);
-				cs->ics.driver = -1;
-				break;
-
-			case ISDN_STAT_CAUSE:
-				sprintf(cs->info, "130 0x%lx %s\n", cs->divert_id, ic->parm.num);
-				break;
-
-			case ISDN_STAT_REDIR:
-				sprintf(cs->info, "131 0x%lx\n", cs->divert_id);
-				del_timer(&cs->timer);
-				cs->ics.driver = -1;
-				break;
-
-			default:
-				sprintf(cs->info, "999 0x%lx 0x%x\n", cs->divert_id, (int)(ic->command));
-				break;
-			}
-			put_info_buffer(cs->info);
-			retval = 0;
-		}
-		cs1 = cs;
-		cs = cs->next;
-		if (cs1->ics.driver == -1) {
-			spin_lock_irqsave(&divert_lock, flags);
-			if (cs1->prev)
-				cs1->prev->next = cs1->next; /* forward link */
-			else
-				divert_head = cs1->next;
-			if (cs1->next)
-				cs1->next->prev = cs1->prev; /* back link */
-			spin_unlock_irqrestore(&divert_lock, flags);
-			kfree(cs1);
-		}
-	}
-	return (retval); /* not found */
-} /* isdn_divert_stat_callback */
-
-
-/********************/
-/* callback from ll */
-/********************/
-int ll_callback(isdn_ctrl *ic)
-{
-	switch (ic->command) {
-	case ISDN_STAT_ICALL:
-	case ISDN_STAT_ICALLW:
-		return (isdn_divert_icall(ic));
-		break;
-
-	case ISDN_STAT_PROT:
-		if ((ic->arg & 0xFF) == ISDN_PTYPE_EURO) {
-			if (ic->arg != DSS1_STAT_INVOKE_BRD)
-				return (prot_stat_callback(ic));
-			else
-				return (0); /* DSS1 invoke broadcast */
-		} else
-			return (-1); /* protocol not euro */
-
-	default:
-		return (isdn_divert_stat_callback(ic));
-	}
-} /* ll_callback */
diff --git a/drivers/isdn/divert/isdn_divert.h b/drivers/isdn/divert/isdn_divert.h
deleted file mode 100644
index 55033dd872c0..000000000000
--- a/drivers/isdn/divert/isdn_divert.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* $Id: isdn_divert.h,v 1.5.6.1 2001/09/23 22:24:36 kai Exp $
- *
- * Header for the diversion supplementary ioctl interface.
- *
- * Copyright 1998       by Werner Cornelius (werner@ikt.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-/******************************************/
-/* IOCTL codes for interface to user prog */
-/******************************************/
-#define DIVERT_IIOC_VERSION 0x01 /* actual version */
-#define IIOCGETVER   _IO('I', 1)  /* get version of interface */
-#define IIOCGETDRV   _IO('I', 2)  /* get driver number */
-#define IIOCGETNAM   _IO('I', 3)  /* get driver name */
-#define IIOCGETRULE  _IO('I', 4)  /* read one rule */
-#define IIOCMODRULE  _IO('I', 5)  /* modify/replace a rule */
-#define IIOCINSRULE  _IO('I', 6)  /* insert/append one rule */
-#define IIOCDELRULE  _IO('I', 7)  /* delete a rule */
-#define IIOCDODFACT  _IO('I', 8)  /* hangup/reject/alert/immediately deflect a call */
-#define IIOCDOCFACT  _IO('I', 9)  /* activate control forwarding in PBX */
-#define IIOCDOCFDIS  _IO('I', 10)  /* deactivate control forwarding in PBX */
-#define IIOCDOCFINT  _IO('I', 11)  /* interrogate control forwarding in PBX */
-
-/*************************************/
-/* states reported through interface */
-/*************************************/
-#define DEFLECT_IGNORE    0  /* ignore incoming call */
-#define DEFLECT_REPORT    1  /* only report */
-#define DEFLECT_PROCEED   2  /* deflect when externally triggered */
-#define DEFLECT_ALERT     3  /* alert and deflect after delay */
-#define DEFLECT_REJECT    4  /* reject immediately */
-#define DIVERT_ACTIVATE   5  /* diversion activate */
-#define DIVERT_DEACTIVATE 6  /* diversion deactivate */
-#define DIVERT_REPORT     7  /* interrogation result */
-#define DEFLECT_AUTODEL 255  /* only for internal use */
-
-#define DEFLECT_ALL_IDS   0xFFFFFFFF /* all drivers selected */
-
-typedef struct {
-	ulong drvid;     /* driver ids, bit mapped */
-	char my_msn[35]; /* desired msn, subaddr allowed */
-	char caller[35]; /* caller id, partial string with * + subaddr allowed */
-	char to_nr[35];  /* deflected to number incl. subaddress */
-	u_char si1, si2;  /* service indicators, si1=bitmask, si1+2 0 = all */
-	u_char screen;   /* screening: 0 = no info, 1 = info, 2 = nfo with nr */
-	u_char callopt;  /* option for call handling:
-			    0 = all calls
-			    1 = only non waiting calls
-			    2 = only waiting calls */
-	u_char action;   /* desired action:
-			    0 = don't report call -> ignore
-			    1 = report call, do not allow/proceed for deflection
-			    2 = report call, send proceed, wait max waittime secs
-			    3 = report call, alert and deflect after waittime
-			    4 = report call, reject immediately
-			    actions 1-2 only take place if interface is opened
-			 */
-	u_char waittime; /* maximum wait time for proceeding */
-} divert_rule;
-
-typedef union {
-	int drv_version; /* return of driver version */
-	struct {
-		int drvid;		/* id of driver */
-		char drvnam[30];	/* name of driver */
-	} getid;
-	struct {
-		int ruleidx;	/* index of rule */
-		divert_rule rule;	/* rule parms */
-	} getsetrule;
-	struct {
-		u_char subcmd;  /* 0 = hangup/reject,
-			     1 = alert,
-			     2 = deflect */
-		ulong callid;   /* id of call delivered by ascii output */
-		char to_nr[35]; /* destination when deflect,
-				   else uus1 string (maxlen 31),
-				   data from rule used if empty */
-	} fwd_ctrl;
-	struct {
-		int drvid;      /* id of driver */
-		u_char cfproc;  /* cfu = 0, cfb = 1, cfnr = 2 */
-		ulong procid;   /* process id returned when no error */
-		u_char service; /* basically coded service, 0 = all */
-		char msn[25];   /* desired msn, empty = all */
-		char fwd_nr[35];/* forwarded to number + subaddress */
-	} cf_ctrl;
-} divert_ioctl;
-
-#ifdef __KERNEL__
-
-#include <linux/isdnif.h>
-#include <linux/isdn_divertif.h>
-
-#define AUTODEL_TIME 30 /* timeout in s to delete internal entries */
-
-/**************************************************/
-/* structure keeping ascii info for device output */
-/**************************************************/
-struct divert_info {
-	struct divert_info *next;
-	ulong usage_cnt; /* number of files still to work */
-	char info_start[2]; /* info string start */
-};
-
-
-/**************/
-/* Prototypes */
-/**************/
-extern spinlock_t divert_lock;
-
-extern ulong if_used; /* number of interface users */
-extern int divert_dev_deinit(void);
-extern int divert_dev_init(void);
-extern void put_info_buffer(char *);
-extern int ll_callback(isdn_ctrl *);
-extern isdn_divert_if divert_if;
-extern divert_rule *getruleptr(int);
-extern int insertrule(int, divert_rule *);
-extern int deleterule(int);
-extern void deleteprocs(void);
-extern int deflect_extern_action(u_char, ulong, char *);
-extern int cf_command(int, int, u_char, char *, u_char, char *, ulong *);
-
-#endif /* __KERNEL__ */
diff --git a/drivers/isdn/i4l/Kconfig b/drivers/isdn/i4l/Kconfig
deleted file mode 100644
index cacde8de38a3..000000000000
--- a/drivers/isdn/i4l/Kconfig
+++ /dev/null
@@ -1,127 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Old ISDN4Linux config
-#
-
-if ISDN_I4L
-
-config ISDN_PPP
-	bool "Support synchronous PPP"
-	depends on INET
-	select SLHC
-	help
-	  Over digital connections such as ISDN, there is no need to
-	  synchronize sender and recipient's clocks with start and stop bits
-	  as is done over analog telephone lines. Instead, one can use
-	  "synchronous PPP". Saying Y here will include this protocol. This
-	  protocol is used by Cisco and Sun for example. So you want to say Y
-	  here if the other end of your ISDN connection supports it. You will
-	  need a special version of pppd (called ipppd) for using this
-	  feature. See <file:Documentation/isdn/README.syncppp> and
-	  <file:Documentation/isdn/syncPPP.FAQ> for more information.
-
-config ISDN_PPP_VJ
-	bool "Use VJ-compression with synchronous PPP"
-	depends on ISDN_PPP
-	help
-	  This enables Van Jacobson header compression for synchronous PPP.
-	  Say Y if the other end of the connection supports it.
-
-config ISDN_MPP
-	bool "Support generic MP (RFC 1717)"
-	depends on ISDN_PPP
-	help
-	  With synchronous PPP enabled, it is possible to increase throughput
-	  by bundling several ISDN-connections, using this protocol. See
-	  <file:Documentation/isdn/README.syncppp> for more information.
-
-config IPPP_FILTER
-	bool "Filtering for synchronous PPP"
-	depends on ISDN_PPP
-	help
-	  Say Y here if you want to be able to filter the packets passing over
-	  IPPP interfaces.  This allows you to control which packets count as
-	  activity (i.e. which packets will reset the idle timer or bring up
-	  a demand-dialled link) and which packets are to be dropped entirely.
-	  You need to say Y here if you wish to use the pass-filter and
-	  active-filter options to ipppd.
-
-config ISDN_PPP_BSDCOMP
-	tristate "Support BSD compression"
-	depends on ISDN_PPP
-	help
-	  Support for the BSD-Compress compression method for PPP, which uses
-	  the LZW compression method to compress each PPP packet before it is
-	  sent over the wire. The machine at the other end of the PPP link
-	  (usually your ISP) has to support the BSD-Compress compression
-	  method as well for this to be useful. Even if they don't support it,
-	  it is safe to say Y here.
-
-config ISDN_AUDIO
-	bool "Support audio via ISDN"
-	help
-	  If you say Y here, the modem-emulator will support a subset of the
-	  EIA Class 8 Voice commands. Using a getty with voice-support
-	  (mgetty+sendfax by <gert@greenie.muc.de> with an extension, available
-	  with the ISDN utility package for example), you will be able to use
-	  your Linux box as an ISDN-answering machine. Of course, this must be
-	  supported by the lowlevel driver also. Currently, the HiSax driver
-	  is the only voice-supporting driver. See
-	  <file:Documentation/isdn/README.audio> for more information.
-
-config ISDN_TTY_FAX
-	bool "Support AT-Fax Class 1 and 2 commands"
-	depends on ISDN_AUDIO
-	help
-	  If you say Y here, the modem-emulator will support a subset of the
-	  Fax Class 1 and 2 commands. Using a getty with fax-support
-	  (mgetty+sendfax, hylafax), you will be able to use your Linux box as
-	  an ISDN-fax-machine. This must be supported by the lowlevel driver
-	  also. See <file:Documentation/isdn/README.fax> for more information.
-
-config ISDN_X25
-	bool "X.25 PLP on top of ISDN"
-	depends on X25
-	help
-	  This feature provides the X.25 protocol over ISDN connections.
-	  See <file:Documentation/isdn/README.x25> for more information
-	  if you are thinking about using this.
-
-
-menu "ISDN feature submodules"
-
-config ISDN_DRV_LOOP
-	tristate "isdnloop support"
-	depends on BROKEN_ON_SMP
-	help
-	  This driver provides a virtual ISDN card. Its primary purpose is
-	  testing of linklevel features or configuration without getting
-	  charged by your service-provider for lots of phone calls.
-	  You need will need the loopctrl utility from the latest isdn4k-utils
-	  package to set up this driver.
-
-config ISDN_DIVERSION
-	tristate "Support isdn diversion services"
-	help
-	  This option allows you to use some supplementary diversion
-	  services in conjunction with the HiSax driver on an EURO/DSS1
-	  line.
-
-	  Supported options are CD (call deflection), CFU (Call forward
-	  unconditional), CFB (Call forward when busy) and CFNR (call forward
-	  not reachable). Additionally the actual CFU, CFB and CFNR state may
-	  be interrogated.
-
-	  The use of CFU, CFB, CFNR and interrogation may be limited to some
-	  countries. The keypad protocol is still not implemented. CD should
-	  work in all countries if the service has been subscribed to.
-
-	  Please read the file <file:Documentation/isdn/README.diversion>.
-
-endmenu
-
-comment "ISDN4Linux hardware drivers"
-
-# end ISDN_I4L
-endif
-
diff --git a/drivers/isdn/i4l/Makefile b/drivers/isdn/i4l/Makefile
index be77500c9e86..11fe697739d5 100644
--- a/drivers/isdn/i4l/Makefile
+++ b/drivers/isdn/i4l/Makefile
@@ -3,18 +3,4 @@
 
 # Each configuration option enables a list of files.
 
-obj-$(CONFIG_ISDN_I4L)		+= isdn.o
-obj-$(CONFIG_ISDN_PPP_BSDCOMP)	+= isdn_bsdcomp.o
 obj-$(CONFIG_ISDN_HDLC)		+= isdnhdlc.o
-
-# Multipart objects.
-
-isdn-y				:= isdn_net.o isdn_tty.o isdn_v110.o isdn_common.o
-
-# Optional parts of multipart objects.
-
-isdn-$(CONFIG_ISDN_PPP)		+= isdn_ppp.o
-isdn-$(CONFIG_ISDN_X25)		+= isdn_concap.o isdn_x25iface.o
-isdn-$(CONFIG_ISDN_AUDIO)		+= isdn_audio.o
-isdn-$(CONFIG_ISDN_TTY_FAX)	+= isdn_ttyfax.o
-
diff --git a/drivers/isdn/i4l/isdn_audio.c b/drivers/isdn/i4l/isdn_audio.c
deleted file mode 100644
index b6bcd1eca128..000000000000
--- a/drivers/isdn/i4l/isdn_audio.c
+++ /dev/null
@@ -1,711 +0,0 @@
-/* $Id: isdn_audio.c,v 1.1.2.2 2004/01/12 22:37:18 keil Exp $
- *
- * Linux ISDN subsystem, audio conversion and compression (linklevel).
- *
- * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de)
- * DTMF code (c) 1996 by Christian Mock (cm@kukuruz.ping.at)
- * Silence detection (c) 1998 by Armin Schindler (mac@gismo.telekom.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/isdn.h>
-#include <linux/slab.h>
-#include "isdn_audio.h"
-#include "isdn_common.h"
-
-char *isdn_audio_revision = "$Revision: 1.1.2.2 $";
-
-/*
- * Misc. lookup-tables.
- */
-
-/* ulaw -> signed 16-bit */
-static short isdn_audio_ulaw_to_s16[] =
-{
-	0x8284, 0x8684, 0x8a84, 0x8e84, 0x9284, 0x9684, 0x9a84, 0x9e84,
-	0xa284, 0xa684, 0xaa84, 0xae84, 0xb284, 0xb684, 0xba84, 0xbe84,
-	0xc184, 0xc384, 0xc584, 0xc784, 0xc984, 0xcb84, 0xcd84, 0xcf84,
-	0xd184, 0xd384, 0xd584, 0xd784, 0xd984, 0xdb84, 0xdd84, 0xdf84,
-	0xe104, 0xe204, 0xe304, 0xe404, 0xe504, 0xe604, 0xe704, 0xe804,
-	0xe904, 0xea04, 0xeb04, 0xec04, 0xed04, 0xee04, 0xef04, 0xf004,
-	0xf0c4, 0xf144, 0xf1c4, 0xf244, 0xf2c4, 0xf344, 0xf3c4, 0xf444,
-	0xf4c4, 0xf544, 0xf5c4, 0xf644, 0xf6c4, 0xf744, 0xf7c4, 0xf844,
-	0xf8a4, 0xf8e4, 0xf924, 0xf964, 0xf9a4, 0xf9e4, 0xfa24, 0xfa64,
-	0xfaa4, 0xfae4, 0xfb24, 0xfb64, 0xfba4, 0xfbe4, 0xfc24, 0xfc64,
-	0xfc94, 0xfcb4, 0xfcd4, 0xfcf4, 0xfd14, 0xfd34, 0xfd54, 0xfd74,
-	0xfd94, 0xfdb4, 0xfdd4, 0xfdf4, 0xfe14, 0xfe34, 0xfe54, 0xfe74,
-	0xfe8c, 0xfe9c, 0xfeac, 0xfebc, 0xfecc, 0xfedc, 0xfeec, 0xfefc,
-	0xff0c, 0xff1c, 0xff2c, 0xff3c, 0xff4c, 0xff5c, 0xff6c, 0xff7c,
-	0xff88, 0xff90, 0xff98, 0xffa0, 0xffa8, 0xffb0, 0xffb8, 0xffc0,
-	0xffc8, 0xffd0, 0xffd8, 0xffe0, 0xffe8, 0xfff0, 0xfff8, 0x0000,
-	0x7d7c, 0x797c, 0x757c, 0x717c, 0x6d7c, 0x697c, 0x657c, 0x617c,
-	0x5d7c, 0x597c, 0x557c, 0x517c, 0x4d7c, 0x497c, 0x457c, 0x417c,
-	0x3e7c, 0x3c7c, 0x3a7c, 0x387c, 0x367c, 0x347c, 0x327c, 0x307c,
-	0x2e7c, 0x2c7c, 0x2a7c, 0x287c, 0x267c, 0x247c, 0x227c, 0x207c,
-	0x1efc, 0x1dfc, 0x1cfc, 0x1bfc, 0x1afc, 0x19fc, 0x18fc, 0x17fc,
-	0x16fc, 0x15fc, 0x14fc, 0x13fc, 0x12fc, 0x11fc, 0x10fc, 0x0ffc,
-	0x0f3c, 0x0ebc, 0x0e3c, 0x0dbc, 0x0d3c, 0x0cbc, 0x0c3c, 0x0bbc,
-	0x0b3c, 0x0abc, 0x0a3c, 0x09bc, 0x093c, 0x08bc, 0x083c, 0x07bc,
-	0x075c, 0x071c, 0x06dc, 0x069c, 0x065c, 0x061c, 0x05dc, 0x059c,
-	0x055c, 0x051c, 0x04dc, 0x049c, 0x045c, 0x041c, 0x03dc, 0x039c,
-	0x036c, 0x034c, 0x032c, 0x030c, 0x02ec, 0x02cc, 0x02ac, 0x028c,
-	0x026c, 0x024c, 0x022c, 0x020c, 0x01ec, 0x01cc, 0x01ac, 0x018c,
-	0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104,
-	0x00f4, 0x00e4, 0x00d4, 0x00c4, 0x00b4, 0x00a4, 0x0094, 0x0084,
-	0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040,
-	0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000
-};
-
-/* alaw -> signed 16-bit */
-static short isdn_audio_alaw_to_s16[] =
-{
-	0x13fc, 0xec04, 0x0144, 0xfebc, 0x517c, 0xae84, 0x051c, 0xfae4,
-	0x0a3c, 0xf5c4, 0x0048, 0xffb8, 0x287c, 0xd784, 0x028c, 0xfd74,
-	0x1bfc, 0xe404, 0x01cc, 0xfe34, 0x717c, 0x8e84, 0x071c, 0xf8e4,
-	0x0e3c, 0xf1c4, 0x00c4, 0xff3c, 0x387c, 0xc784, 0x039c, 0xfc64,
-	0x0ffc, 0xf004, 0x0104, 0xfefc, 0x417c, 0xbe84, 0x041c, 0xfbe4,
-	0x083c, 0xf7c4, 0x0008, 0xfff8, 0x207c, 0xdf84, 0x020c, 0xfdf4,
-	0x17fc, 0xe804, 0x018c, 0xfe74, 0x617c, 0x9e84, 0x061c, 0xf9e4,
-	0x0c3c, 0xf3c4, 0x0084, 0xff7c, 0x307c, 0xcf84, 0x030c, 0xfcf4,
-	0x15fc, 0xea04, 0x0164, 0xfe9c, 0x597c, 0xa684, 0x059c, 0xfa64,
-	0x0b3c, 0xf4c4, 0x0068, 0xff98, 0x2c7c, 0xd384, 0x02cc, 0xfd34,
-	0x1dfc, 0xe204, 0x01ec, 0xfe14, 0x797c, 0x8684, 0x07bc, 0xf844,
-	0x0f3c, 0xf0c4, 0x00e4, 0xff1c, 0x3c7c, 0xc384, 0x03dc, 0xfc24,
-	0x11fc, 0xee04, 0x0124, 0xfedc, 0x497c, 0xb684, 0x049c, 0xfb64,
-	0x093c, 0xf6c4, 0x0028, 0xffd8, 0x247c, 0xdb84, 0x024c, 0xfdb4,
-	0x19fc, 0xe604, 0x01ac, 0xfe54, 0x697c, 0x9684, 0x069c, 0xf964,
-	0x0d3c, 0xf2c4, 0x00a4, 0xff5c, 0x347c, 0xcb84, 0x034c, 0xfcb4,
-	0x12fc, 0xed04, 0x0134, 0xfecc, 0x4d7c, 0xb284, 0x04dc, 0xfb24,
-	0x09bc, 0xf644, 0x0038, 0xffc8, 0x267c, 0xd984, 0x026c, 0xfd94,
-	0x1afc, 0xe504, 0x01ac, 0xfe54, 0x6d7c, 0x9284, 0x06dc, 0xf924,
-	0x0dbc, 0xf244, 0x00b4, 0xff4c, 0x367c, 0xc984, 0x036c, 0xfc94,
-	0x0f3c, 0xf0c4, 0x00f4, 0xff0c, 0x3e7c, 0xc184, 0x03dc, 0xfc24,
-	0x07bc, 0xf844, 0x0008, 0xfff8, 0x1efc, 0xe104, 0x01ec, 0xfe14,
-	0x16fc, 0xe904, 0x0174, 0xfe8c, 0x5d7c, 0xa284, 0x05dc, 0xfa24,
-	0x0bbc, 0xf444, 0x0078, 0xff88, 0x2e7c, 0xd184, 0x02ec, 0xfd14,
-	0x14fc, 0xeb04, 0x0154, 0xfeac, 0x557c, 0xaa84, 0x055c, 0xfaa4,
-	0x0abc, 0xf544, 0x0058, 0xffa8, 0x2a7c, 0xd584, 0x02ac, 0xfd54,
-	0x1cfc, 0xe304, 0x01cc, 0xfe34, 0x757c, 0x8a84, 0x075c, 0xf8a4,
-	0x0ebc, 0xf144, 0x00d4, 0xff2c, 0x3a7c, 0xc584, 0x039c, 0xfc64,
-	0x10fc, 0xef04, 0x0114, 0xfeec, 0x457c, 0xba84, 0x045c, 0xfba4,
-	0x08bc, 0xf744, 0x0018, 0xffe8, 0x227c, 0xdd84, 0x022c, 0xfdd4,
-	0x18fc, 0xe704, 0x018c, 0xfe74, 0x657c, 0x9a84, 0x065c, 0xf9a4,
-	0x0cbc, 0xf344, 0x0094, 0xff6c, 0x327c, 0xcd84, 0x032c, 0xfcd4
-};
-
-/* alaw -> ulaw */
-static char isdn_audio_alaw_to_ulaw[] =
-{
-	0xab, 0x2b, 0xe3, 0x63, 0x8b, 0x0b, 0xc9, 0x49,
-	0xba, 0x3a, 0xf6, 0x76, 0x9b, 0x1b, 0xd7, 0x57,
-	0xa3, 0x23, 0xdd, 0x5d, 0x83, 0x03, 0xc1, 0x41,
-	0xb2, 0x32, 0xeb, 0x6b, 0x93, 0x13, 0xcf, 0x4f,
-	0xaf, 0x2f, 0xe7, 0x67, 0x8f, 0x0f, 0xcd, 0x4d,
-	0xbe, 0x3e, 0xfe, 0x7e, 0x9f, 0x1f, 0xdb, 0x5b,
-	0xa7, 0x27, 0xdf, 0x5f, 0x87, 0x07, 0xc5, 0x45,
-	0xb6, 0x36, 0xef, 0x6f, 0x97, 0x17, 0xd3, 0x53,
-	0xa9, 0x29, 0xe1, 0x61, 0x89, 0x09, 0xc7, 0x47,
-	0xb8, 0x38, 0xf2, 0x72, 0x99, 0x19, 0xd5, 0x55,
-	0xa1, 0x21, 0xdc, 0x5c, 0x81, 0x01, 0xbf, 0x3f,
-	0xb0, 0x30, 0xe9, 0x69, 0x91, 0x11, 0xce, 0x4e,
-	0xad, 0x2d, 0xe5, 0x65, 0x8d, 0x0d, 0xcb, 0x4b,
-	0xbc, 0x3c, 0xfa, 0x7a, 0x9d, 0x1d, 0xd9, 0x59,
-	0xa5, 0x25, 0xde, 0x5e, 0x85, 0x05, 0xc3, 0x43,
-	0xb4, 0x34, 0xed, 0x6d, 0x95, 0x15, 0xd1, 0x51,
-	0xac, 0x2c, 0xe4, 0x64, 0x8c, 0x0c, 0xca, 0x4a,
-	0xbb, 0x3b, 0xf8, 0x78, 0x9c, 0x1c, 0xd8, 0x58,
-	0xa4, 0x24, 0xde, 0x5e, 0x84, 0x04, 0xc2, 0x42,
-	0xb3, 0x33, 0xec, 0x6c, 0x94, 0x14, 0xd0, 0x50,
-	0xb0, 0x30, 0xe8, 0x68, 0x90, 0x10, 0xce, 0x4e,
-	0xbf, 0x3f, 0xfe, 0x7e, 0xa0, 0x20, 0xdc, 0x5c,
-	0xa8, 0x28, 0xe0, 0x60, 0x88, 0x08, 0xc6, 0x46,
-	0xb7, 0x37, 0xf0, 0x70, 0x98, 0x18, 0xd4, 0x54,
-	0xaa, 0x2a, 0xe2, 0x62, 0x8a, 0x0a, 0xc8, 0x48,
-	0xb9, 0x39, 0xf4, 0x74, 0x9a, 0x1a, 0xd6, 0x56,
-	0xa2, 0x22, 0xdd, 0x5d, 0x82, 0x02, 0xc0, 0x40,
-	0xb1, 0x31, 0xea, 0x6a, 0x92, 0x12, 0xcf, 0x4f,
-	0xae, 0x2e, 0xe6, 0x66, 0x8e, 0x0e, 0xcc, 0x4c,
-	0xbd, 0x3d, 0xfc, 0x7c, 0x9e, 0x1e, 0xda, 0x5a,
-	0xa6, 0x26, 0xdf, 0x5f, 0x86, 0x06, 0xc4, 0x44,
-	0xb5, 0x35, 0xee, 0x6e, 0x96, 0x16, 0xd2, 0x52
-};
-
-/* ulaw -> alaw */
-static char isdn_audio_ulaw_to_alaw[] =
-{
-	0xab, 0x55, 0xd5, 0x15, 0x95, 0x75, 0xf5, 0x35,
-	0xb5, 0x45, 0xc5, 0x05, 0x85, 0x65, 0xe5, 0x25,
-	0xa5, 0x5d, 0xdd, 0x1d, 0x9d, 0x7d, 0xfd, 0x3d,
-	0xbd, 0x4d, 0xcd, 0x0d, 0x8d, 0x6d, 0xed, 0x2d,
-	0xad, 0x51, 0xd1, 0x11, 0x91, 0x71, 0xf1, 0x31,
-	0xb1, 0x41, 0xc1, 0x01, 0x81, 0x61, 0xe1, 0x21,
-	0x59, 0xd9, 0x19, 0x99, 0x79, 0xf9, 0x39, 0xb9,
-	0x49, 0xc9, 0x09, 0x89, 0x69, 0xe9, 0x29, 0xa9,
-	0xd7, 0x17, 0x97, 0x77, 0xf7, 0x37, 0xb7, 0x47,
-	0xc7, 0x07, 0x87, 0x67, 0xe7, 0x27, 0xa7, 0xdf,
-	0x9f, 0x7f, 0xff, 0x3f, 0xbf, 0x4f, 0xcf, 0x0f,
-	0x8f, 0x6f, 0xef, 0x2f, 0x53, 0x13, 0x73, 0x33,
-	0xb3, 0x43, 0xc3, 0x03, 0x83, 0x63, 0xe3, 0x23,
-	0xa3, 0x5b, 0xdb, 0x1b, 0x9b, 0x7b, 0xfb, 0x3b,
-	0xbb, 0xbb, 0x4b, 0x4b, 0xcb, 0xcb, 0x0b, 0x0b,
-	0x8b, 0x8b, 0x6b, 0x6b, 0xeb, 0xeb, 0x2b, 0x2b,
-	0xab, 0x54, 0xd4, 0x14, 0x94, 0x74, 0xf4, 0x34,
-	0xb4, 0x44, 0xc4, 0x04, 0x84, 0x64, 0xe4, 0x24,
-	0xa4, 0x5c, 0xdc, 0x1c, 0x9c, 0x7c, 0xfc, 0x3c,
-	0xbc, 0x4c, 0xcc, 0x0c, 0x8c, 0x6c, 0xec, 0x2c,
-	0xac, 0x50, 0xd0, 0x10, 0x90, 0x70, 0xf0, 0x30,
-	0xb0, 0x40, 0xc0, 0x00, 0x80, 0x60, 0xe0, 0x20,
-	0x58, 0xd8, 0x18, 0x98, 0x78, 0xf8, 0x38, 0xb8,
-	0x48, 0xc8, 0x08, 0x88, 0x68, 0xe8, 0x28, 0xa8,
-	0xd6, 0x16, 0x96, 0x76, 0xf6, 0x36, 0xb6, 0x46,
-	0xc6, 0x06, 0x86, 0x66, 0xe6, 0x26, 0xa6, 0xde,
-	0x9e, 0x7e, 0xfe, 0x3e, 0xbe, 0x4e, 0xce, 0x0e,
-	0x8e, 0x6e, 0xee, 0x2e, 0x52, 0x12, 0x72, 0x32,
-	0xb2, 0x42, 0xc2, 0x02, 0x82, 0x62, 0xe2, 0x22,
-	0xa2, 0x5a, 0xda, 0x1a, 0x9a, 0x7a, 0xfa, 0x3a,
-	0xba, 0xba, 0x4a, 0x4a, 0xca, 0xca, 0x0a, 0x0a,
-	0x8a, 0x8a, 0x6a, 0x6a, 0xea, 0xea, 0x2a, 0x2a
-};
-
-#define NCOEFF            8     /* number of frequencies to be analyzed       */
-#define DTMF_TRESH     4000     /* above this is dtmf                         */
-#define SILENCE_TRESH   200     /* below this is silence                      */
-#define AMP_BITS          9     /* bits per sample, reduced to avoid overflow */
-#define LOGRP             0
-#define HIGRP             1
-
-/* For DTMF recognition:
- * 2 * cos(2 * PI * k / N) precalculated for all k
- */
-static int cos2pik[NCOEFF] =
-{
-	55813, 53604, 51193, 48591, 38114, 33057, 25889, 18332
-};
-
-static char dtmf_matrix[4][4] =
-{
-	{'1', '2', '3', 'A'},
-	{'4', '5', '6', 'B'},
-	{'7', '8', '9', 'C'},
-	{'*', '0', '#', 'D'}
-};
-
-static inline void
-isdn_audio_tlookup(const u_char *table, u_char *buff, unsigned long n)
-{
-#ifdef __i386__
-	unsigned long d0, d1, d2, d3;
-	__asm__ __volatile__(
-		"cld\n"
-		"1:\tlodsb\n\t"
-		"xlatb\n\t"
-		"stosb\n\t"
-		"loop 1b\n\t"
-		:	"=&b"(d0), "=&c"(d1), "=&D"(d2), "=&S"(d3)
-		:	"0"((long) table), "1"(n), "2"((long) buff), "3"((long) buff)
-		:	"memory", "ax");
-#else
-	while (n--)
-		*buff = table[*(unsigned char *)buff], buff++;
-#endif
-}
-
-void
-isdn_audio_ulaw2alaw(unsigned char *buff, unsigned long len)
-{
-	isdn_audio_tlookup(isdn_audio_ulaw_to_alaw, buff, len);
-}
-
-void
-isdn_audio_alaw2ulaw(unsigned char *buff, unsigned long len)
-{
-	isdn_audio_tlookup(isdn_audio_alaw_to_ulaw, buff, len);
-}
-
-/*
- * linear <-> adpcm conversion stuff
- * Most parts from the mgetty-package.
- * (C) by Gert Doering and Klaus Weidner
- * Used by permission of Gert Doering
- */
-
-
-#define ZEROTRAP                /* turn on the trap as per the MIL-STD */
-#undef ZEROTRAP
-#define BIAS 0x84               /* define the add-in bias for 16 bit samples */
-#define CLIP 32635
-
-static unsigned char
-isdn_audio_linear2ulaw(int sample)
-{
-	static int exp_lut[256] =
-		{
-			0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-			4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-			5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-			5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-			6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-			6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-			6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-			6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-			7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-		};
-	int sign,
-		exponent,
-		mantissa;
-	unsigned char ulawbyte;
-
-	/* Get the sample into sign-magnitude. */
-	sign = (sample >> 8) & 0x80;	/* set aside the sign  */
-	if (sign != 0)
-		sample = -sample;	/* get magnitude       */
-	if (sample > CLIP)
-		sample = CLIP;  /* clip the magnitude  */
-
-	/* Convert from 16 bit linear to ulaw. */
-	sample = sample + BIAS;
-	exponent = exp_lut[(sample >> 7) & 0xFF];
-	mantissa = (sample >> (exponent + 3)) & 0x0F;
-	ulawbyte = ~(sign | (exponent << 4) | mantissa);
-#ifdef ZEROTRAP
-	/* optional CCITT trap */
-	if (ulawbyte == 0)
-		ulawbyte = 0x02;
-#endif
-	return (ulawbyte);
-}
-
-
-static int Mx[3][8] =
-{
-	{0x3800, 0x5600, 0, 0, 0, 0, 0, 0},
-	{0x399a, 0x3a9f, 0x4d14, 0x6607, 0, 0, 0, 0},
-	{0x3556, 0x3556, 0x399A, 0x3A9F, 0x4200, 0x4D14, 0x6607, 0x6607},
-};
-
-static int bitmask[9] =
-{
-	0, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff
-};
-
-static int
-isdn_audio_get_bits(adpcm_state *s, unsigned char **in, int *len)
-{
-	while (s->nleft < s->nbits) {
-		int d = *((*in)++);
-		(*len)--;
-		s->word = (s->word << 8) | d;
-		s->nleft += 8;
-	}
-	s->nleft -= s->nbits;
-	return (s->word >> s->nleft) & bitmask[s->nbits];
-}
-
-static void
-isdn_audio_put_bits(int data, int nbits, adpcm_state *s,
-		    unsigned char **out, int *len)
-{
-	s->word = (s->word << nbits) | (data & bitmask[nbits]);
-	s->nleft += nbits;
-	while (s->nleft >= 8) {
-		int d = (s->word >> (s->nleft - 8));
-		*(out[0]++) = d & 255;
-		(*len)++;
-		s->nleft -= 8;
-	}
-}
-
-adpcm_state *
-isdn_audio_adpcm_init(adpcm_state *s, int nbits)
-{
-	if (!s)
-		s = kmalloc(sizeof(adpcm_state), GFP_ATOMIC);
-	if (s) {
-		s->a = 0;
-		s->d = 5;
-		s->word = 0;
-		s->nleft = 0;
-		s->nbits = nbits;
-	}
-	return s;
-}
-
-dtmf_state *
-isdn_audio_dtmf_init(dtmf_state *s)
-{
-	if (!s)
-		s = kmalloc(sizeof(dtmf_state), GFP_ATOMIC);
-	if (s) {
-		s->idx = 0;
-		s->last = ' ';
-	}
-	return s;
-}
-
-/*
- * Decompression of adpcm data to a/u-law
- *
- */
-
-int
-isdn_audio_adpcm2xlaw(adpcm_state *s, int fmt, unsigned char *in,
-		      unsigned char *out, int len)
-{
-	int a = s->a;
-	int d = s->d;
-	int nbits = s->nbits;
-	int olen = 0;
-
-	while (len) {
-		int e = isdn_audio_get_bits(s, &in, &len);
-		int sign;
-
-		if (nbits == 4 && e == 0)
-			d = 4;
-		sign = (e >> (nbits - 1)) ? -1 : 1;
-		e &= bitmask[nbits - 1];
-		a += sign * ((e << 1) + 1) * d >> 1;
-		if (d & 1)
-			a++;
-		if (fmt)
-			*out++ = isdn_audio_ulaw_to_alaw[
-				isdn_audio_linear2ulaw(a << 2)];
-		else
-			*out++ = isdn_audio_linear2ulaw(a << 2);
-		olen++;
-		d = (d * Mx[nbits - 2][e] + 0x2000) >> 14;
-		if (d < 5)
-			d = 5;
-	}
-	s->a = a;
-	s->d = d;
-	return olen;
-}
-
-int
-isdn_audio_xlaw2adpcm(adpcm_state *s, int fmt, unsigned char *in,
-		      unsigned char *out, int len)
-{
-	int a = s->a;
-	int d = s->d;
-	int nbits = s->nbits;
-	int olen = 0;
-
-	while (len--) {
-		int e = 0,
-			nmax = 1 << (nbits - 1);
-		int sign,
-			delta;
-
-		if (fmt)
-			delta = (isdn_audio_alaw_to_s16[*in++] >> 2) - a;
-		else
-			delta = (isdn_audio_ulaw_to_s16[*in++] >> 2) - a;
-		if (delta < 0) {
-			e = nmax;
-			delta = -delta;
-		}
-		while (--nmax && delta > d) {
-			delta -= d;
-			e++;
-		}
-		if (nbits == 4 && ((e & 0x0f) == 0))
-			e = 8;
-		isdn_audio_put_bits(e, nbits, s, &out, &olen);
-		sign = (e >> (nbits - 1)) ? -1 : 1;
-		e &= bitmask[nbits - 1];
-
-		a += sign * ((e << 1) + 1) * d >> 1;
-		if (d & 1)
-			a++;
-		d = (d * Mx[nbits - 2][e] + 0x2000) >> 14;
-		if (d < 5)
-			d = 5;
-	}
-	s->a = a;
-	s->d = d;
-	return olen;
-}
-
-/*
- * Goertzel algorithm.
- * See http://ptolemy.eecs.berkeley.edu/papers/96/dtmf_ict/
- * for more info.
- * Result is stored into an sk_buff and queued up for later
- * evaluation.
- */
-static void
-isdn_audio_goertzel(int *sample, modem_info *info)
-{
-	int sk,
-		sk1,
-		sk2;
-	int k,
-		n;
-	struct sk_buff *skb;
-	int *result;
-
-	skb = dev_alloc_skb(sizeof(int) * NCOEFF);
-	if (!skb) {
-		printk(KERN_WARNING
-		       "isdn_audio: Could not alloc DTMF result for ttyI%d\n",
-		       info->line);
-		return;
-	}
-	result = skb_put(skb, sizeof(int) * NCOEFF);
-	for (k = 0; k < NCOEFF; k++) {
-		sk = sk1 = sk2 = 0;
-		for (n = 0; n < DTMF_NPOINTS; n++) {
-			sk = sample[n] + ((cos2pik[k] * sk1) >> 15) - sk2;
-			sk2 = sk1;
-			sk1 = sk;
-		}
-		/* Avoid overflows */
-		sk >>= 1;
-		sk2 >>= 1;
-		/* compute |X(k)|**2 */
-		/* report overflows. This should not happen. */
-		/* Comment this out if desired */
-		if (sk < -32768 || sk > 32767)
-			printk(KERN_DEBUG
-			       "isdn_audio: dtmf goertzel overflow, sk=%d\n", sk);
-		if (sk2 < -32768 || sk2 > 32767)
-			printk(KERN_DEBUG
-			       "isdn_audio: dtmf goertzel overflow, sk2=%d\n", sk2);
-		result[k] =
-			((sk * sk) >> AMP_BITS) -
-			((((cos2pik[k] * sk) >> 15) * sk2) >> AMP_BITS) +
-			((sk2 * sk2) >> AMP_BITS);
-	}
-	skb_queue_tail(&info->dtmf_queue, skb);
-	isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-}
-
-void
-isdn_audio_eval_dtmf(modem_info *info)
-{
-	struct sk_buff *skb;
-	int *result;
-	dtmf_state *s;
-	int silence;
-	int i;
-	int di;
-	int ch;
-	int grp[2];
-	char what;
-	char *p;
-	int thresh;
-
-	while ((skb = skb_dequeue(&info->dtmf_queue))) {
-		result = (int *) skb->data;
-		s = info->dtmf_state;
-		grp[LOGRP] = grp[HIGRP] = -1;
-		silence = 0;
-		thresh = 0;
-		for (i = 0; i < NCOEFF; i++) {
-			if (result[i] > DTMF_TRESH) {
-				if (result[i] > thresh)
-					thresh = result[i];
-			}
-			else if (result[i] < SILENCE_TRESH)
-				silence++;
-		}
-		if (silence == NCOEFF)
-			what = ' ';
-		else {
-			if (thresh > 0)	{
-				thresh = thresh >> 4;  /* touchtones must match within 12 dB */
-				for (i = 0; i < NCOEFF; i++) {
-					if (result[i] < thresh)
-						continue;  /* ignore */
-					/* good level found. This is allowed only one time per group */
-					if (i < NCOEFF / 2) {
-						/* lowgroup*/
-						if (grp[LOGRP] >= 0) {
-							// Bad. Another tone found. */
-							grp[LOGRP] = -1;
-							break;
-						}
-						else
-							grp[LOGRP] = i;
-					}
-					else { /* higroup */
-						if (grp[HIGRP] >= 0) { // Bad. Another tone found. */
-							grp[HIGRP] = -1;
-							break;
-						}
-						else
-							grp[HIGRP] = i - NCOEFF/2;
-					}
-				}
-				if ((grp[LOGRP] >= 0) && (grp[HIGRP] >= 0)) {
-					what = dtmf_matrix[grp[LOGRP]][grp[HIGRP]];
-					if (s->last != ' ' && s->last != '.')
-						s->last = what;	/* min. 1 non-DTMF between DTMF */
-				} else
-					what = '.';
-			}
-			else
-				what = '.';
-		}
-		if ((what != s->last) && (what != ' ') && (what != '.')) {
-			printk(KERN_DEBUG "dtmf: tt='%c'\n", what);
-			p = skb->data;
-			*p++ = 0x10;
-			*p = what;
-			skb_trim(skb, 2);
-			ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-			di = info->isdn_driver;
-			ch = info->isdn_channel;
-			__skb_queue_tail(&dev->drv[di]->rpqueue[ch], skb);
-			dev->drv[di]->rcvcount[ch] += 2;
-			/* Schedule dequeuing */
-			if ((dev->modempoll) && (info->rcvsched))
-				isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-			wake_up_interruptible(&dev->drv[di]->rcv_waitq[ch]);
-		} else
-			kfree_skb(skb);
-		s->last = what;
-	}
-}
-
-/*
- * Decode DTMF tones, queue result in separate sk_buf for
- * later examination.
- * Parameters:
- *   s    = pointer to state-struct.
- *   buf  = input audio data
- *   len  = size of audio data.
- *   fmt  = audio data format (0 = ulaw, 1 = alaw)
- */
-void
-isdn_audio_calc_dtmf(modem_info *info, unsigned char *buf, int len, int fmt)
-{
-	dtmf_state *s = info->dtmf_state;
-	int i;
-	int c;
-
-	while (len) {
-		c = DTMF_NPOINTS - s->idx;
-		if (c > len)
-			c = len;
-		if (c <= 0)
-			break;
-		for (i = 0; i < c; i++) {
-			if (fmt)
-				s->buf[s->idx++] =
-					isdn_audio_alaw_to_s16[*buf++] >> (15 - AMP_BITS);
-			else
-				s->buf[s->idx++] =
-					isdn_audio_ulaw_to_s16[*buf++] >> (15 - AMP_BITS);
-		}
-		if (s->idx == DTMF_NPOINTS) {
-			isdn_audio_goertzel(s->buf, info);
-			s->idx = 0;
-		}
-		len -= c;
-	}
-}
-
-silence_state *
-isdn_audio_silence_init(silence_state *s)
-{
-	if (!s)
-		s = kmalloc(sizeof(silence_state), GFP_ATOMIC);
-	if (s) {
-		s->idx = 0;
-		s->state = 0;
-	}
-	return s;
-}
-
-void
-isdn_audio_calc_silence(modem_info *info, unsigned char *buf, int len, int fmt)
-{
-	silence_state *s = info->silence_state;
-	int i;
-	signed char c;
-
-	if (!info->emu.vpar[1]) return;
-
-	for (i = 0; i < len; i++) {
-		if (fmt)
-			c = isdn_audio_alaw_to_ulaw[*buf++];
-		else
-			c = *buf++;
-
-		if (c > 0) c -= 128;
-		c = abs(c);
-
-		if (c > (info->emu.vpar[1] * 4)) {
-			s->idx = 0;
-			s->state = 1;
-		} else {
-			if (s->idx < 210000) s->idx++;
-		}
-	}
-}
-
-void
-isdn_audio_put_dle_code(modem_info *info, u_char code)
-{
-	struct sk_buff *skb;
-	int di;
-	int ch;
-	char *p;
-
-	skb = dev_alloc_skb(2);
-	if (!skb) {
-		printk(KERN_WARNING
-		       "isdn_audio: Could not alloc skb for ttyI%d\n",
-		       info->line);
-		return;
-	}
-	p = skb_put(skb, 2);
-	p[0] = 0x10;
-	p[1] = code;
-	ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
-	ISDN_AUDIO_SKB_LOCK(skb) = 0;
-	di = info->isdn_driver;
-	ch = info->isdn_channel;
-	__skb_queue_tail(&dev->drv[di]->rpqueue[ch], skb);
-	dev->drv[di]->rcvcount[ch] += 2;
-	/* Schedule dequeuing */
-	if ((dev->modempoll) && (info->rcvsched))
-		isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-	wake_up_interruptible(&dev->drv[di]->rcv_waitq[ch]);
-}
-
-void
-isdn_audio_eval_silence(modem_info *info)
-{
-	silence_state *s = info->silence_state;
-	char what;
-
-	what = ' ';
-
-	if (s->idx > (info->emu.vpar[2] * 800)) {
-		s->idx = 0;
-		if (!s->state) {	/* silence from beginning of rec */
-			what = 's';
-		} else {
-			what = 'q';
-		}
-	}
-	if ((what == 's') || (what == 'q')) {
-		printk(KERN_DEBUG "ttyI%d: %s\n", info->line,
-		       (what == 's') ? "silence" : "quiet");
-		isdn_audio_put_dle_code(info, what);
-	}
-}
diff --git a/drivers/isdn/i4l/isdn_audio.h b/drivers/isdn/i4l/isdn_audio.h
deleted file mode 100644
index 013c3582e0d1..000000000000
--- a/drivers/isdn/i4l/isdn_audio.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* $Id: isdn_audio.h,v 1.1.2.2 2004/01/12 22:37:18 keil Exp $
- *
- * Linux ISDN subsystem, audio conversion and compression (linklevel).
- *
- * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#define DTMF_NPOINTS 205        /* Number of samples for DTMF recognition */
-typedef struct adpcm_state {
-	int a;
-	int d;
-	int word;
-	int nleft;
-	int nbits;
-} adpcm_state;
-
-typedef struct dtmf_state {
-	char last;
-	char llast;
-	int idx;
-	int buf[DTMF_NPOINTS];
-} dtmf_state;
-
-typedef struct silence_state {
-	int state;
-	unsigned int idx;
-} silence_state;
-
-extern void isdn_audio_ulaw2alaw(unsigned char *, unsigned long);
-extern void isdn_audio_alaw2ulaw(unsigned char *, unsigned long);
-extern adpcm_state *isdn_audio_adpcm_init(adpcm_state *, int);
-extern int isdn_audio_adpcm2xlaw(adpcm_state *, int, unsigned char *, unsigned char *, int);
-extern int isdn_audio_xlaw2adpcm(adpcm_state *, int, unsigned char *, unsigned char *, int);
-extern void isdn_audio_calc_dtmf(modem_info *, unsigned char *, int, int);
-extern void isdn_audio_eval_dtmf(modem_info *);
-dtmf_state *isdn_audio_dtmf_init(dtmf_state *);
-extern void isdn_audio_calc_silence(modem_info *, unsigned char *, int, int);
-extern void isdn_audio_eval_silence(modem_info *);
-silence_state *isdn_audio_silence_init(silence_state *);
-extern void isdn_audio_put_dle_code(modem_info *, u_char);
diff --git a/drivers/isdn/i4l/isdn_bsdcomp.c b/drivers/isdn/i4l/isdn_bsdcomp.c
deleted file mode 100644
index 7f28b967ed19..000000000000
--- a/drivers/isdn/i4l/isdn_bsdcomp.c
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
- * BSD compression module
- *
- * Patched version for ISDN syncPPP written 1997/1998 by Michael Hipp
- * The whole module is now SKB based.
- *
- */
-
-/*
- * Update: The Berkeley copyright was changed, and the change
- * is retroactive to all "true" BSD software (ie everything
- * from UCB as opposed to other peoples code that just carried
- * the same license). The new copyright doesn't clash with the
- * GPL, so the module-only restriction has been removed..
- */
-
-/*
- * Original copyright notice:
- *
- * Copyright (c) 1985, 1986 The Regents of the University of California.
- * All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * James A. Woods, derived from original work by Spencer Thomas
- * and Joseph Orost.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/interrupt.h>
-#include <linux/ptrace.h>
-#include <linux/ioport.h>
-#include <linux/in.h>
-#include <linux/slab.h>
-#include <linux/tty.h>
-#include <linux/errno.h>
-#include <linux/string.h>	/* used in new tty drivers */
-#include <linux/signal.h>	/* used in new tty drivers */
-#include <linux/bitops.h>
-
-#include <asm/byteorder.h>
-#include <asm/types.h>
-
-#include <linux/if.h>
-
-#include <linux/if_ether.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/inet.h>
-#include <linux/ioctl.h>
-#include <linux/vmalloc.h>
-
-#include <linux/ppp_defs.h>
-
-#include <linux/isdn.h>
-#include <linux/isdn_ppp.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/if_arp.h>
-#include <linux/ppp-comp.h>
-
-#include "isdn_ppp.h"
-
-MODULE_DESCRIPTION("ISDN4Linux: BSD Compression for PPP over ISDN");
-MODULE_LICENSE("Dual BSD/GPL");
-
-#define BSD_VERSION(x)	((x) >> 5)
-#define BSD_NBITS(x)	((x) & 0x1F)
-
-#define BSD_CURRENT_VERSION	1
-
-#define DEBUG 1
-
-/*
- * A dictionary for doing BSD compress.
- */
-
-struct bsd_dict {
-	u32 fcode;
-	u16 codem1;		/* output of hash table -1 */
-	u16 cptr;		/* map code to hash table entry */
-};
-
-struct bsd_db {
-	int            totlen;		/* length of this structure */
-	unsigned int   hsize;		/* size of the hash table */
-	unsigned char  hshift;		/* used in hash function */
-	unsigned char  n_bits;		/* current bits/code */
-	unsigned char  maxbits;		/* maximum bits/code */
-	unsigned char  debug;		/* non-zero if debug desired */
-	unsigned char  unit;		/* ppp unit number */
-	u16 seqno;			/* sequence # of next packet */
-	unsigned int   mru;		/* size of receive (decompress) bufr */
-	unsigned int   maxmaxcode;	/* largest valid code */
-	unsigned int   max_ent;		/* largest code in use */
-	unsigned int   in_count;	/* uncompressed bytes, aged */
-	unsigned int   bytes_out;	/* compressed bytes, aged */
-	unsigned int   ratio;		/* recent compression ratio */
-	unsigned int   checkpoint;	/* when to next check the ratio */
-	unsigned int   clear_count;	/* times dictionary cleared */
-	unsigned int   incomp_count;	/* incompressible packets */
-	unsigned int   incomp_bytes;	/* incompressible bytes */
-	unsigned int   uncomp_count;	/* uncompressed packets */
-	unsigned int   uncomp_bytes;	/* uncompressed bytes */
-	unsigned int   comp_count;	/* compressed packets */
-	unsigned int   comp_bytes;	/* compressed bytes */
-	unsigned short  *lens;		/* array of lengths of codes */
-	struct bsd_dict *dict;		/* dictionary */
-	int xmit;
-};
-
-#define BSD_OVHD	2		/* BSD compress overhead/packet */
-#define MIN_BSD_BITS	9
-#define BSD_INIT_BITS	MIN_BSD_BITS
-#define MAX_BSD_BITS	15
-
-/*
- * the next two codes should not be changed lightly, as they must not
- * lie within the contiguous general code space.
- */
-#define CLEAR	256			/* table clear output code */
-#define FIRST	257			/* first free entry */
-#define LAST	255
-
-#define MAXCODE(b)	((1 << (b)) - 1)
-#define BADCODEM1	MAXCODE(MAX_BSD_BITS)
-
-#define BSD_HASH(prefix, suffix, hshift) ((((unsigned long)(suffix)) << (hshift)) \
-					  ^ (unsigned long)(prefix))
-#define BSD_KEY(prefix, suffix)		((((unsigned long)(suffix)) << 16) \
-					 + (unsigned long)(prefix))
-
-#define CHECK_GAP	10000		/* Ratio check interval */
-
-#define RATIO_SCALE_LOG	8
-#define RATIO_SCALE	(1 << RATIO_SCALE_LOG)
-#define RATIO_MAX	(0x7fffffff >> RATIO_SCALE_LOG)
-
-/*
- * clear the dictionary
- */
-
-static void bsd_clear(struct bsd_db *db)
-{
-	db->clear_count++;
-	db->max_ent      = FIRST - 1;
-	db->n_bits       = BSD_INIT_BITS;
-	db->bytes_out    = 0;
-	db->in_count     = 0;
-	db->incomp_count = 0;
-	db->ratio	     = 0;
-	db->checkpoint   = CHECK_GAP;
-}
-
-/*
- * If the dictionary is full, then see if it is time to reset it.
- *
- * Compute the compression ratio using fixed-point arithmetic
- * with 8 fractional bits.
- *
- * Since we have an infinite stream instead of a single file,
- * watch only the local compression ratio.
- *
- * Since both peers must reset the dictionary at the same time even in
- * the absence of CLEAR codes (while packets are incompressible), they
- * must compute the same ratio.
- */
-static int bsd_check(struct bsd_db *db)	/* 1=output CLEAR */
-{
-	unsigned int new_ratio;
-
-	if (db->in_count >= db->checkpoint)
-	{
-		/* age the ratio by limiting the size of the counts */
-		if (db->in_count >= RATIO_MAX || db->bytes_out >= RATIO_MAX)
-		{
-			db->in_count  -= (db->in_count  >> 2);
-			db->bytes_out -= (db->bytes_out >> 2);
-		}
-
-		db->checkpoint = db->in_count + CHECK_GAP;
-
-		if (db->max_ent >= db->maxmaxcode)
-		{
-			/* Reset the dictionary only if the ratio is worse,
-			 * or if it looks as if it has been poisoned
-			 * by incompressible data.
-			 *
-			 * This does not overflow, because
-			 *	db->in_count <= RATIO_MAX.
-			 */
-
-			new_ratio = db->in_count << RATIO_SCALE_LOG;
-			if (db->bytes_out != 0)
-			{
-				new_ratio /= db->bytes_out;
-			}
-
-			if (new_ratio < db->ratio || new_ratio < 1 * RATIO_SCALE)
-			{
-				bsd_clear(db);
-				return 1;
-			}
-			db->ratio = new_ratio;
-		}
-	}
-	return 0;
-}
-
-/*
- * Return statistics.
- */
-
-static void bsd_stats(void *state, struct compstat *stats)
-{
-	struct bsd_db *db = (struct bsd_db *) state;
-
-	stats->unc_bytes    = db->uncomp_bytes;
-	stats->unc_packets  = db->uncomp_count;
-	stats->comp_bytes   = db->comp_bytes;
-	stats->comp_packets = db->comp_count;
-	stats->inc_bytes    = db->incomp_bytes;
-	stats->inc_packets  = db->incomp_count;
-	stats->in_count     = db->in_count;
-	stats->bytes_out    = db->bytes_out;
-}
-
-/*
- * Reset state, as on a CCP ResetReq.
- */
-static void bsd_reset(void *state, unsigned char code, unsigned char id,
-		      unsigned char *data, unsigned len,
-		      struct isdn_ppp_resetparams *rsparm)
-{
-	struct bsd_db *db = (struct bsd_db *) state;
-
-	bsd_clear(db);
-	db->seqno       = 0;
-	db->clear_count = 0;
-}
-
-/*
- * Release the compression structure
- */
-static void bsd_free(void *state)
-{
-	struct bsd_db *db = (struct bsd_db *) state;
-
-	if (db) {
-		/*
-		 * Release the dictionary
-		 */
-		vfree(db->dict);
-		db->dict = NULL;
-
-		/*
-		 * Release the string buffer
-		 */
-		vfree(db->lens);
-		db->lens = NULL;
-
-		/*
-		 * Finally release the structure itself.
-		 */
-		kfree(db);
-	}
-}
-
-
-/*
- * Allocate space for a (de) compressor.
- */
-static void *bsd_alloc(struct isdn_ppp_comp_data *data)
-{
-	int bits;
-	unsigned int hsize, hshift, maxmaxcode;
-	struct bsd_db *db;
-	int decomp;
-
-	static unsigned int htab[][2] = {
-		{ 5003 , 4 } , { 5003 , 4 } , { 5003 , 4 } , { 5003 , 4 } ,
-		{ 9001 , 5 } , { 18013 , 6 } , { 35023 , 7 } , { 69001 , 8 }
-	};
-
-	if (data->optlen != 1 || data->num != CI_BSD_COMPRESS
-	    || BSD_VERSION(data->options[0]) != BSD_CURRENT_VERSION)
-		return NULL;
-
-	bits = BSD_NBITS(data->options[0]);
-
-	if (bits < 9 || bits > 15)
-		return NULL;
-
-	hsize = htab[bits - 9][0];
-	hshift = htab[bits - 9][1];
-
-	/*
-	 * Allocate the main control structure for this instance.
-	 */
-	maxmaxcode = MAXCODE(bits);
-	db = kzalloc(sizeof(struct bsd_db), GFP_KERNEL);
-	if (!db)
-		return NULL;
-
-	db->xmit = data->flags & IPPP_COMP_FLAG_XMIT;
-	decomp = db->xmit ? 0 : 1;
-
-	/*
-	 * Allocate space for the dictionary. This may be more than one page in
-	 * length.
-	 */
-	db->dict = vmalloc(array_size(hsize, sizeof(struct bsd_dict)));
-	if (!db->dict) {
-		bsd_free(db);
-		return NULL;
-	}
-
-	/*
-	 * If this is the compression buffer then there is no length data.
-	 * For decompression, the length information is needed as well.
-	 */
-	if (!decomp)
-		db->lens = NULL;
-	else {
-		db->lens = vmalloc(array_size(sizeof(db->lens[0]),
-					      maxmaxcode + 1));
-		if (!db->lens) {
-			bsd_free(db);
-			return (NULL);
-		}
-	}
-
-	/*
-	 * Initialize the data information for the compression code
-	 */
-	db->totlen = sizeof(struct bsd_db) + (sizeof(struct bsd_dict) * hsize);
-	db->hsize = hsize;
-	db->hshift = hshift;
-	db->maxmaxcode = maxmaxcode;
-	db->maxbits = bits;
-
-	return (void *)db;
-}
-
-/*
- * Initialize the database.
- */
-static int bsd_init(void *state, struct isdn_ppp_comp_data *data, int unit, int debug)
-{
-	struct bsd_db *db = state;
-	int indx;
-	int decomp;
-
-	if (!state || !data) {
-		printk(KERN_ERR "isdn_bsd_init: [%d] ERR, state %lx data %lx\n", unit, (long)state, (long)data);
-		return 0;
-	}
-
-	decomp = db->xmit ? 0 : 1;
-
-	if (data->optlen != 1 || data->num != CI_BSD_COMPRESS
-	    || (BSD_VERSION(data->options[0]) != BSD_CURRENT_VERSION)
-	    || (BSD_NBITS(data->options[0]) != db->maxbits)
-	    || (decomp && db->lens == NULL)) {
-		printk(KERN_ERR "isdn_bsd: %d %d %d %d %lx\n", data->optlen, data->num, data->options[0], decomp, (unsigned long)db->lens);
-		return 0;
-	}
-
-	if (decomp)
-		for (indx = LAST; indx >= 0; indx--)
-			db->lens[indx] = 1;
-
-	indx = db->hsize;
-	while (indx-- != 0) {
-		db->dict[indx].codem1 = BADCODEM1;
-		db->dict[indx].cptr   = 0;
-	}
-
-	db->unit = unit;
-	db->mru  = 0;
-
-	db->debug = 1;
-
-	bsd_reset(db, 0, 0, NULL, 0, NULL);
-
-	return 1;
-}
-
-/*
- * Obtain pointers to the various structures in the compression tables
- */
-
-#define dict_ptrx(p, idx) &(p->dict[idx])
-#define lens_ptrx(p, idx) &(p->lens[idx])
-
-#ifdef DEBUG
-static unsigned short *lens_ptr(struct bsd_db *db, int idx)
-{
-	if ((unsigned int) idx > (unsigned int) db->maxmaxcode) {
-		printk(KERN_DEBUG "<9>ppp: lens_ptr(%d) > max\n", idx);
-		idx = 0;
-	}
-	return lens_ptrx(db, idx);
-}
-
-static struct bsd_dict *dict_ptr(struct bsd_db *db, int idx)
-{
-	if ((unsigned int) idx >= (unsigned int) db->hsize) {
-		printk(KERN_DEBUG "<9>ppp: dict_ptr(%d) > max\n", idx);
-		idx = 0;
-	}
-	return dict_ptrx(db, idx);
-}
-
-#else
-#define lens_ptr(db, idx) lens_ptrx(db, idx)
-#define dict_ptr(db, idx) dict_ptrx(db, idx)
-#endif
-
-/*
- * compress a packet
- */
-static int bsd_compress(void *state, struct sk_buff *skb_in, struct sk_buff *skb_out, int proto)
-{
-	struct bsd_db *db;
-	int hshift;
-	unsigned int max_ent;
-	unsigned int n_bits;
-	unsigned int bitno;
-	unsigned long accm;
-	int ent;
-	unsigned long fcode;
-	struct bsd_dict *dictp;
-	unsigned char c;
-	int hval, disp, ilen, mxcode;
-	unsigned char *rptr = skb_in->data;
-	int isize = skb_in->len;
-
-#define OUTPUT(ent)							\
-	{								\
-		bitno -= n_bits;					\
-		accm |= ((ent) << bitno);				\
-		do	{						\
-			if (skb_out && skb_tailroom(skb_out) > 0)	\
-				skb_put_u8(skb_out, (u8)(accm >> 24));	\
-			accm <<= 8;					\
-			bitno += 8;					\
-		} while (bitno <= 24);					\
-	}
-
-	/*
-	 * If the protocol is not in the range we're interested in,
-	 * just return without compressing the packet.  If it is,
-	 * the protocol becomes the first byte to compress.
-	 */
-	printk(KERN_DEBUG "bsd_compress called with %x\n", proto);
-
-	ent = proto;
-	if (proto < 0x21 || proto > 0xf9 || !(proto & 0x1))
-		return 0;
-
-	db      = (struct bsd_db *) state;
-	hshift  = db->hshift;
-	max_ent = db->max_ent;
-	n_bits  = db->n_bits;
-	bitno   = 32;
-	accm    = 0;
-	mxcode  = MAXCODE(n_bits);
-
-	/* This is the PPP header information */
-	if (skb_out && skb_tailroom(skb_out) >= 2) {
-		char *v = skb_put(skb_out, 2);
-		/* we only push our own data on the header,
-		   AC,PC and protos is pushed by caller  */
-		v[0] = db->seqno >> 8;
-		v[1] = db->seqno;
-	}
-
-	ilen = ++isize; /* This is off by one, but that is what is in draft! */
-
-	while (--ilen > 0) {
-		c = *rptr++;
-		fcode = BSD_KEY(ent, c);
-		hval = BSD_HASH(ent, c, hshift);
-		dictp = dict_ptr(db, hval);
-
-		/* Validate and then check the entry. */
-		if (dictp->codem1 >= max_ent)
-			goto nomatch;
-
-		if (dictp->fcode == fcode) {
-			ent = dictp->codem1 + 1;
-			continue;	/* found (prefix,suffix) */
-		}
-
-		/* continue probing until a match or invalid entry */
-		disp = (hval == 0) ? 1 : hval;
-
-		do {
-			hval += disp;
-			if (hval >= db->hsize)
-				hval -= db->hsize;
-			dictp = dict_ptr(db, hval);
-			if (dictp->codem1 >= max_ent)
-				goto nomatch;
-		} while (dictp->fcode != fcode);
-
-		ent = dictp->codem1 + 1;	/* finally found (prefix,suffix) */
-		continue;
-
-	nomatch:
-		OUTPUT(ent);		/* output the prefix */
-
-		/* code -> hashtable */
-		if (max_ent < db->maxmaxcode) {
-			struct bsd_dict *dictp2;
-			struct bsd_dict *dictp3;
-			int indx;
-
-			/* expand code size if needed */
-			if (max_ent >= mxcode) {
-				db->n_bits = ++n_bits;
-				mxcode = MAXCODE(n_bits);
-			}
-
-			/*
-			 * Invalidate old hash table entry using
-			 * this code, and then take it over.
-			 */
-			dictp2 = dict_ptr(db, max_ent + 1);
-			indx   = dictp2->cptr;
-			dictp3 = dict_ptr(db, indx);
-
-			if (dictp3->codem1 == max_ent)
-				dictp3->codem1 = BADCODEM1;
-
-			dictp2->cptr   = hval;
-			dictp->codem1  = max_ent;
-			dictp->fcode = fcode;
-			db->max_ent    = ++max_ent;
-
-			if (db->lens) {
-				unsigned short *len1 = lens_ptr(db, max_ent);
-				unsigned short *len2 = lens_ptr(db, ent);
-				*len1 = *len2 + 1;
-			}
-		}
-		ent = c;
-	}
-
-	OUTPUT(ent);		/* output the last code */
-
-	if (skb_out)
-		db->bytes_out    += skb_out->len; /* Do not count bytes from here */
-	db->uncomp_bytes += isize;
-	db->in_count     += isize;
-	++db->uncomp_count;
-	++db->seqno;
-
-	if (bitno < 32)
-		++db->bytes_out; /* must be set before calling bsd_check */
-
-	/*
-	 * Generate the clear command if needed
-	 */
-
-	if (bsd_check(db))
-		OUTPUT(CLEAR);
-
-	/*
-	 * Pad dribble bits of last code with ones.
-	 * Do not emit a completely useless byte of ones.
-	 */
-	if (bitno < 32 && skb_out && skb_tailroom(skb_out) > 0)
-		skb_put_u8(skb_out,
-			   (unsigned char)((accm | (0xff << (bitno - 8))) >> 24));
-
-	/*
-	 * Increase code size if we would have without the packet
-	 * boundary because the decompressor will do so.
-	 */
-	if (max_ent >= mxcode && max_ent < db->maxmaxcode)
-		db->n_bits++;
-
-	/* If output length is too large then this is an incompressible frame. */
-	if (!skb_out || skb_out->len >= skb_in->len) {
-		++db->incomp_count;
-		db->incomp_bytes += isize;
-		return 0;
-	}
-
-	/* Count the number of compressed frames */
-	++db->comp_count;
-	db->comp_bytes += skb_out->len;
-	return skb_out->len;
-
-#undef OUTPUT
-}
-
-/*
- * Update the "BSD Compress" dictionary on the receiver for
- * incompressible data by pretending to compress the incoming data.
- */
-static void bsd_incomp(void *state, struct sk_buff *skb_in, int proto)
-{
-	bsd_compress(state, skb_in, NULL, proto);
-}
-
-/*
- * Decompress "BSD Compress".
- */
-static int bsd_decompress(void *state, struct sk_buff *skb_in, struct sk_buff *skb_out,
-			  struct isdn_ppp_resetparams *rsparm)
-{
-	struct bsd_db *db;
-	unsigned int max_ent;
-	unsigned long accm;
-	unsigned int bitno;		/* 1st valid bit in accm */
-	unsigned int n_bits;
-	unsigned int tgtbitno;	/* bitno when we have a code */
-	struct bsd_dict *dictp;
-	int seq;
-	unsigned int incode;
-	unsigned int oldcode;
-	unsigned int finchar;
-	unsigned char *p, *ibuf;
-	int ilen;
-	int codelen;
-	int extra;
-
-	db       = (struct bsd_db *) state;
-	max_ent  = db->max_ent;
-	accm     = 0;
-	bitno    = 32;		/* 1st valid bit in accm */
-	n_bits   = db->n_bits;
-	tgtbitno = 32 - n_bits;	/* bitno when we have a code */
-
-	printk(KERN_DEBUG "bsd_decompress called\n");
-
-	if (!skb_in || !skb_out) {
-		printk(KERN_ERR "bsd_decompress called with NULL parameter\n");
-		return DECOMP_ERROR;
-	}
-
-	/*
-	 * Get the sequence number.
-	 */
-	if ((p = skb_pull(skb_in, 2)) == NULL) {
-		return DECOMP_ERROR;
-	}
-	p -= 2;
-	seq = (p[0] << 8) + p[1];
-	ilen = skb_in->len;
-	ibuf = skb_in->data;
-
-	/*
-	 * Check the sequence number and give up if it differs from
-	 * the value we're expecting.
-	 */
-	if (seq != db->seqno) {
-		if (db->debug) {
-			printk(KERN_DEBUG "bsd_decomp%d: bad sequence # %d, expected %d\n",
-			       db->unit, seq, db->seqno - 1);
-		}
-		return DECOMP_ERROR;
-	}
-
-	++db->seqno;
-	db->bytes_out += ilen;
-
-	if (skb_tailroom(skb_out) > 0)
-		skb_put_u8(skb_out, 0);
-	else
-		return DECOMP_ERR_NOMEM;
-
-	oldcode = CLEAR;
-
-	/*
-	 * Keep the checkpoint correctly so that incompressible packets
-	 * clear the dictionary at the proper times.
-	 */
-
-	for (;;) {
-		if (ilen-- <= 0) {
-			db->in_count += (skb_out->len - 1); /* don't count the header */
-			break;
-		}
-
-		/*
-		 * Accumulate bytes until we have a complete code.
-		 * Then get the next code, relying on the 32-bit,
-		 * unsigned accm to mask the result.
-		 */
-
-		bitno -= 8;
-		accm  |= *ibuf++ << bitno;
-		if (tgtbitno < bitno)
-			continue;
-
-		incode = accm >> tgtbitno;
-		accm <<= n_bits;
-		bitno += n_bits;
-
-		/*
-		 * The dictionary must only be cleared at the end of a packet.
-		 */
-
-		if (incode == CLEAR) {
-			if (ilen > 0) {
-				if (db->debug)
-					printk(KERN_DEBUG "bsd_decomp%d: bad CLEAR\n", db->unit);
-				return DECOMP_FATALERROR;	/* probably a bug */
-			}
-			bsd_clear(db);
-			break;
-		}
-
-		if ((incode > max_ent + 2) || (incode > db->maxmaxcode)
-		    || (incode > max_ent && oldcode == CLEAR)) {
-			if (db->debug) {
-				printk(KERN_DEBUG "bsd_decomp%d: bad code 0x%x oldcode=0x%x ",
-				       db->unit, incode, oldcode);
-				printk(KERN_DEBUG "max_ent=0x%x skb->Len=%d seqno=%d\n",
-				       max_ent, skb_out->len, db->seqno);
-			}
-			return DECOMP_FATALERROR;	/* probably a bug */
-		}
-
-		/* Special case for KwKwK string. */
-		if (incode > max_ent) {
-			finchar = oldcode;
-			extra   = 1;
-		} else {
-			finchar = incode;
-			extra   = 0;
-		}
-
-		codelen = *(lens_ptr(db, finchar));
-		if (skb_tailroom(skb_out) < codelen + extra) {
-			if (db->debug) {
-				printk(KERN_DEBUG "bsd_decomp%d: ran out of mru\n", db->unit);
-#ifdef DEBUG
-				printk(KERN_DEBUG "  len=%d, finchar=0x%x, codelen=%d,skblen=%d\n",
-				       ilen, finchar, codelen, skb_out->len);
-#endif
-			}
-			return DECOMP_FATALERROR;
-		}
-
-		/*
-		 * Decode this code and install it in the decompressed buffer.
-		 */
-
-		p = skb_put(skb_out, codelen);
-		p += codelen;
-		while (finchar > LAST) {
-			struct bsd_dict *dictp2 = dict_ptr(db, finchar);
-
-			dictp = dict_ptr(db, dictp2->cptr);
-
-#ifdef DEBUG
-			if (--codelen <= 0 || dictp->codem1 != finchar - 1) {
-				if (codelen <= 0) {
-					printk(KERN_ERR "bsd_decomp%d: fell off end of chain ", db->unit);
-					printk(KERN_ERR "0x%x at 0x%x by 0x%x, max_ent=0x%x\n", incode, finchar, dictp2->cptr, max_ent);
-				} else {
-					if (dictp->codem1 != finchar - 1) {
-						printk(KERN_ERR "bsd_decomp%d: bad code chain 0x%x finchar=0x%x ", db->unit, incode, finchar);
-						printk(KERN_ERR "oldcode=0x%x cptr=0x%x codem1=0x%x\n", oldcode, dictp2->cptr, dictp->codem1);
-					}
-				}
-				return DECOMP_FATALERROR;
-			}
-#endif
-
-			{
-				u32 fcode = dictp->fcode;
-				*--p    = (fcode >> 16) & 0xff;
-				finchar = fcode & 0xffff;
-			}
-		}
-		*--p = finchar;
-
-#ifdef DEBUG
-		if (--codelen != 0)
-			printk(KERN_ERR "bsd_decomp%d: short by %d after code 0x%x, max_ent=0x%x\n", db->unit, codelen, incode, max_ent);
-#endif
-
-		if (extra)		/* the KwKwK case again */
-			skb_put_u8(skb_out, finchar);
-
-		/*
-		 * If not first code in a packet, and
-		 * if not out of code space, then allocate a new code.
-		 *
-		 * Keep the hash table correct so it can be used
-		 * with uncompressed packets.
-		 */
-		if (oldcode != CLEAR && max_ent < db->maxmaxcode) {
-			struct bsd_dict *dictp2, *dictp3;
-			u16 *lens1, *lens2;
-			unsigned long fcode;
-			int hval, disp, indx;
-
-			fcode = BSD_KEY(oldcode, finchar);
-			hval  = BSD_HASH(oldcode, finchar, db->hshift);
-			dictp = dict_ptr(db, hval);
-
-			/* look for a free hash table entry */
-			if (dictp->codem1 < max_ent) {
-				disp = (hval == 0) ? 1 : hval;
-				do {
-					hval += disp;
-					if (hval >= db->hsize)
-						hval -= db->hsize;
-					dictp = dict_ptr(db, hval);
-				} while (dictp->codem1 < max_ent);
-			}
-
-			/*
-			 * Invalidate previous hash table entry
-			 * assigned this code, and then take it over
-			 */
-
-			dictp2 = dict_ptr(db, max_ent + 1);
-			indx   = dictp2->cptr;
-			dictp3 = dict_ptr(db, indx);
-
-			if (dictp3->codem1 == max_ent)
-				dictp3->codem1 = BADCODEM1;
-
-			dictp2->cptr   = hval;
-			dictp->codem1  = max_ent;
-			dictp->fcode = fcode;
-			db->max_ent    = ++max_ent;
-
-			/* Update the length of this string. */
-			lens1  = lens_ptr(db, max_ent);
-			lens2  = lens_ptr(db, oldcode);
-			*lens1 = *lens2 + 1;
-
-			/* Expand code size if needed. */
-			if (max_ent >= MAXCODE(n_bits) && max_ent < db->maxmaxcode) {
-				db->n_bits = ++n_bits;
-				tgtbitno   = 32-n_bits;
-			}
-		}
-		oldcode = incode;
-	}
-
-	++db->comp_count;
-	++db->uncomp_count;
-	db->comp_bytes   += skb_in->len - BSD_OVHD;
-	db->uncomp_bytes += skb_out->len;
-
-	if (bsd_check(db)) {
-		if (db->debug)
-			printk(KERN_DEBUG "bsd_decomp%d: peer should have cleared dictionary on %d\n",
-			       db->unit, db->seqno - 1);
-	}
-	return skb_out->len;
-}
-
-/*************************************************************
- * Table of addresses for the BSD compression module
- *************************************************************/
-
-static struct isdn_ppp_compressor ippp_bsd_compress = {
-	.owner          = THIS_MODULE,
-	.num            = CI_BSD_COMPRESS,
-	.alloc          = bsd_alloc,
-	.free           = bsd_free,
-	.init           = bsd_init,
-	.reset          = bsd_reset,
-	.compress       = bsd_compress,
-	.decompress     = bsd_decompress,
-	.incomp         = bsd_incomp,
-	.stat           = bsd_stats,
-};
-
-/*************************************************************
- * Module support routines
- *************************************************************/
-
-static int __init isdn_bsdcomp_init(void)
-{
-	int answer = isdn_ppp_register_compressor(&ippp_bsd_compress);
-	if (answer == 0)
-		printk(KERN_INFO "PPP BSD Compression module registered\n");
-	return answer;
-}
-
-static void __exit isdn_bsdcomp_exit(void)
-{
-	isdn_ppp_unregister_compressor(&ippp_bsd_compress);
-}
-
-module_init(isdn_bsdcomp_init);
-module_exit(isdn_bsdcomp_exit);
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
deleted file mode 100644
index 74ee00f5b310..000000000000
--- a/drivers/isdn/i4l/isdn_common.c
+++ /dev/null
@@ -1,2368 +0,0 @@
-/* $Id: isdn_common.c,v 1.1.2.3 2004/02/10 01:07:13 keil Exp $
- *
- * Linux ISDN subsystem, common used functions (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/isdn.h>
-#include <linux/mutex.h>
-#include "isdn_common.h"
-#include "isdn_tty.h"
-#include "isdn_net.h"
-#include "isdn_ppp.h"
-#ifdef CONFIG_ISDN_AUDIO
-#include "isdn_audio.h"
-#endif
-#ifdef CONFIG_ISDN_DIVERSION_MODULE
-#define CONFIG_ISDN_DIVERSION
-#endif
-#ifdef CONFIG_ISDN_DIVERSION
-#include <linux/isdn_divertif.h>
-#endif /* CONFIG_ISDN_DIVERSION */
-#include "isdn_v110.h"
-
-/* Debugflags */
-#undef ISDN_DEBUG_STATCALLB
-
-MODULE_DESCRIPTION("ISDN4Linux: link layer");
-MODULE_AUTHOR("Fritz Elfert");
-MODULE_LICENSE("GPL");
-
-isdn_dev *dev;
-
-static DEFINE_MUTEX(isdn_mutex);
-static char *isdn_revision = "$Revision: 1.1.2.3 $";
-
-extern char *isdn_net_revision;
-#ifdef CONFIG_ISDN_PPP
-extern char *isdn_ppp_revision;
-#else
-static char *isdn_ppp_revision = ": none $";
-#endif
-#ifdef CONFIG_ISDN_AUDIO
-extern char *isdn_audio_revision;
-#else
-static char *isdn_audio_revision = ": none $";
-#endif
-extern char *isdn_v110_revision;
-
-#ifdef CONFIG_ISDN_DIVERSION
-static isdn_divert_if *divert_if; /* = NULL */
-#endif /* CONFIG_ISDN_DIVERSION */
-
-
-static int isdn_writebuf_stub(int, int, const u_char __user *, int);
-static void set_global_features(void);
-static int isdn_wildmat(char *s, char *p);
-static int isdn_add_channels(isdn_driver_t *d, int drvidx, int n, int adding);
-
-static inline void
-isdn_lock_driver(isdn_driver_t *drv)
-{
-	try_module_get(drv->interface->owner);
-	drv->locks++;
-}
-
-void
-isdn_lock_drivers(void)
-{
-	int i;
-
-	for (i = 0; i < ISDN_MAX_DRIVERS; i++) {
-		if (!dev->drv[i])
-			continue;
-		isdn_lock_driver(dev->drv[i]);
-	}
-}
-
-static inline void
-isdn_unlock_driver(isdn_driver_t *drv)
-{
-	if (drv->locks > 0) {
-		drv->locks--;
-		module_put(drv->interface->owner);
-	}
-}
-
-void
-isdn_unlock_drivers(void)
-{
-	int i;
-
-	for (i = 0; i < ISDN_MAX_DRIVERS; i++) {
-		if (!dev->drv[i])
-			continue;
-		isdn_unlock_driver(dev->drv[i]);
-	}
-}
-
-#if defined(ISDN_DEBUG_NET_DUMP) || defined(ISDN_DEBUG_MODEM_DUMP)
-void
-isdn_dumppkt(char *s, u_char *p, int len, int dumplen)
-{
-	int dumpc;
-
-	printk(KERN_DEBUG "%s(%d) ", s, len);
-	for (dumpc = 0; (dumpc < dumplen) && (len); len--, dumpc++)
-		printk(" %02x", *p++);
-	printk("\n");
-}
-#endif
-
-/*
- * I picked the pattern-matching-functions from an old GNU-tar version (1.10)
- * It was originally written and put to PD by rs@mirror.TMC.COM (Rich Salz)
- */
-static int
-isdn_star(char *s, char *p)
-{
-	while (isdn_wildmat(s, p)) {
-		if (*++s == '\0')
-			return (2);
-	}
-	return (0);
-}
-
-/*
- * Shell-type Pattern-matching for incoming caller-Ids
- * This function gets a string in s and checks, if it matches the pattern
- * given in p.
- *
- * Return:
- *   0 = match.
- *   1 = no match.
- *   2 = no match. Would eventually match, if s would be longer.
- *
- * Possible Patterns:
- *
- * '?'     matches one character
- * '*'     matches zero or more characters
- * [xyz]   matches the set of characters in brackets.
- * [^xyz]  matches any single character not in the set of characters
- */
-
-static int
-isdn_wildmat(char *s, char *p)
-{
-	register int last;
-	register int matched;
-	register int reverse;
-	register int nostar = 1;
-
-	if (!(*s) && !(*p))
-		return (1);
-	for (; *p; s++, p++)
-		switch (*p) {
-		case '\\':
-			/* Literal match with following character. */
-			p++;
-			/* fall through */
-		default:
-			if (*s != *p)
-				return (*s == '\0') ? 2 : 1;
-					continue;
-		case '?':
-			/* Match anything. */
-			if (*s == '\0')
-				return (2);
-			continue;
-		case '*':
-			nostar = 0;
-			/* Trailing star matches everything. */
-			return (*++p ? isdn_star(s, p) : 0);
-		case '[':
-			/* [^....] means inverse character class. */
-			if ((reverse = (p[1] == '^')))
-				p++;
-			for (last = 0, matched = 0; *++p && (*p != ']'); last = *p)
-				/* This next line requires a good C compiler. */
-				if (*p == '-' ? *s <= *++p && *s >= last : *s == *p)
-					matched = 1;
-			if (matched == reverse)
-				return (1);
-			continue;
-		}
-	return (*s == '\0') ? 0 : nostar;
-}
-
-int isdn_msncmp(const char *msn1, const char *msn2)
-{
-	char TmpMsn1[ISDN_MSNLEN];
-	char TmpMsn2[ISDN_MSNLEN];
-	char *p;
-
-	for (p = TmpMsn1; *msn1 && *msn1 != ':';)  // Strip off a SPID
-		*p++ = *msn1++;
-	*p = '\0';
-
-	for (p = TmpMsn2; *msn2 && *msn2 != ':';)  // Strip off a SPID
-		*p++ = *msn2++;
-	*p = '\0';
-
-	return isdn_wildmat(TmpMsn1, TmpMsn2);
-}
-
-int
-isdn_dc2minor(int di, int ch)
-{
-	int i;
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (dev->chanmap[i] == ch && dev->drvmap[i] == di)
-			return i;
-	return -1;
-}
-
-static int isdn_timer_cnt1 = 0;
-static int isdn_timer_cnt2 = 0;
-static int isdn_timer_cnt3 = 0;
-
-static void
-isdn_timer_funct(struct timer_list *unused)
-{
-	int tf = dev->tflags;
-	if (tf & ISDN_TIMER_FAST) {
-		if (tf & ISDN_TIMER_MODEMREAD)
-			isdn_tty_readmodem();
-		if (tf & ISDN_TIMER_MODEMPLUS)
-			isdn_tty_modem_escape();
-		if (tf & ISDN_TIMER_MODEMXMIT)
-			isdn_tty_modem_xmit();
-	}
-	if (tf & ISDN_TIMER_SLOW) {
-		if (++isdn_timer_cnt1 >= ISDN_TIMER_02SEC) {
-			isdn_timer_cnt1 = 0;
-			if (tf & ISDN_TIMER_NETDIAL)
-				isdn_net_dial();
-		}
-		if (++isdn_timer_cnt2 >= ISDN_TIMER_1SEC) {
-			isdn_timer_cnt2 = 0;
-			if (tf & ISDN_TIMER_NETHANGUP)
-				isdn_net_autohup();
-			if (++isdn_timer_cnt3 >= ISDN_TIMER_RINGING) {
-				isdn_timer_cnt3 = 0;
-				if (tf & ISDN_TIMER_MODEMRING)
-					isdn_tty_modem_ring();
-			}
-			if (tf & ISDN_TIMER_CARRIER)
-				isdn_tty_carrier_timeout();
-		}
-	}
-	if (tf)
-		mod_timer(&dev->timer, jiffies + ISDN_TIMER_RES);
-}
-
-void
-isdn_timer_ctrl(int tf, int onoff)
-{
-	unsigned long flags;
-	int old_tflags;
-
-	spin_lock_irqsave(&dev->timerlock, flags);
-	if ((tf & ISDN_TIMER_SLOW) && (!(dev->tflags & ISDN_TIMER_SLOW))) {
-		/* If the slow-timer wasn't activated until now */
-		isdn_timer_cnt1 = 0;
-		isdn_timer_cnt2 = 0;
-	}
-	old_tflags = dev->tflags;
-	if (onoff)
-		dev->tflags |= tf;
-	else
-		dev->tflags &= ~tf;
-	if (dev->tflags && !old_tflags)
-		mod_timer(&dev->timer, jiffies + ISDN_TIMER_RES);
-	spin_unlock_irqrestore(&dev->timerlock, flags);
-}
-
-/*
- * Receive a packet from B-Channel. (Called from low-level-module)
- */
-static void
-isdn_receive_skb_callback(int di, int channel, struct sk_buff *skb)
-{
-	int i;
-
-	if ((i = isdn_dc2minor(di, channel)) == -1) {
-		dev_kfree_skb(skb);
-		return;
-	}
-	/* Update statistics */
-	dev->ibytes[i] += skb->len;
-
-	/* First, try to deliver data to network-device */
-	if (isdn_net_rcv_skb(i, skb))
-		return;
-
-	/* V.110 handling
-	 * makes sense for async streams only, so it is
-	 * called after possible net-device delivery.
-	 */
-	if (dev->v110[i]) {
-		atomic_inc(&dev->v110use[i]);
-		skb = isdn_v110_decode(dev->v110[i], skb);
-		atomic_dec(&dev->v110use[i]);
-		if (!skb)
-			return;
-	}
-
-	/* No network-device found, deliver to tty or raw-channel */
-	if (skb->len) {
-		if (isdn_tty_rcv_skb(i, di, channel, skb))
-			return;
-		wake_up_interruptible(&dev->drv[di]->rcv_waitq[channel]);
-	} else
-		dev_kfree_skb(skb);
-}
-
-/*
- * Intercept command from Linklevel to Lowlevel.
- * If layer 2 protocol is V.110 and this is not supported by current
- * lowlevel-driver, use driver's transparent mode and handle V.110 in
- * linklevel instead.
- */
-int
-isdn_command(isdn_ctrl *cmd)
-{
-	if (cmd->driver == -1) {
-		printk(KERN_WARNING "isdn_command command(%x) driver -1\n", cmd->command);
-		return (1);
-	}
-	if (!dev->drv[cmd->driver]) {
-		printk(KERN_WARNING "isdn_command command(%x) dev->drv[%d] NULL\n",
-		       cmd->command, cmd->driver);
-		return (1);
-	}
-	if (!dev->drv[cmd->driver]->interface) {
-		printk(KERN_WARNING "isdn_command command(%x) dev->drv[%d]->interface NULL\n",
-		       cmd->command, cmd->driver);
-		return (1);
-	}
-	if (cmd->command == ISDN_CMD_SETL2) {
-		int idx = isdn_dc2minor(cmd->driver, cmd->arg & 255);
-		unsigned long l2prot = (cmd->arg >> 8) & 255;
-		unsigned long features = (dev->drv[cmd->driver]->interface->features
-					  >> ISDN_FEATURE_L2_SHIFT) &
-			ISDN_FEATURE_L2_MASK;
-		unsigned long l2_feature = (1 << l2prot);
-
-		switch (l2prot) {
-		case ISDN_PROTO_L2_V11096:
-		case ISDN_PROTO_L2_V11019:
-		case ISDN_PROTO_L2_V11038:
-			/* If V.110 requested, but not supported by
-			 * HL-driver, set emulator-flag and change
-			 * Layer-2 to transparent
-			 */
-			if (!(features & l2_feature)) {
-				dev->v110emu[idx] = l2prot;
-				cmd->arg = (cmd->arg & 255) |
-					(ISDN_PROTO_L2_TRANS << 8);
-			} else
-				dev->v110emu[idx] = 0;
-		}
-	}
-	return dev->drv[cmd->driver]->interface->command(cmd);
-}
-
-void
-isdn_all_eaz(int di, int ch)
-{
-	isdn_ctrl cmd;
-
-	if (di < 0)
-		return;
-	cmd.driver = di;
-	cmd.arg = ch;
-	cmd.command = ISDN_CMD_SETEAZ;
-	cmd.parm.num[0] = '\0';
-	isdn_command(&cmd);
-}
-
-/*
- * Begin of a CAPI like LL<->HL interface, currently used only for
- * supplementary service (CAPI 2.0 part III)
- */
-#include <linux/isdn/capicmd.h>
-
-static int
-isdn_capi_rec_hl_msg(capi_msg *cm)
-{
-	switch (cm->Command) {
-	case CAPI_FACILITY:
-		/* in the moment only handled in tty */
-		return (isdn_tty_capi_facility(cm));
-	default:
-		return (-1);
-	}
-}
-
-static int
-isdn_status_callback(isdn_ctrl *c)
-{
-	int di;
-	u_long flags;
-	int i;
-	int r;
-	int retval = 0;
-	isdn_ctrl cmd;
-	isdn_net_dev *p;
-
-	di = c->driver;
-	i = isdn_dc2minor(di, c->arg);
-	switch (c->command) {
-	case ISDN_STAT_BSENT:
-		if (i < 0)
-			return -1;
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		if (isdn_net_stat_callback(i, c))
-			return 0;
-		if (isdn_v110_stat_callback(i, c))
-			return 0;
-		if (isdn_tty_stat_callback(i, c))
-			return 0;
-		wake_up_interruptible(&dev->drv[di]->snd_waitq[c->arg]);
-		break;
-	case ISDN_STAT_STAVAIL:
-		dev->drv[di]->stavail += c->arg;
-		wake_up_interruptible(&dev->drv[di]->st_waitq);
-		break;
-	case ISDN_STAT_RUN:
-		dev->drv[di]->flags |= DRV_FLAG_RUNNING;
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-			if (dev->drvmap[i] == di)
-				isdn_all_eaz(di, dev->chanmap[i]);
-		set_global_features();
-		break;
-	case ISDN_STAT_STOP:
-		dev->drv[di]->flags &= ~DRV_FLAG_RUNNING;
-		break;
-	case ISDN_STAT_ICALL:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "ICALL (net): %d %ld %s\n", di, c->arg, c->parm.num);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED) {
-			cmd.driver = di;
-			cmd.arg = c->arg;
-			cmd.command = ISDN_CMD_HANGUP;
-			isdn_command(&cmd);
-			return 0;
-		}
-		/* Try to find a network-interface which will accept incoming call */
-		r = ((c->command == ISDN_STAT_ICALLW) ? 0 : isdn_net_find_icall(di, c->arg, i, &c->parm.setup));
-		switch (r) {
-		case 0:
-			/* No network-device replies.
-			 * Try ttyI's.
-			 * These return 0 on no match, 1 on match and
-			 * 3 on eventually match, if CID is longer.
-			 */
-			if (c->command == ISDN_STAT_ICALL)
-				if ((retval = isdn_tty_find_icall(di, c->arg, &c->parm.setup))) return (retval);
-#ifdef CONFIG_ISDN_DIVERSION
-			if (divert_if)
-				if ((retval = divert_if->stat_callback(c)))
-					return (retval); /* processed */
-#endif /* CONFIG_ISDN_DIVERSION */
-			if ((!retval) && (dev->drv[di]->flags & DRV_FLAG_REJBUS)) {
-				/* No tty responding */
-				cmd.driver = di;
-				cmd.arg = c->arg;
-				cmd.command = ISDN_CMD_HANGUP;
-				isdn_command(&cmd);
-				retval = 2;
-			}
-			break;
-		case 1:
-			/* Schedule connection-setup */
-			isdn_net_dial();
-			cmd.driver = di;
-			cmd.arg = c->arg;
-			cmd.command = ISDN_CMD_ACCEPTD;
-			for (p = dev->netdev; p; p = p->next)
-				if (p->local->isdn_channel == cmd.arg)
-				{
-					strcpy(cmd.parm.setup.eazmsn, p->local->msn);
-					isdn_command(&cmd);
-					retval = 1;
-					break;
-				}
-			break;
-
-		case 2:	/* For calling back, first reject incoming call ... */
-		case 3:	/* Interface found, but down, reject call actively  */
-			retval = 2;
-			printk(KERN_INFO "isdn: Rejecting Call\n");
-			cmd.driver = di;
-			cmd.arg = c->arg;
-			cmd.command = ISDN_CMD_HANGUP;
-			isdn_command(&cmd);
-			if (r == 3)
-				break;
-			/* Fall through */
-		case 4:
-			/* ... then start callback. */
-			isdn_net_dial();
-			break;
-		case 5:
-			/* Number would eventually match, if longer */
-			retval = 3;
-			break;
-		}
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "ICALL: ret=%d\n", retval);
-#endif
-		return retval;
-		break;
-	case ISDN_STAT_CINF:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "CINF: %ld %s\n", c->arg, c->parm.num);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		if (strcmp(c->parm.num, "0"))
-			isdn_net_stat_callback(i, c);
-		isdn_tty_stat_callback(i, c);
-		break;
-	case ISDN_STAT_CAUSE:
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "CAUSE: %ld %s\n", c->arg, c->parm.num);
-#endif
-		printk(KERN_INFO "isdn: %s,ch%ld cause: %s\n",
-		       dev->drvid[di], c->arg, c->parm.num);
-		isdn_tty_stat_callback(i, c);
-#ifdef CONFIG_ISDN_DIVERSION
-		if (divert_if)
-			divert_if->stat_callback(c);
-#endif /* CONFIG_ISDN_DIVERSION */
-		break;
-	case ISDN_STAT_DISPLAY:
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "DISPLAY: %ld %s\n", c->arg, c->parm.display);
-#endif
-		isdn_tty_stat_callback(i, c);
-#ifdef CONFIG_ISDN_DIVERSION
-		if (divert_if)
-			divert_if->stat_callback(c);
-#endif /* CONFIG_ISDN_DIVERSION */
-		break;
-	case ISDN_STAT_DCONN:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "DCONN: %ld\n", c->arg);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		/* Find any net-device, waiting for D-channel setup */
-		if (isdn_net_stat_callback(i, c))
-			break;
-		isdn_v110_stat_callback(i, c);
-		/* Find any ttyI, waiting for D-channel setup */
-		if (isdn_tty_stat_callback(i, c)) {
-			cmd.driver = di;
-			cmd.arg = c->arg;
-			cmd.command = ISDN_CMD_ACCEPTB;
-			isdn_command(&cmd);
-			break;
-		}
-		break;
-	case ISDN_STAT_DHUP:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "DHUP: %ld\n", c->arg);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		dev->drv[di]->online &= ~(1 << (c->arg));
-		isdn_info_update();
-		/* Signal hangup to network-devices */
-		if (isdn_net_stat_callback(i, c))
-			break;
-		isdn_v110_stat_callback(i, c);
-		if (isdn_tty_stat_callback(i, c))
-			break;
-#ifdef CONFIG_ISDN_DIVERSION
-		if (divert_if)
-			divert_if->stat_callback(c);
-#endif /* CONFIG_ISDN_DIVERSION */
-		break;
-		break;
-	case ISDN_STAT_BCONN:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "BCONN: %ld\n", c->arg);
-#endif
-		/* Signal B-channel-connect to network-devices */
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		dev->drv[di]->online |= (1 << (c->arg));
-		isdn_info_update();
-		if (isdn_net_stat_callback(i, c))
-			break;
-		isdn_v110_stat_callback(i, c);
-		if (isdn_tty_stat_callback(i, c))
-			break;
-		break;
-	case ISDN_STAT_BHUP:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "BHUP: %ld\n", c->arg);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		dev->drv[di]->online &= ~(1 << (c->arg));
-		isdn_info_update();
-#ifdef CONFIG_ISDN_X25
-		/* Signal hangup to network-devices */
-		if (isdn_net_stat_callback(i, c))
-			break;
-#endif
-		isdn_v110_stat_callback(i, c);
-		if (isdn_tty_stat_callback(i, c))
-			break;
-		break;
-	case ISDN_STAT_NODCH:
-		if (i < 0)
-			return -1;
-#ifdef ISDN_DEBUG_STATCALLB
-		printk(KERN_DEBUG "NODCH: %ld\n", c->arg);
-#endif
-		if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-			return 0;
-		if (isdn_net_stat_callback(i, c))
-			break;
-		if (isdn_tty_stat_callback(i, c))
-			break;
-		break;
-	case ISDN_STAT_ADDCH:
-		spin_lock_irqsave(&dev->lock, flags);
-		if (isdn_add_channels(dev->drv[di], di, c->arg, 1)) {
-			spin_unlock_irqrestore(&dev->lock, flags);
-			return -1;
-		}
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_info_update();
-		break;
-	case ISDN_STAT_DISCH:
-		spin_lock_irqsave(&dev->lock, flags);
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-			if ((dev->drvmap[i] == di) &&
-			    (dev->chanmap[i] == c->arg)) {
-				if (c->parm.num[0])
-					dev->usage[i] &= ~ISDN_USAGE_DISABLED;
-				else
-					if (USG_NONE(dev->usage[i])) {
-						dev->usage[i] |= ISDN_USAGE_DISABLED;
-					}
-					else
-						retval = -1;
-				break;
-			}
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_info_update();
-		break;
-	case ISDN_STAT_UNLOAD:
-		while (dev->drv[di]->locks > 0) {
-			isdn_unlock_driver(dev->drv[di]);
-		}
-		spin_lock_irqsave(&dev->lock, flags);
-		isdn_tty_stat_callback(i, c);
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-			if (dev->drvmap[i] == di) {
-				dev->drvmap[i] = -1;
-				dev->chanmap[i] = -1;
-				dev->usage[i] &= ~ISDN_USAGE_DISABLED;
-			}
-		dev->drivers--;
-		dev->channels -= dev->drv[di]->channels;
-		kfree(dev->drv[di]->rcverr);
-		kfree(dev->drv[di]->rcvcount);
-		for (i = 0; i < dev->drv[di]->channels; i++)
-			skb_queue_purge(&dev->drv[di]->rpqueue[i]);
-		kfree(dev->drv[di]->rpqueue);
-		kfree(dev->drv[di]->rcv_waitq);
-		kfree(dev->drv[di]);
-		dev->drv[di] = NULL;
-		dev->drvid[di][0] = '\0';
-		isdn_info_update();
-		set_global_features();
-		spin_unlock_irqrestore(&dev->lock, flags);
-		return 0;
-	case ISDN_STAT_L1ERR:
-		break;
-	case CAPI_PUT_MESSAGE:
-		return (isdn_capi_rec_hl_msg(&c->parm.cmsg));
-#ifdef CONFIG_ISDN_TTY_FAX
-	case ISDN_STAT_FAXIND:
-		isdn_tty_stat_callback(i, c);
-		break;
-#endif
-#ifdef CONFIG_ISDN_AUDIO
-	case ISDN_STAT_AUDIO:
-		isdn_tty_stat_callback(i, c);
-		break;
-#endif
-#ifdef CONFIG_ISDN_DIVERSION
-	case ISDN_STAT_PROT:
-	case ISDN_STAT_REDIR:
-		if (divert_if)
-			return (divert_if->stat_callback(c));
-#endif /* CONFIG_ISDN_DIVERSION */
-		/* fall through */
-	default:
-		return -1;
-	}
-	return 0;
-}
-
-/*
- * Get integer from char-pointer, set pointer to end of number
- */
-int
-isdn_getnum(char **p)
-{
-	int v = -1;
-
-	while (*p[0] >= '0' && *p[0] <= '9')
-		v = ((v < 0) ? 0 : (v * 10)) + (int) ((*p[0]++) - '0');
-	return v;
-}
-
-#define DLE 0x10
-
-/*
- * isdn_readbchan() tries to get data from the read-queue.
- * It MUST be called with interrupts off.
- *
- * Be aware that this is not an atomic operation when sleep != 0, even though
- * interrupts are turned off! Well, like that we are currently only called
- * on behalf of a read system call on raw device files (which are documented
- * to be dangerous and for debugging purpose only). The inode semaphore
- * takes care that this is not called for the same minor device number while
- * we are sleeping, but access is not serialized against simultaneous read()
- * from the corresponding ttyI device. Can other ugly events, like changes
- * of the mapping (di,ch)<->minor, happen during the sleep? --he
- */
-int
-isdn_readbchan(int di, int channel, u_char *buf, u_char *fp, int len, wait_queue_head_t *sleep)
-{
-	int count;
-	int count_pull;
-	int count_put;
-	int dflag;
-	struct sk_buff *skb;
-	u_char *cp;
-
-	if (!dev->drv[di])
-		return 0;
-	if (skb_queue_empty(&dev->drv[di]->rpqueue[channel])) {
-		if (sleep)
-			wait_event_interruptible(*sleep,
-				!skb_queue_empty(&dev->drv[di]->rpqueue[channel]));
-		else
-			return 0;
-	}
-	if (len > dev->drv[di]->rcvcount[channel])
-		len = dev->drv[di]->rcvcount[channel];
-	cp = buf;
-	count = 0;
-	while (len) {
-		if (!(skb = skb_peek(&dev->drv[di]->rpqueue[channel])))
-			break;
-#ifdef CONFIG_ISDN_AUDIO
-		if (ISDN_AUDIO_SKB_LOCK(skb))
-			break;
-		ISDN_AUDIO_SKB_LOCK(skb) = 1;
-		if ((ISDN_AUDIO_SKB_DLECOUNT(skb)) || (dev->drv[di]->DLEflag & (1 << channel))) {
-			char *p = skb->data;
-			unsigned long DLEmask = (1 << channel);
-
-			dflag = 0;
-			count_pull = count_put = 0;
-			while ((count_pull < skb->len) && (len > 0)) {
-				len--;
-				if (dev->drv[di]->DLEflag & DLEmask) {
-					*cp++ = DLE;
-					dev->drv[di]->DLEflag &= ~DLEmask;
-				} else {
-					*cp++ = *p;
-					if (*p == DLE) {
-						dev->drv[di]->DLEflag |= DLEmask;
-						(ISDN_AUDIO_SKB_DLECOUNT(skb))--;
-					}
-					p++;
-					count_pull++;
-				}
-				count_put++;
-			}
-			if (count_pull >= skb->len)
-				dflag = 1;
-		} else {
-#endif
-			/* No DLE's in buff, so simply copy it */
-			dflag = 1;
-			if ((count_pull = skb->len) > len) {
-				count_pull = len;
-				dflag = 0;
-			}
-			count_put = count_pull;
-			skb_copy_from_linear_data(skb, cp, count_put);
-			cp += count_put;
-			len -= count_put;
-#ifdef CONFIG_ISDN_AUDIO
-		}
-#endif
-		count += count_put;
-		if (fp) {
-			memset(fp, 0, count_put);
-			fp += count_put;
-		}
-		if (dflag) {
-			/* We got all the data in this buff.
-			 * Now we can dequeue it.
-			 */
-			if (fp)
-				*(fp - 1) = 0xff;
-#ifdef CONFIG_ISDN_AUDIO
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-			skb = skb_dequeue(&dev->drv[di]->rpqueue[channel]);
-			dev_kfree_skb(skb);
-		} else {
-			/* Not yet emptied this buff, so it
-			 * must stay in the queue, for further calls
-			 * but we pull off the data we got until now.
-			 */
-			skb_pull(skb, count_pull);
-#ifdef CONFIG_ISDN_AUDIO
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-		}
-		dev->drv[di]->rcvcount[channel] -= count_put;
-	}
-	return count;
-}
-
-/*
- * isdn_readbchan_tty() tries to get data from the read-queue.
- * It MUST be called with interrupts off.
- *
- * Be aware that this is not an atomic operation when sleep != 0, even though
- * interrupts are turned off! Well, like that we are currently only called
- * on behalf of a read system call on raw device files (which are documented
- * to be dangerous and for debugging purpose only). The inode semaphore
- * takes care that this is not called for the same minor device number while
- * we are sleeping, but access is not serialized against simultaneous read()
- * from the corresponding ttyI device. Can other ugly events, like changes
- * of the mapping (di,ch)<->minor, happen during the sleep? --he
- */
-int
-isdn_readbchan_tty(int di, int channel, struct tty_port *port, int cisco_hack)
-{
-	int count;
-	int count_pull;
-	int count_put;
-	int dflag;
-	struct sk_buff *skb;
-	char last = 0;
-	int len;
-
-	if (!dev->drv[di])
-		return 0;
-	if (skb_queue_empty(&dev->drv[di]->rpqueue[channel]))
-		return 0;
-
-	len = tty_buffer_request_room(port, dev->drv[di]->rcvcount[channel]);
-	if (len == 0)
-		return len;
-
-	count = 0;
-	while (len) {
-		if (!(skb = skb_peek(&dev->drv[di]->rpqueue[channel])))
-			break;
-#ifdef CONFIG_ISDN_AUDIO
-		if (ISDN_AUDIO_SKB_LOCK(skb))
-			break;
-		ISDN_AUDIO_SKB_LOCK(skb) = 1;
-		if ((ISDN_AUDIO_SKB_DLECOUNT(skb)) || (dev->drv[di]->DLEflag & (1 << channel))) {
-			char *p = skb->data;
-			unsigned long DLEmask = (1 << channel);
-
-			dflag = 0;
-			count_pull = count_put = 0;
-			while ((count_pull < skb->len) && (len > 0)) {
-				/* push every character but the last to the tty buffer directly */
-				if (count_put)
-					tty_insert_flip_char(port, last, TTY_NORMAL);
-				len--;
-				if (dev->drv[di]->DLEflag & DLEmask) {
-					last = DLE;
-					dev->drv[di]->DLEflag &= ~DLEmask;
-				} else {
-					last = *p;
-					if (last == DLE) {
-						dev->drv[di]->DLEflag |= DLEmask;
-						(ISDN_AUDIO_SKB_DLECOUNT(skb))--;
-					}
-					p++;
-					count_pull++;
-				}
-				count_put++;
-			}
-			if (count_pull >= skb->len)
-				dflag = 1;
-		} else {
-#endif
-			/* No DLE's in buff, so simply copy it */
-			dflag = 1;
-			if ((count_pull = skb->len) > len) {
-				count_pull = len;
-				dflag = 0;
-			}
-			count_put = count_pull;
-			if (count_put > 1)
-				tty_insert_flip_string(port, skb->data, count_put - 1);
-			last = skb->data[count_put - 1];
-			len -= count_put;
-#ifdef CONFIG_ISDN_AUDIO
-		}
-#endif
-		count += count_put;
-		if (dflag) {
-			/* We got all the data in this buff.
-			 * Now we can dequeue it.
-			 */
-			if (cisco_hack)
-				tty_insert_flip_char(port, last, 0xFF);
-			else
-				tty_insert_flip_char(port, last, TTY_NORMAL);
-#ifdef CONFIG_ISDN_AUDIO
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-			skb = skb_dequeue(&dev->drv[di]->rpqueue[channel]);
-			dev_kfree_skb(skb);
-		} else {
-			tty_insert_flip_char(port, last, TTY_NORMAL);
-			/* Not yet emptied this buff, so it
-			 * must stay in the queue, for further calls
-			 * but we pull off the data we got until now.
-			 */
-			skb_pull(skb, count_pull);
-#ifdef CONFIG_ISDN_AUDIO
-			ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-		}
-		dev->drv[di]->rcvcount[channel] -= count_put;
-	}
-	return count;
-}
-
-
-static inline int
-isdn_minor2drv(int minor)
-{
-	return (dev->drvmap[minor]);
-}
-
-static inline int
-isdn_minor2chan(int minor)
-{
-	return (dev->chanmap[minor]);
-}
-
-static char *
-isdn_statstr(void)
-{
-	static char istatbuf[2048];
-	char *p;
-	int i;
-
-	sprintf(istatbuf, "idmap:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%s ", (dev->drvmap[i] < 0) ? "-" : dev->drvid[dev->drvmap[i]]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\nchmap:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%d ", dev->chanmap[i]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\ndrmap:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%d ", dev->drvmap[i]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\nusage:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%d ", dev->usage[i]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\nflags:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_DRIVERS; i++) {
-		if (dev->drv[i]) {
-			sprintf(p, "%ld ", dev->drv[i]->online);
-			p = istatbuf + strlen(istatbuf);
-		} else {
-			sprintf(p, "? ");
-			p = istatbuf + strlen(istatbuf);
-		}
-	}
-	sprintf(p, "\nphone:\t");
-	p = istatbuf + strlen(istatbuf);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		sprintf(p, "%s ", dev->num[i]);
-		p = istatbuf + strlen(istatbuf);
-	}
-	sprintf(p, "\n");
-	return istatbuf;
-}
-
-/* Module interface-code */
-
-void
-isdn_info_update(void)
-{
-	infostruct *p = dev->infochain;
-
-	while (p) {
-		*(p->private) = 1;
-		p = (infostruct *) p->next;
-	}
-	wake_up_interruptible(&(dev->info_waitq));
-}
-
-static ssize_t
-isdn_read(struct file *file, char __user *buf, size_t count, loff_t *off)
-{
-	uint minor = iminor(file_inode(file));
-	int len = 0;
-	int drvidx;
-	int chidx;
-	int retval;
-	char *p;
-
-	mutex_lock(&isdn_mutex);
-	if (minor == ISDN_MINOR_STATUS) {
-		if (!file->private_data) {
-			if (file->f_flags & O_NONBLOCK) {
-				retval = -EAGAIN;
-				goto out;
-			}
-			wait_event_interruptible(dev->info_waitq,
-						 file->private_data);
-		}
-		p = isdn_statstr();
-		file->private_data = NULL;
-		if ((len = strlen(p)) <= count) {
-			if (copy_to_user(buf, p, len)) {
-				retval = -EFAULT;
-				goto out;
-			}
-			*off += len;
-			retval = len;
-			goto out;
-		}
-		retval = 0;
-		goto out;
-	}
-	if (!dev->drivers) {
-		retval = -ENODEV;
-		goto out;
-	}
-	if (minor <= ISDN_MINOR_BMAX) {
-		printk(KERN_WARNING "isdn_read minor %d obsolete!\n", minor);
-		drvidx = isdn_minor2drv(minor);
-		if (drvidx < 0) {
-			retval = -ENODEV;
-			goto out;
-		}
-		if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) {
-			retval = -ENODEV;
-			goto out;
-		}
-		chidx = isdn_minor2chan(minor);
-		if (!(p = kmalloc(count, GFP_KERNEL))) {
-			retval = -ENOMEM;
-			goto out;
-		}
-		len = isdn_readbchan(drvidx, chidx, p, NULL, count,
-				     &dev->drv[drvidx]->rcv_waitq[chidx]);
-		*off += len;
-		if (copy_to_user(buf, p, len))
-			len = -EFAULT;
-		kfree(p);
-		retval = len;
-		goto out;
-	}
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-		drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
-		if (drvidx < 0) {
-			retval = -ENODEV;
-			goto out;
-		}
-		if (!dev->drv[drvidx]->stavail) {
-			if (file->f_flags & O_NONBLOCK) {
-				retval = -EAGAIN;
-				goto out;
-			}
-			wait_event_interruptible(dev->drv[drvidx]->st_waitq,
-						 dev->drv[drvidx]->stavail);
-		}
-		if (dev->drv[drvidx]->interface->readstat) {
-			if (count > dev->drv[drvidx]->stavail)
-				count = dev->drv[drvidx]->stavail;
-			len = dev->drv[drvidx]->interface->readstat(buf, count,
-								    drvidx, isdn_minor2chan(minor - ISDN_MINOR_CTRL));
-			if (len < 0) {
-				retval = len;
-				goto out;
-			}
-		} else {
-			len = 0;
-		}
-		if (len)
-			dev->drv[drvidx]->stavail -= len;
-		else
-			dev->drv[drvidx]->stavail = 0;
-		*off += len;
-		retval = len;
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX) {
-		retval = isdn_ppp_read(minor - ISDN_MINOR_PPP, file, buf, count);
-		goto out;
-	}
-#endif
-	retval = -ENODEV;
-out:
-	mutex_unlock(&isdn_mutex);
-	return retval;
-}
-
-static ssize_t
-isdn_write(struct file *file, const char __user *buf, size_t count, loff_t *off)
-{
-	uint minor = iminor(file_inode(file));
-	int drvidx;
-	int chidx;
-	int retval;
-
-	if (minor == ISDN_MINOR_STATUS)
-		return -EPERM;
-	if (!dev->drivers)
-		return -ENODEV;
-
-	mutex_lock(&isdn_mutex);
-	if (minor <= ISDN_MINOR_BMAX) {
-		printk(KERN_WARNING "isdn_write minor %d obsolete!\n", minor);
-		drvidx = isdn_minor2drv(minor);
-		if (drvidx < 0) {
-			retval = -ENODEV;
-			goto out;
-		}
-		if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) {
-			retval = -ENODEV;
-			goto out;
-		}
-		chidx = isdn_minor2chan(minor);
-		wait_event_interruptible(dev->drv[drvidx]->snd_waitq[chidx],
-			(retval = isdn_writebuf_stub(drvidx, chidx, buf, count)));
-		goto out;
-	}
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-		drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
-		if (drvidx < 0) {
-			retval = -ENODEV;
-			goto out;
-		}
-		/*
-		 * We want to use the isdnctrl device to load the firmware
-		 *
-		 if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING))
-		 return -ENODEV;
-		*/
-		if (dev->drv[drvidx]->interface->writecmd)
-			retval = dev->drv[drvidx]->interface->
-				writecmd(buf, count, drvidx,
-					 isdn_minor2chan(minor - ISDN_MINOR_CTRL));
-		else
-			retval = count;
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX) {
-		retval = isdn_ppp_write(minor - ISDN_MINOR_PPP, file, buf, count);
-		goto out;
-	}
-#endif
-	retval = -ENODEV;
-out:
-	mutex_unlock(&isdn_mutex);
-	return retval;
-}
-
-static __poll_t
-isdn_poll(struct file *file, poll_table *wait)
-{
-	__poll_t mask = 0;
-	unsigned int minor = iminor(file_inode(file));
-	int drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
-
-	mutex_lock(&isdn_mutex);
-	if (minor == ISDN_MINOR_STATUS) {
-		poll_wait(file, &(dev->info_waitq), wait);
-		/* mask = EPOLLOUT | EPOLLWRNORM; */
-		if (file->private_data) {
-			mask |= EPOLLIN | EPOLLRDNORM;
-		}
-		goto out;
-	}
-	if (minor >= ISDN_MINOR_CTRL && minor <= ISDN_MINOR_CTRLMAX) {
-		if (drvidx < 0) {
-			/* driver deregistered while file open */
-			mask = EPOLLHUP;
-			goto out;
-		}
-		poll_wait(file, &(dev->drv[drvidx]->st_waitq), wait);
-		mask = EPOLLOUT | EPOLLWRNORM;
-		if (dev->drv[drvidx]->stavail) {
-			mask |= EPOLLIN | EPOLLRDNORM;
-		}
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX) {
-		mask = isdn_ppp_poll(file, wait);
-		goto out;
-	}
-#endif
-	mask = EPOLLERR;
-out:
-	mutex_unlock(&isdn_mutex);
-	return mask;
-}
-
-
-static int
-isdn_ioctl(struct file *file, uint cmd, ulong arg)
-{
-	uint minor = iminor(file_inode(file));
-	isdn_ctrl c;
-	int drvidx;
-	int ret;
-	int i;
-	char __user *p;
-	char *s;
-	union iocpar {
-		char name[10];
-		char bname[22];
-		isdn_ioctl_struct iocts;
-		isdn_net_ioctl_phone phone;
-		isdn_net_ioctl_cfg cfg;
-	} iocpar;
-	void __user *argp = (void __user *)arg;
-
-#define name  iocpar.name
-#define bname iocpar.bname
-#define iocts iocpar.iocts
-#define phone iocpar.phone
-#define cfg   iocpar.cfg
-
-	if (minor == ISDN_MINOR_STATUS) {
-		switch (cmd) {
-		case IIOCGETDVR:
-			return (TTY_DV +
-				(NET_DV << 8) +
-				(INF_DV << 16));
-		case IIOCGETCPS:
-			if (arg) {
-				ulong __user *p = argp;
-				int i;
-				for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-					put_user(dev->ibytes[i], p++);
-					put_user(dev->obytes[i], p++);
-				}
-				return 0;
-			} else
-				return -EINVAL;
-			break;
-		case IIOCNETGPN:
-			/* Get peer phone number of a connected
-			 * isdn network interface */
-			if (arg) {
-				if (copy_from_user(&phone, argp, sizeof(phone)))
-					return -EFAULT;
-				return isdn_net_getpeer(&phone, argp);
-			} else
-				return -EINVAL;
-		default:
-			return -EINVAL;
-		}
-	}
-	if (!dev->drivers)
-		return -ENODEV;
-	if (minor <= ISDN_MINOR_BMAX) {
-		drvidx = isdn_minor2drv(minor);
-		if (drvidx < 0)
-			return -ENODEV;
-		if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING))
-			return -ENODEV;
-		return 0;
-	}
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-/*
- * isdn net devices manage lots of configuration variables as linked lists.
- * Those lists must only be manipulated from user space. Some of the ioctl's
- * service routines access user space and are not atomic. Therefore, ioctl's
- * manipulating the lists and ioctl's sleeping while accessing the lists
- * are serialized by means of a semaphore.
- */
-		switch (cmd) {
-		case IIOCNETDWRSET:
-			printk(KERN_INFO "INFO: ISDN_DW_ABC_EXTENSION not enabled\n");
-			return (-EINVAL);
-		case IIOCNETLCR:
-			printk(KERN_INFO "INFO: ISDN_ABC_LCR_SUPPORT not enabled\n");
-			return -ENODEV;
-		case IIOCNETAIF:
-			/* Add a network-interface */
-			if (arg) {
-				if (copy_from_user(name, argp, sizeof(name)))
-					return -EFAULT;
-				s = name;
-			} else {
-				s = NULL;
-			}
-			ret = mutex_lock_interruptible(&dev->mtx);
-			if (ret) return ret;
-			if ((s = isdn_net_new(s, NULL))) {
-				if (copy_to_user(argp, s, strlen(s) + 1)) {
-					ret = -EFAULT;
-				} else {
-					ret = 0;
-				}
-			} else
-				ret = -ENODEV;
-			mutex_unlock(&dev->mtx);
-			return ret;
-		case IIOCNETASL:
-			/* Add a slave to a network-interface */
-			if (arg) {
-				if (copy_from_user(bname, argp, sizeof(bname) - 1))
-					return -EFAULT;
-				bname[sizeof(bname)-1] = 0;
-			} else
-				return -EINVAL;
-			ret = mutex_lock_interruptible(&dev->mtx);
-			if (ret) return ret;
-			if ((s = isdn_net_newslave(bname))) {
-				if (copy_to_user(argp, s, strlen(s) + 1)) {
-					ret = -EFAULT;
-				} else {
-					ret = 0;
-				}
-			} else
-				ret = -ENODEV;
-			mutex_unlock(&dev->mtx);
-			return ret;
-		case IIOCNETDIF:
-			/* Delete a network-interface */
-			if (arg) {
-				if (copy_from_user(name, argp, sizeof(name)))
-					return -EFAULT;
-				ret = mutex_lock_interruptible(&dev->mtx);
-				if (ret) return ret;
-				ret = isdn_net_rm(name);
-				mutex_unlock(&dev->mtx);
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETSCF:
-			/* Set configurable parameters of a network-interface */
-			if (arg) {
-				if (copy_from_user(&cfg, argp, sizeof(cfg)))
-					return -EFAULT;
-				return isdn_net_setcfg(&cfg);
-			} else
-				return -EINVAL;
-		case IIOCNETGCF:
-			/* Get configurable parameters of a network-interface */
-			if (arg) {
-				if (copy_from_user(&cfg, argp, sizeof(cfg)))
-					return -EFAULT;
-				if (!(ret = isdn_net_getcfg(&cfg))) {
-					if (copy_to_user(argp, &cfg, sizeof(cfg)))
-						return -EFAULT;
-				}
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETANM:
-			/* Add a phone-number to a network-interface */
-			if (arg) {
-				if (copy_from_user(&phone, argp, sizeof(phone)))
-					return -EFAULT;
-				ret = mutex_lock_interruptible(&dev->mtx);
-				if (ret) return ret;
-				ret = isdn_net_addphone(&phone);
-				mutex_unlock(&dev->mtx);
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETGNM:
-			/* Get list of phone-numbers of a network-interface */
-			if (arg) {
-				if (copy_from_user(&phone, argp, sizeof(phone)))
-					return -EFAULT;
-				ret = mutex_lock_interruptible(&dev->mtx);
-				if (ret) return ret;
-				ret = isdn_net_getphones(&phone, argp);
-				mutex_unlock(&dev->mtx);
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETDNM:
-			/* Delete a phone-number of a network-interface */
-			if (arg) {
-				if (copy_from_user(&phone, argp, sizeof(phone)))
-					return -EFAULT;
-				ret = mutex_lock_interruptible(&dev->mtx);
-				if (ret) return ret;
-				ret = isdn_net_delphone(&phone);
-				mutex_unlock(&dev->mtx);
-				return ret;
-			} else
-				return -EINVAL;
-		case IIOCNETDIL:
-			/* Force dialing of a network-interface */
-			if (arg) {
-				if (copy_from_user(name, argp, sizeof(name)))
-					return -EFAULT;
-				return isdn_net_force_dial(name);
-			} else
-				return -EINVAL;
-#ifdef CONFIG_ISDN_PPP
-		case IIOCNETALN:
-			if (!arg)
-				return -EINVAL;
-			if (copy_from_user(name, argp, sizeof(name)))
-				return -EFAULT;
-			return isdn_ppp_dial_slave(name);
-		case IIOCNETDLN:
-			if (!arg)
-				return -EINVAL;
-			if (copy_from_user(name, argp, sizeof(name)))
-				return -EFAULT;
-			return isdn_ppp_hangup_slave(name);
-#endif
-		case IIOCNETHUP:
-			/* Force hangup of a network-interface */
-			if (!arg)
-				return -EINVAL;
-			if (copy_from_user(name, argp, sizeof(name)))
-				return -EFAULT;
-			return isdn_net_force_hangup(name);
-			break;
-		case IIOCSETVER:
-			dev->net_verbose = arg;
-			printk(KERN_INFO "isdn: Verbose-Level is %d\n", dev->net_verbose);
-			return 0;
-		case IIOCSETGST:
-			if (arg)
-				dev->global_flags |= ISDN_GLOBAL_STOPPED;
-			else
-				dev->global_flags &= ~ISDN_GLOBAL_STOPPED;
-			printk(KERN_INFO "isdn: Global Mode %s\n",
-			       (dev->global_flags & ISDN_GLOBAL_STOPPED) ? "stopped" : "running");
-			return 0;
-		case IIOCSETBRJ:
-			drvidx = -1;
-			if (arg) {
-				int i;
-				char *p;
-				if (copy_from_user(&iocts, argp,
-						   sizeof(isdn_ioctl_struct)))
-					return -EFAULT;
-				iocts.drvid[sizeof(iocts.drvid) - 1] = 0;
-				if (strlen(iocts.drvid)) {
-					if ((p = strchr(iocts.drvid, ',')))
-						*p = 0;
-					drvidx = -1;
-					for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-						if (!(strcmp(dev->drvid[i], iocts.drvid))) {
-							drvidx = i;
-							break;
-						}
-				}
-			}
-			if (drvidx == -1)
-				return -ENODEV;
-			if (iocts.arg)
-				dev->drv[drvidx]->flags |= DRV_FLAG_REJBUS;
-			else
-				dev->drv[drvidx]->flags &= ~DRV_FLAG_REJBUS;
-			return 0;
-		case IIOCSIGPRF:
-			dev->profd = current;
-			return 0;
-			break;
-		case IIOCGETPRF:
-			/* Get all Modem-Profiles */
-			if (arg) {
-				char __user *p = argp;
-				int i;
-
-				for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-					if (copy_to_user(p, dev->mdm.info[i].emu.profile,
-							 ISDN_MODEM_NUMREG))
-						return -EFAULT;
-					p += ISDN_MODEM_NUMREG;
-					if (copy_to_user(p, dev->mdm.info[i].emu.pmsn, ISDN_MSNLEN))
-						return -EFAULT;
-					p += ISDN_MSNLEN;
-					if (copy_to_user(p, dev->mdm.info[i].emu.plmsn, ISDN_LMSNLEN))
-						return -EFAULT;
-					p += ISDN_LMSNLEN;
-				}
-				return (ISDN_MODEM_NUMREG + ISDN_MSNLEN + ISDN_LMSNLEN) * ISDN_MAX_CHANNELS;
-			} else
-				return -EINVAL;
-			break;
-		case IIOCSETPRF:
-			/* Set all Modem-Profiles */
-			if (arg) {
-				char __user *p = argp;
-				int i;
-
-				for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-					if (copy_from_user(dev->mdm.info[i].emu.profile, p,
-							   ISDN_MODEM_NUMREG))
-						return -EFAULT;
-					p += ISDN_MODEM_NUMREG;
-					if (copy_from_user(dev->mdm.info[i].emu.plmsn, p, ISDN_LMSNLEN))
-						return -EFAULT;
-					p += ISDN_LMSNLEN;
-					if (copy_from_user(dev->mdm.info[i].emu.pmsn, p, ISDN_MSNLEN))
-						return -EFAULT;
-					p += ISDN_MSNLEN;
-				}
-				return 0;
-			} else
-				return -EINVAL;
-			break;
-		case IIOCSETMAP:
-		case IIOCGETMAP:
-			/* Set/Get MSN->EAZ-Mapping for a driver */
-			if (arg) {
-
-				if (copy_from_user(&iocts, argp,
-						   sizeof(isdn_ioctl_struct)))
-					return -EFAULT;
-				iocts.drvid[sizeof(iocts.drvid) - 1] = 0;
-				if (strlen(iocts.drvid)) {
-					drvidx = -1;
-					for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-						if (!(strcmp(dev->drvid[i], iocts.drvid))) {
-							drvidx = i;
-							break;
-						}
-				} else
-					drvidx = 0;
-				if (drvidx == -1)
-					return -ENODEV;
-				if (cmd == IIOCSETMAP) {
-					int loop = 1;
-
-					p = (char __user *) iocts.arg;
-					i = 0;
-					while (loop) {
-						int j = 0;
-
-						while (1) {
-							get_user(bname[j], p++);
-							switch (bname[j]) {
-							case '\0':
-								loop = 0;
-								/* Fall through */
-							case ',':
-								bname[j] = '\0';
-								strcpy(dev->drv[drvidx]->msn2eaz[i], bname);
-								j = ISDN_MSNLEN;
-								break;
-							default:
-								j++;
-							}
-							if (j >= ISDN_MSNLEN)
-								break;
-						}
-						if (++i > 9)
-							break;
-					}
-				} else {
-					p = (char __user *) iocts.arg;
-					for (i = 0; i < 10; i++) {
-						snprintf(bname, sizeof(bname), "%s%s",
-							 strlen(dev->drv[drvidx]->msn2eaz[i]) ?
-							 dev->drv[drvidx]->msn2eaz[i] : "_",
-							 (i < 9) ? "," : "\0");
-						if (copy_to_user(p, bname, strlen(bname) + 1))
-							return -EFAULT;
-						p += strlen(bname);
-					}
-				}
-				return 0;
-			} else
-				return -EINVAL;
-		case IIOCDBGVAR:
-			return -EINVAL;
-		default:
-			if ((cmd & IIOCDRVCTL) == IIOCDRVCTL)
-				cmd = ((cmd >> _IOC_NRSHIFT) & _IOC_NRMASK) & ISDN_DRVIOCTL_MASK;
-			else
-				return -EINVAL;
-			if (arg) {
-				int i;
-				char *p;
-				if (copy_from_user(&iocts, argp, sizeof(isdn_ioctl_struct)))
-					return -EFAULT;
-				iocts.drvid[sizeof(iocts.drvid) - 1] = 0;
-				if (strlen(iocts.drvid)) {
-					if ((p = strchr(iocts.drvid, ',')))
-						*p = 0;
-					drvidx = -1;
-					for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-						if (!(strcmp(dev->drvid[i], iocts.drvid))) {
-							drvidx = i;
-							break;
-						}
-				} else
-					drvidx = 0;
-				if (drvidx == -1)
-					return -ENODEV;
-				c.driver = drvidx;
-				c.command = ISDN_CMD_IOCTL;
-				c.arg = cmd;
-				memcpy(c.parm.num, &iocts.arg, sizeof(ulong));
-				ret = isdn_command(&c);
-				memcpy(&iocts.arg, c.parm.num, sizeof(ulong));
-				if (copy_to_user(argp, &iocts, sizeof(isdn_ioctl_struct)))
-					return -EFAULT;
-				return ret;
-			} else
-				return -EINVAL;
-		}
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX)
-		return (isdn_ppp_ioctl(minor - ISDN_MINOR_PPP, file, cmd, arg));
-#endif
-	return -ENODEV;
-
-#undef name
-#undef bname
-#undef iocts
-#undef phone
-#undef cfg
-}
-
-static long
-isdn_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	int ret;
-
-	mutex_lock(&isdn_mutex);
-	ret = isdn_ioctl(file, cmd, arg);
-	mutex_unlock(&isdn_mutex);
-
-	return ret;
-}
-
-/*
- * Open the device code.
- */
-static int
-isdn_open(struct inode *ino, struct file *filep)
-{
-	uint minor = iminor(ino);
-	int drvidx;
-	int chidx;
-	int retval = -ENODEV;
-
-	mutex_lock(&isdn_mutex);
-	if (minor == ISDN_MINOR_STATUS) {
-		infostruct *p;
-
-		if ((p = kmalloc(sizeof(infostruct), GFP_KERNEL))) {
-			p->next = (char *) dev->infochain;
-			p->private = (char *) &(filep->private_data);
-			dev->infochain = p;
-			/* At opening we allow a single update */
-			filep->private_data = (char *) 1;
-			retval = 0;
-			goto out;
-		} else {
-			retval = -ENOMEM;
-			goto out;
-		}
-	}
-	if (!dev->channels)
-		goto out;
-	if (minor <= ISDN_MINOR_BMAX) {
-		printk(KERN_WARNING "isdn_open minor %d obsolete!\n", minor);
-		drvidx = isdn_minor2drv(minor);
-		if (drvidx < 0)
-			goto out;
-		chidx = isdn_minor2chan(minor);
-		if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING))
-			goto out;
-		if (!(dev->drv[drvidx]->online & (1 << chidx)))
-			goto out;
-		isdn_lock_drivers();
-		retval = 0;
-		goto out;
-	}
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-		drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
-		if (drvidx < 0)
-			goto out;
-		isdn_lock_drivers();
-		retval = 0;
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX) {
-		retval = isdn_ppp_open(minor - ISDN_MINOR_PPP, filep);
-		if (retval == 0)
-			isdn_lock_drivers();
-		goto out;
-	}
-#endif
-out:
-	nonseekable_open(ino, filep);
-	mutex_unlock(&isdn_mutex);
-	return retval;
-}
-
-static int
-isdn_close(struct inode *ino, struct file *filep)
-{
-	uint minor = iminor(ino);
-
-	mutex_lock(&isdn_mutex);
-	if (minor == ISDN_MINOR_STATUS) {
-		infostruct *p = dev->infochain;
-		infostruct *q = NULL;
-
-		while (p) {
-			if (p->private == (char *) &(filep->private_data)) {
-				if (q)
-					q->next = p->next;
-				else
-					dev->infochain = (infostruct *) (p->next);
-				kfree(p);
-				goto out;
-			}
-			q = p;
-			p = (infostruct *) (p->next);
-		}
-		printk(KERN_WARNING "isdn: No private data while closing isdnctrl\n");
-		goto out;
-	}
-	isdn_unlock_drivers();
-	if (minor <= ISDN_MINOR_BMAX)
-		goto out;
-	if (minor <= ISDN_MINOR_CTRLMAX) {
-		if (dev->profd == current)
-			dev->profd = NULL;
-		goto out;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (minor <= ISDN_MINOR_PPPMAX)
-		isdn_ppp_release(minor - ISDN_MINOR_PPP, filep);
-#endif
-
-out:
-	mutex_unlock(&isdn_mutex);
-	return 0;
-}
-
-static const struct file_operations isdn_fops =
-{
-	.owner		= THIS_MODULE,
-	.llseek		= no_llseek,
-	.read		= isdn_read,
-	.write		= isdn_write,
-	.poll		= isdn_poll,
-	.unlocked_ioctl	= isdn_unlocked_ioctl,
-	.open		= isdn_open,
-	.release	= isdn_close,
-};
-
-char *
-isdn_map_eaz2msn(char *msn, int di)
-{
-	isdn_driver_t *this = dev->drv[di];
-	int i;
-
-	if (strlen(msn) == 1) {
-		i = msn[0] - '0';
-		if ((i >= 0) && (i <= 9))
-			if (strlen(this->msn2eaz[i]))
-				return (this->msn2eaz[i]);
-	}
-	return (msn);
-}
-
-/*
- * Find an unused ISDN-channel, whose feature-flags match the
- * given L2- and L3-protocols.
- */
-#define L2V (~(ISDN_FEATURE_L2_V11096 | ISDN_FEATURE_L2_V11019 | ISDN_FEATURE_L2_V11038))
-
-/*
- * This function must be called with holding the dev->lock.
- */
-int
-isdn_get_free_channel(int usage, int l2_proto, int l3_proto, int pre_dev
-		      , int pre_chan, char *msn)
-{
-	int i;
-	ulong features;
-	ulong vfeatures;
-
-	features = ((1 << l2_proto) | (0x10000 << l3_proto));
-	vfeatures = (((1 << l2_proto) | (0x10000 << l3_proto)) &
-		     ~(ISDN_FEATURE_L2_V11096 | ISDN_FEATURE_L2_V11019 | ISDN_FEATURE_L2_V11038));
-	/* If Layer-2 protocol is V.110, accept drivers with
-	 * transparent feature even if these don't support V.110
-	 * because we can emulate this in linklevel.
-	 */
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (USG_NONE(dev->usage[i]) &&
-		    (dev->drvmap[i] != -1)) {
-			int d = dev->drvmap[i];
-			if ((dev->usage[i] & ISDN_USAGE_EXCLUSIVE) &&
-			    ((pre_dev != d) || (pre_chan != dev->chanmap[i])))
-				continue;
-			if (!strcmp(isdn_map_eaz2msn(msn, d), "-"))
-				continue;
-			if (dev->usage[i] & ISDN_USAGE_DISABLED)
-				continue; /* usage not allowed */
-			if (dev->drv[d]->flags & DRV_FLAG_RUNNING) {
-				if (((dev->drv[d]->interface->features & features) == features) ||
-				    (((dev->drv[d]->interface->features & vfeatures) == vfeatures) &&
-				     (dev->drv[d]->interface->features & ISDN_FEATURE_L2_TRANS))) {
-					if ((pre_dev < 0) || (pre_chan < 0)) {
-						dev->usage[i] &= ISDN_USAGE_EXCLUSIVE;
-						dev->usage[i] |= usage;
-						isdn_info_update();
-						return i;
-					} else {
-						if ((pre_dev == d) && (pre_chan == dev->chanmap[i])) {
-							dev->usage[i] &= ISDN_USAGE_EXCLUSIVE;
-							dev->usage[i] |= usage;
-							isdn_info_update();
-							return i;
-						}
-					}
-				}
-			}
-		}
-	return -1;
-}
-
-/*
- * Set state of ISDN-channel to 'unused'
- */
-void
-isdn_free_channel(int di, int ch, int usage)
-{
-	int i;
-
-	if ((di < 0) || (ch < 0)) {
-		printk(KERN_WARNING "%s: called with invalid drv(%d) or channel(%d)\n",
-		       __func__, di, ch);
-		return;
-	}
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (((!usage) || ((dev->usage[i] & ISDN_USAGE_MASK) == usage)) &&
-		    (dev->drvmap[i] == di) &&
-		    (dev->chanmap[i] == ch)) {
-			dev->usage[i] &= (ISDN_USAGE_NONE | ISDN_USAGE_EXCLUSIVE);
-			strcpy(dev->num[i], "???");
-			dev->ibytes[i] = 0;
-			dev->obytes[i] = 0;
-// 20.10.99 JIM, try to reinitialize v110 !
-			dev->v110emu[i] = 0;
-			atomic_set(&(dev->v110use[i]), 0);
-			isdn_v110_close(dev->v110[i]);
-			dev->v110[i] = NULL;
-// 20.10.99 JIM, try to reinitialize v110 !
-			isdn_info_update();
-			if (dev->drv[di])
-				skb_queue_purge(&dev->drv[di]->rpqueue[ch]);
-		}
-}
-
-/*
- * Cancel Exclusive-Flag for ISDN-channel
- */
-void
-isdn_unexclusive_channel(int di, int ch)
-{
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if ((dev->drvmap[i] == di) &&
-		    (dev->chanmap[i] == ch)) {
-			dev->usage[i] &= ~ISDN_USAGE_EXCLUSIVE;
-			isdn_info_update();
-			return;
-		}
-}
-
-/*
- *  writebuf replacement for SKB_ABLE drivers
- */
-static int
-isdn_writebuf_stub(int drvidx, int chan, const u_char __user *buf, int len)
-{
-	int ret;
-	int hl = dev->drv[drvidx]->interface->hl_hdrlen;
-	struct sk_buff *skb = alloc_skb(hl + len, GFP_ATOMIC);
-
-	if (!skb)
-		return -ENOMEM;
-	skb_reserve(skb, hl);
-	if (copy_from_user(skb_put(skb, len), buf, len)) {
-		dev_kfree_skb(skb);
-		return -EFAULT;
-	}
-	ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, 1, skb);
-	if (ret <= 0)
-		dev_kfree_skb(skb);
-	if (ret > 0)
-		dev->obytes[isdn_dc2minor(drvidx, chan)] += ret;
-	return ret;
-}
-
-/*
- * Return: length of data on success, -ERRcode on failure.
- */
-int
-isdn_writebuf_skb_stub(int drvidx, int chan, int ack, struct sk_buff *skb)
-{
-	int ret;
-	struct sk_buff *nskb = NULL;
-	int v110_ret = skb->len;
-	int idx = isdn_dc2minor(drvidx, chan);
-
-	if (dev->v110[idx]) {
-		atomic_inc(&dev->v110use[idx]);
-		nskb = isdn_v110_encode(dev->v110[idx], skb);
-		atomic_dec(&dev->v110use[idx]);
-		if (!nskb)
-			return 0;
-		v110_ret = *((int *)nskb->data);
-		skb_pull(nskb, sizeof(int));
-		if (!nskb->len) {
-			dev_kfree_skb(nskb);
-			return v110_ret;
-		}
-		/* V.110 must always be acknowledged */
-		ack = 1;
-		ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, nskb);
-	} else {
-		int hl = dev->drv[drvidx]->interface->hl_hdrlen;
-
-		if (skb_headroom(skb) < hl) {
-			/*
-			 * This should only occur when new HL driver with
-			 * increased hl_hdrlen was loaded after netdevice
-			 * was created and connected to the new driver.
-			 *
-			 * The V.110 branch (re-allocates on its own) does
-			 * not need this
-			 */
-			struct sk_buff *skb_tmp;
-
-			skb_tmp = skb_realloc_headroom(skb, hl);
-			printk(KERN_DEBUG "isdn_writebuf_skb_stub: reallocating headroom%s\n", skb_tmp ? "" : " failed");
-			if (!skb_tmp) return -ENOMEM; /* 0 better? */
-			ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, skb_tmp);
-			if (ret > 0) {
-				dev_kfree_skb(skb);
-			} else {
-				dev_kfree_skb(skb_tmp);
-			}
-		} else {
-			ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, skb);
-		}
-	}
-	if (ret > 0) {
-		dev->obytes[idx] += ret;
-		if (dev->v110[idx]) {
-			atomic_inc(&dev->v110use[idx]);
-			dev->v110[idx]->skbuser++;
-			atomic_dec(&dev->v110use[idx]);
-			/* For V.110 return unencoded data length */
-			ret = v110_ret;
-			/* if the complete frame was send we free the skb;
-			   if not upper function will requeue the skb */
-			if (ret == skb->len)
-				dev_kfree_skb(skb);
-		}
-	} else
-		if (dev->v110[idx])
-			dev_kfree_skb(nskb);
-	return ret;
-}
-
-static int
-isdn_add_channels(isdn_driver_t *d, int drvidx, int n, int adding)
-{
-	int j, k, m;
-
-	init_waitqueue_head(&d->st_waitq);
-	if (d->flags & DRV_FLAG_RUNNING)
-		return -1;
-	if (n < 1) return 0;
-
-	m = (adding) ? d->channels + n : n;
-
-	if (dev->channels + n > ISDN_MAX_CHANNELS) {
-		printk(KERN_WARNING "register_isdn: Max. %d channels supported\n",
-		       ISDN_MAX_CHANNELS);
-		return -1;
-	}
-
-	if ((adding) && (d->rcverr))
-		kfree(d->rcverr);
-	if (!(d->rcverr = kcalloc(m, sizeof(int), GFP_ATOMIC))) {
-		printk(KERN_WARNING "register_isdn: Could not alloc rcverr\n");
-		return -1;
-	}
-
-	if ((adding) && (d->rcvcount))
-		kfree(d->rcvcount);
-	if (!(d->rcvcount = kcalloc(m, sizeof(int), GFP_ATOMIC))) {
-		printk(KERN_WARNING "register_isdn: Could not alloc rcvcount\n");
-		if (!adding)
-			kfree(d->rcverr);
-		return -1;
-	}
-
-	if ((adding) && (d->rpqueue)) {
-		for (j = 0; j < d->channels; j++)
-			skb_queue_purge(&d->rpqueue[j]);
-		kfree(d->rpqueue);
-	}
-	d->rpqueue = kmalloc_array(m, sizeof(struct sk_buff_head), GFP_ATOMIC);
-	if (!d->rpqueue) {
-		printk(KERN_WARNING "register_isdn: Could not alloc rpqueue\n");
-		if (!adding) {
-			kfree(d->rcvcount);
-			kfree(d->rcverr);
-		}
-		return -1;
-	}
-	for (j = 0; j < m; j++) {
-		skb_queue_head_init(&d->rpqueue[j]);
-	}
-
-	if ((adding) && (d->rcv_waitq))
-		kfree(d->rcv_waitq);
-	d->rcv_waitq = kmalloc(array3_size(sizeof(wait_queue_head_t), 2, m),
-			       GFP_ATOMIC);
-	if (!d->rcv_waitq) {
-		printk(KERN_WARNING "register_isdn: Could not alloc rcv_waitq\n");
-		if (!adding) {
-			kfree(d->rpqueue);
-			kfree(d->rcvcount);
-			kfree(d->rcverr);
-		}
-		return -1;
-	}
-	d->snd_waitq = d->rcv_waitq + m;
-	for (j = 0; j < m; j++) {
-		init_waitqueue_head(&d->rcv_waitq[j]);
-		init_waitqueue_head(&d->snd_waitq[j]);
-	}
-
-	dev->channels += n;
-	for (j = d->channels; j < m; j++)
-		for (k = 0; k < ISDN_MAX_CHANNELS; k++)
-			if (dev->chanmap[k] < 0) {
-				dev->chanmap[k] = j;
-				dev->drvmap[k] = drvidx;
-				break;
-			}
-	d->channels = m;
-	return 0;
-}
-
-/*
- * Low-level-driver registration
- */
-
-static void
-set_global_features(void)
-{
-	int drvidx;
-
-	dev->global_features = 0;
-	for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++) {
-		if (!dev->drv[drvidx])
-			continue;
-		if (dev->drv[drvidx]->interface)
-			dev->global_features |= dev->drv[drvidx]->interface->features;
-	}
-}
-
-#ifdef CONFIG_ISDN_DIVERSION
-
-static char *map_drvname(int di)
-{
-	if ((di < 0) || (di >= ISDN_MAX_DRIVERS))
-		return (NULL);
-	return (dev->drvid[di]); /* driver name */
-} /* map_drvname */
-
-static int map_namedrv(char *id)
-{  int i;
-
-	for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-	{ if (!strcmp(dev->drvid[i], id))
-			return (i);
-	}
-	return (-1);
-} /* map_namedrv */
-
-int DIVERT_REG_NAME(isdn_divert_if *i_div)
-{
-	if (i_div->if_magic != DIVERT_IF_MAGIC)
-		return (DIVERT_VER_ERR);
-	switch (i_div->cmd)
-	{
-	case DIVERT_CMD_REL:
-		if (divert_if != i_div)
-			return (DIVERT_REL_ERR);
-		divert_if = NULL; /* free interface */
-		return (DIVERT_NO_ERR);
-
-	case DIVERT_CMD_REG:
-		if (divert_if)
-			return (DIVERT_REG_ERR);
-		i_div->ll_cmd = isdn_command; /* set command function */
-		i_div->drv_to_name = map_drvname;
-		i_div->name_to_drv = map_namedrv;
-		divert_if = i_div; /* remember interface */
-		return (DIVERT_NO_ERR);
-
-	default:
-		return (DIVERT_CMD_ERR);
-	}
-} /* DIVERT_REG_NAME */
-
-EXPORT_SYMBOL(DIVERT_REG_NAME);
-
-#endif /* CONFIG_ISDN_DIVERSION */
-
-
-EXPORT_SYMBOL(register_isdn);
-#ifdef CONFIG_ISDN_PPP
-EXPORT_SYMBOL(isdn_ppp_register_compressor);
-EXPORT_SYMBOL(isdn_ppp_unregister_compressor);
-#endif
-
-int
-register_isdn(isdn_if *i)
-{
-	isdn_driver_t *d;
-	int j;
-	ulong flags;
-	int drvidx;
-
-	if (dev->drivers >= ISDN_MAX_DRIVERS) {
-		printk(KERN_WARNING "register_isdn: Max. %d drivers supported\n",
-		       ISDN_MAX_DRIVERS);
-		return 0;
-	}
-	if (!i->writebuf_skb) {
-		printk(KERN_WARNING "register_isdn: No write routine given.\n");
-		return 0;
-	}
-	if (!(d = kzalloc(sizeof(isdn_driver_t), GFP_KERNEL))) {
-		printk(KERN_WARNING "register_isdn: Could not alloc driver-struct\n");
-		return 0;
-	}
-
-	d->maxbufsize = i->maxbufsize;
-	d->pktcount = 0;
-	d->stavail = 0;
-	d->flags = DRV_FLAG_LOADED;
-	d->online = 0;
-	d->interface = i;
-	d->channels = 0;
-	spin_lock_irqsave(&dev->lock, flags);
-	for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++)
-		if (!dev->drv[drvidx])
-			break;
-	if (isdn_add_channels(d, drvidx, i->channels, 0)) {
-		spin_unlock_irqrestore(&dev->lock, flags);
-		kfree(d);
-		return 0;
-	}
-	i->channels = drvidx;
-	i->rcvcallb_skb = isdn_receive_skb_callback;
-	i->statcallb = isdn_status_callback;
-	if (!strlen(i->id))
-		sprintf(i->id, "line%d", drvidx);
-	for (j = 0; j < drvidx; j++)
-		if (!strcmp(i->id, dev->drvid[j]))
-			sprintf(i->id, "line%d", drvidx);
-	dev->drv[drvidx] = d;
-	strcpy(dev->drvid[drvidx], i->id);
-	isdn_info_update();
-	dev->drivers++;
-	set_global_features();
-	spin_unlock_irqrestore(&dev->lock, flags);
-	return 1;
-}
-
-/*
-*****************************************************************************
-* And now the modules code.
-*****************************************************************************
-*/
-
-static char *
-isdn_getrev(const char *revision)
-{
-	char *rev;
-	char *p;
-
-	if ((p = strchr(revision, ':'))) {
-		rev = p + 2;
-		p = strchr(rev, '$');
-		*--p = 0;
-	} else
-		rev = "???";
-	return rev;
-}
-
-/*
- * Allocate and initialize all data, register modem-devices
- */
-static int __init isdn_init(void)
-{
-	int i;
-	char tmprev[50];
-
-	dev = vzalloc(sizeof(isdn_dev));
-	if (!dev) {
-		printk(KERN_WARNING "isdn: Could not allocate device-struct.\n");
-		return -EIO;
-	}
-	timer_setup(&dev->timer, isdn_timer_funct, 0);
-	spin_lock_init(&dev->lock);
-	spin_lock_init(&dev->timerlock);
-#ifdef MODULE
-	dev->owner = THIS_MODULE;
-#endif
-	mutex_init(&dev->mtx);
-	init_waitqueue_head(&dev->info_waitq);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		dev->drvmap[i] = -1;
-		dev->chanmap[i] = -1;
-		dev->m_idx[i] = -1;
-		strcpy(dev->num[i], "???");
-	}
-	if (register_chrdev(ISDN_MAJOR, "isdn", &isdn_fops)) {
-		printk(KERN_WARNING "isdn: Could not register control devices\n");
-		vfree(dev);
-		return -EIO;
-	}
-	if ((isdn_tty_modem_init()) < 0) {
-		printk(KERN_WARNING "isdn: Could not register tty devices\n");
-		vfree(dev);
-		unregister_chrdev(ISDN_MAJOR, "isdn");
-		return -EIO;
-	}
-#ifdef CONFIG_ISDN_PPP
-	if (isdn_ppp_init() < 0) {
-		printk(KERN_WARNING "isdn: Could not create PPP-device-structs\n");
-		isdn_tty_exit();
-		unregister_chrdev(ISDN_MAJOR, "isdn");
-		vfree(dev);
-		return -EIO;
-	}
-#endif                          /* CONFIG_ISDN_PPP */
-
-	strcpy(tmprev, isdn_revision);
-	printk(KERN_NOTICE "ISDN subsystem Rev: %s/", isdn_getrev(tmprev));
-	strcpy(tmprev, isdn_net_revision);
-	printk("%s/", isdn_getrev(tmprev));
-	strcpy(tmprev, isdn_ppp_revision);
-	printk("%s/", isdn_getrev(tmprev));
-	strcpy(tmprev, isdn_audio_revision);
-	printk("%s/", isdn_getrev(tmprev));
-	strcpy(tmprev, isdn_v110_revision);
-	printk("%s", isdn_getrev(tmprev));
-
-#ifdef MODULE
-	printk(" loaded\n");
-#else
-	printk("\n");
-#endif
-	isdn_info_update();
-	return 0;
-}
-
-/*
- * Unload module
- */
-static void __exit isdn_exit(void)
-{
-#ifdef CONFIG_ISDN_PPP
-	isdn_ppp_cleanup();
-#endif
-	if (isdn_net_rmall() < 0) {
-		printk(KERN_WARNING "isdn: net-device busy, remove cancelled\n");
-		return;
-	}
-	isdn_tty_exit();
-	unregister_chrdev(ISDN_MAJOR, "isdn");
-	del_timer_sync(&dev->timer);
-	/* call vfree with interrupts enabled, else it will hang */
-	vfree(dev);
-	printk(KERN_NOTICE "ISDN-subsystem unloaded\n");
-}
-
-module_init(isdn_init);
-module_exit(isdn_exit);
diff --git a/drivers/isdn/i4l/isdn_common.h b/drivers/isdn/i4l/isdn_common.h
deleted file mode 100644
index 2260ef07ab9c..000000000000
--- a/drivers/isdn/i4l/isdn_common.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* $Id: isdn_common.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem
- * common used functions and debugging-switches (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#undef  ISDN_DEBUG_MODEM_OPEN
-#undef  ISDN_DEBUG_MODEM_IOCTL
-#undef  ISDN_DEBUG_MODEM_WAITSENT
-#undef  ISDN_DEBUG_MODEM_HUP
-#undef  ISDN_DEBUG_MODEM_ICALL
-#undef  ISDN_DEBUG_MODEM_DUMP
-#undef  ISDN_DEBUG_MODEM_VOICE
-#undef  ISDN_DEBUG_AT
-#undef  ISDN_DEBUG_NET_DUMP
-#undef  ISDN_DEBUG_NET_DIAL
-#undef  ISDN_DEBUG_NET_ICALL
-
-/* Prototypes */
-extern void isdn_lock_drivers(void);
-extern void isdn_unlock_drivers(void);
-extern void isdn_free_channel(int di, int ch, int usage);
-extern void isdn_all_eaz(int di, int ch);
-extern int isdn_command(isdn_ctrl *);
-extern int isdn_dc2minor(int di, int ch);
-extern void isdn_info_update(void);
-extern char *isdn_map_eaz2msn(char *msn, int di);
-extern void isdn_timer_ctrl(int tf, int onoff);
-extern void isdn_unexclusive_channel(int di, int ch);
-extern int isdn_getnum(char **);
-extern int isdn_readbchan(int, int, u_char *, u_char *, int, wait_queue_head_t *);
-extern int isdn_readbchan_tty(int, int, struct tty_port *, int);
-extern int isdn_get_free_channel(int, int, int, int, int, char *);
-extern int isdn_writebuf_skb_stub(int, int, int, struct sk_buff *);
-extern int register_isdn(isdn_if *i);
-extern int isdn_msncmp(const char *,  const char *);
-#if defined(ISDN_DEBUG_NET_DUMP) || defined(ISDN_DEBUG_MODEM_DUMP)
-extern void isdn_dumppkt(char *, u_char *, int, int);
-#endif
diff --git a/drivers/isdn/i4l/isdn_concap.c b/drivers/isdn/i4l/isdn_concap.c
deleted file mode 100644
index 336523ec077c..000000000000
--- a/drivers/isdn/i4l/isdn_concap.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/* $Id: isdn_concap.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, protocol encapsulation
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-/* Stuff to support the concap_proto by isdn4linux. isdn4linux - specific
- * stuff goes here. Stuff that depends only on the concap protocol goes to
- * another -- protocol specific -- source file.
- *
- */
-
-
-#include <linux/isdn.h>
-#include "isdn_x25iface.h"
-#include "isdn_net.h"
-#include <linux/concap.h>
-#include "isdn_concap.h"
-
-
-/* The following set of device service operations are for encapsulation
-   protocols that require for reliable datalink semantics. That means:
-
-   - before any data is to be submitted the connection must explicitly
-   be set up.
-   - after the successful set up of the connection is signalled the
-   connection is considered to be reliably up.
-
-   Auto-dialing ist not compatible with this requirements. Thus, auto-dialing
-   is completely bypassed.
-
-   It might be possible to implement a (non standardized) datalink protocol
-   that provides a reliable data link service while using some auto dialing
-   mechanism. Such a protocol would need an auxiliary channel (i.e. user-user-
-   signaling on the D-channel) while the B-channel is down.
-*/
-
-
-static int isdn_concap_dl_data_req(struct concap_proto *concap, struct sk_buff *skb)
-{
-	struct net_device *ndev = concap->net_dev;
-	isdn_net_dev *nd = ((isdn_net_local *) netdev_priv(ndev))->netdev;
-	isdn_net_local *lp = isdn_net_get_locked_lp(nd);
-
-	IX25DEBUG("isdn_concap_dl_data_req: %s \n", concap->net_dev->name);
-	if (!lp) {
-		IX25DEBUG("isdn_concap_dl_data_req: %s : isdn_net_send_skb returned %d\n", concap->net_dev->name, 1);
-		return 1;
-	}
-	lp->huptimer = 0;
-	isdn_net_writebuf_skb(lp, skb);
-	spin_unlock_bh(&lp->xmit_lock);
-	IX25DEBUG("isdn_concap_dl_data_req: %s : isdn_net_send_skb returned %d\n", concap->net_dev->name, 0);
-	return 0;
-}
-
-
-static int isdn_concap_dl_connect_req(struct concap_proto *concap)
-{
-	struct net_device *ndev = concap->net_dev;
-	isdn_net_local *lp = netdev_priv(ndev);
-	int ret;
-	IX25DEBUG("isdn_concap_dl_connect_req: %s \n", ndev->name);
-
-	/* dial ... */
-	ret = isdn_net_dial_req(lp);
-	if (ret) IX25DEBUG("dialing failed\n");
-	return ret;
-}
-
-static int isdn_concap_dl_disconn_req(struct concap_proto *concap)
-{
-	IX25DEBUG("isdn_concap_dl_disconn_req: %s \n", concap->net_dev->name);
-
-	isdn_net_hangup(concap->net_dev);
-	return 0;
-}
-
-struct concap_device_ops isdn_concap_reliable_dl_dops = {
-	.data_req = &isdn_concap_dl_data_req,
-	.connect_req = &isdn_concap_dl_connect_req,
-	.disconn_req = &isdn_concap_dl_disconn_req
-};
-
-/* The following should better go into a dedicated source file such that
-   this sourcefile does not need to include any protocol specific header
-   files. For now:
-*/
-struct concap_proto *isdn_concap_new(int encap)
-{
-	switch (encap) {
-	case ISDN_NET_ENCAP_X25IFACE:
-		return isdn_x25iface_proto_new();
-	}
-	return NULL;
-}
diff --git a/drivers/isdn/i4l/isdn_concap.h b/drivers/isdn/i4l/isdn_concap.h
deleted file mode 100644
index cd7e3ba74e25..000000000000
--- a/drivers/isdn/i4l/isdn_concap.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* $Id: isdn_concap.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, protocol encapsulation
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-extern struct concap_device_ops isdn_concap_reliable_dl_dops;
-extern struct concap_proto *isdn_concap_new(int);
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
deleted file mode 100644
index c138f66f2659..000000000000
--- a/drivers/isdn/i4l/isdn_net.c
+++ /dev/null
@@ -1,3198 +0,0 @@
-/* $Id: isdn_net.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, network interfaces and related functions (linklevel).
- *
- * Copyright 1994-1998  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- * Data Over Voice (DOV) support added - Guy Ellis 23-Mar-02
- *                                       guy@traverse.com.au
- * Outgoing calls - looks for a 'V' in first char of dialed number
- * Incoming calls - checks first character of eaz as follows:
- *   Numeric - accept DATA only - original functionality
- *   'V'     - accept VOICE (DOV) only
- *   'B'     - accept BOTH DATA and DOV types
- *
- * Jan 2001: fix CISCO HDLC      Bjoern A. Zeeb <i4l@zabbadoz.net>
- *           for info on the protocol, see
- *           http://i4l.zabbadoz.net/i4l/cisco-hdlc.txt
- */
-
-#include <linux/isdn.h>
-#include <linux/slab.h>
-#include <net/arp.h>
-#include <net/dst.h>
-#include <net/pkt_sched.h>
-#include <linux/inetdevice.h>
-#include "isdn_common.h"
-#include "isdn_net.h"
-#ifdef CONFIG_ISDN_PPP
-#include "isdn_ppp.h"
-#endif
-#ifdef CONFIG_ISDN_X25
-#include <linux/concap.h>
-#include "isdn_concap.h"
-#endif
-
-
-/*
- * Outline of new tbusy handling:
- *
- * Old method, roughly spoken, consisted of setting tbusy when entering
- * isdn_net_start_xmit() and at several other locations and clearing
- * it from isdn_net_start_xmit() thread when sending was successful.
- *
- * With 2.3.x multithreaded network core, to prevent problems, tbusy should
- * only be set by the isdn_net_start_xmit() thread and only when a tx-busy
- * condition is detected. Other threads (in particular isdn_net_stat_callb())
- * are only allowed to clear tbusy.
- *
- * -HE
- */
-
-/*
- * About SOFTNET:
- * Most of the changes were pretty obvious and basically done by HE already.
- *
- * One problem of the isdn net device code is that it uses struct net_device
- * for masters and slaves. However, only master interface are registered to
- * the network layer, and therefore, it only makes sense to call netif_*
- * functions on them.
- *
- * --KG
- */
-
-/*
- * Find out if the netdevice has been ifup-ed yet.
- * For slaves, look at the corresponding master.
- */
-static __inline__ int isdn_net_device_started(isdn_net_dev *n)
-{
-	isdn_net_local *lp = n->local;
-	struct net_device *dev;
-
-	if (lp->master)
-		dev = lp->master;
-	else
-		dev = n->dev;
-	return netif_running(dev);
-}
-
-/*
- * wake up the network -> net_device queue.
- * For slaves, wake the corresponding master interface.
- */
-static __inline__ void isdn_net_device_wake_queue(isdn_net_local *lp)
-{
-	if (lp->master)
-		netif_wake_queue(lp->master);
-	else
-		netif_wake_queue(lp->netdev->dev);
-}
-
-/*
- * stop the network -> net_device queue.
- * For slaves, stop the corresponding master interface.
- */
-static __inline__ void isdn_net_device_stop_queue(isdn_net_local *lp)
-{
-	if (lp->master)
-		netif_stop_queue(lp->master);
-	else
-		netif_stop_queue(lp->netdev->dev);
-}
-
-/*
- * find out if the net_device which this lp belongs to (lp can be
- * master or slave) is busy. It's busy iff all (master and slave)
- * queues are busy
- */
-static __inline__ int isdn_net_device_busy(isdn_net_local *lp)
-{
-	isdn_net_local *nlp;
-	isdn_net_dev *nd;
-	unsigned long flags;
-
-	if (!isdn_net_lp_busy(lp))
-		return 0;
-
-	if (lp->master)
-		nd = ISDN_MASTER_PRIV(lp)->netdev;
-	else
-		nd = lp->netdev;
-
-	spin_lock_irqsave(&nd->queue_lock, flags);
-	nlp = lp->next;
-	while (nlp != lp) {
-		if (!isdn_net_lp_busy(nlp)) {
-			spin_unlock_irqrestore(&nd->queue_lock, flags);
-			return 0;
-		}
-		nlp = nlp->next;
-	}
-	spin_unlock_irqrestore(&nd->queue_lock, flags);
-	return 1;
-}
-
-static __inline__ void isdn_net_inc_frame_cnt(isdn_net_local *lp)
-{
-	atomic_inc(&lp->frame_cnt);
-	if (isdn_net_device_busy(lp))
-		isdn_net_device_stop_queue(lp);
-}
-
-static __inline__ void isdn_net_dec_frame_cnt(isdn_net_local *lp)
-{
-	atomic_dec(&lp->frame_cnt);
-
-	if (!(isdn_net_device_busy(lp))) {
-		if (!skb_queue_empty(&lp->super_tx_queue)) {
-			schedule_work(&lp->tqueue);
-		} else {
-			isdn_net_device_wake_queue(lp);
-		}
-	}
-}
-
-static __inline__ void isdn_net_zero_frame_cnt(isdn_net_local *lp)
-{
-	atomic_set(&lp->frame_cnt, 0);
-}
-
-/* For 2.2.x we leave the transmitter busy timeout at 2 secs, just
- * to be safe.
- * For 2.3.x we push it up to 20 secs, because call establishment
- * (in particular callback) may take such a long time, and we
- * don't want confusing messages in the log. However, there is a slight
- * possibility that this large timeout will break other things like MPPP,
- * which might rely on the tx timeout. If so, we'll find out this way...
- */
-
-#define ISDN_NET_TX_TIMEOUT (20 * HZ)
-
-/* Prototypes */
-
-static int isdn_net_force_dial_lp(isdn_net_local *);
-static netdev_tx_t isdn_net_start_xmit(struct sk_buff *,
-				       struct net_device *);
-
-static void isdn_net_ciscohdlck_connected(isdn_net_local *lp);
-static void isdn_net_ciscohdlck_disconnected(isdn_net_local *lp);
-
-char *isdn_net_revision = "$Revision: 1.1.2.2 $";
-
-/*
- * Code for raw-networking over ISDN
- */
-
-static void
-isdn_net_unreachable(struct net_device *dev, struct sk_buff *skb, char *reason)
-{
-	if (skb) {
-
-		u_short proto = ntohs(skb->protocol);
-
-		printk(KERN_DEBUG "isdn_net: %s: %s, signalling dst_link_failure %s\n",
-		       dev->name,
-		       (reason != NULL) ? reason : "unknown",
-		       (proto != ETH_P_IP) ? "Protocol != ETH_P_IP" : "");
-
-		dst_link_failure(skb);
-	}
-	else {  /* dial not triggered by rawIP packet */
-		printk(KERN_DEBUG "isdn_net: %s: %s\n",
-		       dev->name,
-		       (reason != NULL) ? reason : "reason unknown");
-	}
-}
-
-static void
-isdn_net_reset(struct net_device *dev)
-{
-#ifdef CONFIG_ISDN_X25
-	struct concap_device_ops *dops =
-		((isdn_net_local *)netdev_priv(dev))->dops;
-	struct concap_proto *cprot =
-		((isdn_net_local *)netdev_priv(dev))->netdev->cprot;
-#endif
-#ifdef CONFIG_ISDN_X25
-	if (cprot && cprot->pops && dops)
-		cprot->pops->restart(cprot, dev, dops);
-#endif
-}
-
-/* Open/initialize the board. */
-static int
-isdn_net_open(struct net_device *dev)
-{
-	int i;
-	struct net_device *p;
-	struct in_device *in_dev;
-
-	/* moved here from isdn_net_reset, because only the master has an
-	   interface associated which is supposed to be started. BTW:
-	   we need to call netif_start_queue, not netif_wake_queue here */
-	netif_start_queue(dev);
-
-	isdn_net_reset(dev);
-	/* Fill in the MAC-level header (not needed, but for compatibility... */
-	for (i = 0; i < ETH_ALEN - sizeof(u32); i++)
-		dev->dev_addr[i] = 0xfc;
-	if ((in_dev = dev->ip_ptr) != NULL) {
-		/*
-		 *      Any address will do - we take the first
-		 */
-		struct in_ifaddr *ifa = in_dev->ifa_list;
-		if (ifa != NULL)
-			memcpy(dev->dev_addr + 2, &ifa->ifa_local, 4);
-	}
-
-	/* If this interface has slaves, start them also */
-	p = MASTER_TO_SLAVE(dev);
-	if (p) {
-		while (p) {
-			isdn_net_reset(p);
-			p = MASTER_TO_SLAVE(p);
-		}
-	}
-	isdn_lock_drivers();
-	return 0;
-}
-
-/*
- * Assign an ISDN-channel to a net-interface
- */
-static void
-isdn_net_bind_channel(isdn_net_local *lp, int idx)
-{
-	lp->flags |= ISDN_NET_CONNECTED;
-	lp->isdn_device = dev->drvmap[idx];
-	lp->isdn_channel = dev->chanmap[idx];
-	dev->rx_netdev[idx] = lp->netdev;
-	dev->st_netdev[idx] = lp->netdev;
-}
-
-/*
- * unbind a net-interface (resets interface after an error)
- */
-static void
-isdn_net_unbind_channel(isdn_net_local *lp)
-{
-	skb_queue_purge(&lp->super_tx_queue);
-
-	if (!lp->master) {	/* reset only master device */
-		/* Moral equivalent of dev_purge_queues():
-		   BEWARE! This chunk of code cannot be called from hardware
-		   interrupt handler. I hope it is true. --ANK
-		*/
-		qdisc_reset_all_tx(lp->netdev->dev);
-	}
-	lp->dialstate = 0;
-	dev->rx_netdev[isdn_dc2minor(lp->isdn_device, lp->isdn_channel)] = NULL;
-	dev->st_netdev[isdn_dc2minor(lp->isdn_device, lp->isdn_channel)] = NULL;
-	if (lp->isdn_device != -1 && lp->isdn_channel != -1)
-		isdn_free_channel(lp->isdn_device, lp->isdn_channel,
-				  ISDN_USAGE_NET);
-	lp->flags &= ~ISDN_NET_CONNECTED;
-	lp->isdn_device = -1;
-	lp->isdn_channel = -1;
-}
-
-/*
- * Perform auto-hangup and cps-calculation for net-interfaces.
- *
- * auto-hangup:
- * Increment idle-counter (this counter is reset on any incoming or
- * outgoing packet), if counter exceeds configured limit either do a
- * hangup immediately or - if configured - wait until just before the next
- * charge-info.
- *
- * cps-calculation (needed for dynamic channel-bundling):
- * Since this function is called every second, simply reset the
- * byte-counter of the interface after copying it to the cps-variable.
- */
-static unsigned long last_jiffies = -HZ;
-
-void
-isdn_net_autohup(void)
-{
-	isdn_net_dev *p = dev->netdev;
-	int anymore;
-
-	anymore = 0;
-	while (p) {
-		isdn_net_local *l = p->local;
-		if (jiffies == last_jiffies)
-			l->cps = l->transcount;
-		else
-			l->cps = (l->transcount * HZ) / (jiffies - last_jiffies);
-		l->transcount = 0;
-		if (dev->net_verbose > 3)
-			printk(KERN_DEBUG "%s: %d bogocps\n", p->dev->name, l->cps);
-		if ((l->flags & ISDN_NET_CONNECTED) && (!l->dialstate)) {
-			anymore = 1;
-			l->huptimer++;
-			/*
-			 * if there is some dialmode where timeout-hangup
-			 * should _not_ be done, check for that here
-			 */
-			if ((l->onhtime) &&
-			    (l->huptimer > l->onhtime))
-			{
-				if (l->hupflags & ISDN_MANCHARGE &&
-				    l->hupflags & ISDN_CHARGEHUP) {
-					while (time_after(jiffies, l->chargetime + l->chargeint))
-						l->chargetime += l->chargeint;
-					if (time_after(jiffies, l->chargetime + l->chargeint - 2 * HZ))
-						if (l->outgoing || l->hupflags & ISDN_INHUP)
-							isdn_net_hangup(p->dev);
-				} else if (l->outgoing) {
-					if (l->hupflags & ISDN_CHARGEHUP) {
-						if (l->hupflags & ISDN_WAITCHARGE) {
-							printk(KERN_DEBUG "isdn_net: Hupflags of %s are %X\n",
-							       p->dev->name, l->hupflags);
-							isdn_net_hangup(p->dev);
-						} else if (time_after(jiffies, l->chargetime + l->chargeint)) {
-							printk(KERN_DEBUG
-							       "isdn_net: %s: chtime = %lu, chint = %d\n",
-							       p->dev->name, l->chargetime, l->chargeint);
-							isdn_net_hangup(p->dev);
-						}
-					} else
-						isdn_net_hangup(p->dev);
-				} else if (l->hupflags & ISDN_INHUP)
-					isdn_net_hangup(p->dev);
-			}
-
-			if (dev->global_flags & ISDN_GLOBAL_STOPPED || (ISDN_NET_DIALMODE(*l) == ISDN_NET_DM_OFF)) {
-				isdn_net_hangup(p->dev);
-				break;
-			}
-		}
-		p = (isdn_net_dev *) p->next;
-	}
-	last_jiffies = jiffies;
-	isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, anymore);
-}
-
-static void isdn_net_lp_disconnected(isdn_net_local *lp)
-{
-	isdn_net_rm_from_bundle(lp);
-}
-
-/*
- * Handle status-messages from ISDN-interfacecard.
- * This function is called from within the main-status-dispatcher
- * isdn_status_callback, which itself is called from the low-level driver.
- * Return: 1 = Event handled, 0 = not for us or unknown Event.
- */
-int
-isdn_net_stat_callback(int idx, isdn_ctrl *c)
-{
-	isdn_net_dev *p = dev->st_netdev[idx];
-	int cmd = c->command;
-
-	if (p) {
-		isdn_net_local *lp = p->local;
-#ifdef CONFIG_ISDN_X25
-		struct concap_proto *cprot = lp->netdev->cprot;
-		struct concap_proto_ops *pops = cprot ? cprot->pops : NULL;
-#endif
-		switch (cmd) {
-		case ISDN_STAT_BSENT:
-			/* A packet has successfully been sent out */
-			if ((lp->flags & ISDN_NET_CONNECTED) &&
-			    (!lp->dialstate)) {
-				isdn_net_dec_frame_cnt(lp);
-				lp->stats.tx_packets++;
-				lp->stats.tx_bytes += c->parm.length;
-			}
-			return 1;
-		case ISDN_STAT_DCONN:
-			/* D-Channel is up */
-			switch (lp->dialstate) {
-			case 4:
-			case 7:
-			case 8:
-				lp->dialstate++;
-				return 1;
-			case 12:
-				lp->dialstate = 5;
-				return 1;
-			}
-			break;
-		case ISDN_STAT_DHUP:
-			/* Either D-Channel-hangup or error during dialout */
-#ifdef CONFIG_ISDN_X25
-			/* If we are not connencted then dialing had
-			   failed. If there are generic encap protocol
-			   receiver routines signal the closure of
-			   the link*/
-
-			if (!(lp->flags & ISDN_NET_CONNECTED)
-			    && pops && pops->disconn_ind)
-				pops->disconn_ind(cprot);
-#endif /* CONFIG_ISDN_X25 */
-			if ((!lp->dialstate) && (lp->flags & ISDN_NET_CONNECTED)) {
-				if (lp->p_encap == ISDN_NET_ENCAP_CISCOHDLCK)
-					isdn_net_ciscohdlck_disconnected(lp);
-#ifdef CONFIG_ISDN_PPP
-				if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-					isdn_ppp_free(lp);
-#endif
-				isdn_net_lp_disconnected(lp);
-				isdn_all_eaz(lp->isdn_device, lp->isdn_channel);
-				printk(KERN_INFO "%s: remote hangup\n", p->dev->name);
-				printk(KERN_INFO "%s: Chargesum is %d\n", p->dev->name,
-				       lp->charge);
-				isdn_net_unbind_channel(lp);
-				return 1;
-			}
-			break;
-#ifdef CONFIG_ISDN_X25
-		case ISDN_STAT_BHUP:
-			/* B-Channel-hangup */
-			/* try if there are generic encap protocol
-			   receiver routines and signal the closure of
-			   the link */
-			if (pops && pops->disconn_ind) {
-				pops->disconn_ind(cprot);
-				return 1;
-			}
-			break;
-#endif /* CONFIG_ISDN_X25 */
-		case ISDN_STAT_BCONN:
-			/* B-Channel is up */
-			isdn_net_zero_frame_cnt(lp);
-			switch (lp->dialstate) {
-			case 5:
-			case 6:
-			case 7:
-			case 8:
-			case 9:
-			case 10:
-			case 12:
-				if (lp->dialstate <= 6) {
-					dev->usage[idx] |= ISDN_USAGE_OUTGOING;
-					isdn_info_update();
-				} else
-					dev->rx_netdev[idx] = p;
-				lp->dialstate = 0;
-				isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 1);
-				if (lp->p_encap == ISDN_NET_ENCAP_CISCOHDLCK)
-					isdn_net_ciscohdlck_connected(lp);
-				if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP) {
-					if (lp->master) { /* is lp a slave? */
-						isdn_net_dev *nd = ISDN_MASTER_PRIV(lp)->netdev;
-						isdn_net_add_to_bundle(nd, lp);
-					}
-				}
-				printk(KERN_INFO "isdn_net: %s connected\n", p->dev->name);
-				/* If first Chargeinfo comes before B-Channel connect,
-				 * we correct the timestamp here.
-				 */
-				lp->chargetime = jiffies;
-
-				/* reset dial-timeout */
-				lp->dialstarted = 0;
-				lp->dialwait_timer = 0;
-
-#ifdef CONFIG_ISDN_PPP
-				if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-					isdn_ppp_wakeup_daemon(lp);
-#endif
-#ifdef CONFIG_ISDN_X25
-				/* try if there are generic concap receiver routines */
-				if (pops)
-					if (pops->connect_ind)
-						pops->connect_ind(cprot);
-#endif /* CONFIG_ISDN_X25 */
-				/* ppp needs to do negotiations first */
-				if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP)
-					isdn_net_device_wake_queue(lp);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_NODCH:
-			/* No D-Channel avail. */
-			if (lp->dialstate == 4) {
-				lp->dialstate--;
-				return 1;
-			}
-			break;
-		case ISDN_STAT_CINF:
-			/* Charge-info from TelCo. Calculate interval between
-			 * charge-infos and set timestamp for last info for
-			 * usage by isdn_net_autohup()
-			 */
-			lp->charge++;
-			if (lp->hupflags & ISDN_HAVECHARGE) {
-				lp->hupflags &= ~ISDN_WAITCHARGE;
-				lp->chargeint = jiffies - lp->chargetime - (2 * HZ);
-			}
-			if (lp->hupflags & ISDN_WAITCHARGE)
-				lp->hupflags |= ISDN_HAVECHARGE;
-			lp->chargetime = jiffies;
-			printk(KERN_DEBUG "isdn_net: Got CINF chargetime of %s now %lu\n",
-			       p->dev->name, lp->chargetime);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * Perform dialout for net-interfaces and timeout-handling for
- * D-Channel-up and B-Channel-up Messages.
- * This function is initially called from within isdn_net_start_xmit() or
- * or isdn_net_find_icall() after initializing the dialstate for an
- * interface. If further calls are needed, the function schedules itself
- * for a timer-callback via isdn_timer_function().
- * The dialstate is also affected by incoming status-messages from
- * the ISDN-Channel which are handled in isdn_net_stat_callback() above.
- */
-void
-isdn_net_dial(void)
-{
-	isdn_net_dev *p = dev->netdev;
-	int anymore = 0;
-	int i;
-	isdn_ctrl cmd;
-	u_char *phone_number;
-
-	while (p) {
-		isdn_net_local *lp = p->local;
-
-#ifdef ISDN_DEBUG_NET_DIAL
-		if (lp->dialstate)
-			printk(KERN_DEBUG "%s: dialstate=%d\n", p->dev->name, lp->dialstate);
-#endif
-		switch (lp->dialstate) {
-		case 0:
-			/* Nothing to do for this interface */
-			break;
-		case 1:
-			/* Initiate dialout. Set phone-number-pointer to first number
-			 * of interface.
-			 */
-			lp->dial = lp->phone[1];
-			if (!lp->dial) {
-				printk(KERN_WARNING "%s: phone number deleted?\n",
-				       p->dev->name);
-				isdn_net_hangup(p->dev);
-				break;
-			}
-			anymore = 1;
-
-			if (lp->dialtimeout > 0)
-				if (lp->dialstarted == 0 || time_after(jiffies, lp->dialstarted + lp->dialtimeout + lp->dialwait)) {
-					lp->dialstarted = jiffies;
-					lp->dialwait_timer = 0;
-				}
-
-			lp->dialstate++;
-			/* Fall through */
-		case 2:
-			/* Prepare dialing. Clear EAZ, then set EAZ. */
-			cmd.driver = lp->isdn_device;
-			cmd.arg = lp->isdn_channel;
-			cmd.command = ISDN_CMD_CLREAZ;
-			isdn_command(&cmd);
-			sprintf(cmd.parm.num, "%s", isdn_map_eaz2msn(lp->msn, cmd.driver));
-			cmd.command = ISDN_CMD_SETEAZ;
-			isdn_command(&cmd);
-			lp->dialretry = 0;
-			anymore = 1;
-			lp->dialstate++;
-			/* Fall through */
-		case 3:
-			/* Setup interface, dial current phone-number, switch to next number.
-			 * If list of phone-numbers is exhausted, increment
-			 * retry-counter.
-			 */
-			if (dev->global_flags & ISDN_GLOBAL_STOPPED || (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF)) {
-				char *s;
-				if (dev->global_flags & ISDN_GLOBAL_STOPPED)
-					s = "dial suppressed: isdn system stopped";
-				else
-					s = "dial suppressed: dialmode `off'";
-				isdn_net_unreachable(p->dev, NULL, s);
-				isdn_net_hangup(p->dev);
-				break;
-			}
-			cmd.driver = lp->isdn_device;
-			cmd.command = ISDN_CMD_SETL2;
-			cmd.arg = lp->isdn_channel + (lp->l2_proto << 8);
-			isdn_command(&cmd);
-			cmd.driver = lp->isdn_device;
-			cmd.command = ISDN_CMD_SETL3;
-			cmd.arg = lp->isdn_channel + (lp->l3_proto << 8);
-			isdn_command(&cmd);
-			cmd.driver = lp->isdn_device;
-			cmd.arg = lp->isdn_channel;
-			if (!lp->dial) {
-				printk(KERN_WARNING "%s: phone number deleted?\n",
-				       p->dev->name);
-				isdn_net_hangup(p->dev);
-				break;
-			}
-			if (!strncmp(lp->dial->num, "LEASED", strlen("LEASED"))) {
-				lp->dialstate = 4;
-				printk(KERN_INFO "%s: Open leased line ...\n", p->dev->name);
-			} else {
-				if (lp->dialtimeout > 0)
-					if (time_after(jiffies, lp->dialstarted + lp->dialtimeout)) {
-						lp->dialwait_timer = jiffies + lp->dialwait;
-						lp->dialstarted = 0;
-						isdn_net_unreachable(p->dev, NULL, "dial: timed out");
-						isdn_net_hangup(p->dev);
-						break;
-					}
-
-				cmd.driver = lp->isdn_device;
-				cmd.command = ISDN_CMD_DIAL;
-				cmd.parm.setup.si2 = 0;
-
-				/* check for DOV */
-				phone_number = lp->dial->num;
-				if ((*phone_number == 'v') ||
-				    (*phone_number == 'V')) { /* DOV call */
-					cmd.parm.setup.si1 = 1;
-				} else { /* DATA call */
-					cmd.parm.setup.si1 = 7;
-				}
-
-				strcpy(cmd.parm.setup.phone, phone_number);
-				/*
-				 * Switch to next number or back to start if at end of list.
-				 */
-				if (!(lp->dial = (isdn_net_phone *) lp->dial->next)) {
-					lp->dial = lp->phone[1];
-					lp->dialretry++;
-
-					if (lp->dialretry > lp->dialmax) {
-						if (lp->dialtimeout == 0) {
-							lp->dialwait_timer = jiffies + lp->dialwait;
-							lp->dialstarted = 0;
-							isdn_net_unreachable(p->dev, NULL, "dial: tried all numbers dialmax times");
-						}
-						isdn_net_hangup(p->dev);
-						break;
-					}
-				}
-				sprintf(cmd.parm.setup.eazmsn, "%s",
-					isdn_map_eaz2msn(lp->msn, cmd.driver));
-				i = isdn_dc2minor(lp->isdn_device, lp->isdn_channel);
-				if (i >= 0) {
-					strcpy(dev->num[i], cmd.parm.setup.phone);
-					dev->usage[i] |= ISDN_USAGE_OUTGOING;
-					isdn_info_update();
-				}
-				printk(KERN_INFO "%s: dialing %d %s... %s\n", p->dev->name,
-				       lp->dialretry, cmd.parm.setup.phone,
-				       (cmd.parm.setup.si1 == 1) ? "DOV" : "");
-				lp->dtimer = 0;
-#ifdef ISDN_DEBUG_NET_DIAL
-				printk(KERN_DEBUG "dial: d=%d c=%d\n", lp->isdn_device,
-				       lp->isdn_channel);
-#endif
-				isdn_command(&cmd);
-			}
-			lp->huptimer = 0;
-			lp->outgoing = 1;
-			if (lp->chargeint) {
-				lp->hupflags |= ISDN_HAVECHARGE;
-				lp->hupflags &= ~ISDN_WAITCHARGE;
-			} else {
-				lp->hupflags |= ISDN_WAITCHARGE;
-				lp->hupflags &= ~ISDN_HAVECHARGE;
-			}
-			anymore = 1;
-			lp->dialstate =
-				(lp->cbdelay &&
-				 (lp->flags & ISDN_NET_CBOUT)) ? 12 : 4;
-			break;
-		case 4:
-			/* Wait for D-Channel-connect.
-			 * If timeout, switch back to state 3.
-			 * Dialmax-handling moved to state 3.
-			 */
-			if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10)
-				lp->dialstate = 3;
-			anymore = 1;
-			break;
-		case 5:
-			/* Got D-Channel-Connect, send B-Channel-request */
-			cmd.driver = lp->isdn_device;
-			cmd.arg = lp->isdn_channel;
-			cmd.command = ISDN_CMD_ACCEPTB;
-			anymore = 1;
-			lp->dtimer = 0;
-			lp->dialstate++;
-			isdn_command(&cmd);
-			break;
-		case 6:
-			/* Wait for B- or D-Channel-connect. If timeout,
-			 * switch back to state 3.
-			 */
-#ifdef ISDN_DEBUG_NET_DIAL
-			printk(KERN_DEBUG "dialtimer2: %d\n", lp->dtimer);
-#endif
-			if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10)
-				lp->dialstate = 3;
-			anymore = 1;
-			break;
-		case 7:
-			/* Got incoming Call, setup L2 and L3 protocols,
-			 * then wait for D-Channel-connect
-			 */
-#ifdef ISDN_DEBUG_NET_DIAL
-			printk(KERN_DEBUG "dialtimer4: %d\n", lp->dtimer);
-#endif
-			cmd.driver = lp->isdn_device;
-			cmd.command = ISDN_CMD_SETL2;
-			cmd.arg = lp->isdn_channel + (lp->l2_proto << 8);
-			isdn_command(&cmd);
-			cmd.driver = lp->isdn_device;
-			cmd.command = ISDN_CMD_SETL3;
-			cmd.arg = lp->isdn_channel + (lp->l3_proto << 8);
-			isdn_command(&cmd);
-			if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT15)
-				isdn_net_hangup(p->dev);
-			else {
-				anymore = 1;
-				lp->dialstate++;
-			}
-			break;
-		case 9:
-			/* Got incoming D-Channel-Connect, send B-Channel-request */
-			cmd.driver = lp->isdn_device;
-			cmd.arg = lp->isdn_channel;
-			cmd.command = ISDN_CMD_ACCEPTB;
-			isdn_command(&cmd);
-			anymore = 1;
-			lp->dtimer = 0;
-			lp->dialstate++;
-			break;
-		case 8:
-		case 10:
-			/*  Wait for B- or D-channel-connect */
-#ifdef ISDN_DEBUG_NET_DIAL
-			printk(KERN_DEBUG "dialtimer4: %d\n", lp->dtimer);
-#endif
-			if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10)
-				isdn_net_hangup(p->dev);
-			else
-				anymore = 1;
-			break;
-		case 11:
-			/* Callback Delay */
-			if (lp->dtimer++ > lp->cbdelay)
-				lp->dialstate = 1;
-			anymore = 1;
-			break;
-		case 12:
-			/* Remote does callback. Hangup after cbdelay, then wait for incoming
-			 * call (in state 4).
-			 */
-			if (lp->dtimer++ > lp->cbdelay)
-			{
-				printk(KERN_INFO "%s: hangup waiting for callback ...\n", p->dev->name);
-				lp->dtimer = 0;
-				lp->dialstate = 4;
-				cmd.driver = lp->isdn_device;
-				cmd.command = ISDN_CMD_HANGUP;
-				cmd.arg = lp->isdn_channel;
-				isdn_command(&cmd);
-				isdn_all_eaz(lp->isdn_device, lp->isdn_channel);
-			}
-			anymore = 1;
-			break;
-		default:
-			printk(KERN_WARNING "isdn_net: Illegal dialstate %d for device %s\n",
-			       lp->dialstate, p->dev->name);
-		}
-		p = (isdn_net_dev *) p->next;
-	}
-	isdn_timer_ctrl(ISDN_TIMER_NETDIAL, anymore);
-}
-
-/*
- * Perform hangup for a net-interface.
- */
-void
-isdn_net_hangup(struct net_device *d)
-{
-	isdn_net_local *lp = netdev_priv(d);
-	isdn_ctrl cmd;
-#ifdef CONFIG_ISDN_X25
-	struct concap_proto *cprot = lp->netdev->cprot;
-	struct concap_proto_ops *pops = cprot ? cprot->pops : NULL;
-#endif
-
-	if (lp->flags & ISDN_NET_CONNECTED) {
-		if (lp->slave != NULL) {
-			isdn_net_local *slp = ISDN_SLAVE_PRIV(lp);
-			if (slp->flags & ISDN_NET_CONNECTED) {
-				printk(KERN_INFO
-				       "isdn_net: hang up slave %s before %s\n",
-				       lp->slave->name, d->name);
-				isdn_net_hangup(lp->slave);
-			}
-		}
-		printk(KERN_INFO "isdn_net: local hangup %s\n", d->name);
-#ifdef CONFIG_ISDN_PPP
-		if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-			isdn_ppp_free(lp);
-#endif
-		isdn_net_lp_disconnected(lp);
-#ifdef CONFIG_ISDN_X25
-		/* try if there are generic encap protocol
-		   receiver routines and signal the closure of
-		   the link */
-		if (pops && pops->disconn_ind)
-			pops->disconn_ind(cprot);
-#endif /* CONFIG_ISDN_X25 */
-
-		cmd.driver = lp->isdn_device;
-		cmd.command = ISDN_CMD_HANGUP;
-		cmd.arg = lp->isdn_channel;
-		isdn_command(&cmd);
-		printk(KERN_INFO "%s: Chargesum is %d\n", d->name, lp->charge);
-		isdn_all_eaz(lp->isdn_device, lp->isdn_channel);
-	}
-	isdn_net_unbind_channel(lp);
-}
-
-typedef struct {
-	__be16 source;
-	__be16 dest;
-} ip_ports;
-
-static void
-isdn_net_log_skb(struct sk_buff *skb, isdn_net_local *lp)
-{
-	/* hopefully, this was set correctly */
-	const u_char *p = skb_network_header(skb);
-	unsigned short proto = ntohs(skb->protocol);
-	int data_ofs;
-	ip_ports *ipp;
-	char addinfo[100];
-
-	addinfo[0] = '\0';
-	/* This check stolen from 2.1.72 dev_queue_xmit_nit() */
-	if (p < skb->data || skb_network_header(skb) >= skb_tail_pointer(skb)) {
-		/* fall back to old isdn_net_log_packet method() */
-		char *buf = skb->data;
-
-		printk(KERN_DEBUG "isdn_net: protocol %04x is buggy, dev %s\n", skb->protocol, lp->netdev->dev->name);
-		p = buf;
-		proto = ETH_P_IP;
-		switch (lp->p_encap) {
-		case ISDN_NET_ENCAP_IPTYP:
-			proto = ntohs(*(__be16 *)&buf[0]);
-			p = &buf[2];
-			break;
-		case ISDN_NET_ENCAP_ETHER:
-			proto = ntohs(*(__be16 *)&buf[12]);
-			p = &buf[14];
-			break;
-		case ISDN_NET_ENCAP_CISCOHDLC:
-			proto = ntohs(*(__be16 *)&buf[2]);
-			p = &buf[4];
-			break;
-#ifdef CONFIG_ISDN_PPP
-		case ISDN_NET_ENCAP_SYNCPPP:
-			proto = ntohs(skb->protocol);
-			p = &buf[IPPP_MAX_HEADER];
-			break;
-#endif
-		}
-	}
-	data_ofs = ((p[0] & 15) * 4);
-	switch (proto) {
-	case ETH_P_IP:
-		switch (p[9]) {
-		case 1:
-			strcpy(addinfo, " ICMP");
-			break;
-		case 2:
-			strcpy(addinfo, " IGMP");
-			break;
-		case 4:
-			strcpy(addinfo, " IPIP");
-			break;
-		case 6:
-			ipp = (ip_ports *) (&p[data_ofs]);
-			sprintf(addinfo, " TCP, port: %d -> %d", ntohs(ipp->source),
-				ntohs(ipp->dest));
-			break;
-		case 8:
-			strcpy(addinfo, " EGP");
-			break;
-		case 12:
-			strcpy(addinfo, " PUP");
-			break;
-		case 17:
-			ipp = (ip_ports *) (&p[data_ofs]);
-			sprintf(addinfo, " UDP, port: %d -> %d", ntohs(ipp->source),
-				ntohs(ipp->dest));
-			break;
-		case 22:
-			strcpy(addinfo, " IDP");
-			break;
-		}
-		printk(KERN_INFO "OPEN: %pI4 -> %pI4%s\n",
-		       p + 12, p + 16, addinfo);
-		break;
-	case ETH_P_ARP:
-		printk(KERN_INFO "OPEN: ARP %pI4 -> *.*.*.* ?%pI4\n",
-		       p + 14, p + 24);
-		break;
-	}
-}
-
-/*
- * this function is used to send supervisory data, i.e. data which was
- * not received from the network layer, but e.g. frames from ipppd, CCP
- * reset frames etc.
- */
-void isdn_net_write_super(isdn_net_local *lp, struct sk_buff *skb)
-{
-	if (in_irq()) {
-		// we can't grab the lock from irq context,
-		// so we just queue the packet
-		skb_queue_tail(&lp->super_tx_queue, skb);
-		schedule_work(&lp->tqueue);
-		return;
-	}
-
-	spin_lock_bh(&lp->xmit_lock);
-	if (!isdn_net_lp_busy(lp)) {
-		isdn_net_writebuf_skb(lp, skb);
-	} else {
-		skb_queue_tail(&lp->super_tx_queue, skb);
-	}
-	spin_unlock_bh(&lp->xmit_lock);
-}
-
-/*
- * called from tq_immediate
- */
-static void isdn_net_softint(struct work_struct *work)
-{
-	isdn_net_local *lp = container_of(work, isdn_net_local, tqueue);
-	struct sk_buff *skb;
-
-	spin_lock_bh(&lp->xmit_lock);
-	while (!isdn_net_lp_busy(lp)) {
-		skb = skb_dequeue(&lp->super_tx_queue);
-		if (!skb)
-			break;
-		isdn_net_writebuf_skb(lp, skb);
-	}
-	spin_unlock_bh(&lp->xmit_lock);
-}
-
-/*
- * all frames sent from the (net) LL to a HL driver should go via this function
- * it's serialized by the caller holding the lp->xmit_lock spinlock
- */
-void isdn_net_writebuf_skb(isdn_net_local *lp, struct sk_buff *skb)
-{
-	int ret;
-	int len = skb->len;     /* save len */
-
-	/* before obtaining the lock the caller should have checked that
-	   the lp isn't busy */
-	if (isdn_net_lp_busy(lp)) {
-		printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__);
-		goto error;
-	}
-
-	if (!(lp->flags & ISDN_NET_CONNECTED)) {
-		printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__);
-		goto error;
-	}
-	ret = isdn_writebuf_skb_stub(lp->isdn_device, lp->isdn_channel, 1, skb);
-	if (ret != len) {
-		/* we should never get here */
-		printk(KERN_WARNING "%s: HL driver queue full\n", lp->netdev->dev->name);
-		goto error;
-	}
-
-	lp->transcount += len;
-	isdn_net_inc_frame_cnt(lp);
-	return;
-
-error:
-	dev_kfree_skb(skb);
-	lp->stats.tx_errors++;
-
-}
-
-
-/*
- *  Helper function for isdn_net_start_xmit.
- *  When called, the connection is already established.
- *  Based on cps-calculation, check if device is overloaded.
- *  If so, and if a slave exists, trigger dialing for it.
- *  If any slave is online, deliver packets using a simple round robin
- *  scheme.
- *
- *  Return: 0 on success, !0 on failure.
- */
-
-static int
-isdn_net_xmit(struct net_device *ndev, struct sk_buff *skb)
-{
-	isdn_net_dev *nd;
-	isdn_net_local *slp;
-	isdn_net_local *lp = netdev_priv(ndev);
-	int retv = NETDEV_TX_OK;
-
-	if (((isdn_net_local *) netdev_priv(ndev))->master) {
-		printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__);
-		dev_kfree_skb(skb);
-		return NETDEV_TX_OK;
-	}
-
-	/* For the other encaps the header has already been built */
-#ifdef CONFIG_ISDN_PPP
-	if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) {
-		return isdn_ppp_xmit(skb, ndev);
-	}
-#endif
-	nd = ((isdn_net_local *) netdev_priv(ndev))->netdev;
-	lp = isdn_net_get_locked_lp(nd);
-	if (!lp) {
-		printk(KERN_WARNING "%s: all channels busy - requeuing!\n", ndev->name);
-		return NETDEV_TX_BUSY;
-	}
-	/* we have our lp locked from now on */
-
-	/* Reset hangup-timeout */
-	lp->huptimer = 0; // FIXME?
-	isdn_net_writebuf_skb(lp, skb);
-	spin_unlock_bh(&lp->xmit_lock);
-
-	/* the following stuff is here for backwards compatibility.
-	 * in future, start-up and hangup of slaves (based on current load)
-	 * should move to userspace and get based on an overall cps
-	 * calculation
-	 */
-	if (lp->cps > lp->triggercps) {
-		if (lp->slave) {
-			if (!lp->sqfull) {
-				/* First time overload: set timestamp only */
-				lp->sqfull = 1;
-				lp->sqfull_stamp = jiffies;
-			} else {
-				/* subsequent overload: if slavedelay exceeded, start dialing */
-				if (time_after(jiffies, lp->sqfull_stamp + lp->slavedelay)) {
-					slp = ISDN_SLAVE_PRIV(lp);
-					if (!(slp->flags & ISDN_NET_CONNECTED)) {
-						isdn_net_force_dial_lp(ISDN_SLAVE_PRIV(lp));
-					}
-				}
-			}
-		}
-	} else {
-		if (lp->sqfull && time_after(jiffies, lp->sqfull_stamp + lp->slavedelay + (10 * HZ))) {
-			lp->sqfull = 0;
-		}
-		/* this is a hack to allow auto-hangup for slaves on moderate loads */
-		nd->queue = nd->local;
-	}
-
-	return retv;
-
-}
-
-static void
-isdn_net_adjust_hdr(struct sk_buff *skb, struct net_device *dev)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	if (!skb)
-		return;
-	if (lp->p_encap == ISDN_NET_ENCAP_ETHER) {
-		const int pullsize = skb_network_offset(skb) - ETH_HLEN;
-		if (pullsize > 0) {
-			printk(KERN_DEBUG "isdn_net: Pull junk %d\n", pullsize);
-			skb_pull(skb, pullsize);
-		}
-	}
-}
-
-
-static void isdn_net_tx_timeout(struct net_device *ndev)
-{
-	isdn_net_local *lp = netdev_priv(ndev);
-
-	printk(KERN_WARNING "isdn_tx_timeout dev %s dialstate %d\n", ndev->name, lp->dialstate);
-	if (!lp->dialstate) {
-		lp->stats.tx_errors++;
-		/*
-		 * There is a certain probability that this currently
-		 * works at all because if we always wake up the interface,
-		 * then upper layer will try to send the next packet
-		 * immediately. And then, the old clean_up logic in the
-		 * driver will hopefully continue to work as it used to do.
-		 *
-		 * This is rather primitive right know, we better should
-		 * clean internal queues here, in particular for multilink and
-		 * ppp, and reset HL driver's channel, too.   --HE
-		 *
-		 * actually, this may not matter at all, because ISDN hardware
-		 * should not see transmitter hangs at all IMO
-		 * changed KERN_DEBUG to KERN_WARNING to find out if this is
-		 * ever called   --KG
-		 */
-	}
-	netif_trans_update(ndev);
-	netif_wake_queue(ndev);
-}
-
-/*
- * Try sending a packet.
- * If this interface isn't connected to a ISDN-Channel, find a free channel,
- * and start dialing.
- */
-static netdev_tx_t
-isdn_net_start_xmit(struct sk_buff *skb, struct net_device *ndev)
-{
-	isdn_net_local *lp = netdev_priv(ndev);
-#ifdef CONFIG_ISDN_X25
-	struct concap_proto *cprot = lp->netdev->cprot;
-/* At this point hard_start_xmit() passes control to the encapsulation
-   protocol (if present).
-   For X.25 auto-dialing is completly bypassed because:
-   - It does not conform with the semantics of a reliable datalink
-   service as needed by X.25 PLP.
-   - I don't want that the interface starts dialing when the network layer
-   sends a message which requests to disconnect the lapb link (or if it
-   sends any other message not resulting in data transmission).
-   Instead, dialing will be initiated by the encapsulation protocol entity
-   when a dl_establish request is received from the upper layer.
-*/
-	if (cprot && cprot->pops) {
-		int ret = cprot->pops->encap_and_xmit(cprot, skb);
-
-		if (ret)
-			netif_stop_queue(ndev);
-		return ret;
-	} else
-#endif
-		/* auto-dialing xmit function */
-	{
-#ifdef ISDN_DEBUG_NET_DUMP
-		u_char *buf;
-#endif
-		isdn_net_adjust_hdr(skb, ndev);
-#ifdef ISDN_DEBUG_NET_DUMP
-		buf = skb->data;
-		isdn_dumppkt("S:", buf, skb->len, 40);
-#endif
-
-		if (!(lp->flags & ISDN_NET_CONNECTED)) {
-			int chi;
-			/* only do autodial if allowed by config */
-			if (!(ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_AUTO)) {
-				isdn_net_unreachable(ndev, skb, "dial rejected: interface not in dialmode `auto'");
-				dev_kfree_skb(skb);
-				return NETDEV_TX_OK;
-			}
-			if (lp->phone[1]) {
-				ulong flags;
-
-				if (lp->dialwait_timer <= 0)
-					if (lp->dialstarted > 0 && lp->dialtimeout > 0 && time_before(jiffies, lp->dialstarted + lp->dialtimeout + lp->dialwait))
-						lp->dialwait_timer = lp->dialstarted + lp->dialtimeout + lp->dialwait;
-
-				if (lp->dialwait_timer > 0) {
-					if (time_before(jiffies, lp->dialwait_timer)) {
-						isdn_net_unreachable(ndev, skb, "dial rejected: retry-time not reached");
-						dev_kfree_skb(skb);
-						return NETDEV_TX_OK;
-					} else
-						lp->dialwait_timer = 0;
-				}
-				/* Grab a free ISDN-Channel */
-				spin_lock_irqsave(&dev->lock, flags);
-				if (((chi =
-				      isdn_get_free_channel(
-					      ISDN_USAGE_NET,
-					      lp->l2_proto,
-					      lp->l3_proto,
-					      lp->pre_device,
-					      lp->pre_channel,
-					      lp->msn)
-					     ) < 0) &&
-				    ((chi =
-				      isdn_get_free_channel(
-					      ISDN_USAGE_NET,
-					      lp->l2_proto,
-					      lp->l3_proto,
-					      lp->pre_device,
-					      lp->pre_channel^1,
-					      lp->msn)
-					    ) < 0)) {
-					spin_unlock_irqrestore(&dev->lock, flags);
-					isdn_net_unreachable(ndev, skb,
-							     "No channel");
-					dev_kfree_skb(skb);
-					return NETDEV_TX_OK;
-				}
-				/* Log packet, which triggered dialing */
-				if (dev->net_verbose)
-					isdn_net_log_skb(skb, lp);
-				lp->dialstate = 1;
-				/* Connect interface with channel */
-				isdn_net_bind_channel(lp, chi);
-#ifdef CONFIG_ISDN_PPP
-				if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) {
-					/* no 'first_skb' handling for syncPPP */
-					if (isdn_ppp_bind(lp) < 0) {
-						dev_kfree_skb(skb);
-						isdn_net_unbind_channel(lp);
-						spin_unlock_irqrestore(&dev->lock, flags);
-						return NETDEV_TX_OK;	/* STN (skb to nirvana) ;) */
-					}
-#ifdef CONFIG_IPPP_FILTER
-					if (isdn_ppp_autodial_filter(skb, lp)) {
-						isdn_ppp_free(lp);
-						isdn_net_unbind_channel(lp);
-						spin_unlock_irqrestore(&dev->lock, flags);
-						isdn_net_unreachable(ndev, skb, "dial rejected: packet filtered");
-						dev_kfree_skb(skb);
-						return NETDEV_TX_OK;
-					}
-#endif
-					spin_unlock_irqrestore(&dev->lock, flags);
-					isdn_net_dial();	/* Initiate dialing */
-					netif_stop_queue(ndev);
-					return NETDEV_TX_BUSY;	/* let upper layer requeue skb packet */
-				}
-#endif
-				/* Initiate dialing */
-				spin_unlock_irqrestore(&dev->lock, flags);
-				isdn_net_dial();
-				isdn_net_device_stop_queue(lp);
-				return NETDEV_TX_BUSY;
-			} else {
-				isdn_net_unreachable(ndev, skb,
-						     "No phone number");
-				dev_kfree_skb(skb);
-				return NETDEV_TX_OK;
-			}
-		} else {
-			/* Device is connected to an ISDN channel */
-			netif_trans_update(ndev);
-			if (!lp->dialstate) {
-				/* ISDN connection is established, try sending */
-				int ret;
-				ret = (isdn_net_xmit(ndev, skb));
-				if (ret) netif_stop_queue(ndev);
-				return ret;
-			} else
-				netif_stop_queue(ndev);
-		}
-	}
-	return NETDEV_TX_BUSY;
-}
-
-/*
- * Shutdown a net-interface.
- */
-static int
-isdn_net_close(struct net_device *dev)
-{
-	struct net_device *p;
-#ifdef CONFIG_ISDN_X25
-	struct concap_proto *cprot =
-		((isdn_net_local *)netdev_priv(dev))->netdev->cprot;
-	/* printk(KERN_DEBUG "isdn_net_close %s\n" , dev-> name); */
-#endif
-
-#ifdef CONFIG_ISDN_X25
-	if (cprot && cprot->pops) cprot->pops->close(cprot);
-#endif
-	netif_stop_queue(dev);
-	p = MASTER_TO_SLAVE(dev);
-	if (p) {
-		/* If this interface has slaves, stop them also */
-		while (p) {
-#ifdef CONFIG_ISDN_X25
-			cprot = ((isdn_net_local *)netdev_priv(p))
-				->netdev->cprot;
-			if (cprot && cprot->pops)
-				cprot->pops->close(cprot);
-#endif
-			isdn_net_hangup(p);
-			p = MASTER_TO_SLAVE(p);
-		}
-	}
-	isdn_net_hangup(dev);
-	isdn_unlock_drivers();
-	return 0;
-}
-
-/*
- * Get statistics
- */
-static struct net_device_stats *
-isdn_net_get_stats(struct net_device *dev)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	return &lp->stats;
-}
-
-/*      This is simply a copy from std. eth.c EXCEPT we pull ETH_HLEN
- *      instead of dev->hard_header_len off. This is done because the
- *      lowlevel-driver has already pulled off its stuff when we get
- *      here and this routine only gets called with p_encap == ETHER.
- *      Determine the packet's protocol ID. The rule here is that we
- *      assume 802.3 if the type field is short enough to be a length.
- *      This is normal practice and works for any 'now in use' protocol.
- */
-
-static __be16
-isdn_net_type_trans(struct sk_buff *skb, struct net_device *dev)
-{
-	struct ethhdr *eth;
-	unsigned char *rawp;
-
-	skb_reset_mac_header(skb);
-	skb_pull(skb, ETH_HLEN);
-	eth = eth_hdr(skb);
-
-	if (*eth->h_dest & 1) {
-		if (ether_addr_equal(eth->h_dest, dev->broadcast))
-			skb->pkt_type = PACKET_BROADCAST;
-		else
-			skb->pkt_type = PACKET_MULTICAST;
-	}
-	/*
-	 *      This ALLMULTI check should be redundant by 1.4
-	 *      so don't forget to remove it.
-	 */
-
-	else if (dev->flags & (IFF_PROMISC /*| IFF_ALLMULTI*/)) {
-		if (!ether_addr_equal(eth->h_dest, dev->dev_addr))
-			skb->pkt_type = PACKET_OTHERHOST;
-	}
-	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
-		return eth->h_proto;
-
-	rawp = skb->data;
-
-	/*
-	 *      This is a magic hack to spot IPX packets. Older Novell breaks
-	 *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
-	 *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
-	 *      won't work for fault tolerant netware but does for the rest.
-	 */
-	if (*(unsigned short *) rawp == 0xFFFF)
-		return htons(ETH_P_802_3);
-	/*
-	 *      Real 802.2 LLC
-	 */
-	return htons(ETH_P_802_2);
-}
-
-
-/*
- * CISCO HDLC keepalive specific stuff
- */
-static struct sk_buff*
-isdn_net_ciscohdlck_alloc_skb(isdn_net_local *lp, int len)
-{
-	unsigned short hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen;
-	struct sk_buff *skb;
-
-	skb = alloc_skb(hl + len, GFP_ATOMIC);
-	if (skb)
-		skb_reserve(skb, hl);
-	else
-		printk("isdn out of mem at %s:%d!\n", __FILE__, __LINE__);
-	return skb;
-}
-
-/* cisco hdlck device private ioctls */
-static int
-isdn_ciscohdlck_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	unsigned long len = 0;
-	unsigned long expires = 0;
-	int tmp = 0;
-	int period = lp->cisco_keepalive_period;
-	s8 debserint = lp->cisco_debserint;
-	int rc = 0;
-
-	if (lp->p_encap != ISDN_NET_ENCAP_CISCOHDLCK)
-		return -EINVAL;
-
-	switch (cmd) {
-		/* get/set keepalive period */
-	case SIOCGKEEPPERIOD:
-		len = (unsigned long)sizeof(lp->cisco_keepalive_period);
-		if (copy_to_user(ifr->ifr_data,
-				 &lp->cisco_keepalive_period, len))
-			rc = -EFAULT;
-		break;
-	case SIOCSKEEPPERIOD:
-		tmp = lp->cisco_keepalive_period;
-		len = (unsigned long)sizeof(lp->cisco_keepalive_period);
-		if (copy_from_user(&period, ifr->ifr_data, len))
-			rc = -EFAULT;
-		if ((period > 0) && (period <= 32767))
-			lp->cisco_keepalive_period = period;
-		else
-			rc = -EINVAL;
-		if (!rc && (tmp != lp->cisco_keepalive_period)) {
-			expires = (unsigned long)(jiffies +
-						  lp->cisco_keepalive_period * HZ);
-			mod_timer(&lp->cisco_timer, expires);
-			printk(KERN_INFO "%s: Keepalive period set "
-			       "to %d seconds.\n",
-			       dev->name, lp->cisco_keepalive_period);
-		}
-		break;
-
-		/* get/set debugging */
-	case SIOCGDEBSERINT:
-		len = (unsigned long)sizeof(lp->cisco_debserint);
-		if (copy_to_user(ifr->ifr_data,
-				 &lp->cisco_debserint, len))
-			rc = -EFAULT;
-		break;
-	case SIOCSDEBSERINT:
-		len = (unsigned long)sizeof(lp->cisco_debserint);
-		if (copy_from_user(&debserint,
-				   ifr->ifr_data, len))
-			rc = -EFAULT;
-		if ((debserint >= 0) && (debserint <= 64))
-			lp->cisco_debserint = debserint;
-		else
-			rc = -EINVAL;
-		break;
-
-	default:
-		rc = -EINVAL;
-		break;
-	}
-	return (rc);
-}
-
-
-static int isdn_net_ioctl(struct net_device *dev,
-			  struct ifreq *ifr, int cmd)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-
-	switch (lp->p_encap) {
-#ifdef CONFIG_ISDN_PPP
-	case ISDN_NET_ENCAP_SYNCPPP:
-		return isdn_ppp_dev_ioctl(dev, ifr, cmd);
-#endif
-	case ISDN_NET_ENCAP_CISCOHDLCK:
-		return isdn_ciscohdlck_dev_ioctl(dev, ifr, cmd);
-	default:
-		return -EINVAL;
-	}
-}
-
-/* called via cisco_timer.function */
-static void
-isdn_net_ciscohdlck_slarp_send_keepalive(struct timer_list *t)
-{
-	isdn_net_local *lp = from_timer(lp, t, cisco_timer);
-	struct sk_buff *skb;
-	unsigned char *p;
-	unsigned long last_cisco_myseq = lp->cisco_myseq;
-	int myseq_diff = 0;
-
-	if (!(lp->flags & ISDN_NET_CONNECTED) || lp->dialstate) {
-		printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__);
-		return;
-	}
-	lp->cisco_myseq++;
-
-	myseq_diff = (lp->cisco_myseq - lp->cisco_mineseen);
-	if ((lp->cisco_line_state) && ((myseq_diff >= 3) || (myseq_diff <= -3))) {
-		/* line up -> down */
-		lp->cisco_line_state = 0;
-		printk(KERN_WARNING
-		       "UPDOWN: Line protocol on Interface %s,"
-		       " changed state to down\n", lp->netdev->dev->name);
-		/* should stop routing higher-level data across */
-	} else if ((!lp->cisco_line_state) &&
-		   (myseq_diff >= 0) && (myseq_diff <= 2)) {
-		/* line down -> up */
-		lp->cisco_line_state = 1;
-		printk(KERN_WARNING
-		       "UPDOWN: Line protocol on Interface %s,"
-		       " changed state to up\n", lp->netdev->dev->name);
-		/* restart routing higher-level data across */
-	}
-
-	if (lp->cisco_debserint)
-		printk(KERN_DEBUG "%s: HDLC "
-		       "myseq %lu, mineseen %lu%c, yourseen %lu, %s\n",
-		       lp->netdev->dev->name, last_cisco_myseq, lp->cisco_mineseen,
-		       ((last_cisco_myseq == lp->cisco_mineseen) ? '*' : 040),
-		       lp->cisco_yourseq,
-		       ((lp->cisco_line_state) ? "line up" : "line down"));
-
-	skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14);
-	if (!skb)
-		return;
-
-	p = skb_put(skb, 4 + 14);
-
-	/* cisco header */
-	*(u8 *)(p + 0) = CISCO_ADDR_UNICAST;
-	*(u8 *)(p + 1) = CISCO_CTRL;
-	*(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP);
-
-	/* slarp keepalive */
-	*(__be32 *)(p +  4) = cpu_to_be32(CISCO_SLARP_KEEPALIVE);
-	*(__be32 *)(p +  8) = cpu_to_be32(lp->cisco_myseq);
-	*(__be32 *)(p + 12) = cpu_to_be32(lp->cisco_yourseq);
-	*(__be16 *)(p + 16) = cpu_to_be16(0xffff); // reliability, always 0xffff
-	p += 18;
-
-	isdn_net_write_super(lp, skb);
-
-	lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ;
-
-	add_timer(&lp->cisco_timer);
-}
-
-static void
-isdn_net_ciscohdlck_slarp_send_request(isdn_net_local *lp)
-{
-	struct sk_buff *skb;
-	unsigned char *p;
-
-	skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14);
-	if (!skb)
-		return;
-
-	p = skb_put(skb, 4 + 14);
-
-	/* cisco header */
-	*(u8 *)(p + 0) = CISCO_ADDR_UNICAST;
-	*(u8 *)(p + 1) = CISCO_CTRL;
-	*(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP);
-
-	/* slarp request */
-	*(__be32 *)(p +  4) = cpu_to_be32(CISCO_SLARP_REQUEST);
-	*(__be32 *)(p +  8) = cpu_to_be32(0); // address
-	*(__be32 *)(p + 12) = cpu_to_be32(0); // netmask
-	*(__be16 *)(p + 16) = cpu_to_be16(0); // unused
-	p += 18;
-
-	isdn_net_write_super(lp, skb);
-}
-
-static void
-isdn_net_ciscohdlck_connected(isdn_net_local *lp)
-{
-	lp->cisco_myseq = 0;
-	lp->cisco_mineseen = 0;
-	lp->cisco_yourseq = 0;
-	lp->cisco_keepalive_period = ISDN_TIMER_KEEPINT;
-	lp->cisco_last_slarp_in = 0;
-	lp->cisco_line_state = 0;
-	lp->cisco_debserint = 0;
-
-	/* send slarp request because interface/seq.no.s reset */
-	isdn_net_ciscohdlck_slarp_send_request(lp);
-
-	timer_setup(&lp->cisco_timer,
-		    isdn_net_ciscohdlck_slarp_send_keepalive, 0);
-	lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ;
-	add_timer(&lp->cisco_timer);
-}
-
-static void
-isdn_net_ciscohdlck_disconnected(isdn_net_local *lp)
-{
-	del_timer(&lp->cisco_timer);
-}
-
-static void
-isdn_net_ciscohdlck_slarp_send_reply(isdn_net_local *lp)
-{
-	struct sk_buff *skb;
-	unsigned char *p;
-	struct in_device *in_dev = NULL;
-	__be32 addr = 0;		/* local ipv4 address */
-	__be32 mask = 0;		/* local netmask */
-
-	if ((in_dev = lp->netdev->dev->ip_ptr) != NULL) {
-		/* take primary(first) address of interface */
-		struct in_ifaddr *ifa = in_dev->ifa_list;
-		if (ifa != NULL) {
-			addr = ifa->ifa_local;
-			mask = ifa->ifa_mask;
-		}
-	}
-
-	skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14);
-	if (!skb)
-		return;
-
-	p = skb_put(skb, 4 + 14);
-
-	/* cisco header */
-	*(u8 *)(p + 0) = CISCO_ADDR_UNICAST;
-	*(u8 *)(p + 1) = CISCO_CTRL;
-	*(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP);
-
-	/* slarp reply, send own ip/netmask; if values are nonsense remote
-	 * should think we are unable to provide it with an address via SLARP */
-	*(__be32 *)(p +  4) = cpu_to_be32(CISCO_SLARP_REPLY);
-	*(__be32 *)(p +  8) = addr; // address
-	*(__be32 *)(p + 12) = mask; // netmask
-	*(__be16 *)(p + 16) = cpu_to_be16(0); // unused
-	p += 18;
-
-	isdn_net_write_super(lp, skb);
-}
-
-static void
-isdn_net_ciscohdlck_slarp_in(isdn_net_local *lp, struct sk_buff *skb)
-{
-	unsigned char *p;
-	int period;
-	u32 code;
-	u32 my_seq;
-	u32 your_seq;
-	__be32 local;
-	__be32 *addr, *mask;
-
-	if (skb->len < 14)
-		return;
-
-	p = skb->data;
-	code = be32_to_cpup((__be32 *)p);
-	p += 4;
-
-	switch (code) {
-	case CISCO_SLARP_REQUEST:
-		lp->cisco_yourseq = 0;
-		isdn_net_ciscohdlck_slarp_send_reply(lp);
-		break;
-	case CISCO_SLARP_REPLY:
-		addr = (__be32 *)p;
-		mask = (__be32 *)(p + 4);
-		if (*mask != cpu_to_be32(0xfffffffc))
-			goto slarp_reply_out;
-		if ((*addr & cpu_to_be32(3)) == cpu_to_be32(0) ||
-		    (*addr & cpu_to_be32(3)) == cpu_to_be32(3))
-			goto slarp_reply_out;
-		local = *addr ^ cpu_to_be32(3);
-		printk(KERN_INFO "%s: got slarp reply: remote ip: %pI4, local ip: %pI4 mask: %pI4\n",
-		       lp->netdev->dev->name, addr, &local, mask);
-		break;
-	slarp_reply_out:
-		printk(KERN_INFO "%s: got invalid slarp reply (%pI4/%pI4) - ignored\n",
-		       lp->netdev->dev->name, addr, mask);
-		break;
-	case CISCO_SLARP_KEEPALIVE:
-		period = (int)((jiffies - lp->cisco_last_slarp_in
-				+ HZ / 2 - 1) / HZ);
-		if (lp->cisco_debserint &&
-		    (period != lp->cisco_keepalive_period) &&
-		    lp->cisco_last_slarp_in) {
-			printk(KERN_DEBUG "%s: Keepalive period mismatch - "
-			       "is %d but should be %d.\n",
-			       lp->netdev->dev->name, period,
-			       lp->cisco_keepalive_period);
-		}
-		lp->cisco_last_slarp_in = jiffies;
-		my_seq = be32_to_cpup((__be32 *)(p + 0));
-		your_seq = be32_to_cpup((__be32 *)(p + 4));
-		p += 10;
-		lp->cisco_yourseq = my_seq;
-		lp->cisco_mineseen = your_seq;
-		break;
-	}
-}
-
-static void
-isdn_net_ciscohdlck_receive(isdn_net_local *lp, struct sk_buff *skb)
-{
-	unsigned char *p;
-	u8 addr;
-	u8 ctrl;
-	u16 type;
-
-	if (skb->len < 4)
-		goto out_free;
-
-	p = skb->data;
-	addr = *(u8 *)(p + 0);
-	ctrl = *(u8 *)(p + 1);
-	type = be16_to_cpup((__be16 *)(p + 2));
-	p += 4;
-	skb_pull(skb, 4);
-
-	if (addr != CISCO_ADDR_UNICAST && addr != CISCO_ADDR_BROADCAST) {
-		printk(KERN_WARNING "%s: Unknown Cisco addr 0x%02x\n",
-		       lp->netdev->dev->name, addr);
-		goto out_free;
-	}
-	if (ctrl != CISCO_CTRL) {
-		printk(KERN_WARNING "%s: Unknown Cisco ctrl 0x%02x\n",
-		       lp->netdev->dev->name, ctrl);
-		goto out_free;
-	}
-
-	switch (type) {
-	case CISCO_TYPE_SLARP:
-		isdn_net_ciscohdlck_slarp_in(lp, skb);
-		goto out_free;
-	case CISCO_TYPE_CDP:
-		if (lp->cisco_debserint)
-			printk(KERN_DEBUG "%s: Received CDP packet. use "
-			       "\"no cdp enable\" on cisco.\n",
-			       lp->netdev->dev->name);
-		goto out_free;
-	default:
-		/* no special cisco protocol */
-		skb->protocol = htons(type);
-		netif_rx(skb);
-		return;
-	}
-
-out_free:
-	kfree_skb(skb);
-}
-
-/*
- * Got a packet from ISDN-Channel.
- */
-static void
-isdn_net_receive(struct net_device *ndev, struct sk_buff *skb)
-{
-	isdn_net_local *lp = netdev_priv(ndev);
-	isdn_net_local *olp = lp;	/* original 'lp' */
-#ifdef CONFIG_ISDN_X25
-	struct concap_proto *cprot = lp->netdev->cprot;
-#endif
-	lp->transcount += skb->len;
-
-	lp->stats.rx_packets++;
-	lp->stats.rx_bytes += skb->len;
-	if (lp->master) {
-		/* Bundling: If device is a slave-device, deliver to master, also
-		 * handle master's statistics and hangup-timeout
-		 */
-		ndev = lp->master;
-		lp = netdev_priv(ndev);
-		lp->stats.rx_packets++;
-		lp->stats.rx_bytes += skb->len;
-	}
-	skb->dev = ndev;
-	skb->pkt_type = PACKET_HOST;
-	skb_reset_mac_header(skb);
-#ifdef ISDN_DEBUG_NET_DUMP
-	isdn_dumppkt("R:", skb->data, skb->len, 40);
-#endif
-	switch (lp->p_encap) {
-	case ISDN_NET_ENCAP_ETHER:
-		/* Ethernet over ISDN */
-		olp->huptimer = 0;
-		lp->huptimer = 0;
-		skb->protocol = isdn_net_type_trans(skb, ndev);
-		break;
-	case ISDN_NET_ENCAP_UIHDLC:
-		/* HDLC with UI-frame (for ispa with -h1 option) */
-		olp->huptimer = 0;
-		lp->huptimer = 0;
-		skb_pull(skb, 2);
-		/* Fall through */
-	case ISDN_NET_ENCAP_RAWIP:
-		/* RAW-IP without MAC-Header */
-		olp->huptimer = 0;
-		lp->huptimer = 0;
-		skb->protocol = htons(ETH_P_IP);
-		break;
-	case ISDN_NET_ENCAP_CISCOHDLCK:
-		isdn_net_ciscohdlck_receive(lp, skb);
-		return;
-	case ISDN_NET_ENCAP_CISCOHDLC:
-		/* CISCO-HDLC IP with type field and  fake I-frame-header */
-		skb_pull(skb, 2);
-		/* Fall through */
-	case ISDN_NET_ENCAP_IPTYP:
-		/* IP with type field */
-		olp->huptimer = 0;
-		lp->huptimer = 0;
-		skb->protocol = *(__be16 *)&(skb->data[0]);
-		skb_pull(skb, 2);
-		if (*(unsigned short *) skb->data == 0xFFFF)
-			skb->protocol = htons(ETH_P_802_3);
-		break;
-#ifdef CONFIG_ISDN_PPP
-	case ISDN_NET_ENCAP_SYNCPPP:
-		/* huptimer is done in isdn_ppp_push_higher */
-		isdn_ppp_receive(lp->netdev, olp, skb);
-		return;
-#endif
-
-	default:
-#ifdef CONFIG_ISDN_X25
-		/* try if there are generic sync_device receiver routines */
-		if (cprot) if (cprot->pops)
-				   if (cprot->pops->data_ind) {
-					   cprot->pops->data_ind(cprot, skb);
-					   return;
-				   };
-#endif /* CONFIG_ISDN_X25 */
-		printk(KERN_WARNING "%s: unknown encapsulation, dropping\n",
-		       lp->netdev->dev->name);
-		kfree_skb(skb);
-		return;
-	}
-
-	netif_rx(skb);
-	return;
-}
-
-/*
- * A packet arrived via ISDN. Search interface-chain for a corresponding
- * interface. If found, deliver packet to receiver-function and return 1,
- * else return 0.
- */
-int
-isdn_net_rcv_skb(int idx, struct sk_buff *skb)
-{
-	isdn_net_dev *p = dev->rx_netdev[idx];
-
-	if (p) {
-		isdn_net_local *lp = p->local;
-		if ((lp->flags & ISDN_NET_CONNECTED) &&
-		    (!lp->dialstate)) {
-			isdn_net_receive(p->dev, skb);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- *  build an header
- *  depends on encaps that is being used.
- */
-
-static int isdn_net_header(struct sk_buff *skb, struct net_device *dev,
-			   unsigned short type,
-			   const void *daddr, const void *saddr, unsigned plen)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	unsigned char *p;
-	int len = 0;
-
-	switch (lp->p_encap) {
-	case ISDN_NET_ENCAP_ETHER:
-		len = eth_header(skb, dev, type, daddr, saddr, plen);
-		break;
-#ifdef CONFIG_ISDN_PPP
-	case ISDN_NET_ENCAP_SYNCPPP:
-		/* stick on a fake header to keep fragmentation code happy. */
-		len = IPPP_MAX_HEADER;
-		skb_push(skb, len);
-		break;
-#endif
-	case ISDN_NET_ENCAP_RAWIP:
-		printk(KERN_WARNING "isdn_net_header called with RAW_IP!\n");
-		len = 0;
-		break;
-	case ISDN_NET_ENCAP_IPTYP:
-		/* ethernet type field */
-		*((__be16 *)skb_push(skb, 2)) = htons(type);
-		len = 2;
-		break;
-	case ISDN_NET_ENCAP_UIHDLC:
-		/* HDLC with UI-Frames (for ispa with -h1 option) */
-		*((__be16 *)skb_push(skb, 2)) = htons(0x0103);
-		len = 2;
-		break;
-	case ISDN_NET_ENCAP_CISCOHDLC:
-	case ISDN_NET_ENCAP_CISCOHDLCK:
-		p = skb_push(skb, 4);
-		*(u8 *)(p + 0) = CISCO_ADDR_UNICAST;
-		*(u8 *)(p + 1) = CISCO_CTRL;
-		*(__be16 *)(p + 2) = cpu_to_be16(type);
-		p += 4;
-		len = 4;
-		break;
-#ifdef CONFIG_ISDN_X25
-	default:
-		/* try if there are generic concap protocol routines */
-		if (lp->netdev->cprot) {
-			printk(KERN_WARNING "isdn_net_header called with concap_proto!\n");
-			len = 0;
-			break;
-		}
-		break;
-#endif /* CONFIG_ISDN_X25 */
-	}
-	return len;
-}
-
-static int isdn_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
-			     __be16 type)
-{
-	const struct net_device *dev = neigh->dev;
-	isdn_net_local *lp = netdev_priv(dev);
-
-	if (lp->p_encap == ISDN_NET_ENCAP_ETHER)
-		return eth_header_cache(neigh, hh, type);
-	return -1;
-}
-
-static void isdn_header_cache_update(struct hh_cache *hh,
-				     const struct net_device *dev,
-				     const unsigned char *haddr)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-	if (lp->p_encap == ISDN_NET_ENCAP_ETHER)
-		eth_header_cache_update(hh, dev, haddr);
-}
-
-static const struct header_ops isdn_header_ops = {
-	.create = isdn_net_header,
-	.cache = isdn_header_cache,
-	.cache_update = isdn_header_cache_update,
-};
-
-/*
- * Interface-setup. (just after registering a new interface)
- */
-static int
-isdn_net_init(struct net_device *ndev)
-{
-	ushort max_hlhdr_len = 0;
-	int drvidx;
-
-	/*
-	 *  up till binding we ask the protocol layer to reserve as much
-	 *  as we might need for HL layer
-	 */
-
-	for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++)
-		if (dev->drv[drvidx])
-			if (max_hlhdr_len < dev->drv[drvidx]->interface->hl_hdrlen)
-				max_hlhdr_len = dev->drv[drvidx]->interface->hl_hdrlen;
-
-	ndev->hard_header_len = ETH_HLEN + max_hlhdr_len;
-	return 0;
-}
-
-static void
-isdn_net_swapbind(int drvidx)
-{
-	isdn_net_dev *p;
-
-#ifdef ISDN_DEBUG_NET_ICALL
-	printk(KERN_DEBUG "n_fi: swapping ch of %d\n", drvidx);
-#endif
-	p = dev->netdev;
-	while (p) {
-		if (p->local->pre_device == drvidx)
-			switch (p->local->pre_channel) {
-			case 0:
-				p->local->pre_channel = 1;
-				break;
-			case 1:
-				p->local->pre_channel = 0;
-				break;
-			}
-		p = (isdn_net_dev *) p->next;
-	}
-}
-
-static void
-isdn_net_swap_usage(int i1, int i2)
-{
-	int u1 = dev->usage[i1] & ISDN_USAGE_EXCLUSIVE;
-	int u2 = dev->usage[i2] & ISDN_USAGE_EXCLUSIVE;
-
-#ifdef ISDN_DEBUG_NET_ICALL
-	printk(KERN_DEBUG "n_fi: usage of %d and %d\n", i1, i2);
-#endif
-	dev->usage[i1] &= ~ISDN_USAGE_EXCLUSIVE;
-	dev->usage[i1] |= u2;
-	dev->usage[i2] &= ~ISDN_USAGE_EXCLUSIVE;
-	dev->usage[i2] |= u1;
-	isdn_info_update();
-}
-
-/*
- * An incoming call-request has arrived.
- * Search the interface-chain for an appropriate interface.
- * If found, connect the interface to the ISDN-channel and initiate
- * D- and B-Channel-setup. If secure-flag is set, accept only
- * configured phone-numbers. If callback-flag is set, initiate
- * callback-dialing.
- *
- * Return-Value: 0 = No appropriate interface for this call.
- *               1 = Call accepted
- *               2 = Reject call, wait cbdelay, then call back
- *               3 = Reject call
- *               4 = Wait cbdelay, then call back
- *               5 = No appropriate interface for this call,
- *                   would eventually match if CID was longer.
- */
-
-int
-isdn_net_find_icall(int di, int ch, int idx, setup_parm *setup)
-{
-	char *eaz;
-	int si1;
-	int si2;
-	int ematch;
-	int wret;
-	int swapped;
-	int sidx = 0;
-	u_long flags;
-	isdn_net_dev *p;
-	isdn_net_phone *n;
-	char nr[ISDN_MSNLEN];
-	char *my_eaz;
-
-	/* Search name in netdev-chain */
-	if (!setup->phone[0]) {
-		nr[0] = '0';
-		nr[1] = '\0';
-		printk(KERN_INFO "isdn_net: Incoming call without OAD, assuming '0'\n");
-	} else
-		strlcpy(nr, setup->phone, ISDN_MSNLEN);
-	si1 = (int) setup->si1;
-	si2 = (int) setup->si2;
-	if (!setup->eazmsn[0]) {
-		printk(KERN_WARNING "isdn_net: Incoming call without CPN, assuming '0'\n");
-		eaz = "0";
-	} else
-		eaz = setup->eazmsn;
-	if (dev->net_verbose > 1)
-		printk(KERN_INFO "isdn_net: call from %s,%d,%d -> %s\n", nr, si1, si2, eaz);
-	/* Accept DATA and VOICE calls at this stage
-	 * local eaz is checked later for allowed call types
-	 */
-	if ((si1 != 7) && (si1 != 1)) {
-		if (dev->net_verbose > 1)
-			printk(KERN_INFO "isdn_net: Service-Indicator not 1 or 7, ignored\n");
-		return 0;
-	}
-	n = (isdn_net_phone *) 0;
-	p = dev->netdev;
-	ematch = wret = swapped = 0;
-#ifdef ISDN_DEBUG_NET_ICALL
-	printk(KERN_DEBUG "n_fi: di=%d ch=%d idx=%d usg=%d\n", di, ch, idx,
-	       dev->usage[idx]);
-#endif
-	while (p) {
-		int matchret;
-		isdn_net_local *lp = p->local;
-
-		/* If last check has triggered as binding-swap, revert it */
-		switch (swapped) {
-		case 2:
-			isdn_net_swap_usage(idx, sidx);
-			/* fall through */
-		case 1:
-			isdn_net_swapbind(di);
-			break;
-		}
-		swapped = 0;
-		/* check acceptable call types for DOV */
-		my_eaz = isdn_map_eaz2msn(lp->msn, di);
-		if (si1 == 1) { /* it's a DOV call, check if we allow it */
-			if (*my_eaz == 'v' || *my_eaz == 'V' ||
-			    *my_eaz == 'b' || *my_eaz == 'B')
-				my_eaz++; /* skip to allow a match */
-			else
-				my_eaz = NULL; /* force non match */
-		} else { /* it's a DATA call, check if we allow it */
-			if (*my_eaz == 'b' || *my_eaz == 'B')
-				my_eaz++; /* skip to allow a match */
-		}
-		if (my_eaz)
-			matchret = isdn_msncmp(eaz, my_eaz);
-		else
-			matchret = 1;
-		if (!matchret)
-			ematch = 1;
-
-		/* Remember if more numbers eventually can match */
-		if (matchret > wret)
-			wret = matchret;
-#ifdef ISDN_DEBUG_NET_ICALL
-		printk(KERN_DEBUG "n_fi: if='%s', l.msn=%s, l.flags=%d, l.dstate=%d\n",
-		       p->dev->name, lp->msn, lp->flags, lp->dialstate);
-#endif
-		if ((!matchret) &&                                        /* EAZ is matching   */
-		    (((!(lp->flags & ISDN_NET_CONNECTED)) &&              /* but not connected */
-		      (USG_NONE(dev->usage[idx]))) ||                     /* and ch. unused or */
-		     ((((lp->dialstate == 4) || (lp->dialstate == 12)) && /* if dialing        */
-		       (!(lp->flags & ISDN_NET_CALLBACK)))                /* but no callback   */
-			     )))
-		{
-#ifdef ISDN_DEBUG_NET_ICALL
-			printk(KERN_DEBUG "n_fi: match1, pdev=%d pch=%d\n",
-			       lp->pre_device, lp->pre_channel);
-#endif
-			if (dev->usage[idx] & ISDN_USAGE_EXCLUSIVE) {
-				if ((lp->pre_channel != ch) ||
-				    (lp->pre_device != di)) {
-					/* Here we got a problem:
-					 * If using an ICN-Card, an incoming call is always signaled on
-					 * on the first channel of the card, if both channels are
-					 * down. However this channel may be bound exclusive. If the
-					 * second channel is free, this call should be accepted.
-					 * The solution is horribly but it runs, so what:
-					 * We exchange the exclusive bindings of the two channels, the
-					 * corresponding variables in the interface-structs.
-					 */
-					if (ch == 0) {
-						sidx = isdn_dc2minor(di, 1);
-#ifdef ISDN_DEBUG_NET_ICALL
-						printk(KERN_DEBUG "n_fi: ch is 0\n");
-#endif
-						if (USG_NONE(dev->usage[sidx])) {
-							/* Second Channel is free, now see if it is bound
-							 * exclusive too. */
-							if (dev->usage[sidx] & ISDN_USAGE_EXCLUSIVE) {
-#ifdef ISDN_DEBUG_NET_ICALL
-								printk(KERN_DEBUG "n_fi: 2nd channel is down and bound\n");
-#endif
-								/* Yes, swap bindings only, if the original
-								 * binding is bound to channel 1 of this driver */
-								if ((lp->pre_device == di) &&
-								    (lp->pre_channel == 1)) {
-									isdn_net_swapbind(di);
-									swapped = 1;
-								} else {
-									/* ... else iterate next device */
-									p = (isdn_net_dev *) p->next;
-									continue;
-								}
-							} else {
-#ifdef ISDN_DEBUG_NET_ICALL
-								printk(KERN_DEBUG "n_fi: 2nd channel is down and unbound\n");
-#endif
-								/* No, swap always and swap excl-usage also */
-								isdn_net_swap_usage(idx, sidx);
-								isdn_net_swapbind(di);
-								swapped = 2;
-							}
-							/* Now check for exclusive binding again */
-#ifdef ISDN_DEBUG_NET_ICALL
-							printk(KERN_DEBUG "n_fi: final check\n");
-#endif
-							if ((dev->usage[idx] & ISDN_USAGE_EXCLUSIVE) &&
-							    ((lp->pre_channel != ch) ||
-							     (lp->pre_device != di))) {
-#ifdef ISDN_DEBUG_NET_ICALL
-								printk(KERN_DEBUG "n_fi: final check failed\n");
-#endif
-								p = (isdn_net_dev *) p->next;
-								continue;
-							}
-						}
-					} else {
-						/* We are already on the second channel, so nothing to do */
-#ifdef ISDN_DEBUG_NET_ICALL
-						printk(KERN_DEBUG "n_fi: already on 2nd channel\n");
-#endif
-					}
-				}
-			}
-#ifdef ISDN_DEBUG_NET_ICALL
-			printk(KERN_DEBUG "n_fi: match2\n");
-#endif
-			n = lp->phone[0];
-			if (lp->flags & ISDN_NET_SECURE) {
-				while (n) {
-					if (!isdn_msncmp(nr, n->num))
-						break;
-					n = (isdn_net_phone *) n->next;
-				}
-			}
-			if (n || (!(lp->flags & ISDN_NET_SECURE))) {
-#ifdef ISDN_DEBUG_NET_ICALL
-				printk(KERN_DEBUG "n_fi: match3\n");
-#endif
-				/* matching interface found */
-
-				/*
-				 * Is the state STOPPED?
-				 * If so, no dialin is allowed,
-				 * so reject actively.
-				 * */
-				if (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF) {
-					printk(KERN_INFO "incoming call, interface %s `stopped' -> rejected\n",
-					       p->dev->name);
-					return 3;
-				}
-				/*
-				 * Is the interface up?
-				 * If not, reject the call actively.
-				 */
-				if (!isdn_net_device_started(p)) {
-					printk(KERN_INFO "%s: incoming call, interface down -> rejected\n",
-					       p->dev->name);
-					return 3;
-				}
-				/* Interface is up, now see if it's a slave. If so, see if
-				 * it's master and parent slave is online. If not, reject the call.
-				 */
-				if (lp->master) {
-					isdn_net_local *mlp = ISDN_MASTER_PRIV(lp);
-					printk(KERN_DEBUG "ICALLslv: %s\n", p->dev->name);
-					printk(KERN_DEBUG "master=%s\n", lp->master->name);
-					if (mlp->flags & ISDN_NET_CONNECTED) {
-						printk(KERN_DEBUG "master online\n");
-						/* Master is online, find parent-slave (master if first slave) */
-						while (mlp->slave) {
-							if (ISDN_SLAVE_PRIV(mlp) == lp)
-								break;
-							mlp = ISDN_SLAVE_PRIV(mlp);
-						}
-					} else
-						printk(KERN_DEBUG "master offline\n");
-					/* Found parent, if it's offline iterate next device */
-					printk(KERN_DEBUG "mlpf: %d\n", mlp->flags & ISDN_NET_CONNECTED);
-					if (!(mlp->flags & ISDN_NET_CONNECTED)) {
-						p = (isdn_net_dev *) p->next;
-						continue;
-					}
-				}
-				if (lp->flags & ISDN_NET_CALLBACK) {
-					int chi;
-					/*
-					 * Is the state MANUAL?
-					 * If so, no callback can be made,
-					 * so reject actively.
-					 * */
-					if (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF) {
-						printk(KERN_INFO "incoming call for callback, interface %s `off' -> rejected\n",
-						       p->dev->name);
-						return 3;
-					}
-					printk(KERN_DEBUG "%s: call from %s -> %s, start callback\n",
-					       p->dev->name, nr, eaz);
-					if (lp->phone[1]) {
-						/* Grab a free ISDN-Channel */
-						spin_lock_irqsave(&dev->lock, flags);
-						if ((chi =
-						     isdn_get_free_channel(
-							     ISDN_USAGE_NET,
-							     lp->l2_proto,
-							     lp->l3_proto,
-							     lp->pre_device,
-							     lp->pre_channel,
-							     lp->msn)
-							    ) < 0) {
-
-							printk(KERN_WARNING "isdn_net_find_icall: No channel for %s\n",
-							       p->dev->name);
-							spin_unlock_irqrestore(&dev->lock, flags);
-							return 0;
-						}
-						/* Setup dialstate. */
-						lp->dtimer = 0;
-						lp->dialstate = 11;
-						/* Connect interface with channel */
-						isdn_net_bind_channel(lp, chi);
-#ifdef CONFIG_ISDN_PPP
-						if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-							if (isdn_ppp_bind(lp) < 0) {
-								spin_unlock_irqrestore(&dev->lock, flags);
-								isdn_net_unbind_channel(lp);
-								return 0;
-							}
-#endif
-						spin_unlock_irqrestore(&dev->lock, flags);
-						/* Initiate dialing by returning 2 or 4 */
-						return (lp->flags & ISDN_NET_CBHUP) ? 2 : 4;
-					} else
-						printk(KERN_WARNING "isdn_net: %s: No phone number\n",
-						       p->dev->name);
-					return 0;
-				} else {
-					printk(KERN_DEBUG "%s: call from %s -> %s accepted\n",
-					       p->dev->name, nr, eaz);
-					/* if this interface is dialing, it does it probably on a different
-					   device, so free this device */
-					if ((lp->dialstate == 4) || (lp->dialstate == 12)) {
-#ifdef CONFIG_ISDN_PPP
-						if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-							isdn_ppp_free(lp);
-#endif
-						isdn_net_lp_disconnected(lp);
-						isdn_free_channel(lp->isdn_device, lp->isdn_channel,
-								  ISDN_USAGE_NET);
-					}
-					spin_lock_irqsave(&dev->lock, flags);
-					dev->usage[idx] &= ISDN_USAGE_EXCLUSIVE;
-					dev->usage[idx] |= ISDN_USAGE_NET;
-					strcpy(dev->num[idx], nr);
-					isdn_info_update();
-					dev->st_netdev[idx] = lp->netdev;
-					lp->isdn_device = di;
-					lp->isdn_channel = ch;
-					lp->ppp_slot = -1;
-					lp->flags |= ISDN_NET_CONNECTED;
-					lp->dialstate = 7;
-					lp->dtimer = 0;
-					lp->outgoing = 0;
-					lp->huptimer = 0;
-					lp->hupflags |= ISDN_WAITCHARGE;
-					lp->hupflags &= ~ISDN_HAVECHARGE;
-#ifdef CONFIG_ISDN_PPP
-					if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) {
-						if (isdn_ppp_bind(lp) < 0) {
-							isdn_net_unbind_channel(lp);
-							spin_unlock_irqrestore(&dev->lock, flags);
-							return 0;
-						}
-					}
-#endif
-					spin_unlock_irqrestore(&dev->lock, flags);
-					return 1;
-				}
-			}
-		}
-		p = (isdn_net_dev *) p->next;
-	}
-	/* If none of configured EAZ/MSN matched and not verbose, be silent */
-	if (!ematch || dev->net_verbose)
-		printk(KERN_INFO "isdn_net: call from %s -> %d %s ignored\n", nr, di, eaz);
-	return (wret == 2) ? 5 : 0;
-}
-
-/*
- * Search list of net-interfaces for an interface with given name.
- */
-isdn_net_dev *
-isdn_net_findif(char *name)
-{
-	isdn_net_dev *p = dev->netdev;
-
-	while (p) {
-		if (!strcmp(p->dev->name, name))
-			return p;
-		p = (isdn_net_dev *) p->next;
-	}
-	return (isdn_net_dev *) NULL;
-}
-
-/*
- * Force a net-interface to dial out.
- * This is called from the userlevel-routine below or
- * from isdn_net_start_xmit().
- */
-static int
-isdn_net_force_dial_lp(isdn_net_local *lp)
-{
-	if ((!(lp->flags & ISDN_NET_CONNECTED)) && !lp->dialstate) {
-		int chi;
-		if (lp->phone[1]) {
-			ulong flags;
-
-			/* Grab a free ISDN-Channel */
-			spin_lock_irqsave(&dev->lock, flags);
-			if ((chi = isdn_get_free_channel(
-				     ISDN_USAGE_NET,
-				     lp->l2_proto,
-				     lp->l3_proto,
-				     lp->pre_device,
-				     lp->pre_channel,
-				     lp->msn)) < 0) {
-				printk(KERN_WARNING "isdn_net_force_dial: No channel for %s\n",
-				       lp->netdev->dev->name);
-				spin_unlock_irqrestore(&dev->lock, flags);
-				return -EAGAIN;
-			}
-			lp->dialstate = 1;
-			/* Connect interface with channel */
-			isdn_net_bind_channel(lp, chi);
-#ifdef CONFIG_ISDN_PPP
-			if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP)
-				if (isdn_ppp_bind(lp) < 0) {
-					isdn_net_unbind_channel(lp);
-					spin_unlock_irqrestore(&dev->lock, flags);
-					return -EAGAIN;
-				}
-#endif
-			/* Initiate dialing */
-			spin_unlock_irqrestore(&dev->lock, flags);
-			isdn_net_dial();
-			return 0;
-		} else
-			return -EINVAL;
-	} else
-		return -EBUSY;
-}
-
-/*
- * This is called from certain upper protocol layers (multilink ppp
- * and x25iface encapsulation module) that want to initiate dialing
- * themselves.
- */
-int
-isdn_net_dial_req(isdn_net_local *lp)
-{
-	/* is there a better error code? */
-	if (!(ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_AUTO)) return -EBUSY;
-
-	return isdn_net_force_dial_lp(lp);
-}
-
-/*
- * Force a net-interface to dial out.
- * This is always called from within userspace (ISDN_IOCTL_NET_DIAL).
- */
-int
-isdn_net_force_dial(char *name)
-{
-	isdn_net_dev *p = isdn_net_findif(name);
-
-	if (!p)
-		return -ENODEV;
-	return (isdn_net_force_dial_lp(p->local));
-}
-
-/* The ISDN-specific entries in the device structure. */
-static const struct net_device_ops isdn_netdev_ops = {
-	.ndo_init	      = isdn_net_init,
-	.ndo_open	      = isdn_net_open,
-	.ndo_stop	      = isdn_net_close,
-	.ndo_do_ioctl	      = isdn_net_ioctl,
-
-	.ndo_start_xmit	      = isdn_net_start_xmit,
-	.ndo_get_stats	      = isdn_net_get_stats,
-	.ndo_tx_timeout	      = isdn_net_tx_timeout,
-};
-
-/*
- * Helper for alloc_netdev()
- */
-static void _isdn_setup(struct net_device *dev)
-{
-	isdn_net_local *lp = netdev_priv(dev);
-
-	ether_setup(dev);
-
-	/* Setup the generic properties */
-	dev->flags = IFF_NOARP | IFF_POINTOPOINT;
-
-	/* isdn prepends a header in the tx path, can't share skbs */
-	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
-	dev->header_ops = NULL;
-	dev->netdev_ops = &isdn_netdev_ops;
-
-	/* for clients with MPPP maybe higher values better */
-	dev->tx_queue_len = 30;
-
-	lp->p_encap = ISDN_NET_ENCAP_RAWIP;
-	lp->magic = ISDN_NET_MAGIC;
-	lp->last = lp;
-	lp->next = lp;
-	lp->isdn_device = -1;
-	lp->isdn_channel = -1;
-	lp->pre_device = -1;
-	lp->pre_channel = -1;
-	lp->exclusive = -1;
-	lp->ppp_slot = -1;
-	lp->pppbind = -1;
-	skb_queue_head_init(&lp->super_tx_queue);
-	lp->l2_proto = ISDN_PROTO_L2_X75I;
-	lp->l3_proto = ISDN_PROTO_L3_TRANS;
-	lp->triggercps = 6000;
-	lp->slavedelay = 10 * HZ;
-	lp->hupflags = ISDN_INHUP;	/* Do hangup even on incoming calls */
-	lp->onhtime = 10;	/* Default hangup-time for saving costs */
-	lp->dialmax = 1;
-	/* Hangup before Callback, manual dial */
-	lp->flags = ISDN_NET_CBHUP | ISDN_NET_DM_MANUAL;
-	lp->cbdelay = 25;	/* Wait 5 secs before Callback */
-	lp->dialtimeout = -1;  /* Infinite Dial-Timeout */
-	lp->dialwait = 5 * HZ; /* Wait 5 sec. after failed dial */
-	lp->dialstarted = 0;   /* Jiffies of last dial-start */
-	lp->dialwait_timer = 0;  /* Jiffies of earliest next dial-start */
-}
-
-/*
- * Allocate a new network-interface and initialize its data structures.
- */
-char *
-isdn_net_new(char *name, struct net_device *master)
-{
-	isdn_net_dev *netdev;
-
-	/* Avoid creating an existing interface */
-	if (isdn_net_findif(name)) {
-		printk(KERN_WARNING "isdn_net: interface %s already exists\n", name);
-		return NULL;
-	}
-	if (name == NULL)
-		return NULL;
-	if (!(netdev = kzalloc(sizeof(isdn_net_dev), GFP_KERNEL))) {
-		printk(KERN_WARNING "isdn_net: Could not allocate net-device\n");
-		return NULL;
-	}
-	netdev->dev = alloc_netdev(sizeof(isdn_net_local), name,
-				   NET_NAME_UNKNOWN, _isdn_setup);
-	if (!netdev->dev) {
-		printk(KERN_WARNING "isdn_net: Could not allocate network device\n");
-		kfree(netdev);
-		return NULL;
-	}
-	netdev->local = netdev_priv(netdev->dev);
-
-	if (master) {
-		/* Device shall be a slave */
-		struct net_device *p = MASTER_TO_SLAVE(master);
-		struct net_device *q = master;
-
-		netdev->local->master = master;
-		/* Put device at end of slave-chain */
-		while (p) {
-			q = p;
-			p = MASTER_TO_SLAVE(p);
-		}
-		MASTER_TO_SLAVE(q) = netdev->dev;
-	} else {
-		/* Device shall be a master */
-		/*
-		 * Watchdog timer (currently) for master only.
-		 */
-		netdev->dev->watchdog_timeo = ISDN_NET_TX_TIMEOUT;
-		if (register_netdev(netdev->dev) != 0) {
-			printk(KERN_WARNING "isdn_net: Could not register net-device\n");
-			free_netdev(netdev->dev);
-			kfree(netdev);
-			return NULL;
-		}
-	}
-	netdev->queue = netdev->local;
-	spin_lock_init(&netdev->queue_lock);
-
-	netdev->local->netdev = netdev;
-
-	INIT_WORK(&netdev->local->tqueue, isdn_net_softint);
-	spin_lock_init(&netdev->local->xmit_lock);
-
-	/* Put into to netdev-chain */
-	netdev->next = (void *) dev->netdev;
-	dev->netdev = netdev;
-	return netdev->dev->name;
-}
-
-char *
-isdn_net_newslave(char *parm)
-{
-	char *p = strchr(parm, ',');
-	isdn_net_dev *n;
-	char newname[10];
-
-	if (p) {
-		/* Slave-Name MUST not be empty or overflow 'newname' */
-		if (strscpy(newname, p + 1, sizeof(newname)) <= 0)
-			return NULL;
-		*p = 0;
-		/* Master must already exist */
-		if (!(n = isdn_net_findif(parm)))
-			return NULL;
-		/* Master must be a real interface, not a slave */
-		if (n->local->master)
-			return NULL;
-		/* Master must not be started yet */
-		if (isdn_net_device_started(n))
-			return NULL;
-		return (isdn_net_new(newname, n->dev));
-	}
-	return NULL;
-}
-
-/*
- * Set interface-parameters.
- * Always set all parameters, so the user-level application is responsible
- * for not overwriting existing setups. It has to get the current
- * setup first, if only selected parameters are to be changed.
- */
-int
-isdn_net_setcfg(isdn_net_ioctl_cfg *cfg)
-{
-	isdn_net_dev *p = isdn_net_findif(cfg->name);
-	ulong features;
-	int i;
-	int drvidx;
-	int chidx;
-	char drvid[25];
-
-	if (p) {
-		isdn_net_local *lp = p->local;
-
-		/* See if any registered driver supports the features we want */
-		features = ((1 << cfg->l2_proto) << ISDN_FEATURE_L2_SHIFT) |
-			((1 << cfg->l3_proto) << ISDN_FEATURE_L3_SHIFT);
-		for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-			if (dev->drv[i])
-				if ((dev->drv[i]->interface->features & features) == features)
-					break;
-		if (i == ISDN_MAX_DRIVERS) {
-			printk(KERN_WARNING "isdn_net: No driver with selected features\n");
-			return -ENODEV;
-		}
-		if (lp->p_encap != cfg->p_encap) {
-#ifdef CONFIG_ISDN_X25
-			struct concap_proto *cprot = p->cprot;
-#endif
-			if (isdn_net_device_started(p)) {
-				printk(KERN_WARNING "%s: cannot change encap when if is up\n",
-				       p->dev->name);
-				return -EBUSY;
-			}
-#ifdef CONFIG_ISDN_X25
-			if (cprot && cprot->pops)
-				cprot->pops->proto_del(cprot);
-			p->cprot = NULL;
-			lp->dops = NULL;
-			/* ... ,  prepare for configuration of new one ... */
-			switch (cfg->p_encap) {
-			case ISDN_NET_ENCAP_X25IFACE:
-				lp->dops = &isdn_concap_reliable_dl_dops;
-			}
-			/* ... and allocate new one ... */
-			p->cprot = isdn_concap_new(cfg->p_encap);
-			/* p -> cprot == NULL now if p_encap is not supported
-			   by means of the concap_proto mechanism */
-			/* the protocol is not configured yet; this will
-			   happen later when isdn_net_reset() is called */
-#endif
-		}
-		switch (cfg->p_encap) {
-		case ISDN_NET_ENCAP_SYNCPPP:
-#ifndef CONFIG_ISDN_PPP
-			printk(KERN_WARNING "%s: SyncPPP support not configured\n",
-			       p->dev->name);
-			return -EINVAL;
-#else
-			p->dev->type = ARPHRD_PPP;	/* change ARP type */
-			p->dev->addr_len = 0;
-#endif
-			break;
-		case ISDN_NET_ENCAP_X25IFACE:
-#ifndef CONFIG_ISDN_X25
-			printk(KERN_WARNING "%s: isdn-x25 support not configured\n",
-			       p->dev->name);
-			return -EINVAL;
-#else
-			p->dev->type = ARPHRD_X25;	/* change ARP type */
-			p->dev->addr_len = 0;
-#endif
-			break;
-		case ISDN_NET_ENCAP_CISCOHDLCK:
-			break;
-		default:
-			if (cfg->p_encap >= 0 &&
-			    cfg->p_encap <= ISDN_NET_ENCAP_MAX_ENCAP)
-				break;
-			printk(KERN_WARNING
-			       "%s: encapsulation protocol %d not supported\n",
-			       p->dev->name, cfg->p_encap);
-			return -EINVAL;
-		}
-		if (strlen(cfg->drvid)) {
-			/* A bind has been requested ... */
-			char *c,
-				*e;
-
-			if (strnlen(cfg->drvid, sizeof(cfg->drvid)) ==
-			    sizeof(cfg->drvid))
-				return -EINVAL;
-			drvidx = -1;
-			chidx = -1;
-			strcpy(drvid, cfg->drvid);
-			if ((c = strchr(drvid, ','))) {
-				/* The channel-number is appended to the driver-Id with a comma */
-				chidx = (int) simple_strtoul(c + 1, &e, 10);
-				if (e == c)
-					chidx = -1;
-				*c = '\0';
-			}
-			for (i = 0; i < ISDN_MAX_DRIVERS; i++)
-				/* Lookup driver-Id in array */
-				if (!(strcmp(dev->drvid[i], drvid))) {
-					drvidx = i;
-					break;
-				}
-			if ((drvidx == -1) || (chidx == -1))
-				/* Either driver-Id or channel-number invalid */
-				return -ENODEV;
-		} else {
-			/* Parameters are valid, so get them */
-			drvidx = lp->pre_device;
-			chidx = lp->pre_channel;
-		}
-		if (cfg->exclusive > 0) {
-			unsigned long flags;
-
-			/* If binding is exclusive, try to grab the channel */
-			spin_lock_irqsave(&dev->lock, flags);
-			if ((i = isdn_get_free_channel(ISDN_USAGE_NET,
-						       lp->l2_proto, lp->l3_proto, drvidx,
-						       chidx, lp->msn)) < 0) {
-				/* Grab failed, because desired channel is in use */
-				lp->exclusive = -1;
-				spin_unlock_irqrestore(&dev->lock, flags);
-				return -EBUSY;
-			}
-			/* All went ok, so update isdninfo */
-			dev->usage[i] = ISDN_USAGE_EXCLUSIVE;
-			isdn_info_update();
-			spin_unlock_irqrestore(&dev->lock, flags);
-			lp->exclusive = i;
-		} else {
-			/* Non-exclusive binding or unbind. */
-			lp->exclusive = -1;
-			if ((lp->pre_device != -1) && (cfg->exclusive == -1)) {
-				isdn_unexclusive_channel(lp->pre_device, lp->pre_channel);
-				isdn_free_channel(lp->pre_device, lp->pre_channel, ISDN_USAGE_NET);
-				drvidx = -1;
-				chidx = -1;
-			}
-		}
-		strlcpy(lp->msn, cfg->eaz, sizeof(lp->msn));
-		lp->pre_device = drvidx;
-		lp->pre_channel = chidx;
-		lp->onhtime = cfg->onhtime;
-		lp->charge = cfg->charge;
-		lp->l2_proto = cfg->l2_proto;
-		lp->l3_proto = cfg->l3_proto;
-		lp->cbdelay = cfg->cbdelay;
-		lp->dialmax = cfg->dialmax;
-		lp->triggercps = cfg->triggercps;
-		lp->slavedelay = cfg->slavedelay * HZ;
-		lp->pppbind = cfg->pppbind;
-		lp->dialtimeout = cfg->dialtimeout >= 0 ? cfg->dialtimeout * HZ : -1;
-		lp->dialwait = cfg->dialwait * HZ;
-		if (cfg->secure)
-			lp->flags |= ISDN_NET_SECURE;
-		else
-			lp->flags &= ~ISDN_NET_SECURE;
-		if (cfg->cbhup)
-			lp->flags |= ISDN_NET_CBHUP;
-		else
-			lp->flags &= ~ISDN_NET_CBHUP;
-		switch (cfg->callback) {
-		case 0:
-			lp->flags &= ~(ISDN_NET_CALLBACK | ISDN_NET_CBOUT);
-			break;
-		case 1:
-			lp->flags |= ISDN_NET_CALLBACK;
-			lp->flags &= ~ISDN_NET_CBOUT;
-			break;
-		case 2:
-			lp->flags |= ISDN_NET_CBOUT;
-			lp->flags &= ~ISDN_NET_CALLBACK;
-			break;
-		}
-		lp->flags &= ~ISDN_NET_DIALMODE_MASK;	/* first all bits off */
-		if (cfg->dialmode && !(cfg->dialmode & ISDN_NET_DIALMODE_MASK)) {
-			/* old isdnctrl version, where only 0 or 1 is given */
-			printk(KERN_WARNING
-			       "Old isdnctrl version detected! Please update.\n");
-			lp->flags |= ISDN_NET_DM_OFF; /* turn on `off' bit */
-		}
-		else {
-			lp->flags |= cfg->dialmode;  /* turn on selected bits */
-		}
-		if (cfg->chargehup)
-			lp->hupflags |= ISDN_CHARGEHUP;
-		else
-			lp->hupflags &= ~ISDN_CHARGEHUP;
-		if (cfg->ihup)
-			lp->hupflags |= ISDN_INHUP;
-		else
-			lp->hupflags &= ~ISDN_INHUP;
-		if (cfg->chargeint > 10) {
-			lp->hupflags |= ISDN_CHARGEHUP | ISDN_HAVECHARGE | ISDN_MANCHARGE;
-			lp->chargeint = cfg->chargeint * HZ;
-		}
-		if (cfg->p_encap != lp->p_encap) {
-			if (cfg->p_encap == ISDN_NET_ENCAP_RAWIP) {
-				p->dev->header_ops = NULL;
-				p->dev->flags = IFF_NOARP | IFF_POINTOPOINT;
-			} else {
-				p->dev->header_ops = &isdn_header_ops;
-				if (cfg->p_encap == ISDN_NET_ENCAP_ETHER)
-					p->dev->flags = IFF_BROADCAST | IFF_MULTICAST;
-				else
-					p->dev->flags = IFF_NOARP | IFF_POINTOPOINT;
-			}
-		}
-		lp->p_encap = cfg->p_encap;
-		return 0;
-	}
-	return -ENODEV;
-}
-
-/*
- * Perform get-interface-parameters.ioctl
- */
-int
-isdn_net_getcfg(isdn_net_ioctl_cfg *cfg)
-{
-	isdn_net_dev *p = isdn_net_findif(cfg->name);
-
-	if (p) {
-		isdn_net_local *lp = p->local;
-
-		strcpy(cfg->eaz, lp->msn);
-		cfg->exclusive = lp->exclusive;
-		if (lp->pre_device >= 0) {
-			sprintf(cfg->drvid, "%s,%d", dev->drvid[lp->pre_device],
-				lp->pre_channel);
-		} else
-			cfg->drvid[0] = '\0';
-		cfg->onhtime = lp->onhtime;
-		cfg->charge = lp->charge;
-		cfg->l2_proto = lp->l2_proto;
-		cfg->l3_proto = lp->l3_proto;
-		cfg->p_encap = lp->p_encap;
-		cfg->secure = (lp->flags & ISDN_NET_SECURE) ? 1 : 0;
-		cfg->callback = 0;
-		if (lp->flags & ISDN_NET_CALLBACK)
-			cfg->callback = 1;
-		if (lp->flags & ISDN_NET_CBOUT)
-			cfg->callback = 2;
-		cfg->cbhup = (lp->flags & ISDN_NET_CBHUP) ? 1 : 0;
-		cfg->dialmode = lp->flags & ISDN_NET_DIALMODE_MASK;
-		cfg->chargehup = (lp->hupflags & ISDN_CHARGEHUP) ? 1 : 0;
-		cfg->ihup = (lp->hupflags & ISDN_INHUP) ? 1 : 0;
-		cfg->cbdelay = lp->cbdelay;
-		cfg->dialmax = lp->dialmax;
-		cfg->triggercps = lp->triggercps;
-		cfg->slavedelay = lp->slavedelay / HZ;
-		cfg->chargeint = (lp->hupflags & ISDN_CHARGEHUP) ?
-			(lp->chargeint / HZ) : 0;
-		cfg->pppbind = lp->pppbind;
-		cfg->dialtimeout = lp->dialtimeout >= 0 ? lp->dialtimeout / HZ : -1;
-		cfg->dialwait = lp->dialwait / HZ;
-		if (lp->slave) {
-			if (strlen(lp->slave->name) >= 10)
-				strcpy(cfg->slave, "too-long");
-			else
-				strcpy(cfg->slave, lp->slave->name);
-		} else
-			cfg->slave[0] = '\0';
-		if (lp->master) {
-			if (strlen(lp->master->name) >= 10)
-				strcpy(cfg->master, "too-long");
-			else
-				strcpy(cfg->master, lp->master->name);
-		} else
-			cfg->master[0] = '\0';
-		return 0;
-	}
-	return -ENODEV;
-}
-
-/*
- * Add a phone-number to an interface.
- */
-int
-isdn_net_addphone(isdn_net_ioctl_phone *phone)
-{
-	isdn_net_dev *p = isdn_net_findif(phone->name);
-	isdn_net_phone *n;
-
-	if (p) {
-		if (!(n = kmalloc(sizeof(isdn_net_phone), GFP_KERNEL)))
-			return -ENOMEM;
-		strlcpy(n->num, phone->phone, sizeof(n->num));
-		n->next = p->local->phone[phone->outgoing & 1];
-		p->local->phone[phone->outgoing & 1] = n;
-		return 0;
-	}
-	return -ENODEV;
-}
-
-/*
- * Copy a string of all phone-numbers of an interface to user space.
- * This might sleep and must be called with the isdn semaphore down.
- */
-int
-isdn_net_getphones(isdn_net_ioctl_phone *phone, char __user *phones)
-{
-	isdn_net_dev *p = isdn_net_findif(phone->name);
-	int inout = phone->outgoing & 1;
-	int more = 0;
-	int count = 0;
-	isdn_net_phone *n;
-
-	if (!p)
-		return -ENODEV;
-	inout &= 1;
-	for (n = p->local->phone[inout]; n; n = n->next) {
-		if (more) {
-			put_user(' ', phones++);
-			count++;
-		}
-		if (copy_to_user(phones, n->num, strlen(n->num) + 1)) {
-			return -EFAULT;
-		}
-		phones += strlen(n->num);
-		count += strlen(n->num);
-		more = 1;
-	}
-	put_user(0, phones);
-	count++;
-	return count;
-}
-
-/*
- * Copy a string containing the peer's phone number of a connected interface
- * to user space.
- */
-int
-isdn_net_getpeer(isdn_net_ioctl_phone *phone, isdn_net_ioctl_phone __user *peer)
-{
-	isdn_net_dev *p = isdn_net_findif(phone->name);
-	int ch, dv, idx;
-
-	if (!p)
-		return -ENODEV;
-	/*
-	 * Theoretical race: while this executes, the remote number might
-	 * become invalid (hang up) or change (new connection), resulting
-	 * in (partially) wrong number copied to user. This race
-	 * currently ignored.
-	 */
-	ch = p->local->isdn_channel;
-	dv = p->local->isdn_device;
-	if (ch < 0 && dv < 0)
-		return -ENOTCONN;
-	idx = isdn_dc2minor(dv, ch);
-	if (idx < 0)
-		return -ENODEV;
-	/* for pre-bound channels, we need this extra check */
-	if (strncmp(dev->num[idx], "???", 3) == 0)
-		return -ENOTCONN;
-	strncpy(phone->phone, dev->num[idx], ISDN_MSNLEN);
-	phone->outgoing = USG_OUTGOING(dev->usage[idx]);
-	if (copy_to_user(peer, phone, sizeof(*peer)))
-		return -EFAULT;
-	return 0;
-}
-/*
- * Delete a phone-number from an interface.
- */
-int
-isdn_net_delphone(isdn_net_ioctl_phone *phone)
-{
-	isdn_net_dev *p = isdn_net_findif(phone->name);
-	int inout = phone->outgoing & 1;
-	isdn_net_phone *n;
-	isdn_net_phone *m;
-
-	if (p) {
-		n = p->local->phone[inout];
-		m = NULL;
-		while (n) {
-			if (!strcmp(n->num, phone->phone)) {
-				if (p->local->dial == n)
-					p->local->dial = n->next;
-				if (m)
-					m->next = n->next;
-				else
-					p->local->phone[inout] = n->next;
-				kfree(n);
-				return 0;
-			}
-			m = n;
-			n = (isdn_net_phone *) n->next;
-		}
-		return -EINVAL;
-	}
-	return -ENODEV;
-}
-
-/*
- * Delete all phone-numbers of an interface.
- */
-static int
-isdn_net_rmallphone(isdn_net_dev *p)
-{
-	isdn_net_phone *n;
-	isdn_net_phone *m;
-	int i;
-
-	for (i = 0; i < 2; i++) {
-		n = p->local->phone[i];
-		while (n) {
-			m = n->next;
-			kfree(n);
-			n = m;
-		}
-		p->local->phone[i] = NULL;
-	}
-	p->local->dial = NULL;
-	return 0;
-}
-
-/*
- * Force a hangup of a network-interface.
- */
-int
-isdn_net_force_hangup(char *name)
-{
-	isdn_net_dev *p = isdn_net_findif(name);
-	struct net_device *q;
-
-	if (p) {
-		if (p->local->isdn_device < 0)
-			return 1;
-		q = p->local->slave;
-		/* If this interface has slaves, do a hangup for them also. */
-		while (q) {
-			isdn_net_hangup(q);
-			q = MASTER_TO_SLAVE(q);
-		}
-		isdn_net_hangup(p->dev);
-		return 0;
-	}
-	return -ENODEV;
-}
-
-/*
- * Helper-function for isdn_net_rm: Do the real work.
- */
-static int
-isdn_net_realrm(isdn_net_dev *p, isdn_net_dev *q)
-{
-	u_long flags;
-
-	if (isdn_net_device_started(p)) {
-		return -EBUSY;
-	}
-#ifdef CONFIG_ISDN_X25
-	if (p->cprot && p->cprot->pops)
-		p->cprot->pops->proto_del(p->cprot);
-#endif
-	/* Free all phone-entries */
-	isdn_net_rmallphone(p);
-	/* If interface is bound exclusive, free channel-usage */
-	if (p->local->exclusive != -1)
-		isdn_unexclusive_channel(p->local->pre_device, p->local->pre_channel);
-	if (p->local->master) {
-		/* It's a slave-device, so update master's slave-pointer if necessary */
-		if (((isdn_net_local *) ISDN_MASTER_PRIV(p->local))->slave ==
-		    p->dev)
-			((isdn_net_local *)ISDN_MASTER_PRIV(p->local))->slave =
-				p->local->slave;
-	} else {
-		/* Unregister only if it's a master-device */
-		unregister_netdev(p->dev);
-	}
-	/* Unlink device from chain */
-	spin_lock_irqsave(&dev->lock, flags);
-	if (q)
-		q->next = p->next;
-	else
-		dev->netdev = p->next;
-	if (p->local->slave) {
-		/* If this interface has a slave, remove it also */
-		char *slavename = p->local->slave->name;
-		isdn_net_dev *n = dev->netdev;
-		q = NULL;
-		while (n) {
-			if (!strcmp(n->dev->name, slavename)) {
-				spin_unlock_irqrestore(&dev->lock, flags);
-				isdn_net_realrm(n, q);
-				spin_lock_irqsave(&dev->lock, flags);
-				break;
-			}
-			q = n;
-			n = (isdn_net_dev *)n->next;
-		}
-	}
-	spin_unlock_irqrestore(&dev->lock, flags);
-	/* If no more net-devices remain, disable auto-hangup timer */
-	if (dev->netdev == NULL)
-		isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 0);
-	free_netdev(p->dev);
-	kfree(p);
-
-	return 0;
-}
-
-/*
- * Remove a single network-interface.
- */
-int
-isdn_net_rm(char *name)
-{
-	u_long flags;
-	isdn_net_dev *p;
-	isdn_net_dev *q;
-
-	/* Search name in netdev-chain */
-	spin_lock_irqsave(&dev->lock, flags);
-	p = dev->netdev;
-	q = NULL;
-	while (p) {
-		if (!strcmp(p->dev->name, name)) {
-			spin_unlock_irqrestore(&dev->lock, flags);
-			return (isdn_net_realrm(p, q));
-		}
-		q = p;
-		p = (isdn_net_dev *) p->next;
-	}
-	spin_unlock_irqrestore(&dev->lock, flags);
-	/* If no more net-devices remain, disable auto-hangup timer */
-	if (dev->netdev == NULL)
-		isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 0);
-	return -ENODEV;
-}
-
-/*
- * Remove all network-interfaces
- */
-int
-isdn_net_rmall(void)
-{
-	u_long flags;
-	int ret;
-
-	/* Walk through netdev-chain */
-	spin_lock_irqsave(&dev->lock, flags);
-	while (dev->netdev) {
-		if (!dev->netdev->local->master) {
-			/* Remove master-devices only, slaves get removed with their master */
-			spin_unlock_irqrestore(&dev->lock, flags);
-			if ((ret = isdn_net_realrm(dev->netdev, NULL))) {
-				return ret;
-			}
-			spin_lock_irqsave(&dev->lock, flags);
-		}
-	}
-	dev->netdev = NULL;
-	spin_unlock_irqrestore(&dev->lock, flags);
-	return 0;
-}
diff --git a/drivers/isdn/i4l/isdn_net.h b/drivers/isdn/i4l/isdn_net.h
deleted file mode 100644
index cca6d68da171..000000000000
--- a/drivers/isdn/i4l/isdn_net.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* $Id: isdn_net.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, network related functions (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-/* Definitions for hupflags:                */
-#define ISDN_WAITCHARGE  1      /* did not get a charge info yet            */
-#define ISDN_HAVECHARGE  2      /* We know a charge info                    */
-#define ISDN_CHARGEHUP   4      /* We want to use the charge mechanism      */
-#define ISDN_INHUP       8      /* Even if incoming, close after huptimeout */
-#define ISDN_MANCHARGE  16      /* Charge Interval manually set             */
-
-/*
- * Definitions for Cisco-HDLC header.
- */
-
-#define CISCO_ADDR_UNICAST    0x0f
-#define CISCO_ADDR_BROADCAST  0x8f
-#define CISCO_CTRL            0x00
-#define CISCO_TYPE_CDP        0x2000
-#define CISCO_TYPE_SLARP      0x8035
-#define CISCO_SLARP_REQUEST   0
-#define CISCO_SLARP_REPLY     1
-#define CISCO_SLARP_KEEPALIVE 2
-
-extern char *isdn_net_new(char *, struct net_device *);
-extern char *isdn_net_newslave(char *);
-extern int isdn_net_rm(char *);
-extern int isdn_net_rmall(void);
-extern int isdn_net_stat_callback(int, isdn_ctrl *);
-extern int isdn_net_setcfg(isdn_net_ioctl_cfg *);
-extern int isdn_net_getcfg(isdn_net_ioctl_cfg *);
-extern int isdn_net_addphone(isdn_net_ioctl_phone *);
-extern int isdn_net_getphones(isdn_net_ioctl_phone *, char __user *);
-extern int isdn_net_getpeer(isdn_net_ioctl_phone *, isdn_net_ioctl_phone __user *);
-extern int isdn_net_delphone(isdn_net_ioctl_phone *);
-extern int isdn_net_find_icall(int, int, int, setup_parm *);
-extern void isdn_net_hangup(struct net_device *);
-extern void isdn_net_dial(void);
-extern void isdn_net_autohup(void);
-extern int isdn_net_force_hangup(char *);
-extern int isdn_net_force_dial(char *);
-extern isdn_net_dev *isdn_net_findif(char *);
-extern int isdn_net_rcv_skb(int, struct sk_buff *);
-extern int isdn_net_dial_req(isdn_net_local *);
-extern void isdn_net_writebuf_skb(isdn_net_local *lp, struct sk_buff *skb);
-extern void isdn_net_write_super(isdn_net_local *lp, struct sk_buff *skb);
-
-#define ISDN_NET_MAX_QUEUE_LENGTH 2
-
-#define ISDN_MASTER_PRIV(lp) ((isdn_net_local *) netdev_priv(lp->master))
-#define ISDN_SLAVE_PRIV(lp) ((isdn_net_local *) netdev_priv(lp->slave))
-#define MASTER_TO_SLAVE(master)					\
-	(((isdn_net_local *) netdev_priv(master))->slave)
-
-/*
- * is this particular channel busy?
- */
-static __inline__ int isdn_net_lp_busy(isdn_net_local *lp)
-{
-	if (atomic_read(&lp->frame_cnt) < ISDN_NET_MAX_QUEUE_LENGTH)
-		return 0;
-	else
-		return 1;
-}
-
-/*
- * For the given net device, this will get a non-busy channel out of the
- * corresponding bundle. The returned channel is locked.
- */
-static __inline__ isdn_net_local *isdn_net_get_locked_lp(isdn_net_dev *nd)
-{
-	unsigned long flags;
-	isdn_net_local *lp;
-
-	spin_lock_irqsave(&nd->queue_lock, flags);
-	lp = nd->queue;         /* get lp on top of queue */
-	while (isdn_net_lp_busy(nd->queue)) {
-		nd->queue = nd->queue->next;
-		if (nd->queue == lp) { /* not found -- should never happen */
-			lp = NULL;
-			goto errout;
-		}
-	}
-	lp = nd->queue;
-	nd->queue = nd->queue->next;
-	spin_unlock_irqrestore(&nd->queue_lock, flags);
-	spin_lock(&lp->xmit_lock);
-	local_bh_disable();
-	return lp;
-errout:
-	spin_unlock_irqrestore(&nd->queue_lock, flags);
-	return lp;
-}
-
-/*
- * add a channel to a bundle
- */
-static __inline__ void isdn_net_add_to_bundle(isdn_net_dev *nd, isdn_net_local *nlp)
-{
-	isdn_net_local *lp;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nd->queue_lock, flags);
-
-	lp = nd->queue;
-//	printk(KERN_DEBUG "%s: lp:%s(%p) nlp:%s(%p) last(%p)\n",
-//		__func__, lp->name, lp, nlp->name, nlp, lp->last);
-	nlp->last = lp->last;
-	lp->last->next = nlp;
-	lp->last = nlp;
-	nlp->next = lp;
-	nd->queue = nlp;
-
-	spin_unlock_irqrestore(&nd->queue_lock, flags);
-}
-/*
- * remove a channel from the bundle it belongs to
- */
-static __inline__ void isdn_net_rm_from_bundle(isdn_net_local *lp)
-{
-	isdn_net_local *master_lp = lp;
-	unsigned long flags;
-
-	if (lp->master)
-		master_lp = ISDN_MASTER_PRIV(lp);
-
-//	printk(KERN_DEBUG "%s: lp:%s(%p) mlp:%s(%p) last(%p) next(%p) mndq(%p)\n",
-//		__func__, lp->name, lp, master_lp->name, master_lp, lp->last, lp->next, master_lp->netdev->queue);
-	spin_lock_irqsave(&master_lp->netdev->queue_lock, flags);
-	lp->last->next = lp->next;
-	lp->next->last = lp->last;
-	if (master_lp->netdev->queue == lp) {
-		master_lp->netdev->queue = lp->next;
-		if (lp->next == lp) { /* last in queue */
-			master_lp->netdev->queue = master_lp->netdev->local;
-		}
-	}
-	lp->next = lp->last = lp;	/* (re)set own pointers */
-//	printk(KERN_DEBUG "%s: mndq(%p)\n",
-//		__func__, master_lp->netdev->queue);
-	spin_unlock_irqrestore(&master_lp->netdev->queue_lock, flags);
-}
diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
deleted file mode 100644
index 7e0f419c14f8..000000000000
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ /dev/null
@@ -1,3046 +0,0 @@
-/* $Id: isdn_ppp.c,v 1.1.2.3 2004/02/10 01:07:13 keil Exp $
- *
- * Linux ISDN subsystem, functions for synchronous PPP (linklevel).
- *
- * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/isdn.h>
-#include <linux/poll.h>
-#include <linux/ppp-comp.h>
-#include <linux/slab.h>
-#ifdef CONFIG_IPPP_FILTER
-#include <linux/filter.h>
-#endif
-
-#include "isdn_common.h"
-#include "isdn_ppp.h"
-#include "isdn_net.h"
-
-#ifndef PPP_IPX
-#define PPP_IPX 0x002b
-#endif
-
-/* Prototypes */
-static int isdn_ppp_fill_rq(unsigned char *buf, int len, int proto, int slot);
-static int isdn_ppp_closewait(int slot);
-static void isdn_ppp_push_higher(isdn_net_dev *net_dev, isdn_net_local *lp,
-				 struct sk_buff *skb, int proto);
-static int isdn_ppp_if_get_unit(char *namebuf);
-static int isdn_ppp_set_compressor(struct ippp_struct *is, struct isdn_ppp_comp_data *);
-static struct sk_buff *isdn_ppp_decompress(struct sk_buff *,
-					   struct ippp_struct *, struct ippp_struct *, int *proto);
-static void isdn_ppp_receive_ccp(isdn_net_dev *net_dev, isdn_net_local *lp,
-				 struct sk_buff *skb, int proto);
-static struct sk_buff *isdn_ppp_compress(struct sk_buff *skb_in, int *proto,
-					 struct ippp_struct *is, struct ippp_struct *master, int type);
-static void isdn_ppp_send_ccp(isdn_net_dev *net_dev, isdn_net_local *lp,
-			      struct sk_buff *skb);
-
-/* New CCP stuff */
-static void isdn_ppp_ccp_kickup(struct ippp_struct *is);
-static void isdn_ppp_ccp_xmit_reset(struct ippp_struct *is, int proto,
-				    unsigned char code, unsigned char id,
-				    unsigned char *data, int len);
-static struct ippp_ccp_reset *isdn_ppp_ccp_reset_alloc(struct ippp_struct *is);
-static void isdn_ppp_ccp_reset_free(struct ippp_struct *is);
-static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is,
-					  unsigned char id);
-static void isdn_ppp_ccp_timer_callback(struct timer_list *t);
-static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_struct *is,
-								   unsigned char id);
-static void isdn_ppp_ccp_reset_trans(struct ippp_struct *is,
-				     struct isdn_ppp_resetparams *rp);
-static void isdn_ppp_ccp_reset_ack_rcvd(struct ippp_struct *is,
-					unsigned char id);
-
-
-
-#ifdef CONFIG_ISDN_MPP
-static ippp_bundle *isdn_ppp_bundle_arr = NULL;
-
-static int isdn_ppp_mp_bundle_array_init(void);
-static int isdn_ppp_mp_init(isdn_net_local *lp, ippp_bundle *add_to);
-static void isdn_ppp_mp_receive(isdn_net_dev *net_dev, isdn_net_local *lp,
-				struct sk_buff *skb);
-static void isdn_ppp_mp_cleanup(isdn_net_local *lp);
-
-static int isdn_ppp_bundle(struct ippp_struct *, int unit);
-#endif	/* CONFIG_ISDN_MPP */
-
-char *isdn_ppp_revision = "$Revision: 1.1.2.3 $";
-
-static struct ippp_struct *ippp_table[ISDN_MAX_CHANNELS];
-
-static struct isdn_ppp_compressor *ipc_head = NULL;
-
-/*
- * frame log (debug)
- */
-static void
-isdn_ppp_frame_log(char *info, char *data, int len, int maxlen, int unit, int slot)
-{
-	int cnt,
-		j,
-		i;
-	char buf[80];
-
-	if (len < maxlen)
-		maxlen = len;
-
-	for (i = 0, cnt = 0; cnt < maxlen; i++) {
-		for (j = 0; j < 16 && cnt < maxlen; j++, cnt++)
-			sprintf(buf + j * 3, "%02x ", (unsigned char)data[cnt]);
-		printk(KERN_DEBUG "[%d/%d].%s[%d]: %s\n", unit, slot, info, i, buf);
-	}
-}
-
-/*
- * unbind isdn_net_local <=> ippp-device
- * note: it can happen, that we hangup/free the master before the slaves
- *       in this case we bind another lp to the master device
- */
-int
-isdn_ppp_free(isdn_net_local *lp)
-{
-	struct ippp_struct *is;
-
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return 0;
-	}
-
-#ifdef CONFIG_ISDN_MPP
-	spin_lock(&lp->netdev->pb->lock);
-#endif
-	isdn_net_rm_from_bundle(lp);
-#ifdef CONFIG_ISDN_MPP
-	if (lp->netdev->pb->ref_ct == 1)	/* last link in queue? */
-		isdn_ppp_mp_cleanup(lp);
-
-	lp->netdev->pb->ref_ct--;
-	spin_unlock(&lp->netdev->pb->lock);
-#endif /* CONFIG_ISDN_MPP */
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: ppp_slot(%d) now invalid\n",
-		       __func__, lp->ppp_slot);
-		return 0;
-	}
-	is = ippp_table[lp->ppp_slot];
-	if ((is->state & IPPP_CONNECT))
-		isdn_ppp_closewait(lp->ppp_slot);	/* force wakeup on ippp device */
-	else if (is->state & IPPP_ASSIGNED)
-		is->state = IPPP_OPEN;	/* fallback to 'OPEN but not ASSIGNED' state */
-
-	if (is->debug & 0x1)
-		printk(KERN_DEBUG "isdn_ppp_free %d %lx %lx\n", lp->ppp_slot, (long) lp, (long) is->lp);
-
-	is->lp = NULL;          /* link is down .. set lp to NULL */
-	lp->ppp_slot = -1;      /* is this OK ?? */
-
-	return 0;
-}
-
-/*
- * bind isdn_net_local <=> ippp-device
- *
- * This function is allways called with holding dev->lock so
- * no additional lock is needed
- */
-int
-isdn_ppp_bind(isdn_net_local *lp)
-{
-	int i;
-	int unit = 0;
-	struct ippp_struct *is;
-	int retval;
-
-	if (lp->pppbind < 0) {  /* device bounded to ippp device ? */
-		isdn_net_dev *net_dev = dev->netdev;
-		char exclusive[ISDN_MAX_CHANNELS];	/* exclusive flags */
-		memset(exclusive, 0, ISDN_MAX_CHANNELS);
-		while (net_dev) {	/* step through net devices to find exclusive minors */
-			isdn_net_local *lp = net_dev->local;
-			if (lp->pppbind >= 0)
-				exclusive[lp->pppbind] = 1;
-			net_dev = net_dev->next;
-		}
-		/*
-		 * search a free device / slot
-		 */
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-			if (ippp_table[i]->state == IPPP_OPEN && !exclusive[ippp_table[i]->minor]) {	/* OPEN, but not connected! */
-				break;
-			}
-		}
-	} else {
-		for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-			if (ippp_table[i]->minor == lp->pppbind &&
-			    (ippp_table[i]->state & IPPP_OPEN) == IPPP_OPEN)
-				break;
-		}
-	}
-
-	if (i >= ISDN_MAX_CHANNELS) {
-		printk(KERN_WARNING "isdn_ppp_bind: Can't find a (free) connection to the ipppd daemon.\n");
-		retval = -1;
-		goto out;
-	}
-	/* get unit number from interface name .. ugly! */
-	unit = isdn_ppp_if_get_unit(lp->netdev->dev->name);
-	if (unit < 0) {
-		printk(KERN_ERR "isdn_ppp_bind: illegal interface name %s.\n",
-		       lp->netdev->dev->name);
-		retval = -1;
-		goto out;
-	}
-
-	lp->ppp_slot = i;
-	is = ippp_table[i];
-	is->lp = lp;
-	is->unit = unit;
-	is->state = IPPP_OPEN | IPPP_ASSIGNED;	/* assigned to a netdevice but not connected */
-#ifdef CONFIG_ISDN_MPP
-	retval = isdn_ppp_mp_init(lp, NULL);
-	if (retval < 0)
-		goto out;
-#endif /* CONFIG_ISDN_MPP */
-
-	retval = lp->ppp_slot;
-
-out:
-	return retval;
-}
-
-/*
- * kick the ipppd on the device
- * (wakes up daemon after B-channel connect)
- */
-
-void
-isdn_ppp_wakeup_daemon(isdn_net_local *lp)
-{
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return;
-	}
-	ippp_table[lp->ppp_slot]->state = IPPP_OPEN | IPPP_CONNECT | IPPP_NOBLOCK;
-	wake_up_interruptible(&ippp_table[lp->ppp_slot]->wq);
-}
-
-/*
- * there was a hangup on the netdevice
- * force wakeup of the ippp device
- * go into 'device waits for release' state
- */
-static int
-isdn_ppp_closewait(int slot)
-{
-	struct ippp_struct *is;
-
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: slot(%d) out of range\n",
-		       __func__, slot);
-		return 0;
-	}
-	is = ippp_table[slot];
-	if (is->state)
-		wake_up_interruptible(&is->wq);
-	is->state = IPPP_CLOSEWAIT;
-	return 1;
-}
-
-/*
- * isdn_ppp_find_slot / isdn_ppp_free_slot
- */
-
-static int
-isdn_ppp_get_slot(void)
-{
-	int i;
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		if (!ippp_table[i]->state)
-			return i;
-	}
-	return -1;
-}
-
-/*
- * isdn_ppp_open
- */
-
-int
-isdn_ppp_open(int min, struct file *file)
-{
-	int slot;
-	struct ippp_struct *is;
-
-	if (min < 0 || min >= ISDN_MAX_CHANNELS)
-		return -ENODEV;
-
-	slot = isdn_ppp_get_slot();
-	if (slot < 0) {
-		return -EBUSY;
-	}
-	is = file->private_data = ippp_table[slot];
-
-	printk(KERN_DEBUG "ippp, open, slot: %d, minor: %d, state: %04x\n",
-	       slot, min, is->state);
-
-	/* compression stuff */
-	is->link_compressor   = is->compressor = NULL;
-	is->link_decompressor = is->decompressor = NULL;
-	is->link_comp_stat    = is->comp_stat = NULL;
-	is->link_decomp_stat  = is->decomp_stat = NULL;
-	is->compflags = 0;
-
-	is->reset = isdn_ppp_ccp_reset_alloc(is);
-	if (!is->reset)
-		return -ENOMEM;
-
-	is->lp = NULL;
-	is->mp_seqno = 0;       /* MP sequence number */
-	is->pppcfg = 0;         /* ppp configuration */
-	is->mpppcfg = 0;        /* mppp configuration */
-	is->last_link_seqno = -1;	/* MP: maybe set to Bundle-MIN, when joining a bundle ?? */
-	is->unit = -1;          /* set, when we have our interface */
-	is->mru = 1524;         /* MRU, default 1524 */
-	is->maxcid = 16;        /* VJ: maxcid */
-	is->tk = current;
-	init_waitqueue_head(&is->wq);
-	is->first = is->rq + NUM_RCV_BUFFS - 1;	/* receive queue */
-	is->last = is->rq;
-	is->minor = min;
-#ifdef CONFIG_ISDN_PPP_VJ
-	/*
-	 * VJ header compression init
-	 */
-	is->slcomp = slhc_init(16, 16);	/* not necessary for 2. link in bundle */
-	if (IS_ERR(is->slcomp)) {
-		isdn_ppp_ccp_reset_free(is);
-		return PTR_ERR(is->slcomp);
-	}
-#endif
-#ifdef CONFIG_IPPP_FILTER
-	is->pass_filter = NULL;
-	is->active_filter = NULL;
-#endif
-	is->state = IPPP_OPEN;
-
-	return 0;
-}
-
-/*
- * release ippp device
- */
-void
-isdn_ppp_release(int min, struct file *file)
-{
-	int i;
-	struct ippp_struct *is;
-
-	if (min < 0 || min >= ISDN_MAX_CHANNELS)
-		return;
-	is = file->private_data;
-
-	if (!is) {
-		printk(KERN_ERR "%s: no file->private_data\n", __func__);
-		return;
-	}
-	if (is->debug & 0x1)
-		printk(KERN_DEBUG "ippp: release, minor: %d %lx\n", min, (long) is->lp);
-
-	if (is->lp) {           /* a lp address says: this link is still up */
-		isdn_net_dev *p = is->lp->netdev;
-
-		if (!p) {
-			printk(KERN_ERR "%s: no lp->netdev\n", __func__);
-			return;
-		}
-		is->state &= ~IPPP_CONNECT;	/* -> effect: no call of wakeup */
-		/*
-		 * isdn_net_hangup() calls isdn_ppp_free()
-		 * isdn_ppp_free() sets is->lp to NULL and lp->ppp_slot to -1
-		 * removing the IPPP_CONNECT flag omits calling of isdn_ppp_wakeup_daemon()
-		 */
-		isdn_net_hangup(p->dev);
-	}
-	for (i = 0; i < NUM_RCV_BUFFS; i++) {
-		kfree(is->rq[i].buf);
-		is->rq[i].buf = NULL;
-	}
-	is->first = is->rq + NUM_RCV_BUFFS - 1;	/* receive queue */
-	is->last = is->rq;
-
-#ifdef CONFIG_ISDN_PPP_VJ
-/* TODO: if this was the previous master: link the slcomp to the new master */
-	slhc_free(is->slcomp);
-	is->slcomp = NULL;
-#endif
-#ifdef CONFIG_IPPP_FILTER
-	if (is->pass_filter) {
-		bpf_prog_destroy(is->pass_filter);
-		is->pass_filter = NULL;
-	}
-
-	if (is->active_filter) {
-		bpf_prog_destroy(is->active_filter);
-		is->active_filter = NULL;
-	}
-#endif
-
-/* TODO: if this was the previous master: link the stuff to the new master */
-	if (is->comp_stat)
-		is->compressor->free(is->comp_stat);
-	if (is->link_comp_stat)
-		is->link_compressor->free(is->link_comp_stat);
-	if (is->link_decomp_stat)
-		is->link_decompressor->free(is->link_decomp_stat);
-	if (is->decomp_stat)
-		is->decompressor->free(is->decomp_stat);
-	is->compressor   = is->link_compressor   = NULL;
-	is->decompressor = is->link_decompressor = NULL;
-	is->comp_stat    = is->link_comp_stat    = NULL;
-	is->decomp_stat  = is->link_decomp_stat  = NULL;
-
-	/* Clean up if necessary */
-	if (is->reset)
-		isdn_ppp_ccp_reset_free(is);
-
-	/* this slot is ready for new connections */
-	is->state = 0;
-}
-
-/*
- * get_arg .. ioctl helper
- */
-static int
-get_arg(void __user *b, void *val, int len)
-{
-	if (len <= 0)
-		len = sizeof(void *);
-	if (copy_from_user(val, b, len))
-		return -EFAULT;
-	return 0;
-}
-
-/*
- * set arg .. ioctl helper
- */
-static int
-set_arg(void __user *b, void *val, int len)
-{
-	if (len <= 0)
-		len = sizeof(void *);
-	if (copy_to_user(b, val, len))
-		return -EFAULT;
-	return 0;
-}
-
-#ifdef CONFIG_IPPP_FILTER
-static int get_filter(void __user *arg, struct sock_filter **p)
-{
-	struct sock_fprog uprog;
-	struct sock_filter *code = NULL;
-	int len;
-
-	if (copy_from_user(&uprog, arg, sizeof(uprog)))
-		return -EFAULT;
-
-	if (!uprog.len) {
-		*p = NULL;
-		return 0;
-	}
-
-	/* uprog.len is unsigned short, so no overflow here */
-	len = uprog.len * sizeof(struct sock_filter);
-	code = memdup_user(uprog.filter, len);
-	if (IS_ERR(code))
-		return PTR_ERR(code);
-
-	*p = code;
-	return uprog.len;
-}
-#endif /* CONFIG_IPPP_FILTER */
-
-/*
- * ippp device ioctl
- */
-int
-isdn_ppp_ioctl(int min, struct file *file, unsigned int cmd, unsigned long arg)
-{
-	unsigned long val;
-	int r, i, j;
-	struct ippp_struct *is;
-	isdn_net_local *lp;
-	struct isdn_ppp_comp_data data;
-	void __user *argp = (void __user *)arg;
-
-	is = file->private_data;
-	lp = is->lp;
-
-	if (is->debug & 0x1)
-		printk(KERN_DEBUG "isdn_ppp_ioctl: minor: %d cmd: %x state: %x\n", min, cmd, is->state);
-
-	if (!(is->state & IPPP_OPEN))
-		return -EINVAL;
-
-	switch (cmd) {
-	case PPPIOCBUNDLE:
-#ifdef CONFIG_ISDN_MPP
-		if (!(is->state & IPPP_CONNECT))
-			return -EINVAL;
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		printk(KERN_DEBUG "iPPP-bundle: minor: %d, slave unit: %d, master unit: %d\n",
-		       (int) min, (int) is->unit, (int) val);
-		return isdn_ppp_bundle(is, val);
-#else
-		return -1;
-#endif
-		break;
-	case PPPIOCGUNIT:	/* get ppp/isdn unit number */
-		if ((r = set_arg(argp, &is->unit, sizeof(is->unit))))
-			return r;
-		break;
-	case PPPIOCGIFNAME:
-		if (!lp)
-			return -EINVAL;
-		if ((r = set_arg(argp, lp->netdev->dev->name,
-				 strlen(lp->netdev->dev->name))))
-			return r;
-		break;
-	case PPPIOCGMPFLAGS:	/* get configuration flags */
-		if ((r = set_arg(argp, &is->mpppcfg, sizeof(is->mpppcfg))))
-			return r;
-		break;
-	case PPPIOCSMPFLAGS:	/* set configuration flags */
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		is->mpppcfg = val;
-		break;
-	case PPPIOCGFLAGS:	/* get configuration flags */
-		if ((r = set_arg(argp, &is->pppcfg, sizeof(is->pppcfg))))
-			return r;
-		break;
-	case PPPIOCSFLAGS:	/* set configuration flags */
-		if ((r = get_arg(argp, &val, sizeof(val)))) {
-			return r;
-		}
-		if (val & SC_ENABLE_IP && !(is->pppcfg & SC_ENABLE_IP) && (is->state & IPPP_CONNECT)) {
-			if (lp) {
-				/* OK .. we are ready to send buffers */
-				is->pppcfg = val; /* isdn_ppp_xmit test for SC_ENABLE_IP !!! */
-				netif_wake_queue(lp->netdev->dev);
-				break;
-			}
-		}
-		is->pppcfg = val;
-		break;
-	case PPPIOCGIDLE:	/* get idle time information */
-		if (lp) {
-			struct ppp_idle pidle;
-			pidle.xmit_idle = pidle.recv_idle = lp->huptimer;
-			if ((r = set_arg(argp, &pidle, sizeof(struct ppp_idle))))
-				return r;
-		}
-		break;
-	case PPPIOCSMRU:	/* set receive unit size for PPP */
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		is->mru = val;
-		break;
-	case PPPIOCSMPMRU:
-		break;
-	case PPPIOCSMPMTU:
-		break;
-	case PPPIOCSMAXCID:	/* set the maximum compression slot id */
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		val++;
-		if (is->maxcid != val) {
-#ifdef CONFIG_ISDN_PPP_VJ
-			struct slcompress *sltmp;
-#endif
-			if (is->debug & 0x1)
-				printk(KERN_DEBUG "ippp, ioctl: changed MAXCID to %ld\n", val);
-			is->maxcid = val;
-#ifdef CONFIG_ISDN_PPP_VJ
-			sltmp = slhc_init(16, val);
-			if (IS_ERR(sltmp))
-				return PTR_ERR(sltmp);
-			if (is->slcomp)
-				slhc_free(is->slcomp);
-			is->slcomp = sltmp;
-#endif
-		}
-		break;
-	case PPPIOCGDEBUG:
-		if ((r = set_arg(argp, &is->debug, sizeof(is->debug))))
-			return r;
-		break;
-	case PPPIOCSDEBUG:
-		if ((r = get_arg(argp, &val, sizeof(val))))
-			return r;
-		is->debug = val;
-		break;
-	case PPPIOCGCOMPRESSORS:
-	{
-		unsigned long protos[8] = {0,};
-		struct isdn_ppp_compressor *ipc = ipc_head;
-		while (ipc) {
-			j = ipc->num / (sizeof(long) * 8);
-			i = ipc->num % (sizeof(long) * 8);
-			if (j < 8)
-				protos[j] |= (1UL << i);
-			ipc = ipc->next;
-		}
-		if ((r = set_arg(argp, protos, 8 * sizeof(long))))
-			return r;
-	}
-	break;
-	case PPPIOCSCOMPRESSOR:
-		if ((r = get_arg(argp, &data, sizeof(struct isdn_ppp_comp_data))))
-			return r;
-		return isdn_ppp_set_compressor(is, &data);
-	case PPPIOCGCALLINFO:
-	{
-		struct pppcallinfo pci;
-		memset((char *)&pci, 0, sizeof(struct pppcallinfo));
-		if (lp)
-		{
-			strncpy(pci.local_num, lp->msn, 63);
-			if (lp->dial) {
-				strncpy(pci.remote_num, lp->dial->num, 63);
-			}
-			pci.charge_units = lp->charge;
-			if (lp->outgoing)
-				pci.calltype = CALLTYPE_OUTGOING;
-			else
-				pci.calltype = CALLTYPE_INCOMING;
-			if (lp->flags & ISDN_NET_CALLBACK)
-				pci.calltype |= CALLTYPE_CALLBACK;
-		}
-		return set_arg(argp, &pci, sizeof(struct pppcallinfo));
-	}
-#ifdef CONFIG_IPPP_FILTER
-	case PPPIOCSPASS:
-	{
-		struct sock_fprog_kern fprog;
-		struct sock_filter *code;
-		int err, len = get_filter(argp, &code);
-
-		if (len < 0)
-			return len;
-
-		fprog.len = len;
-		fprog.filter = code;
-
-		if (is->pass_filter) {
-			bpf_prog_destroy(is->pass_filter);
-			is->pass_filter = NULL;
-		}
-		if (fprog.filter != NULL)
-			err = bpf_prog_create(&is->pass_filter, &fprog);
-		else
-			err = 0;
-		kfree(code);
-
-		return err;
-	}
-	case PPPIOCSACTIVE:
-	{
-		struct sock_fprog_kern fprog;
-		struct sock_filter *code;
-		int err, len = get_filter(argp, &code);
-
-		if (len < 0)
-			return len;
-
-		fprog.len = len;
-		fprog.filter = code;
-
-		if (is->active_filter) {
-			bpf_prog_destroy(is->active_filter);
-			is->active_filter = NULL;
-		}
-		if (fprog.filter != NULL)
-			err = bpf_prog_create(&is->active_filter, &fprog);
-		else
-			err = 0;
-		kfree(code);
-
-		return err;
-	}
-#endif /* CONFIG_IPPP_FILTER */
-	default:
-		break;
-	}
-	return 0;
-}
-
-__poll_t
-isdn_ppp_poll(struct file *file, poll_table *wait)
-{
-	__poll_t mask;
-	struct ippp_buf_queue *bf, *bl;
-	u_long flags;
-	struct ippp_struct *is;
-
-	is = file->private_data;
-
-	if (is->debug & 0x2)
-		printk(KERN_DEBUG "isdn_ppp_poll: minor: %d\n",
-		       iminor(file_inode(file)));
-
-	/* just registers wait_queue hook. This doesn't really wait. */
-	poll_wait(file, &is->wq, wait);
-
-	if (!(is->state & IPPP_OPEN)) {
-		if (is->state == IPPP_CLOSEWAIT)
-			return EPOLLHUP;
-		printk(KERN_DEBUG "isdn_ppp: device not open\n");
-		return EPOLLERR;
-	}
-	/* we're always ready to send .. */
-	mask = EPOLLOUT | EPOLLWRNORM;
-
-	spin_lock_irqsave(&is->buflock, flags);
-	bl = is->last;
-	bf = is->first;
-	/*
-	 * if IPPP_NOBLOCK is set we return even if we have nothing to read
-	 */
-	if (bf->next != bl || (is->state & IPPP_NOBLOCK)) {
-		is->state &= ~IPPP_NOBLOCK;
-		mask |= EPOLLIN | EPOLLRDNORM;
-	}
-	spin_unlock_irqrestore(&is->buflock, flags);
-	return mask;
-}
-
-/*
- *  fill up isdn_ppp_read() queue ..
- */
-
-static int
-isdn_ppp_fill_rq(unsigned char *buf, int len, int proto, int slot)
-{
-	struct ippp_buf_queue *bf, *bl;
-	u_long flags;
-	u_char *nbuf;
-	struct ippp_struct *is;
-
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_WARNING "ippp: illegal slot(%d).\n", slot);
-		return 0;
-	}
-	is = ippp_table[slot];
-
-	if (!(is->state & IPPP_CONNECT)) {
-		printk(KERN_DEBUG "ippp: device not activated.\n");
-		return 0;
-	}
-	nbuf = kmalloc(len + 4, GFP_ATOMIC);
-	if (!nbuf) {
-		printk(KERN_WARNING "ippp: Can't alloc buf\n");
-		return 0;
-	}
-	nbuf[0] = PPP_ALLSTATIONS;
-	nbuf[1] = PPP_UI;
-	nbuf[2] = proto >> 8;
-	nbuf[3] = proto & 0xff;
-	memcpy(nbuf + 4, buf, len);
-
-	spin_lock_irqsave(&is->buflock, flags);
-	bf = is->first;
-	bl = is->last;
-
-	if (bf == bl) {
-		printk(KERN_WARNING "ippp: Queue is full; discarding first buffer\n");
-		bf = bf->next;
-		kfree(bf->buf);
-		is->first = bf;
-	}
-	bl->buf = (char *) nbuf;
-	bl->len = len + 4;
-
-	is->last = bl->next;
-	spin_unlock_irqrestore(&is->buflock, flags);
-	wake_up_interruptible(&is->wq);
-	return len;
-}
-
-/*
- * read() .. non-blocking: ipppd calls it only after select()
- *           reports, that there is data
- */
-
-int
-isdn_ppp_read(int min, struct file *file, char __user *buf, int count)
-{
-	struct ippp_struct *is;
-	struct ippp_buf_queue *b;
-	u_long flags;
-	u_char *save_buf;
-
-	is = file->private_data;
-
-	if (!(is->state & IPPP_OPEN))
-		return 0;
-
-	spin_lock_irqsave(&is->buflock, flags);
-	b = is->first->next;
-	save_buf = b->buf;
-	if (!save_buf) {
-		spin_unlock_irqrestore(&is->buflock, flags);
-		return -EAGAIN;
-	}
-	if (b->len < count)
-		count = b->len;
-	b->buf = NULL;
-	is->first = b;
-
-	spin_unlock_irqrestore(&is->buflock, flags);
-	if (copy_to_user(buf, save_buf, count))
-		count = -EFAULT;
-	kfree(save_buf);
-
-	return count;
-}
-
-/*
- * ipppd wanna write a packet to the card .. non-blocking
- */
-
-int
-isdn_ppp_write(int min, struct file *file, const char __user *buf, int count)
-{
-	isdn_net_local *lp;
-	struct ippp_struct *is;
-	int proto;
-
-	is = file->private_data;
-
-	if (!(is->state & IPPP_CONNECT))
-		return 0;
-
-	lp = is->lp;
-
-	/* -> push it directly to the lowlevel interface */
-
-	if (!lp)
-		printk(KERN_DEBUG "isdn_ppp_write: lp == NULL\n");
-	else {
-		if (lp->isdn_device < 0 || lp->isdn_channel < 0) {
-			unsigned char protobuf[4];
-			/*
-			 * Don't reset huptimer for
-			 * LCP packets. (Echo requests).
-			 */
-			if (copy_from_user(protobuf, buf, 4))
-				return -EFAULT;
-
-			proto = PPP_PROTOCOL(protobuf);
-			if (proto != PPP_LCP)
-				lp->huptimer = 0;
-
-			return 0;
-		}
-
-		if ((dev->drv[lp->isdn_device]->flags & DRV_FLAG_RUNNING) &&
-		    lp->dialstate == 0 &&
-		    (lp->flags & ISDN_NET_CONNECTED)) {
-			unsigned short hl;
-			struct sk_buff *skb;
-			unsigned char *cpy_buf;
-			/*
-			 * we need to reserve enough space in front of
-			 * sk_buff. old call to dev_alloc_skb only reserved
-			 * 16 bytes, now we are looking what the driver want
-			 */
-			hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen;
-			skb = alloc_skb(hl + count, GFP_ATOMIC);
-			if (!skb) {
-				printk(KERN_WARNING "isdn_ppp_write: out of memory!\n");
-				return count;
-			}
-			skb_reserve(skb, hl);
-			cpy_buf = skb_put(skb, count);
-			if (copy_from_user(cpy_buf, buf, count))
-			{
-				kfree_skb(skb);
-				return -EFAULT;
-			}
-
-			/*
-			 * Don't reset huptimer for
-			 * LCP packets. (Echo requests).
-			 */
-			proto = PPP_PROTOCOL(cpy_buf);
-			if (proto != PPP_LCP)
-				lp->huptimer = 0;
-
-			if (is->debug & 0x40) {
-				printk(KERN_DEBUG "ppp xmit: len %d\n", (int) skb->len);
-				isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-			}
-
-			isdn_ppp_send_ccp(lp->netdev, lp, skb); /* keeps CCP/compression states in sync */
-
-			isdn_net_write_super(lp, skb);
-		}
-	}
-	return count;
-}
-
-/*
- * init memory, structures etc.
- */
-
-int
-isdn_ppp_init(void)
-{
-	int i,
-		j;
-
-#ifdef CONFIG_ISDN_MPP
-	if (isdn_ppp_mp_bundle_array_init() < 0)
-		return -ENOMEM;
-#endif /* CONFIG_ISDN_MPP */
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		if (!(ippp_table[i] = kzalloc(sizeof(struct ippp_struct), GFP_KERNEL))) {
-			printk(KERN_WARNING "isdn_ppp_init: Could not alloc ippp_table\n");
-			for (j = 0; j < i; j++)
-				kfree(ippp_table[j]);
-			return -1;
-		}
-		spin_lock_init(&ippp_table[i]->buflock);
-		ippp_table[i]->state = 0;
-		ippp_table[i]->first = ippp_table[i]->rq + NUM_RCV_BUFFS - 1;
-		ippp_table[i]->last = ippp_table[i]->rq;
-
-		for (j = 0; j < NUM_RCV_BUFFS; j++) {
-			ippp_table[i]->rq[j].buf = NULL;
-			ippp_table[i]->rq[j].last = ippp_table[i]->rq +
-				(NUM_RCV_BUFFS + j - 1) % NUM_RCV_BUFFS;
-			ippp_table[i]->rq[j].next = ippp_table[i]->rq + (j + 1) % NUM_RCV_BUFFS;
-		}
-	}
-	return 0;
-}
-
-void
-isdn_ppp_cleanup(void)
-{
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		kfree(ippp_table[i]);
-
-#ifdef CONFIG_ISDN_MPP
-	kfree(isdn_ppp_bundle_arr);
-#endif /* CONFIG_ISDN_MPP */
-
-}
-
-/*
- * check for address/control field and skip if allowed
- * retval != 0 -> discard packet silently
- */
-static int isdn_ppp_skip_ac(struct ippp_struct *is, struct sk_buff *skb)
-{
-	if (skb->len < 1)
-		return -1;
-
-	if (skb->data[0] == 0xff) {
-		if (skb->len < 2)
-			return -1;
-
-		if (skb->data[1] != 0x03)
-			return -1;
-
-		// skip address/control (AC) field
-		skb_pull(skb, 2);
-	} else {
-		if (is->pppcfg & SC_REJ_COMP_AC)
-			// if AC compression was not negotiated, but used, discard packet
-			return -1;
-	}
-	return 0;
-}
-
-/*
- * get the PPP protocol header and pull skb
- * retval < 0 -> discard packet silently
- */
-static int isdn_ppp_strip_proto(struct sk_buff *skb)
-{
-	int proto;
-
-	if (skb->len < 1)
-		return -1;
-
-	if (skb->data[0] & 0x1) {
-		// protocol field is compressed
-		proto = skb->data[0];
-		skb_pull(skb, 1);
-	} else {
-		if (skb->len < 2)
-			return -1;
-		proto = ((int) skb->data[0] << 8) + skb->data[1];
-		skb_pull(skb, 2);
-	}
-	return proto;
-}
-
-
-/*
- * handler for incoming packets on a syncPPP interface
- */
-void isdn_ppp_receive(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb)
-{
-	struct ippp_struct *is;
-	int slot;
-	int proto;
-
-	BUG_ON(net_dev->local->master); // we're called with the master device always
-
-	slot = lp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "isdn_ppp_receive: lp->ppp_slot(%d)\n",
-		       lp->ppp_slot);
-		kfree_skb(skb);
-		return;
-	}
-	is = ippp_table[slot];
-
-	if (is->debug & 0x4) {
-		printk(KERN_DEBUG "ippp_receive: is:%08lx lp:%08lx slot:%d unit:%d len:%d\n",
-		       (long)is, (long)lp, lp->ppp_slot, is->unit, (int)skb->len);
-		isdn_ppp_frame_log("receive", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-	}
-
-	if (isdn_ppp_skip_ac(is, skb) < 0) {
-		kfree_skb(skb);
-		return;
-	}
-	proto = isdn_ppp_strip_proto(skb);
-	if (proto < 0) {
-		kfree_skb(skb);
-		return;
-	}
-
-#ifdef CONFIG_ISDN_MPP
-	if (is->compflags & SC_LINK_DECOMP_ON) {
-		skb = isdn_ppp_decompress(skb, is, NULL, &proto);
-		if (!skb) // decompression error
-			return;
-	}
-
-	if (!(is->mpppcfg & SC_REJ_MP_PROT)) { // we agreed to receive MPPP
-		if (proto == PPP_MP) {
-			isdn_ppp_mp_receive(net_dev, lp, skb);
-			return;
-		}
-	}
-#endif
-	isdn_ppp_push_higher(net_dev, lp, skb, proto);
-}
-
-/*
- * we receive a reassembled frame, MPPP has been taken care of before.
- * address/control and protocol have been stripped from the skb
- * note: net_dev has to be master net_dev
- */
-static void
-isdn_ppp_push_higher(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb, int proto)
-{
-	struct net_device *dev = net_dev->dev;
-	struct ippp_struct *is, *mis;
-	isdn_net_local *mlp = NULL;
-	int slot;
-
-	slot = lp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "isdn_ppp_push_higher: lp->ppp_slot(%d)\n",
-		       lp->ppp_slot);
-		goto drop_packet;
-	}
-	is = ippp_table[slot];
-
-	if (lp->master) { // FIXME?
-		mlp = ISDN_MASTER_PRIV(lp);
-		slot = mlp->ppp_slot;
-		if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-			printk(KERN_ERR "isdn_ppp_push_higher: master->ppp_slot(%d)\n",
-			       lp->ppp_slot);
-			goto drop_packet;
-		}
-	}
-	mis = ippp_table[slot];
-
-	if (is->debug & 0x10) {
-		printk(KERN_DEBUG "push, skb %d %04x\n", (int) skb->len, proto);
-		isdn_ppp_frame_log("rpush", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-	}
-	if (mis->compflags & SC_DECOMP_ON) {
-		skb = isdn_ppp_decompress(skb, is, mis, &proto);
-		if (!skb) // decompression error
-			return;
-	}
-	switch (proto) {
-	case PPP_IPX:  /* untested */
-		if (is->debug & 0x20)
-			printk(KERN_DEBUG "isdn_ppp: IPX\n");
-		skb->protocol = htons(ETH_P_IPX);
-		break;
-	case PPP_IP:
-		if (is->debug & 0x20)
-			printk(KERN_DEBUG "isdn_ppp: IP\n");
-		skb->protocol = htons(ETH_P_IP);
-		break;
-	case PPP_COMP:
-	case PPP_COMPFRAG:
-		printk(KERN_INFO "isdn_ppp: unexpected compressed frame dropped\n");
-		goto drop_packet;
-#ifdef CONFIG_ISDN_PPP_VJ
-	case PPP_VJC_UNCOMP:
-		if (is->debug & 0x20)
-			printk(KERN_DEBUG "isdn_ppp: VJC_UNCOMP\n");
-		if (net_dev->local->ppp_slot < 0) {
-			printk(KERN_ERR "%s: net_dev->local->ppp_slot(%d) out of range\n",
-			       __func__, net_dev->local->ppp_slot);
-			goto drop_packet;
-		}
-		if (slhc_remember(ippp_table[net_dev->local->ppp_slot]->slcomp, skb->data, skb->len) <= 0) {
-			printk(KERN_WARNING "isdn_ppp: received illegal VJC_UNCOMP frame!\n");
-			goto drop_packet;
-		}
-		skb->protocol = htons(ETH_P_IP);
-		break;
-	case PPP_VJC_COMP:
-		if (is->debug & 0x20)
-			printk(KERN_DEBUG "isdn_ppp: VJC_COMP\n");
-		{
-			struct sk_buff *skb_old = skb;
-			int pkt_len;
-			skb = dev_alloc_skb(skb_old->len + 128);
-
-			if (!skb) {
-				printk(KERN_WARNING "%s: Memory squeeze, dropping packet.\n", dev->name);
-				skb = skb_old;
-				goto drop_packet;
-			}
-			skb_put(skb, skb_old->len + 128);
-			skb_copy_from_linear_data(skb_old, skb->data,
-						  skb_old->len);
-			if (net_dev->local->ppp_slot < 0) {
-				printk(KERN_ERR "%s: net_dev->local->ppp_slot(%d) out of range\n",
-				       __func__, net_dev->local->ppp_slot);
-				goto drop_packet;
-			}
-			pkt_len = slhc_uncompress(ippp_table[net_dev->local->ppp_slot]->slcomp,
-						  skb->data, skb_old->len);
-			kfree_skb(skb_old);
-			if (pkt_len < 0)
-				goto drop_packet;
-
-			skb_trim(skb, pkt_len);
-			skb->protocol = htons(ETH_P_IP);
-		}
-		break;
-#endif
-	case PPP_CCP:
-	case PPP_CCPFRAG:
-		isdn_ppp_receive_ccp(net_dev, lp, skb, proto);
-		/* Dont pop up ResetReq/Ack stuff to the daemon any
-		   longer - the job is done already */
-		if (skb->data[0] == CCP_RESETREQ ||
-		    skb->data[0] == CCP_RESETACK)
-			break;
-		/* fall through */
-	default:
-		isdn_ppp_fill_rq(skb->data, skb->len, proto, lp->ppp_slot);	/* push data to pppd device */
-		kfree_skb(skb);
-		return;
-	}
-
-#ifdef CONFIG_IPPP_FILTER
-	/* check if the packet passes the pass and active filters
-	 * the filter instructions are constructed assuming
-	 * a four-byte PPP header on each packet (which is still present) */
-	skb_push(skb, 4);
-
-	{
-		u_int16_t *p = (u_int16_t *) skb->data;
-
-		*p = 0;	/* indicate inbound */
-	}
-
-	if (is->pass_filter
-	    && BPF_PROG_RUN(is->pass_filter, skb) == 0) {
-		if (is->debug & 0x2)
-			printk(KERN_DEBUG "IPPP: inbound frame filtered.\n");
-		kfree_skb(skb);
-		return;
-	}
-	if (!(is->active_filter
-	      && BPF_PROG_RUN(is->active_filter, skb) == 0)) {
-		if (is->debug & 0x2)
-			printk(KERN_DEBUG "IPPP: link-active filter: resetting huptimer.\n");
-		lp->huptimer = 0;
-		if (mlp)
-			mlp->huptimer = 0;
-	}
-	skb_pull(skb, 4);
-#else /* CONFIG_IPPP_FILTER */
-	lp->huptimer = 0;
-	if (mlp)
-		mlp->huptimer = 0;
-#endif /* CONFIG_IPPP_FILTER */
-	skb->dev = dev;
-	skb_reset_mac_header(skb);
-	netif_rx(skb);
-	/* net_dev->local->stats.rx_packets++; done in isdn_net.c */
-	return;
-
-drop_packet:
-	net_dev->local->stats.rx_dropped++;
-	kfree_skb(skb);
-}
-
-/*
- * isdn_ppp_skb_push ..
- * checks whether we have enough space at the beginning of the skb
- * and allocs a new SKB if necessary
- */
-static unsigned char *isdn_ppp_skb_push(struct sk_buff **skb_p, int len)
-{
-	struct sk_buff *skb = *skb_p;
-
-	if (skb_headroom(skb) < len) {
-		struct sk_buff *nskb = skb_realloc_headroom(skb, len);
-
-		if (!nskb) {
-			printk(KERN_ERR "isdn_ppp_skb_push: can't realloc headroom!\n");
-			dev_kfree_skb(skb);
-			return NULL;
-		}
-		printk(KERN_DEBUG "isdn_ppp_skb_push:under %d %d\n", skb_headroom(skb), len);
-		dev_kfree_skb(skb);
-		*skb_p = nskb;
-		return skb_push(nskb, len);
-	}
-	return skb_push(skb, len);
-}
-
-/*
- * send ppp frame .. we expect a PIDCOMPressable proto --
- *  (here: currently always PPP_IP,PPP_VJC_COMP,PPP_VJC_UNCOMP)
- *
- * VJ compression may change skb pointer!!! .. requeue with old
- * skb isn't allowed!!
- */
-
-int
-isdn_ppp_xmit(struct sk_buff *skb, struct net_device *netdev)
-{
-	isdn_net_local *lp, *mlp;
-	isdn_net_dev *nd;
-	unsigned int proto = PPP_IP;     /* 0x21 */
-	struct ippp_struct *ipt, *ipts;
-	int slot, retval = NETDEV_TX_OK;
-
-	mlp = netdev_priv(netdev);
-	nd = mlp->netdev;       /* get master lp */
-
-	slot = mlp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "isdn_ppp_xmit: lp->ppp_slot(%d)\n",
-		       mlp->ppp_slot);
-		kfree_skb(skb);
-		goto out;
-	}
-	ipts = ippp_table[slot];
-
-	if (!(ipts->pppcfg & SC_ENABLE_IP)) {	/* PPP connected ? */
-		if (ipts->debug & 0x1)
-			printk(KERN_INFO "%s: IP frame delayed.\n", netdev->name);
-		retval = NETDEV_TX_BUSY;
-		goto out;
-	}
-
-	switch (ntohs(skb->protocol)) {
-	case ETH_P_IP:
-		proto = PPP_IP;
-		break;
-	case ETH_P_IPX:
-		proto = PPP_IPX;	/* untested */
-		break;
-	default:
-		printk(KERN_ERR "isdn_ppp: skipped unsupported protocol: %#x.\n",
-		       skb->protocol);
-		dev_kfree_skb(skb);
-		goto out;
-	}
-
-	lp = isdn_net_get_locked_lp(nd);
-	if (!lp) {
-		printk(KERN_WARNING "%s: all channels busy - requeuing!\n", netdev->name);
-		retval = NETDEV_TX_BUSY;
-		goto out;
-	}
-	/* we have our lp locked from now on */
-
-	slot = lp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "isdn_ppp_xmit: lp->ppp_slot(%d)\n",
-		       lp->ppp_slot);
-		kfree_skb(skb);
-		goto unlock;
-	}
-	ipt = ippp_table[slot];
-
-	/*
-	 * after this line .. requeueing in the device queue is no longer allowed!!!
-	 */
-
-	/* Pull off the fake header we stuck on earlier to keep
-	 * the fragmentation code happy.
-	 */
-	skb_pull(skb, IPPP_MAX_HEADER);
-
-#ifdef CONFIG_IPPP_FILTER
-	/* check if we should pass this packet
-	 * the filter instructions are constructed assuming
-	 * a four-byte PPP header on each packet */
-	*(u8 *)skb_push(skb, 4) = 1; /* indicate outbound */
-
-	{
-		__be16 *p = (__be16 *)skb->data;
-
-		p++;
-		*p = htons(proto);
-	}
-
-	if (ipt->pass_filter
-	    && BPF_PROG_RUN(ipt->pass_filter, skb) == 0) {
-		if (ipt->debug & 0x4)
-			printk(KERN_DEBUG "IPPP: outbound frame filtered.\n");
-		kfree_skb(skb);
-		goto unlock;
-	}
-	if (!(ipt->active_filter
-	      && BPF_PROG_RUN(ipt->active_filter, skb) == 0)) {
-		if (ipt->debug & 0x4)
-			printk(KERN_DEBUG "IPPP: link-active filter: resetting huptimer.\n");
-		lp->huptimer = 0;
-	}
-	skb_pull(skb, 4);
-#else /* CONFIG_IPPP_FILTER */
-	lp->huptimer = 0;
-#endif /* CONFIG_IPPP_FILTER */
-
-	if (ipt->debug & 0x4)
-		printk(KERN_DEBUG "xmit skb, len %d\n", (int) skb->len);
-	if (ipts->debug & 0x40)
-		isdn_ppp_frame_log("xmit0", skb->data, skb->len, 32, ipts->unit, lp->ppp_slot);
-
-#ifdef CONFIG_ISDN_PPP_VJ
-	if (proto == PPP_IP && ipts->pppcfg & SC_COMP_TCP) {	/* ipts here? probably yes, but check this again */
-		struct sk_buff *new_skb;
-		unsigned short hl;
-		/*
-		 * we need to reserve enough space in front of
-		 * sk_buff. old call to dev_alloc_skb only reserved
-		 * 16 bytes, now we are looking what the driver want.
-		 */
-		hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen + IPPP_MAX_HEADER;
-		/*
-		 * Note: hl might still be insufficient because the method
-		 * above does not account for a possibible MPPP slave channel
-		 * which had larger HL header space requirements than the
-		 * master.
-		 */
-		new_skb = alloc_skb(hl + skb->len, GFP_ATOMIC);
-		if (new_skb) {
-			u_char *buf;
-			int pktlen;
-
-			skb_reserve(new_skb, hl);
-			new_skb->dev = skb->dev;
-			skb_put(new_skb, skb->len);
-			buf = skb->data;
-
-			pktlen = slhc_compress(ipts->slcomp, skb->data, skb->len, new_skb->data,
-					       &buf, !(ipts->pppcfg & SC_NO_TCP_CCID));
-
-			if (buf != skb->data) {
-				if (new_skb->data != buf)
-					printk(KERN_ERR "isdn_ppp: FATAL error after slhc_compress!!\n");
-				dev_kfree_skb(skb);
-				skb = new_skb;
-			} else {
-				dev_kfree_skb(new_skb);
-			}
-
-			skb_trim(skb, pktlen);
-			if (skb->data[0] & SL_TYPE_COMPRESSED_TCP) {	/* cslip? style -> PPP */
-				proto = PPP_VJC_COMP;
-				skb->data[0] ^= SL_TYPE_COMPRESSED_TCP;
-			} else {
-				if (skb->data[0] >= SL_TYPE_UNCOMPRESSED_TCP)
-					proto = PPP_VJC_UNCOMP;
-				skb->data[0] = (skb->data[0] & 0x0f) | 0x40;
-			}
-		}
-	}
-#endif
-
-	/*
-	 * normal (single link) or bundle compression
-	 */
-	if (ipts->compflags & SC_COMP_ON) {
-		/* We send compressed only if both down- und upstream
-		   compression is negotiated, that means, CCP is up */
-		if (ipts->compflags & SC_DECOMP_ON) {
-			skb = isdn_ppp_compress(skb, &proto, ipt, ipts, 0);
-		} else {
-			printk(KERN_DEBUG "isdn_ppp: CCP not yet up - sending as-is\n");
-		}
-	}
-
-	if (ipt->debug & 0x24)
-		printk(KERN_DEBUG "xmit2 skb, len %d, proto %04x\n", (int) skb->len, proto);
-
-#ifdef CONFIG_ISDN_MPP
-	if (ipt->mpppcfg & SC_MP_PROT) {
-		/* we get mp_seqno from static isdn_net_local */
-		long mp_seqno = ipts->mp_seqno;
-		ipts->mp_seqno++;
-		if (ipt->mpppcfg & SC_OUT_SHORT_SEQ) {
-			unsigned char *data = isdn_ppp_skb_push(&skb, 3);
-			if (!data)
-				goto unlock;
-			mp_seqno &= 0xfff;
-			data[0] = MP_BEGIN_FRAG | MP_END_FRAG | ((mp_seqno >> 8) & 0xf);	/* (B)egin & (E)ndbit .. */
-			data[1] = mp_seqno & 0xff;
-			data[2] = proto;	/* PID compression */
-		} else {
-			unsigned char *data = isdn_ppp_skb_push(&skb, 5);
-			if (!data)
-				goto unlock;
-			data[0] = MP_BEGIN_FRAG | MP_END_FRAG;	/* (B)egin & (E)ndbit .. */
-			data[1] = (mp_seqno >> 16) & 0xff;	/* sequence number: 24bit */
-			data[2] = (mp_seqno >> 8) & 0xff;
-			data[3] = (mp_seqno >> 0) & 0xff;
-			data[4] = proto;	/* PID compression */
-		}
-		proto = PPP_MP; /* MP Protocol, 0x003d */
-	}
-#endif
-
-	/*
-	 * 'link in bundle' compression  ...
-	 */
-	if (ipt->compflags & SC_LINK_COMP_ON)
-		skb = isdn_ppp_compress(skb, &proto, ipt, ipts, 1);
-
-	if ((ipt->pppcfg & SC_COMP_PROT) && (proto <= 0xff)) {
-		unsigned char *data = isdn_ppp_skb_push(&skb, 1);
-		if (!data)
-			goto unlock;
-		data[0] = proto & 0xff;
-	}
-	else {
-		unsigned char *data = isdn_ppp_skb_push(&skb, 2);
-		if (!data)
-			goto unlock;
-		data[0] = (proto >> 8) & 0xff;
-		data[1] = proto & 0xff;
-	}
-	if (!(ipt->pppcfg & SC_COMP_AC)) {
-		unsigned char *data = isdn_ppp_skb_push(&skb, 2);
-		if (!data)
-			goto unlock;
-		data[0] = 0xff;    /* All Stations */
-		data[1] = 0x03;    /* Unnumbered information */
-	}
-
-	/* tx-stats are now updated via BSENT-callback */
-
-	if (ipts->debug & 0x40) {
-		printk(KERN_DEBUG "skb xmit: len: %d\n", (int) skb->len);
-		isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, ipt->unit, lp->ppp_slot);
-	}
-
-	isdn_net_writebuf_skb(lp, skb);
-
-unlock:
-	spin_unlock_bh(&lp->xmit_lock);
-out:
-	return retval;
-}
-
-#ifdef CONFIG_IPPP_FILTER
-/*
- * check if this packet may trigger auto-dial.
- */
-
-int isdn_ppp_autodial_filter(struct sk_buff *skb, isdn_net_local *lp)
-{
-	struct ippp_struct *is = ippp_table[lp->ppp_slot];
-	u_int16_t proto;
-	int drop = 0;
-
-	switch (ntohs(skb->protocol)) {
-	case ETH_P_IP:
-		proto = PPP_IP;
-		break;
-	case ETH_P_IPX:
-		proto = PPP_IPX;
-		break;
-	default:
-		printk(KERN_ERR "isdn_ppp_autodial_filter: unsupported protocol 0x%x.\n",
-		       skb->protocol);
-		return 1;
-	}
-
-	/* the filter instructions are constructed assuming
-	 * a four-byte PPP header on each packet. we have to
-	 * temporarily remove part of the fake header stuck on
-	 * earlier.
-	 */
-	*(u8 *)skb_pull(skb, IPPP_MAX_HEADER - 4) = 1; /* indicate outbound */
-
-	{
-		__be16 *p = (__be16 *)skb->data;
-
-		p++;
-		*p = htons(proto);
-	}
-
-	drop |= is->pass_filter
-		&& BPF_PROG_RUN(is->pass_filter, skb) == 0;
-	drop |= is->active_filter
-		&& BPF_PROG_RUN(is->active_filter, skb) == 0;
-
-	skb_push(skb, IPPP_MAX_HEADER - 4);
-	return drop;
-}
-#endif
-#ifdef CONFIG_ISDN_MPP
-
-/* this is _not_ rfc1990 header, but something we convert both short and long
- * headers to for convinience's sake:
- *	byte 0 is flags as in rfc1990
- *	bytes 1...4 is 24-bit seqence number converted to host byte order
- */
-#define MP_HEADER_LEN	5
-
-#define MP_LONGSEQ_MASK		0x00ffffff
-#define MP_SHORTSEQ_MASK	0x00000fff
-#define MP_LONGSEQ_MAX		MP_LONGSEQ_MASK
-#define MP_SHORTSEQ_MAX		MP_SHORTSEQ_MASK
-#define MP_LONGSEQ_MAXBIT	((MP_LONGSEQ_MASK + 1) >> 1)
-#define MP_SHORTSEQ_MAXBIT	((MP_SHORTSEQ_MASK + 1) >> 1)
-
-/* sequence-wrap safe comparisons (for long sequence)*/
-#define MP_LT(a, b)	((a - b) & MP_LONGSEQ_MAXBIT)
-#define MP_LE(a, b)	!((b - a) & MP_LONGSEQ_MAXBIT)
-#define MP_GT(a, b)	((b - a) & MP_LONGSEQ_MAXBIT)
-#define MP_GE(a, b)	!((a - b) & MP_LONGSEQ_MAXBIT)
-
-#define MP_SEQ(f)	((*(u32 *)(f->data + 1)))
-#define MP_FLAGS(f)	(f->data[0])
-
-static int isdn_ppp_mp_bundle_array_init(void)
-{
-	int i;
-	int sz = ISDN_MAX_CHANNELS * sizeof(ippp_bundle);
-	if ((isdn_ppp_bundle_arr = kzalloc(sz, GFP_KERNEL)) == NULL)
-		return -ENOMEM;
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		spin_lock_init(&isdn_ppp_bundle_arr[i].lock);
-	return 0;
-}
-
-static ippp_bundle *isdn_ppp_mp_bundle_alloc(void)
-{
-	int i;
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (isdn_ppp_bundle_arr[i].ref_ct <= 0)
-			return (isdn_ppp_bundle_arr + i);
-	return NULL;
-}
-
-static int isdn_ppp_mp_init(isdn_net_local *lp, ippp_bundle *add_to)
-{
-	struct ippp_struct *is;
-
-	if (lp->ppp_slot < 0) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return (-EINVAL);
-	}
-
-	is = ippp_table[lp->ppp_slot];
-	if (add_to) {
-		if (lp->netdev->pb)
-			lp->netdev->pb->ref_ct--;
-		lp->netdev->pb = add_to;
-	} else {		/* first link in a bundle */
-		is->mp_seqno = 0;
-		if ((lp->netdev->pb = isdn_ppp_mp_bundle_alloc()) == NULL)
-			return -ENOMEM;
-		lp->next = lp->last = lp;	/* nobody else in a queue */
-		lp->netdev->pb->frags = NULL;
-		lp->netdev->pb->frames = 0;
-		lp->netdev->pb->seq = UINT_MAX;
-	}
-	lp->netdev->pb->ref_ct++;
-
-	is->last_link_seqno = 0;
-	return 0;
-}
-
-static u32 isdn_ppp_mp_get_seq(int short_seq,
-			       struct sk_buff *skb, u32 last_seq);
-static struct sk_buff *isdn_ppp_mp_discard(ippp_bundle *mp,
-					   struct sk_buff *from, struct sk_buff *to);
-static void isdn_ppp_mp_reassembly(isdn_net_dev *net_dev, isdn_net_local *lp,
-				   struct sk_buff *from, struct sk_buff *to);
-static void isdn_ppp_mp_free_skb(ippp_bundle *mp, struct sk_buff *skb);
-static void isdn_ppp_mp_print_recv_pkt(int slot, struct sk_buff *skb);
-
-static void isdn_ppp_mp_receive(isdn_net_dev *net_dev, isdn_net_local *lp,
-				struct sk_buff *skb)
-{
-	struct ippp_struct *is;
-	isdn_net_local *lpq;
-	ippp_bundle *mp;
-	isdn_mppp_stats *stats;
-	struct sk_buff *newfrag, *frag, *start, *nextf;
-	u32 newseq, minseq, thisseq;
-	unsigned long flags;
-	int slot;
-
-	spin_lock_irqsave(&net_dev->pb->lock, flags);
-	mp = net_dev->pb;
-	stats = &mp->stats;
-	slot = lp->ppp_slot;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d)\n",
-		       __func__, lp->ppp_slot);
-		stats->frame_drops++;
-		dev_kfree_skb(skb);
-		spin_unlock_irqrestore(&mp->lock, flags);
-		return;
-	}
-	is = ippp_table[slot];
-	if (++mp->frames > stats->max_queue_len)
-		stats->max_queue_len = mp->frames;
-
-	if (is->debug & 0x8)
-		isdn_ppp_mp_print_recv_pkt(lp->ppp_slot, skb);
-
-	newseq = isdn_ppp_mp_get_seq(is->mpppcfg & SC_IN_SHORT_SEQ,
-				     skb, is->last_link_seqno);
-
-
-	/* if this packet seq # is less than last already processed one,
-	 * toss it right away, but check for sequence start case first
-	 */
-	if (mp->seq > MP_LONGSEQ_MAX && (newseq & MP_LONGSEQ_MAXBIT)) {
-		mp->seq = newseq;	/* the first packet: required for
-					 * rfc1990 non-compliant clients --
-					 * prevents constant packet toss */
-	} else if (MP_LT(newseq, mp->seq)) {
-		stats->frame_drops++;
-		isdn_ppp_mp_free_skb(mp, skb);
-		spin_unlock_irqrestore(&mp->lock, flags);
-		return;
-	}
-
-	/* find the minimum received sequence number over all links */
-	is->last_link_seqno = minseq = newseq;
-	for (lpq = net_dev->queue;;) {
-		slot = lpq->ppp_slot;
-		if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-			printk(KERN_ERR "%s: lpq->ppp_slot(%d)\n",
-			       __func__, lpq->ppp_slot);
-		} else {
-			u32 lls = ippp_table[slot]->last_link_seqno;
-			if (MP_LT(lls, minseq))
-				minseq = lls;
-		}
-		if ((lpq = lpq->next) == net_dev->queue)
-			break;
-	}
-	if (MP_LT(minseq, mp->seq))
-		minseq = mp->seq;	/* can't go beyond already processed
-					 * packets */
-	newfrag = skb;
-
-	/* if this new fragment is before the first one, then enqueue it now. */
-	if ((frag = mp->frags) == NULL || MP_LT(newseq, MP_SEQ(frag))) {
-		newfrag->next = frag;
-		mp->frags = frag = newfrag;
-		newfrag = NULL;
-	}
-
-	start = MP_FLAGS(frag) & MP_BEGIN_FRAG &&
-		MP_SEQ(frag) == mp->seq ? frag : NULL;
-
-	/*
-	 * main fragment traversing loop
-	 *
-	 * try to accomplish several tasks:
-	 * - insert new fragment into the proper sequence slot (once that's done
-	 *   newfrag will be set to NULL)
-	 * - reassemble any complete fragment sequence (non-null 'start'
-	 *   indicates there is a contiguous sequence present)
-	 * - discard any incomplete sequences that are below minseq -- due
-	 *   to the fact that sender always increment sequence number, if there
-	 *   is an incomplete sequence below minseq, no new fragments would
-	 *   come to complete such sequence and it should be discarded
-	 *
-	 * loop completes when we accomplished the following tasks:
-	 * - new fragment is inserted in the proper sequence ('newfrag' is
-	 *   set to NULL)
-	 * - we hit a gap in the sequence, so no reassembly/processing is
-	 *   possible ('start' would be set to NULL)
-	 *
-	 * algorithm for this code is derived from code in the book
-	 * 'PPP Design And Debugging' by James Carlson (Addison-Wesley)
-	 */
-	while (start != NULL || newfrag != NULL) {
-
-		thisseq = MP_SEQ(frag);
-		nextf = frag->next;
-
-		/* drop any duplicate fragments */
-		if (newfrag != NULL && thisseq == newseq) {
-			isdn_ppp_mp_free_skb(mp, newfrag);
-			newfrag = NULL;
-		}
-
-		/* insert new fragment before next element if possible. */
-		if (newfrag != NULL && (nextf == NULL ||
-					MP_LT(newseq, MP_SEQ(nextf)))) {
-			newfrag->next = nextf;
-			frag->next = nextf = newfrag;
-			newfrag = NULL;
-		}
-
-		if (start != NULL) {
-			/* check for misplaced start */
-			if (start != frag && (MP_FLAGS(frag) & MP_BEGIN_FRAG)) {
-				printk(KERN_WARNING"isdn_mppp(seq %d): new "
-				       "BEGIN flag with no prior END", thisseq);
-				stats->seqerrs++;
-				stats->frame_drops++;
-				start = isdn_ppp_mp_discard(mp, start, frag);
-				nextf = frag->next;
-			}
-		} else if (MP_LE(thisseq, minseq)) {
-			if (MP_FLAGS(frag) & MP_BEGIN_FRAG)
-				start = frag;
-			else {
-				if (MP_FLAGS(frag) & MP_END_FRAG)
-					stats->frame_drops++;
-				if (mp->frags == frag)
-					mp->frags = nextf;
-				isdn_ppp_mp_free_skb(mp, frag);
-				frag = nextf;
-				continue;
-			}
-		}
-
-		/* if start is non-null and we have end fragment, then
-		 * we have full reassembly sequence -- reassemble
-		 * and process packet now
-		 */
-		if (start != NULL && (MP_FLAGS(frag) & MP_END_FRAG)) {
-			minseq = mp->seq = (thisseq + 1) & MP_LONGSEQ_MASK;
-			/* Reassemble the packet then dispatch it */
-			isdn_ppp_mp_reassembly(net_dev, lp, start, nextf);
-
-			start = NULL;
-			frag = NULL;
-
-			mp->frags = nextf;
-		}
-
-		/* check if need to update start pointer: if we just
-		 * reassembled the packet and sequence is contiguous
-		 * then next fragment should be the start of new reassembly
-		 * if sequence is contiguous, but we haven't reassembled yet,
-		 * keep going.
-		 * if sequence is not contiguous, either clear everything
-		 * below low watermark and set start to the next frag or
-		 * clear start ptr.
-		 */
-		if (nextf != NULL &&
-		    ((thisseq + 1) & MP_LONGSEQ_MASK) == MP_SEQ(nextf)) {
-			/* if we just reassembled and the next one is here,
-			 * then start another reassembly. */
-
-			if (frag == NULL) {
-				if (MP_FLAGS(nextf) & MP_BEGIN_FRAG)
-					start = nextf;
-				else
-				{
-					printk(KERN_WARNING"isdn_mppp(seq %d):"
-					       " END flag with no following "
-					       "BEGIN", thisseq);
-					stats->seqerrs++;
-				}
-			}
-
-		} else {
-			if (nextf != NULL && frag != NULL &&
-			    MP_LT(thisseq, minseq)) {
-				/* we've got a break in the sequence
-				 * and we not at the end yet
-				 * and we did not just reassembled
-				 *(if we did, there wouldn't be anything before)
-				 * and we below the low watermark
-				 * discard all the frames below low watermark
-				 * and start over */
-				stats->frame_drops++;
-				mp->frags = isdn_ppp_mp_discard(mp, start, nextf);
-			}
-			/* break in the sequence, no reassembly */
-			start = NULL;
-		}
-
-		frag = nextf;
-	}	/* while -- main loop */
-
-	if (mp->frags == NULL)
-		mp->frags = frag;
-
-	/* rather straighforward way to deal with (not very) possible
-	 * queue overflow */
-	if (mp->frames > MP_MAX_QUEUE_LEN) {
-		stats->overflows++;
-		while (mp->frames > MP_MAX_QUEUE_LEN) {
-			frag = mp->frags->next;
-			isdn_ppp_mp_free_skb(mp, mp->frags);
-			mp->frags = frag;
-		}
-	}
-	spin_unlock_irqrestore(&mp->lock, flags);
-}
-
-static void isdn_ppp_mp_cleanup(isdn_net_local *lp)
-{
-	struct sk_buff *frag = lp->netdev->pb->frags;
-	struct sk_buff *nextfrag;
-	while (frag) {
-		nextfrag = frag->next;
-		isdn_ppp_mp_free_skb(lp->netdev->pb, frag);
-		frag = nextfrag;
-	}
-	lp->netdev->pb->frags = NULL;
-}
-
-static u32 isdn_ppp_mp_get_seq(int short_seq,
-			       struct sk_buff *skb, u32 last_seq)
-{
-	u32 seq;
-	int flags = skb->data[0] & (MP_BEGIN_FRAG | MP_END_FRAG);
-
-	if (!short_seq)
-	{
-		seq = ntohl(*(__be32 *)skb->data) & MP_LONGSEQ_MASK;
-		skb_push(skb, 1);
-	}
-	else
-	{
-		/* convert 12-bit short seq number to 24-bit long one
-		 */
-		seq = ntohs(*(__be16 *)skb->data) & MP_SHORTSEQ_MASK;
-
-		/* check for seqence wrap */
-		if (!(seq &  MP_SHORTSEQ_MAXBIT) &&
-		    (last_seq &  MP_SHORTSEQ_MAXBIT) &&
-		    (unsigned long)last_seq <= MP_LONGSEQ_MAX)
-			seq |= (last_seq + MP_SHORTSEQ_MAX + 1) &
-				(~MP_SHORTSEQ_MASK & MP_LONGSEQ_MASK);
-		else
-			seq |= last_seq & (~MP_SHORTSEQ_MASK & MP_LONGSEQ_MASK);
-
-		skb_push(skb, 3);	/* put converted seqence back in skb */
-	}
-	*(u32 *)(skb->data + 1) = seq;	/* put seqence back in _host_ byte
-					 * order */
-	skb->data[0] = flags;	        /* restore flags */
-	return seq;
-}
-
-static struct sk_buff *isdn_ppp_mp_discard(ippp_bundle *mp,
-					   struct sk_buff *from,
-					   struct sk_buff *to)
-{
-	if (from)
-		while (from != to) {
-			struct sk_buff *next = from->next;
-			isdn_ppp_mp_free_skb(mp, from);
-			from = next;
-		}
-	return from;
-}
-
-static void isdn_ppp_mp_reassembly(isdn_net_dev *net_dev, isdn_net_local *lp,
-				   struct sk_buff *from, struct sk_buff *to)
-{
-	ippp_bundle *mp = net_dev->pb;
-	int proto;
-	struct sk_buff *skb;
-	unsigned int tot_len;
-
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return;
-	}
-	if (MP_FLAGS(from) == (MP_BEGIN_FRAG | MP_END_FRAG)) {
-		if (ippp_table[lp->ppp_slot]->debug & 0x40)
-			printk(KERN_DEBUG "isdn_mppp: reassembly: frame %d, "
-			       "len %d\n", MP_SEQ(from), from->len);
-		skb = from;
-		skb_pull(skb, MP_HEADER_LEN);
-		mp->frames--;
-	} else {
-		struct sk_buff *frag;
-		int n;
-
-		for (tot_len = n = 0, frag = from; frag != to; frag = frag->next, n++)
-			tot_len += frag->len - MP_HEADER_LEN;
-
-		if (ippp_table[lp->ppp_slot]->debug & 0x40)
-			printk(KERN_DEBUG"isdn_mppp: reassembling frames %d "
-			       "to %d, len %d\n", MP_SEQ(from),
-			       (MP_SEQ(from) + n - 1) & MP_LONGSEQ_MASK, tot_len);
-		if ((skb = dev_alloc_skb(tot_len)) == NULL) {
-			printk(KERN_ERR "isdn_mppp: cannot allocate sk buff "
-			       "of size %d\n", tot_len);
-			isdn_ppp_mp_discard(mp, from, to);
-			return;
-		}
-
-		while (from != to) {
-			unsigned int len = from->len - MP_HEADER_LEN;
-
-			skb_copy_from_linear_data_offset(from, MP_HEADER_LEN,
-							 skb_put(skb, len),
-							 len);
-			frag = from->next;
-			isdn_ppp_mp_free_skb(mp, from);
-			from = frag;
-		}
-	}
-	proto = isdn_ppp_strip_proto(skb);
-	isdn_ppp_push_higher(net_dev, lp, skb, proto);
-}
-
-static void isdn_ppp_mp_free_skb(ippp_bundle *mp, struct sk_buff *skb)
-{
-	dev_kfree_skb(skb);
-	mp->frames--;
-}
-
-static void isdn_ppp_mp_print_recv_pkt(int slot, struct sk_buff *skb)
-{
-	printk(KERN_DEBUG "mp_recv: %d/%d -> %02x %02x %02x %02x %02x %02x\n",
-	       slot, (int) skb->len,
-	       (int) skb->data[0], (int) skb->data[1], (int) skb->data[2],
-	       (int) skb->data[3], (int) skb->data[4], (int) skb->data[5]);
-}
-
-static int
-isdn_ppp_bundle(struct ippp_struct *is, int unit)
-{
-	char ifn[IFNAMSIZ + 1];
-	isdn_net_dev *p;
-	isdn_net_local *lp, *nlp;
-	int rc;
-	unsigned long flags;
-
-	sprintf(ifn, "ippp%d", unit);
-	p = isdn_net_findif(ifn);
-	if (!p) {
-		printk(KERN_ERR "ippp_bundle: cannot find %s\n", ifn);
-		return -EINVAL;
-	}
-
-	spin_lock_irqsave(&p->pb->lock, flags);
-
-	nlp = is->lp;
-	lp = p->queue;
-	if (nlp->ppp_slot < 0 || nlp->ppp_slot >= ISDN_MAX_CHANNELS ||
-	    lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "ippp_bundle: binding to invalid slot %d\n",
-		       nlp->ppp_slot < 0 || nlp->ppp_slot >= ISDN_MAX_CHANNELS ?
-		       nlp->ppp_slot : lp->ppp_slot);
-		rc = -EINVAL;
-		goto out;
-	}
-
-	isdn_net_add_to_bundle(p, nlp);
-
-	ippp_table[nlp->ppp_slot]->unit = ippp_table[lp->ppp_slot]->unit;
-
-	/* maybe also SC_CCP stuff */
-	ippp_table[nlp->ppp_slot]->pppcfg |= ippp_table[lp->ppp_slot]->pppcfg &
-		(SC_ENABLE_IP | SC_NO_TCP_CCID | SC_REJ_COMP_TCP);
-	ippp_table[nlp->ppp_slot]->mpppcfg |= ippp_table[lp->ppp_slot]->mpppcfg &
-		(SC_MP_PROT | SC_REJ_MP_PROT | SC_OUT_SHORT_SEQ | SC_IN_SHORT_SEQ);
-	rc = isdn_ppp_mp_init(nlp, p->pb);
-out:
-	spin_unlock_irqrestore(&p->pb->lock, flags);
-	return rc;
-}
-
-#endif /* CONFIG_ISDN_MPP */
-
-/*
- * network device ioctl handlers
- */
-
-static int
-isdn_ppp_dev_ioctl_stats(int slot, struct ifreq *ifr, struct net_device *dev)
-{
-	struct ppp_stats __user *res = ifr->ifr_data;
-	struct ppp_stats t;
-	isdn_net_local *lp = netdev_priv(dev);
-
-	/* build a temporary stat struct and copy it to user space */
-
-	memset(&t, 0, sizeof(struct ppp_stats));
-	if (dev->flags & IFF_UP) {
-		t.p.ppp_ipackets = lp->stats.rx_packets;
-		t.p.ppp_ibytes = lp->stats.rx_bytes;
-		t.p.ppp_ierrors = lp->stats.rx_errors;
-		t.p.ppp_opackets = lp->stats.tx_packets;
-		t.p.ppp_obytes = lp->stats.tx_bytes;
-		t.p.ppp_oerrors = lp->stats.tx_errors;
-#ifdef CONFIG_ISDN_PPP_VJ
-		if (slot >= 0 && ippp_table[slot]->slcomp) {
-			struct slcompress *slcomp = ippp_table[slot]->slcomp;
-			t.vj.vjs_packets = slcomp->sls_o_compressed + slcomp->sls_o_uncompressed;
-			t.vj.vjs_compressed = slcomp->sls_o_compressed;
-			t.vj.vjs_searches = slcomp->sls_o_searches;
-			t.vj.vjs_misses = slcomp->sls_o_misses;
-			t.vj.vjs_errorin = slcomp->sls_i_error;
-			t.vj.vjs_tossed = slcomp->sls_i_tossed;
-			t.vj.vjs_uncompressedin = slcomp->sls_i_uncompressed;
-			t.vj.vjs_compressedin = slcomp->sls_i_compressed;
-		}
-#endif
-	}
-	if (copy_to_user(res, &t, sizeof(struct ppp_stats)))
-		return -EFAULT;
-	return 0;
-}
-
-int
-isdn_ppp_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	int error = 0;
-	int len;
-	isdn_net_local *lp = netdev_priv(dev);
-
-
-	if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP)
-		return -EINVAL;
-
-	switch (cmd) {
-#define PPP_VERSION "2.3.7"
-	case SIOCGPPPVER:
-		len = strlen(PPP_VERSION) + 1;
-		if (copy_to_user(ifr->ifr_data, PPP_VERSION, len))
-			error = -EFAULT;
-		break;
-
-	case SIOCGPPPSTATS:
-		error = isdn_ppp_dev_ioctl_stats(lp->ppp_slot, ifr, dev);
-		break;
-	default:
-		error = -EINVAL;
-		break;
-	}
-	return error;
-}
-
-static int
-isdn_ppp_if_get_unit(char *name)
-{
-	int len,
-		i,
-		unit = 0,
-		deci;
-
-	len = strlen(name);
-
-	if (strncmp("ippp", name, 4) || len > 8)
-		return -1;
-
-	for (i = 0, deci = 1; i < len; i++, deci *= 10) {
-		char a = name[len - i - 1];
-		if (a >= '0' && a <= '9')
-			unit += (a - '0') * deci;
-		else
-			break;
-	}
-	if (!i || len - i != 4)
-		unit = -1;
-
-	return unit;
-}
-
-
-int
-isdn_ppp_dial_slave(char *name)
-{
-#ifdef CONFIG_ISDN_MPP
-	isdn_net_dev *ndev;
-	isdn_net_local *lp;
-	struct net_device *sdev;
-
-	if (!(ndev = isdn_net_findif(name)))
-		return 1;
-	lp = ndev->local;
-	if (!(lp->flags & ISDN_NET_CONNECTED))
-		return 5;
-
-	sdev = lp->slave;
-	while (sdev) {
-		isdn_net_local *mlp = netdev_priv(sdev);
-		if (!(mlp->flags & ISDN_NET_CONNECTED))
-			break;
-		sdev = mlp->slave;
-	}
-	if (!sdev)
-		return 2;
-
-	isdn_net_dial_req(netdev_priv(sdev));
-	return 0;
-#else
-	return -1;
-#endif
-}
-
-int
-isdn_ppp_hangup_slave(char *name)
-{
-#ifdef CONFIG_ISDN_MPP
-	isdn_net_dev *ndev;
-	isdn_net_local *lp;
-	struct net_device *sdev;
-
-	if (!(ndev = isdn_net_findif(name)))
-		return 1;
-	lp = ndev->local;
-	if (!(lp->flags & ISDN_NET_CONNECTED))
-		return 5;
-
-	sdev = lp->slave;
-	while (sdev) {
-		isdn_net_local *mlp = netdev_priv(sdev);
-
-		if (mlp->slave) { /* find last connected link in chain */
-			isdn_net_local *nlp = ISDN_SLAVE_PRIV(mlp);
-
-			if (!(nlp->flags & ISDN_NET_CONNECTED))
-				break;
-		} else if (mlp->flags & ISDN_NET_CONNECTED)
-			break;
-
-		sdev = mlp->slave;
-	}
-	if (!sdev)
-		return 2;
-
-	isdn_net_hangup(sdev);
-	return 0;
-#else
-	return -1;
-#endif
-}
-
-/*
- * PPP compression stuff
- */
-
-
-/* Push an empty CCP Data Frame up to the daemon to wake it up and let it
-   generate a CCP Reset-Request or tear down CCP altogether */
-
-static void isdn_ppp_ccp_kickup(struct ippp_struct *is)
-{
-	isdn_ppp_fill_rq(NULL, 0, PPP_COMP, is->lp->ppp_slot);
-}
-
-/* In-kernel handling of CCP Reset-Request and Reset-Ack is necessary,
-   but absolutely nontrivial. The most abstruse problem we are facing is
-   that the generation, reception and all the handling of timeouts and
-   resends including proper request id management should be entirely left
-   to the (de)compressor, but indeed is not covered by the current API to
-   the (de)compressor. The API is a prototype version from PPP where only
-   some (de)compressors have yet been implemented and all of them are
-   rather simple in their reset handling. Especially, their is only one
-   outstanding ResetAck at a time with all of them and ResetReq/-Acks do
-   not have parameters. For this very special case it was sufficient to
-   just return an error code from the decompressor and have a single
-   reset() entry to communicate all the necessary information between
-   the framework and the (de)compressor. Bad enough, LZS is different
-   (and any other compressor may be different, too). It has multiple
-   histories (eventually) and needs to Reset each of them independently
-   and thus uses multiple outstanding Acks and history numbers as an
-   additional parameter to Reqs/Acks.
-   All that makes it harder to port the reset state engine into the
-   kernel because it is not just the same simple one as in (i)pppd but
-   it must be able to pass additional parameters and have multiple out-
-   standing Acks. We are trying to achieve the impossible by handling
-   reset transactions independent by their id. The id MUST change when
-   the data portion changes, thus any (de)compressor who uses more than
-   one resettable state must provide and recognize individual ids for
-   each individual reset transaction. The framework itself does _only_
-   differentiate them by id, because it has no other semantics like the
-   (de)compressor might.
-   This looks like a major redesign of the interface would be nice,
-   but I don't have an idea how to do it better. */
-
-/* Send a CCP Reset-Request or Reset-Ack directly from the kernel. This is
-   getting that lengthy because there is no simple "send-this-frame-out"
-   function above but every wrapper does a bit different. Hope I guess
-   correct in this hack... */
-
-static void isdn_ppp_ccp_xmit_reset(struct ippp_struct *is, int proto,
-				    unsigned char code, unsigned char id,
-				    unsigned char *data, int len)
-{
-	struct sk_buff *skb;
-	unsigned char *p;
-	int hl;
-	int cnt = 0;
-	isdn_net_local *lp = is->lp;
-
-	/* Alloc large enough skb */
-	hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen;
-	skb = alloc_skb(len + hl + 16, GFP_ATOMIC);
-	if (!skb) {
-		printk(KERN_WARNING
-		       "ippp: CCP cannot send reset - out of memory\n");
-		return;
-	}
-	skb_reserve(skb, hl);
-
-	/* We may need to stuff an address and control field first */
-	if (!(is->pppcfg & SC_COMP_AC)) {
-		p = skb_put(skb, 2);
-		*p++ = 0xff;
-		*p++ = 0x03;
-	}
-
-	/* Stuff proto, code, id and length */
-	p = skb_put(skb, 6);
-	*p++ = (proto >> 8);
-	*p++ = (proto & 0xff);
-	*p++ = code;
-	*p++ = id;
-	cnt = 4 + len;
-	*p++ = (cnt >> 8);
-	*p++ = (cnt & 0xff);
-
-	/* Now stuff remaining bytes */
-	if (len) {
-		skb_put_data(skb, data, len);
-	}
-
-	/* skb is now ready for xmit */
-	printk(KERN_DEBUG "Sending CCP Frame:\n");
-	isdn_ppp_frame_log("ccp-xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-
-	isdn_net_write_super(lp, skb);
-}
-
-/* Allocate the reset state vector */
-static struct ippp_ccp_reset *isdn_ppp_ccp_reset_alloc(struct ippp_struct *is)
-{
-	struct ippp_ccp_reset *r;
-	r = kzalloc(sizeof(struct ippp_ccp_reset), GFP_KERNEL);
-	if (!r) {
-		printk(KERN_ERR "ippp_ccp: failed to allocate reset data"
-		       " structure - no mem\n");
-		return NULL;
-	}
-	printk(KERN_DEBUG "ippp_ccp: allocated reset data structure %p\n", r);
-	is->reset = r;
-	return r;
-}
-
-/* Destroy the reset state vector. Kill all pending timers first. */
-static void isdn_ppp_ccp_reset_free(struct ippp_struct *is)
-{
-	unsigned int id;
-
-	printk(KERN_DEBUG "ippp_ccp: freeing reset data structure %p\n",
-	       is->reset);
-	for (id = 0; id < 256; id++) {
-		if (is->reset->rs[id]) {
-			isdn_ppp_ccp_reset_free_state(is, (unsigned char)id);
-		}
-	}
-	kfree(is->reset);
-	is->reset = NULL;
-}
-
-/* Free a given state and clear everything up for later reallocation */
-static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is,
-					  unsigned char id)
-{
-	struct ippp_ccp_reset_state *rs;
-
-	if (is->reset->rs[id]) {
-		printk(KERN_DEBUG "ippp_ccp: freeing state for id %d\n", id);
-		rs = is->reset->rs[id];
-		/* Make sure the kernel will not call back later */
-		if (rs->ta)
-			del_timer(&rs->timer);
-		is->reset->rs[id] = NULL;
-		kfree(rs);
-	} else {
-		printk(KERN_WARNING "ippp_ccp: id %d is not allocated\n", id);
-	}
-}
-
-/* The timer callback function which is called when a ResetReq has timed out,
-   aka has never been answered by a ResetAck */
-static void isdn_ppp_ccp_timer_callback(struct timer_list *t)
-{
-	struct ippp_ccp_reset_state *rs =
-		from_timer(rs, t, timer);
-
-	if (!rs) {
-		printk(KERN_ERR "ippp_ccp: timer cb with zero closure.\n");
-		return;
-	}
-	if (rs->ta && rs->state == CCPResetSentReq) {
-		/* We are correct here */
-		if (!rs->expra) {
-			/* Hmm, there is no Ack really expected. We can clean
-			   up the state now, it will be reallocated if the
-			   decompressor insists on another reset */
-			rs->ta = 0;
-			isdn_ppp_ccp_reset_free_state(rs->is, rs->id);
-			return;
-		}
-		printk(KERN_DEBUG "ippp_ccp: CCP Reset timed out for id %d\n",
-		       rs->id);
-		/* Push it again */
-		isdn_ppp_ccp_xmit_reset(rs->is, PPP_CCP, CCP_RESETREQ, rs->id,
-					rs->data, rs->dlen);
-		/* Restart timer */
-		rs->timer.expires = jiffies + HZ * 5;
-		add_timer(&rs->timer);
-	} else {
-		printk(KERN_WARNING "ippp_ccp: timer cb in wrong state %d\n",
-		       rs->state);
-	}
-}
-
-/* Allocate a new reset transaction state */
-static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_struct *is,
-								   unsigned char id)
-{
-	struct ippp_ccp_reset_state *rs;
-	if (is->reset->rs[id]) {
-		printk(KERN_WARNING "ippp_ccp: old state exists for id %d\n",
-		       id);
-		return NULL;
-	} else {
-		rs = kzalloc(sizeof(struct ippp_ccp_reset_state), GFP_ATOMIC);
-		if (!rs)
-			return NULL;
-		rs->state = CCPResetIdle;
-		rs->is = is;
-		rs->id = id;
-		timer_setup(&rs->timer, isdn_ppp_ccp_timer_callback, 0);
-		is->reset->rs[id] = rs;
-	}
-	return rs;
-}
-
-
-/* A decompressor wants a reset with a set of parameters - do what is
-   necessary to fulfill it */
-static void isdn_ppp_ccp_reset_trans(struct ippp_struct *is,
-				     struct isdn_ppp_resetparams *rp)
-{
-	struct ippp_ccp_reset_state *rs;
-
-	if (rp->valid) {
-		/* The decompressor defines parameters by itself */
-		if (rp->rsend) {
-			/* And he wants us to send a request */
-			if (!(rp->idval)) {
-				printk(KERN_ERR "ippp_ccp: decompressor must"
-				       " specify reset id\n");
-				return;
-			}
-			if (is->reset->rs[rp->id]) {
-				/* There is already a transaction in existence
-				   for this id. May be still waiting for a
-				   Ack or may be wrong. */
-				rs = is->reset->rs[rp->id];
-				if (rs->state == CCPResetSentReq && rs->ta) {
-					printk(KERN_DEBUG "ippp_ccp: reset"
-					       " trans still in progress"
-					       " for id %d\n", rp->id);
-				} else {
-					printk(KERN_WARNING "ippp_ccp: reset"
-					       " trans in wrong state %d for"
-					       " id %d\n", rs->state, rp->id);
-				}
-			} else {
-				/* Ok, this is a new transaction */
-				printk(KERN_DEBUG "ippp_ccp: new trans for id"
-				       " %d to be started\n", rp->id);
-				rs = isdn_ppp_ccp_reset_alloc_state(is, rp->id);
-				if (!rs) {
-					printk(KERN_ERR "ippp_ccp: out of mem"
-					       " allocing ccp trans\n");
-					return;
-				}
-				rs->state = CCPResetSentReq;
-				rs->expra = rp->expra;
-				if (rp->dtval) {
-					rs->dlen = rp->dlen;
-					memcpy(rs->data, rp->data, rp->dlen);
-				}
-				/* HACK TODO - add link comp here */
-				isdn_ppp_ccp_xmit_reset(is, PPP_CCP,
-							CCP_RESETREQ, rs->id,
-							rs->data, rs->dlen);
-				/* Start the timer */
-				rs->timer.expires = jiffies + 5 * HZ;
-				add_timer(&rs->timer);
-				rs->ta = 1;
-			}
-		} else {
-			printk(KERN_DEBUG "ippp_ccp: no reset sent\n");
-		}
-	} else {
-		/* The reset params are invalid. The decompressor does not
-		   care about them, so we just send the minimal requests
-		   and increase ids only when an Ack is received for a
-		   given id */
-		if (is->reset->rs[is->reset->lastid]) {
-			/* There is already a transaction in existence
-			   for this id. May be still waiting for a
-			   Ack or may be wrong. */
-			rs = is->reset->rs[is->reset->lastid];
-			if (rs->state == CCPResetSentReq && rs->ta) {
-				printk(KERN_DEBUG "ippp_ccp: reset"
-				       " trans still in progress"
-				       " for id %d\n", rp->id);
-			} else {
-				printk(KERN_WARNING "ippp_ccp: reset"
-				       " trans in wrong state %d for"
-				       " id %d\n", rs->state, rp->id);
-			}
-		} else {
-			printk(KERN_DEBUG "ippp_ccp: new trans for id"
-			       " %d to be started\n", is->reset->lastid);
-			rs = isdn_ppp_ccp_reset_alloc_state(is,
-							    is->reset->lastid);
-			if (!rs) {
-				printk(KERN_ERR "ippp_ccp: out of mem"
-				       " allocing ccp trans\n");
-				return;
-			}
-			rs->state = CCPResetSentReq;
-			/* We always expect an Ack if the decompressor doesn't
-			   know	better */
-			rs->expra = 1;
-			rs->dlen = 0;
-			/* HACK TODO - add link comp here */
-			isdn_ppp_ccp_xmit_reset(is, PPP_CCP, CCP_RESETREQ,
-						rs->id, NULL, 0);
-			/* Start the timer */
-			rs->timer.expires = jiffies + 5 * HZ;
-			add_timer(&rs->timer);
-			rs->ta = 1;
-		}
-	}
-}
-
-/* An Ack was received for this id. This means we stop the timer and clean
-   up the state prior to calling the decompressors reset routine. */
-static void isdn_ppp_ccp_reset_ack_rcvd(struct ippp_struct *is,
-					unsigned char id)
-{
-	struct ippp_ccp_reset_state *rs = is->reset->rs[id];
-
-	if (rs) {
-		if (rs->ta && rs->state == CCPResetSentReq) {
-			/* Great, we are correct */
-			if (!rs->expra)
-				printk(KERN_DEBUG "ippp_ccp: ResetAck received"
-				       " for id %d but not expected\n", id);
-		} else {
-			printk(KERN_INFO "ippp_ccp: ResetAck received out of"
-			       "sync for id %d\n", id);
-		}
-		if (rs->ta) {
-			rs->ta = 0;
-			del_timer(&rs->timer);
-		}
-		isdn_ppp_ccp_reset_free_state(is, id);
-	} else {
-		printk(KERN_INFO "ippp_ccp: ResetAck received for unknown id"
-		       " %d\n", id);
-	}
-	/* Make sure the simple reset stuff uses a new id next time */
-	is->reset->lastid++;
-}
-
-/*
- * decompress packet
- *
- * if master = 0, we're trying to uncompress an per-link compressed packet,
- * as opposed to an compressed reconstructed-from-MPPP packet.
- * proto is updated to protocol field of uncompressed packet.
- *
- * retval: decompressed packet,
- *         same packet if uncompressed,
- *	   NULL if decompression error
- */
-
-static struct sk_buff *isdn_ppp_decompress(struct sk_buff *skb, struct ippp_struct *is, struct ippp_struct *master,
-					   int *proto)
-{
-	void *stat = NULL;
-	struct isdn_ppp_compressor *ipc = NULL;
-	struct sk_buff *skb_out;
-	int len;
-	struct ippp_struct *ri;
-	struct isdn_ppp_resetparams rsparm;
-	unsigned char rsdata[IPPP_RESET_MAXDATABYTES];
-
-	if (!master) {
-		// per-link decompression
-		stat = is->link_decomp_stat;
-		ipc = is->link_decompressor;
-		ri = is;
-	} else {
-		stat = master->decomp_stat;
-		ipc = master->decompressor;
-		ri = master;
-	}
-
-	if (!ipc) {
-		// no decompressor -> we can't decompress.
-		printk(KERN_DEBUG "ippp: no decompressor defined!\n");
-		return skb;
-	}
-	BUG_ON(!stat); // if we have a compressor, stat has been set as well
-
-	if ((master && *proto == PPP_COMP) || (!master && *proto == PPP_COMPFRAG)) {
-		// compressed packets are compressed by their protocol type
-
-		// Set up reset params for the decompressor
-		memset(&rsparm, 0, sizeof(rsparm));
-		rsparm.data = rsdata;
-		rsparm.maxdlen = IPPP_RESET_MAXDATABYTES;
-
-		skb_out = dev_alloc_skb(is->mru + PPP_HDRLEN);
-		if (!skb_out) {
-			kfree_skb(skb);
-			printk(KERN_ERR "ippp: decomp memory allocation failure\n");
-			return NULL;
-		}
-		len = ipc->decompress(stat, skb, skb_out, &rsparm);
-		kfree_skb(skb);
-		if (len <= 0) {
-			switch (len) {
-			case DECOMP_ERROR:
-				printk(KERN_INFO "ippp: decomp wants reset %s params\n",
-				       rsparm.valid ? "with" : "without");
-
-				isdn_ppp_ccp_reset_trans(ri, &rsparm);
-				break;
-			case DECOMP_FATALERROR:
-				ri->pppcfg |= SC_DC_FERROR;
-				/* Kick ipppd to recognize the error */
-				isdn_ppp_ccp_kickup(ri);
-				break;
-			}
-			kfree_skb(skb_out);
-			return NULL;
-		}
-		*proto = isdn_ppp_strip_proto(skb_out);
-		if (*proto < 0) {
-			kfree_skb(skb_out);
-			return NULL;
-		}
-		return skb_out;
-	} else {
-		// uncompressed packets are fed through the decompressor to
-		// update the decompressor state
-		ipc->incomp(stat, skb, *proto);
-		return skb;
-	}
-}
-
-/*
- * compress a frame
- *   type=0: normal/bundle compression
- *       =1: link compression
- * returns original skb if we haven't compressed the frame
- * and a new skb pointer if we've done it
- */
-static struct sk_buff *isdn_ppp_compress(struct sk_buff *skb_in, int *proto,
-					 struct ippp_struct *is, struct ippp_struct *master, int type)
-{
-	int ret;
-	int new_proto;
-	struct isdn_ppp_compressor *compressor;
-	void *stat;
-	struct sk_buff *skb_out;
-
-	/* we do not compress control protocols */
-	if (*proto < 0 || *proto > 0x3fff) {
-		return skb_in;
-	}
-
-	if (type) { /* type=1 => Link compression */
-		return skb_in;
-	}
-	else {
-		if (!master) {
-			compressor = is->compressor;
-			stat = is->comp_stat;
-		}
-		else {
-			compressor = master->compressor;
-			stat = master->comp_stat;
-		}
-		new_proto = PPP_COMP;
-	}
-
-	if (!compressor) {
-		printk(KERN_ERR "isdn_ppp: No compressor set!\n");
-		return skb_in;
-	}
-	if (!stat) {
-		printk(KERN_ERR "isdn_ppp: Compressor not initialized?\n");
-		return skb_in;
-	}
-
-	/* Allow for at least 150 % expansion (for now) */
-	skb_out = alloc_skb(skb_in->len + skb_in->len / 2 + 32 +
-			    skb_headroom(skb_in), GFP_ATOMIC);
-	if (!skb_out)
-		return skb_in;
-	skb_reserve(skb_out, skb_headroom(skb_in));
-
-	ret = (compressor->compress)(stat, skb_in, skb_out, *proto);
-	if (!ret) {
-		dev_kfree_skb(skb_out);
-		return skb_in;
-	}
-
-	dev_kfree_skb(skb_in);
-	*proto = new_proto;
-	return skb_out;
-}
-
-/*
- * we received a CCP frame ..
- * not a clean solution, but we MUST handle a few cases in the kernel
- */
-static void isdn_ppp_receive_ccp(isdn_net_dev *net_dev, isdn_net_local *lp,
-				 struct sk_buff *skb, int proto)
-{
-	struct ippp_struct *is;
-	struct ippp_struct *mis;
-	int len;
-	struct isdn_ppp_resetparams rsparm;
-	unsigned char rsdata[IPPP_RESET_MAXDATABYTES];
-
-	printk(KERN_DEBUG "Received CCP frame from peer slot(%d)\n",
-	       lp->ppp_slot);
-	if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n",
-		       __func__, lp->ppp_slot);
-		return;
-	}
-	is = ippp_table[lp->ppp_slot];
-	isdn_ppp_frame_log("ccp-rcv", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-
-	if (lp->master) {
-		int slot = ISDN_MASTER_PRIV(lp)->ppp_slot;
-		if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-			printk(KERN_ERR "%s: slot(%d) out of range\n",
-			       __func__, slot);
-			return;
-		}
-		mis = ippp_table[slot];
-	} else
-		mis = is;
-
-	switch (skb->data[0]) {
-	case CCP_CONFREQ:
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Disable compression here!\n");
-		if (proto == PPP_CCP)
-			mis->compflags &= ~SC_COMP_ON;
-		else
-			is->compflags &= ~SC_LINK_COMP_ON;
-		break;
-	case CCP_TERMREQ:
-	case CCP_TERMACK:
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Disable (de)compression here!\n");
-		if (proto == PPP_CCP)
-			mis->compflags &= ~(SC_DECOMP_ON | SC_COMP_ON);
-		else
-			is->compflags &= ~(SC_LINK_DECOMP_ON | SC_LINK_COMP_ON);
-		break;
-	case CCP_CONFACK:
-		/* if we RECEIVE an ackowledge we enable the decompressor */
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Enable decompression here!\n");
-		if (proto == PPP_CCP) {
-			if (!mis->decompressor)
-				break;
-			mis->compflags |= SC_DECOMP_ON;
-		} else {
-			if (!is->decompressor)
-				break;
-			is->compflags |= SC_LINK_DECOMP_ON;
-		}
-		break;
-
-	case CCP_RESETACK:
-		printk(KERN_DEBUG "Received ResetAck from peer\n");
-		len = (skb->data[2] << 8) | skb->data[3];
-		len -= 4;
-
-		if (proto == PPP_CCP) {
-			/* If a reset Ack was outstanding for this id, then
-			   clean up the state engine */
-			isdn_ppp_ccp_reset_ack_rcvd(mis, skb->data[1]);
-			if (mis->decompressor && mis->decomp_stat)
-				mis->decompressor->
-					reset(mis->decomp_stat,
-					      skb->data[0],
-					      skb->data[1],
-					      len ? &skb->data[4] : NULL,
-					      len, NULL);
-			/* TODO: This is not easy to decide here */
-			mis->compflags &= ~SC_DECOMP_DISCARD;
-		}
-		else {
-			isdn_ppp_ccp_reset_ack_rcvd(is, skb->data[1]);
-			if (is->link_decompressor && is->link_decomp_stat)
-				is->link_decompressor->
-					reset(is->link_decomp_stat,
-					      skb->data[0],
-					      skb->data[1],
-					      len ? &skb->data[4] : NULL,
-					      len, NULL);
-			/* TODO: neither here */
-			is->compflags &= ~SC_LINK_DECOMP_DISCARD;
-		}
-		break;
-
-	case CCP_RESETREQ:
-		printk(KERN_DEBUG "Received ResetReq from peer\n");
-		/* Receiving a ResetReq means we must reset our compressor */
-		/* Set up reset params for the reset entry */
-		memset(&rsparm, 0, sizeof(rsparm));
-		rsparm.data = rsdata;
-		rsparm.maxdlen = IPPP_RESET_MAXDATABYTES;
-		/* Isolate data length */
-		len = (skb->data[2] << 8) | skb->data[3];
-		len -= 4;
-		if (proto == PPP_CCP) {
-			if (mis->compressor && mis->comp_stat)
-				mis->compressor->
-					reset(mis->comp_stat,
-					      skb->data[0],
-					      skb->data[1],
-					      len ? &skb->data[4] : NULL,
-					      len, &rsparm);
-		}
-		else {
-			if (is->link_compressor && is->link_comp_stat)
-				is->link_compressor->
-					reset(is->link_comp_stat,
-					      skb->data[0],
-					      skb->data[1],
-					      len ? &skb->data[4] : NULL,
-					      len, &rsparm);
-		}
-		/* Ack the Req as specified by rsparm */
-		if (rsparm.valid) {
-			/* Compressor reset handler decided how to answer */
-			if (rsparm.rsend) {
-				/* We should send a Frame */
-				isdn_ppp_ccp_xmit_reset(is, proto, CCP_RESETACK,
-							rsparm.idval ? rsparm.id
-							: skb->data[1],
-							rsparm.dtval ?
-							rsparm.data : NULL,
-							rsparm.dtval ?
-							rsparm.dlen : 0);
-			} else {
-				printk(KERN_DEBUG "ResetAck suppressed\n");
-			}
-		} else {
-			/* We answer with a straight reflected Ack */
-			isdn_ppp_ccp_xmit_reset(is, proto, CCP_RESETACK,
-						skb->data[1],
-						len ? &skb->data[4] : NULL,
-						len);
-		}
-		break;
-	}
-}
-
-
-/*
- * Daemon sends a CCP frame ...
- */
-
-/* TODO: Clean this up with new Reset semantics */
-
-/* I believe the CCP handling as-is is done wrong. Compressed frames
- * should only be sent/received after CCP reaches UP state, which means
- * both sides have sent CONF_ACK. Currently, we handle both directions
- * independently, which means we may accept compressed frames too early
- * (supposedly not a problem), but may also mean we send compressed frames
- * too early, which may turn out to be a problem.
- * This part of state machine should actually be handled by (i)pppd, but
- * that's too big of a change now. --kai
- */
-
-/* Actually, we might turn this into an advantage: deal with the RFC in
- * the old tradition of beeing generous on what we accept, but beeing
- * strict on what we send. Thus we should just
- * - accept compressed frames as soon as decompression is negotiated
- * - send compressed frames only when decomp *and* comp are negotiated
- * - drop rx compressed frames if we cannot decomp (instead of pushing them
- *   up to ipppd)
- * and I tried to modify this file according to that. --abp
- */
-
-static void isdn_ppp_send_ccp(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb)
-{
-	struct ippp_struct *mis, *is;
-	int proto, slot = lp->ppp_slot;
-	unsigned char *data;
-
-	if (!skb || skb->len < 3)
-		return;
-	if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-		printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n",
-		       __func__, slot);
-		return;
-	}
-	is = ippp_table[slot];
-	/* Daemon may send with or without address and control field comp */
-	data = skb->data;
-	if (!(is->pppcfg & SC_COMP_AC) && data[0] == 0xff && data[1] == 0x03) {
-		data += 2;
-		if (skb->len < 5)
-			return;
-	}
-
-	proto = ((int)data[0]<<8) + data[1];
-	if (proto != PPP_CCP && proto != PPP_CCPFRAG)
-		return;
-
-	printk(KERN_DEBUG "Received CCP frame from daemon:\n");
-	isdn_ppp_frame_log("ccp-xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot);
-
-	if (lp->master) {
-		slot = ISDN_MASTER_PRIV(lp)->ppp_slot;
-		if (slot < 0 || slot >= ISDN_MAX_CHANNELS) {
-			printk(KERN_ERR "%s: slot(%d) out of range\n",
-			       __func__, slot);
-			return;
-		}
-		mis = ippp_table[slot];
-	} else
-		mis = is;
-	if (mis != is)
-		printk(KERN_DEBUG "isdn_ppp: Ouch! Master CCP sends on slave slot!\n");
-
-	switch (data[2]) {
-	case CCP_CONFREQ:
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Disable decompression here!\n");
-		if (proto == PPP_CCP)
-			is->compflags &= ~SC_DECOMP_ON;
-		else
-			is->compflags &= ~SC_LINK_DECOMP_ON;
-		break;
-	case CCP_TERMREQ:
-	case CCP_TERMACK:
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Disable (de)compression here!\n");
-		if (proto == PPP_CCP)
-			is->compflags &= ~(SC_DECOMP_ON | SC_COMP_ON);
-		else
-			is->compflags &= ~(SC_LINK_DECOMP_ON | SC_LINK_COMP_ON);
-		break;
-	case CCP_CONFACK:
-		/* if we SEND an ackowledge we can/must enable the compressor */
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Enable compression here!\n");
-		if (proto == PPP_CCP) {
-			if (!is->compressor)
-				break;
-			is->compflags |= SC_COMP_ON;
-		} else {
-			if (!is->compressor)
-				break;
-			is->compflags |= SC_LINK_COMP_ON;
-		}
-		break;
-	case CCP_RESETACK:
-		/* If we send a ACK we should reset our compressor */
-		if (is->debug & 0x10)
-			printk(KERN_DEBUG "Reset decompression state here!\n");
-		printk(KERN_DEBUG "ResetAck from daemon passed by\n");
-		if (proto == PPP_CCP) {
-			/* link to master? */
-			if (is->compressor && is->comp_stat)
-				is->compressor->reset(is->comp_stat, 0, 0,
-						      NULL, 0, NULL);
-			is->compflags &= ~SC_COMP_DISCARD;
-		}
-		else {
-			if (is->link_compressor && is->link_comp_stat)
-				is->link_compressor->reset(is->link_comp_stat,
-							   0, 0, NULL, 0, NULL);
-			is->compflags &= ~SC_LINK_COMP_DISCARD;
-		}
-		break;
-	case CCP_RESETREQ:
-		/* Just let it pass by */
-		printk(KERN_DEBUG "ResetReq from daemon passed by\n");
-		break;
-	}
-}
-
-int isdn_ppp_register_compressor(struct isdn_ppp_compressor *ipc)
-{
-	ipc->next = ipc_head;
-	ipc->prev = NULL;
-	if (ipc_head) {
-		ipc_head->prev = ipc;
-	}
-	ipc_head = ipc;
-	return 0;
-}
-
-int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *ipc)
-{
-	if (ipc->prev)
-		ipc->prev->next = ipc->next;
-	else
-		ipc_head = ipc->next;
-	if (ipc->next)
-		ipc->next->prev = ipc->prev;
-	ipc->prev = ipc->next = NULL;
-	return 0;
-}
-
-static int isdn_ppp_set_compressor(struct ippp_struct *is, struct isdn_ppp_comp_data *data)
-{
-	struct isdn_ppp_compressor *ipc = ipc_head;
-	int ret;
-	void *stat;
-	int num = data->num;
-
-	if (is->debug & 0x10)
-		printk(KERN_DEBUG "[%d] Set %s type %d\n", is->unit,
-		       (data->flags & IPPP_COMP_FLAG_XMIT) ? "compressor" : "decompressor", num);
-
-	/* If is has no valid reset state vector, we cannot allocate a
-	   decompressor. The decompressor would cause reset transactions
-	   sooner or later, and they need that vector. */
-
-	if (!(data->flags & IPPP_COMP_FLAG_XMIT) && !is->reset) {
-		printk(KERN_ERR "ippp_ccp: no reset data structure - can't"
-		       " allow decompression.\n");
-		return -ENOMEM;
-	}
-
-	while (ipc) {
-		if (ipc->num == num) {
-			stat = ipc->alloc(data);
-			if (stat) {
-				ret = ipc->init(stat, data, is->unit, 0);
-				if (!ret) {
-					printk(KERN_ERR "Can't init (de)compression!\n");
-					ipc->free(stat);
-					stat = NULL;
-					break;
-				}
-			}
-			else {
-				printk(KERN_ERR "Can't alloc (de)compression!\n");
-				break;
-			}
-
-			if (data->flags & IPPP_COMP_FLAG_XMIT) {
-				if (data->flags & IPPP_COMP_FLAG_LINK) {
-					if (is->link_comp_stat)
-						is->link_compressor->free(is->link_comp_stat);
-					is->link_comp_stat = stat;
-					is->link_compressor = ipc;
-				}
-				else {
-					if (is->comp_stat)
-						is->compressor->free(is->comp_stat);
-					is->comp_stat = stat;
-					is->compressor = ipc;
-				}
-			}
-			else {
-				if (data->flags & IPPP_COMP_FLAG_LINK) {
-					if (is->link_decomp_stat)
-						is->link_decompressor->free(is->link_decomp_stat);
-					is->link_decomp_stat = stat;
-					is->link_decompressor = ipc;
-				}
-				else {
-					if (is->decomp_stat)
-						is->decompressor->free(is->decomp_stat);
-					is->decomp_stat = stat;
-					is->decompressor = ipc;
-				}
-			}
-			return 0;
-		}
-		ipc = ipc->next;
-	}
-	return -EINVAL;
-}
diff --git a/drivers/isdn/i4l/isdn_ppp.h b/drivers/isdn/i4l/isdn_ppp.h
deleted file mode 100644
index 34b8a2ce84f3..000000000000
--- a/drivers/isdn/i4l/isdn_ppp.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* $Id: isdn_ppp.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, functions for synchronous PPP (linklevel).
- *
- * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/ppp_defs.h>     /* for PPP_PROTOCOL */
-#include <linux/isdn_ppp.h>	/* for isdn_ppp info */
-
-extern int isdn_ppp_read(int, struct file *, char __user *, int);
-extern int isdn_ppp_write(int, struct file *, const char __user *, int);
-extern int isdn_ppp_open(int, struct file *);
-extern int isdn_ppp_init(void);
-extern void isdn_ppp_cleanup(void);
-extern int isdn_ppp_free(isdn_net_local *);
-extern int isdn_ppp_bind(isdn_net_local *);
-extern int isdn_ppp_autodial_filter(struct sk_buff *, isdn_net_local *);
-extern int isdn_ppp_xmit(struct sk_buff *, struct net_device *);
-extern void isdn_ppp_receive(isdn_net_dev *, isdn_net_local *, struct sk_buff *);
-extern int isdn_ppp_dev_ioctl(struct net_device *, struct ifreq *, int);
-extern __poll_t isdn_ppp_poll(struct file *, struct poll_table_struct *);
-extern int isdn_ppp_ioctl(int, struct file *, unsigned int, unsigned long);
-extern void isdn_ppp_release(int, struct file *);
-extern int isdn_ppp_dial_slave(char *);
-extern void isdn_ppp_wakeup_daemon(isdn_net_local *);
-
-extern int isdn_ppp_register_compressor(struct isdn_ppp_compressor *ipc);
-extern int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *ipc);
-
-#define IPPP_OPEN	0x01
-#define IPPP_CONNECT	0x02
-#define IPPP_CLOSEWAIT	0x04
-#define IPPP_NOBLOCK	0x08
-#define IPPP_ASSIGNED	0x10
-
-#define IPPP_MAX_HEADER 10
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
deleted file mode 100644
index 43700fc19a31..000000000000
--- a/drivers/isdn/i4l/isdn_tty.c
+++ /dev/null
@@ -1,3756 +0,0 @@
-/*
- * Linux ISDN subsystem, tty functions and AT-command emulator (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#undef ISDN_TTY_STAT_DEBUG
-
-#include <linux/isdn.h>
-#include <linux/serial.h> /* ASYNC_* flags */
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/mutex.h>
-#include <linux/sched/signal.h>
-#include "isdn_common.h"
-#include "isdn_tty.h"
-#ifdef CONFIG_ISDN_AUDIO
-#include "isdn_audio.h"
-#define VBUF 0x3e0
-#define VBUFX (VBUF/16)
-#endif
-
-#define FIX_FILE_TRANSFER
-#define	DUMMY_HAYES_AT
-
-/* Prototypes */
-
-static DEFINE_MUTEX(modem_info_mutex);
-static int isdn_tty_edit_at(const char *, int, modem_info *);
-static void isdn_tty_check_esc(const u_char *, u_char, int, int *, u_long *);
-static void isdn_tty_modem_reset_regs(modem_info *, int);
-static void isdn_tty_cmd_ATA(modem_info *);
-static void isdn_tty_flush_buffer(struct tty_struct *);
-static void isdn_tty_modem_result(int, modem_info *);
-#ifdef CONFIG_ISDN_AUDIO
-static int isdn_tty_countDLE(unsigned char *, int);
-#endif
-
-/* Leave this unchanged unless you know what you do! */
-#define MODEM_PARANOIA_CHECK
-#define MODEM_DO_RESTART
-
-static int bit2si[8] =
-{1, 5, 7, 7, 7, 7, 7, 7};
-static int si2bit[8] =
-{4, 1, 4, 4, 4, 4, 4, 4};
-
-/* isdn_tty_try_read() is called from within isdn_tty_rcv_skb()
- * to stuff incoming data directly into a tty's flip-buffer. This
- * is done to speed up tty-receiving if the receive-queue is empty.
- * This routine MUST be called with interrupts off.
- * Return:
- *  1 = Success
- *  0 = Failure, data has to be buffered and later processed by
- *      isdn_tty_readmodem().
- */
-static int
-isdn_tty_try_read(modem_info *info, struct sk_buff *skb)
-{
-	struct tty_port *port = &info->port;
-	int c;
-	int len;
-	char last;
-
-	if (!info->online)
-		return 0;
-
-	if (!(info->mcr & UART_MCR_RTS))
-		return 0;
-
-	len = skb->len
-#ifdef CONFIG_ISDN_AUDIO
-		+ ISDN_AUDIO_SKB_DLECOUNT(skb)
-#endif
-		;
-
-	c = tty_buffer_request_room(port, len);
-	if (c < len)
-		return 0;
-
-#ifdef CONFIG_ISDN_AUDIO
-	if (ISDN_AUDIO_SKB_DLECOUNT(skb)) {
-		int l = skb->len;
-		unsigned char *dp = skb->data;
-		while (--l) {
-			if (*dp == DLE)
-				tty_insert_flip_char(port, DLE, 0);
-			tty_insert_flip_char(port, *dp++, 0);
-		}
-		if (*dp == DLE)
-			tty_insert_flip_char(port, DLE, 0);
-		last = *dp;
-	} else {
-#endif
-		if (len > 1)
-			tty_insert_flip_string(port, skb->data, len - 1);
-		last = skb->data[len - 1];
-#ifdef CONFIG_ISDN_AUDIO
-	}
-#endif
-	if (info->emu.mdmreg[REG_CPPP] & BIT_CPPP)
-		tty_insert_flip_char(port, last, 0xFF);
-	else
-		tty_insert_flip_char(port, last, TTY_NORMAL);
-	tty_flip_buffer_push(port);
-	kfree_skb(skb);
-
-	return 1;
-}
-
-/* isdn_tty_readmodem() is called periodically from within timer-interrupt.
- * It tries getting received data from the receive queue an stuff it into
- * the tty's flip-buffer.
- */
-void
-isdn_tty_readmodem(void)
-{
-	int resched = 0;
-	int midx;
-	int i;
-	int r;
-	modem_info *info;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		midx = dev->m_idx[i];
-		if (midx < 0)
-			continue;
-
-		info = &dev->mdm.info[midx];
-		if (!info->online)
-			continue;
-
-		r = 0;
-#ifdef CONFIG_ISDN_AUDIO
-		isdn_audio_eval_dtmf(info);
-		if ((info->vonline & 1) && (info->emu.vpar[1]))
-			isdn_audio_eval_silence(info);
-#endif
-		if (info->mcr & UART_MCR_RTS) {
-			/* CISCO AsyncPPP Hack */
-			if (!(info->emu.mdmreg[REG_CPPP] & BIT_CPPP))
-				r = isdn_readbchan_tty(info->isdn_driver,
-						info->isdn_channel,
-						&info->port, 0);
-			else
-				r = isdn_readbchan_tty(info->isdn_driver,
-						info->isdn_channel,
-						&info->port, 1);
-			if (r)
-				tty_flip_buffer_push(&info->port);
-		} else
-			r = 1;
-
-		if (r) {
-			info->rcvsched = 0;
-			resched = 1;
-		} else
-			info->rcvsched = 1;
-	}
-	if (!resched)
-		isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 0);
-}
-
-int
-isdn_tty_rcv_skb(int i, int di, int channel, struct sk_buff *skb)
-{
-	ulong flags;
-	int midx;
-#ifdef CONFIG_ISDN_AUDIO
-	int ifmt;
-#endif
-	modem_info *info;
-
-	if ((midx = dev->m_idx[i]) < 0) {
-		/* if midx is invalid, packet is not for tty */
-		return 0;
-	}
-	info = &dev->mdm.info[midx];
-#ifdef CONFIG_ISDN_AUDIO
-	ifmt = 1;
-
-	if ((info->vonline) && (!info->emu.vpar[4]))
-		isdn_audio_calc_dtmf(info, skb->data, skb->len, ifmt);
-	if ((info->vonline & 1) && (info->emu.vpar[1]))
-		isdn_audio_calc_silence(info, skb->data, skb->len, ifmt);
-#endif
-	if ((info->online < 2)
-#ifdef CONFIG_ISDN_AUDIO
-	    && (!(info->vonline & 1))
-#endif
-		) {
-		/* If Modem not listening, drop data */
-		kfree_skb(skb);
-		return 1;
-	}
-	if (info->emu.mdmreg[REG_T70] & BIT_T70) {
-		if (info->emu.mdmreg[REG_T70] & BIT_T70_EXT) {
-			/* T.70 decoding: throw away the T.70 header (2 or 4 bytes)   */
-			if (skb->data[0] == 3) /* pure data packet -> 4 byte headers  */
-				skb_pull(skb, 4);
-			else
-				if (skb->data[0] == 1) /* keepalive packet -> 2 byte hdr  */
-					skb_pull(skb, 2);
-		} else
-			/* T.70 decoding: Simply throw away the T.70 header (4 bytes) */
-			if ((skb->data[0] == 1) && ((skb->data[1] == 0) || (skb->data[1] == 1)))
-				skb_pull(skb, 4);
-	}
-#ifdef CONFIG_ISDN_AUDIO
-	ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
-	ISDN_AUDIO_SKB_LOCK(skb) = 0;
-	if (info->vonline & 1) {
-		/* voice conversion/compression */
-		switch (info->emu.vpar[3]) {
-		case 2:
-		case 3:
-		case 4:
-			/* adpcm
-			 * Since compressed data takes less
-			 * space, we can overwrite the buffer.
-			 */
-			skb_trim(skb, isdn_audio_xlaw2adpcm(info->adpcmr,
-							    ifmt,
-							    skb->data,
-							    skb->data,
-							    skb->len));
-			break;
-		case 5:
-			/* a-law */
-			if (!ifmt)
-				isdn_audio_ulaw2alaw(skb->data, skb->len);
-			break;
-		case 6:
-			/* u-law */
-			if (ifmt)
-				isdn_audio_alaw2ulaw(skb->data, skb->len);
-			break;
-		}
-		ISDN_AUDIO_SKB_DLECOUNT(skb) =
-			isdn_tty_countDLE(skb->data, skb->len);
-	}
-#ifdef CONFIG_ISDN_TTY_FAX
-	else {
-		if (info->faxonline & 2) {
-			isdn_tty_fax_bitorder(info, skb);
-			ISDN_AUDIO_SKB_DLECOUNT(skb) =
-				isdn_tty_countDLE(skb->data, skb->len);
-		}
-	}
-#endif
-#endif
-	/* Try to deliver directly via tty-buf if queue is empty */
-	spin_lock_irqsave(&info->readlock, flags);
-	if (skb_queue_empty(&dev->drv[di]->rpqueue[channel]))
-		if (isdn_tty_try_read(info, skb)) {
-			spin_unlock_irqrestore(&info->readlock, flags);
-			return 1;
-		}
-	/* Direct deliver failed or queue wasn't empty.
-	 * Queue up for later dequeueing via timer-irq.
-	 */
-	__skb_queue_tail(&dev->drv[di]->rpqueue[channel], skb);
-	dev->drv[di]->rcvcount[channel] +=
-		(skb->len
-#ifdef CONFIG_ISDN_AUDIO
-		 + ISDN_AUDIO_SKB_DLECOUNT(skb)
-#endif
-			);
-	spin_unlock_irqrestore(&info->readlock, flags);
-	/* Schedule dequeuing */
-	if ((dev->modempoll) && (info->rcvsched))
-		isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-	return 1;
-}
-
-static void
-isdn_tty_cleanup_xmit(modem_info *info)
-{
-	skb_queue_purge(&info->xmit_queue);
-#ifdef CONFIG_ISDN_AUDIO
-	skb_queue_purge(&info->dtmf_queue);
-#endif
-}
-
-static void
-isdn_tty_tint(modem_info *info)
-{
-	struct sk_buff *skb = skb_dequeue(&info->xmit_queue);
-	int len, slen;
-
-	if (!skb)
-		return;
-	len = skb->len;
-	if ((slen = isdn_writebuf_skb_stub(info->isdn_driver,
-					   info->isdn_channel, 1, skb)) == len) {
-		struct tty_struct *tty = info->port.tty;
-		info->send_outstanding++;
-		info->msr &= ~UART_MSR_CTS;
-		info->lsr &= ~UART_LSR_TEMT;
-		tty_wakeup(tty);
-		return;
-	}
-	if (slen < 0) {
-		/* Error: no channel, already shutdown, or wrong parameter */
-		dev_kfree_skb(skb);
-		return;
-	}
-	skb_queue_head(&info->xmit_queue, skb);
-}
-
-#ifdef CONFIG_ISDN_AUDIO
-static int
-isdn_tty_countDLE(unsigned char *buf, int len)
-{
-	int count = 0;
-
-	while (len--)
-		if (*buf++ == DLE)
-			count++;
-	return count;
-}
-
-/* This routine is called from within isdn_tty_write() to perform
- * DLE-decoding when sending audio-data.
- */
-static int
-isdn_tty_handleDLEdown(modem_info *info, atemu *m, int len)
-{
-	unsigned char *p = &info->port.xmit_buf[info->xmit_count];
-	int count = 0;
-
-	while (len > 0) {
-		if (m->lastDLE) {
-			m->lastDLE = 0;
-			switch (*p) {
-			case DLE:
-				/* Escape code */
-				if (len > 1)
-					memmove(p, p + 1, len - 1);
-				p--;
-				count++;
-				break;
-			case ETX:
-				/* End of data */
-				info->vonline |= 4;
-				return count;
-			case DC4:
-				/* Abort RX */
-				info->vonline &= ~1;
-#ifdef ISDN_DEBUG_MODEM_VOICE
-				printk(KERN_DEBUG
-				       "DLEdown: got DLE-DC4, send DLE-ETX on ttyI%d\n",
-				       info->line);
-#endif
-				isdn_tty_at_cout("\020\003", info);
-				if (!info->vonline) {
-#ifdef ISDN_DEBUG_MODEM_VOICE
-					printk(KERN_DEBUG
-					       "DLEdown: send VCON on ttyI%d\n",
-					       info->line);
-#endif
-					isdn_tty_at_cout("\r\nVCON\r\n", info);
-				}
-				/* Fall through */
-			case 'q':
-			case 's':
-				/* Silence */
-				if (len > 1)
-					memmove(p, p + 1, len - 1);
-				p--;
-				break;
-			}
-		} else {
-			if (*p == DLE)
-				m->lastDLE = 1;
-			else
-				count++;
-		}
-		p++;
-		len--;
-	}
-	if (len < 0) {
-		printk(KERN_WARNING "isdn_tty: len<0 in DLEdown\n");
-		return 0;
-	}
-	return count;
-}
-
-/* This routine is called from within isdn_tty_write() when receiving
- * audio-data. It interrupts receiving, if an character other than
- * ^S or ^Q is sent.
- */
-static int
-isdn_tty_end_vrx(const char *buf, int c)
-{
-	char ch;
-
-	while (c--) {
-		ch = *buf;
-		if ((ch != 0x11) && (ch != 0x13))
-			return 1;
-		buf++;
-	}
-	return 0;
-}
-
-static int voice_cf[7] =
-{0, 0, 4, 3, 2, 0, 0};
-
-#endif                          /* CONFIG_ISDN_AUDIO */
-
-/* isdn_tty_senddown() is called either directly from within isdn_tty_write()
- * or via timer-interrupt from within isdn_tty_modem_xmit(). It pulls
- * outgoing data from the tty's xmit-buffer, handles voice-decompression or
- * T.70 if necessary, and finally queues it up for sending via isdn_tty_tint.
- */
-static void
-isdn_tty_senddown(modem_info *info)
-{
-	int buflen;
-	int skb_res;
-#ifdef CONFIG_ISDN_AUDIO
-	int audio_len;
-#endif
-	struct sk_buff *skb;
-
-#ifdef CONFIG_ISDN_AUDIO
-	if (info->vonline & 4) {
-		info->vonline &= ~6;
-		if (!info->vonline) {
-#ifdef ISDN_DEBUG_MODEM_VOICE
-			printk(KERN_DEBUG
-			       "senddown: send VCON on ttyI%d\n",
-			       info->line);
-#endif
-			isdn_tty_at_cout("\r\nVCON\r\n", info);
-		}
-	}
-#endif
-	if (!(buflen = info->xmit_count))
-		return;
-	if ((info->emu.mdmreg[REG_CTS] & BIT_CTS) != 0)
-		info->msr &= ~UART_MSR_CTS;
-	info->lsr &= ~UART_LSR_TEMT;
-	/* info->xmit_count is modified here and in isdn_tty_write().
-	 * So we return here if isdn_tty_write() is in the
-	 * critical section.
-	 */
-	atomic_inc(&info->xmit_lock);
-	if (!(atomic_dec_and_test(&info->xmit_lock)))
-		return;
-	if (info->isdn_driver < 0) {
-		info->xmit_count = 0;
-		return;
-	}
-	skb_res = dev->drv[info->isdn_driver]->interface->hl_hdrlen + 4;
-#ifdef CONFIG_ISDN_AUDIO
-	if (info->vonline & 2)
-		audio_len = buflen * voice_cf[info->emu.vpar[3]];
-	else
-		audio_len = 0;
-	skb = dev_alloc_skb(skb_res + buflen + audio_len);
-#else
-	skb = dev_alloc_skb(skb_res + buflen);
-#endif
-	if (!skb) {
-		printk(KERN_WARNING
-		       "isdn_tty: Out of memory in ttyI%d senddown\n",
-		       info->line);
-		return;
-	}
-	skb_reserve(skb, skb_res);
-	skb_put_data(skb, info->port.xmit_buf, buflen);
-	info->xmit_count = 0;
-#ifdef CONFIG_ISDN_AUDIO
-	if (info->vonline & 2) {
-		/* For now, ifmt is fixed to 1 (alaw), since this
-		 * is used with ISDN everywhere in the world, except
-		 * US, Canada and Japan.
-		 * Later, when US-ISDN protocols are implemented,
-		 * this setting will depend on the D-channel protocol.
-		 */
-		int ifmt = 1;
-
-		/* voice conversion/decompression */
-		switch (info->emu.vpar[3]) {
-		case 2:
-		case 3:
-		case 4:
-			/* adpcm, compatible to ZyXel 1496 modem
-			 * with ROM revision 6.01
-			 */
-			audio_len = isdn_audio_adpcm2xlaw(info->adpcms,
-							  ifmt,
-							  skb->data,
-							  skb_put(skb, audio_len),
-							  buflen);
-			skb_pull(skb, buflen);
-			skb_trim(skb, audio_len);
-			break;
-		case 5:
-			/* a-law */
-			if (!ifmt)
-				isdn_audio_alaw2ulaw(skb->data,
-						     buflen);
-			break;
-		case 6:
-			/* u-law */
-			if (ifmt)
-				isdn_audio_ulaw2alaw(skb->data,
-						     buflen);
-			break;
-		}
-	}
-#endif                          /* CONFIG_ISDN_AUDIO */
-	if (info->emu.mdmreg[REG_T70] & BIT_T70) {
-		/* Add T.70 simplified header */
-		if (info->emu.mdmreg[REG_T70] & BIT_T70_EXT)
-			memcpy(skb_push(skb, 2), "\1\0", 2);
-		else
-			memcpy(skb_push(skb, 4), "\1\0\1\0", 4);
-	}
-	skb_queue_tail(&info->xmit_queue, skb);
-}
-
-/************************************************************
- *
- * Modem-functions
- *
- * mostly "stolen" from original Linux-serial.c and friends.
- *
- ************************************************************/
-
-/* The next routine is called once from within timer-interrupt
- * triggered within isdn_tty_modem_ncarrier(). It calls
- * isdn_tty_modem_result() to stuff a "NO CARRIER" Message
- * into the tty's buffer.
- */
-static void
-isdn_tty_modem_do_ncarrier(struct timer_list *t)
-{
-	modem_info *info = from_timer(info, t, nc_timer);
-	isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-}
-
-/* Next routine is called, whenever the DTR-signal is raised.
- * It checks the ncarrier-flag, and triggers the above routine
- * when necessary. The ncarrier-flag is set, whenever DTR goes
- * low.
- */
-static void
-isdn_tty_modem_ncarrier(modem_info *info)
-{
-	if (info->ncarrier) {
-		info->nc_timer.expires = jiffies + HZ;
-		add_timer(&info->nc_timer);
-	}
-}
-
-/*
- * return the usage calculated by si and layer 2 protocol
- */
-static int
-isdn_calc_usage(int si, int l2)
-{
-	int usg = ISDN_USAGE_MODEM;
-
-#ifdef CONFIG_ISDN_AUDIO
-	if (si == 1) {
-		switch (l2) {
-		case ISDN_PROTO_L2_MODEM:
-			usg = ISDN_USAGE_MODEM;
-			break;
-#ifdef CONFIG_ISDN_TTY_FAX
-		case ISDN_PROTO_L2_FAX:
-			usg = ISDN_USAGE_FAX;
-			break;
-#endif
-		case ISDN_PROTO_L2_TRANS:
-		default:
-			usg = ISDN_USAGE_VOICE;
-			break;
-		}
-	}
-#endif
-	return (usg);
-}
-
-/* isdn_tty_dial() performs dialing of a tty an the necessary
- * setup of the lower levels before that.
- */
-static void
-isdn_tty_dial(char *n, modem_info *info, atemu *m)
-{
-	int usg = ISDN_USAGE_MODEM;
-	int si = 7;
-	int l2 = m->mdmreg[REG_L2PROT];
-	u_long flags;
-	isdn_ctrl cmd;
-	int i;
-	int j;
-
-	for (j = 7; j >= 0; j--)
-		if (m->mdmreg[REG_SI1] & (1 << j)) {
-			si = bit2si[j];
-			break;
-		}
-	usg = isdn_calc_usage(si, l2);
-#ifdef CONFIG_ISDN_AUDIO
-	if ((si == 1) &&
-	    (l2 != ISDN_PROTO_L2_MODEM)
-#ifdef CONFIG_ISDN_TTY_FAX
-	    && (l2 != ISDN_PROTO_L2_FAX)
-#endif
-		) {
-		l2 = ISDN_PROTO_L2_TRANS;
-		usg = ISDN_USAGE_VOICE;
-	}
-#endif
-	m->mdmreg[REG_SI1I] = si2bit[si];
-	spin_lock_irqsave(&dev->lock, flags);
-	i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn);
-	if (i < 0) {
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_tty_modem_result(RESULT_NO_DIALTONE, info);
-	} else {
-		info->isdn_driver = dev->drvmap[i];
-		info->isdn_channel = dev->chanmap[i];
-		info->drv_index = i;
-		dev->m_idx[i] = info->line;
-		dev->usage[i] |= ISDN_USAGE_OUTGOING;
-		info->last_dir = 1;
-		strcpy(info->last_num, n);
-		isdn_info_update();
-		spin_unlock_irqrestore(&dev->lock, flags);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.command = ISDN_CMD_CLREAZ;
-		isdn_command(&cmd);
-		strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver));
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETEAZ;
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL2;
-		info->last_l2 = l2;
-		cmd.arg = info->isdn_channel + (l2 << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL3;
-		cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8);
-#ifdef CONFIG_ISDN_TTY_FAX
-		if (l2 == ISDN_PROTO_L2_FAX) {
-			cmd.parm.fax = info->fax;
-			info->fax->direction = ISDN_TTY_FAX_CONN_OUT;
-		}
-#endif
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		sprintf(cmd.parm.setup.phone, "%s", n);
-		sprintf(cmd.parm.setup.eazmsn, "%s",
-			isdn_map_eaz2msn(m->msn, info->isdn_driver));
-		cmd.parm.setup.si1 = si;
-		cmd.parm.setup.si2 = m->mdmreg[REG_SI2];
-		cmd.command = ISDN_CMD_DIAL;
-		info->dialing = 1;
-		info->emu.carrierwait = 0;
-		strcpy(dev->num[i], n);
-		isdn_info_update();
-		isdn_command(&cmd);
-		isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1);
-	}
-}
-
-/* isdn_tty_hangup() disassociates a tty from the real
- * ISDN-line (hangup). The usage-status is cleared
- * and some cleanup is done also.
- */
-void
-isdn_tty_modem_hup(modem_info *info, int local)
-{
-	isdn_ctrl cmd;
-	int di, ch;
-
-	if (!info)
-		return;
-
-	di = info->isdn_driver;
-	ch = info->isdn_channel;
-	if (di < 0 || ch < 0)
-		return;
-
-	info->isdn_driver = -1;
-	info->isdn_channel = -1;
-
-#ifdef ISDN_DEBUG_MODEM_HUP
-	printk(KERN_DEBUG "Mhup ttyI%d\n", info->line);
-#endif
-	info->rcvsched = 0;
-	isdn_tty_flush_buffer(info->port.tty);
-	if (info->online) {
-		info->last_lhup = local;
-		info->online = 0;
-		isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-	}
-#ifdef CONFIG_ISDN_AUDIO
-	info->vonline = 0;
-#ifdef CONFIG_ISDN_TTY_FAX
-	info->faxonline = 0;
-	info->fax->phase = ISDN_FAX_PHASE_IDLE;
-#endif
-	info->emu.vpar[4] = 0;
-	info->emu.vpar[5] = 8;
-	kfree(info->dtmf_state);
-	info->dtmf_state = NULL;
-	kfree(info->silence_state);
-	info->silence_state = NULL;
-	kfree(info->adpcms);
-	info->adpcms = NULL;
-	kfree(info->adpcmr);
-	info->adpcmr = NULL;
-#endif
-	if ((info->msr & UART_MSR_RI) &&
-	    (info->emu.mdmreg[REG_RUNG] & BIT_RUNG))
-		isdn_tty_modem_result(RESULT_RUNG, info);
-	info->msr &= ~(UART_MSR_DCD | UART_MSR_RI);
-	info->lsr |= UART_LSR_TEMT;
-
-	if (local) {
-		cmd.driver = di;
-		cmd.command = ISDN_CMD_HANGUP;
-		cmd.arg = ch;
-		isdn_command(&cmd);
-	}
-
-	isdn_all_eaz(di, ch);
-	info->emu.mdmreg[REG_RINGCNT] = 0;
-	isdn_free_channel(di, ch, 0);
-
-	if (info->drv_index >= 0) {
-		dev->m_idx[info->drv_index] = -1;
-		info->drv_index = -1;
-	}
-}
-
-/*
- * Begin of a CAPI like interface, currently used only for
- * supplementary service (CAPI 2.0 part III)
- */
-#include <linux/isdn/capicmd.h>
-#include <linux/module.h>
-
-int
-isdn_tty_capi_facility(capi_msg *cm) {
-	return (-1); /* dummy */
-}
-
-/* isdn_tty_suspend() tries to suspend the current tty connection
- */
-static void
-isdn_tty_suspend(char *id, modem_info *info, atemu *m)
-{
-	isdn_ctrl cmd;
-
-	int l;
-
-	if (!info)
-		return;
-
-#ifdef ISDN_DEBUG_MODEM_SERVICES
-	printk(KERN_DEBUG "Msusp ttyI%d\n", info->line);
-#endif
-	l = strlen(id);
-	if ((info->isdn_driver >= 0)) {
-		cmd.parm.cmsg.Length = l + 18;
-		cmd.parm.cmsg.Command = CAPI_FACILITY;
-		cmd.parm.cmsg.Subcommand = CAPI_REQ;
-		cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1;
-		cmd.parm.cmsg.para[0] = 3; /* 16 bit 0x0003 suplementary service */
-		cmd.parm.cmsg.para[1] = 0;
-		cmd.parm.cmsg.para[2] = l + 3;
-		cmd.parm.cmsg.para[3] = 4; /* 16 bit 0x0004 Suspend */
-		cmd.parm.cmsg.para[4] = 0;
-		cmd.parm.cmsg.para[5] = l;
-		memcpy(&cmd.parm.cmsg.para[6], id, l);
-		cmd.command = CAPI_PUT_MESSAGE;
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		isdn_command(&cmd);
-	}
-}
-
-/* isdn_tty_resume() tries to resume a suspended call
- * setup of the lower levels before that. unfortunately here is no
- * checking for compatibility of used protocols implemented by Q931
- * It does the same things like isdn_tty_dial, the last command
- * is different, may be we can merge it.
- */
-
-static void
-isdn_tty_resume(char *id, modem_info *info, atemu *m)
-{
-	int usg = ISDN_USAGE_MODEM;
-	int si = 7;
-	int l2 = m->mdmreg[REG_L2PROT];
-	isdn_ctrl cmd;
-	ulong flags;
-	int i;
-	int j;
-	int l;
-
-	l = strlen(id);
-	for (j = 7; j >= 0; j--)
-		if (m->mdmreg[REG_SI1] & (1 << j)) {
-			si = bit2si[j];
-			break;
-		}
-	usg = isdn_calc_usage(si, l2);
-#ifdef CONFIG_ISDN_AUDIO
-	if ((si == 1) &&
-	    (l2 != ISDN_PROTO_L2_MODEM)
-#ifdef CONFIG_ISDN_TTY_FAX
-	    && (l2 != ISDN_PROTO_L2_FAX)
-#endif
-		) {
-		l2 = ISDN_PROTO_L2_TRANS;
-		usg = ISDN_USAGE_VOICE;
-	}
-#endif
-	m->mdmreg[REG_SI1I] = si2bit[si];
-	spin_lock_irqsave(&dev->lock, flags);
-	i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn);
-	if (i < 0) {
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_tty_modem_result(RESULT_NO_DIALTONE, info);
-	} else {
-		info->isdn_driver = dev->drvmap[i];
-		info->isdn_channel = dev->chanmap[i];
-		info->drv_index = i;
-		dev->m_idx[i] = info->line;
-		dev->usage[i] |= ISDN_USAGE_OUTGOING;
-		info->last_dir = 1;
-//		strcpy(info->last_num, n);
-		isdn_info_update();
-		spin_unlock_irqrestore(&dev->lock, flags);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.command = ISDN_CMD_CLREAZ;
-		isdn_command(&cmd);
-		strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver));
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETEAZ;
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL2;
-		info->last_l2 = l2;
-		cmd.arg = info->isdn_channel + (l2 << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL3;
-		cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.parm.cmsg.Length = l + 18;
-		cmd.parm.cmsg.Command = CAPI_FACILITY;
-		cmd.parm.cmsg.Subcommand = CAPI_REQ;
-		cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1;
-		cmd.parm.cmsg.para[0] = 3; /* 16 bit 0x0003 suplementary service */
-		cmd.parm.cmsg.para[1] = 0;
-		cmd.parm.cmsg.para[2] = l + 3;
-		cmd.parm.cmsg.para[3] = 5; /* 16 bit 0x0005 Resume */
-		cmd.parm.cmsg.para[4] = 0;
-		cmd.parm.cmsg.para[5] = l;
-		memcpy(&cmd.parm.cmsg.para[6], id, l);
-		cmd.command = CAPI_PUT_MESSAGE;
-		info->dialing = 1;
-//		strcpy(dev->num[i], n);
-		isdn_info_update();
-		isdn_command(&cmd);
-		isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1);
-	}
-}
-
-/* isdn_tty_send_msg() sends a message to a HL driver
- * This is used for hybrid modem cards to send AT commands to it
- */
-
-static void
-isdn_tty_send_msg(modem_info *info, atemu *m, char *msg)
-{
-	int usg = ISDN_USAGE_MODEM;
-	int si = 7;
-	int l2 = m->mdmreg[REG_L2PROT];
-	isdn_ctrl cmd;
-	ulong flags;
-	int i;
-	int j;
-	int l;
-
-	l = min(strlen(msg), sizeof(cmd.parm) - sizeof(cmd.parm.cmsg)
-		+ sizeof(cmd.parm.cmsg.para) - 2);
-
-	if (!l) {
-		isdn_tty_modem_result(RESULT_ERROR, info);
-		return;
-	}
-	for (j = 7; j >= 0; j--)
-		if (m->mdmreg[REG_SI1] & (1 << j)) {
-			si = bit2si[j];
-			break;
-		}
-	usg = isdn_calc_usage(si, l2);
-#ifdef CONFIG_ISDN_AUDIO
-	if ((si == 1) &&
-	    (l2 != ISDN_PROTO_L2_MODEM)
-#ifdef CONFIG_ISDN_TTY_FAX
-	    && (l2 != ISDN_PROTO_L2_FAX)
-#endif
-		) {
-		l2 = ISDN_PROTO_L2_TRANS;
-		usg = ISDN_USAGE_VOICE;
-	}
-#endif
-	m->mdmreg[REG_SI1I] = si2bit[si];
-	spin_lock_irqsave(&dev->lock, flags);
-	i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn);
-	if (i < 0) {
-		spin_unlock_irqrestore(&dev->lock, flags);
-		isdn_tty_modem_result(RESULT_NO_DIALTONE, info);
-	} else {
-		info->isdn_driver = dev->drvmap[i];
-		info->isdn_channel = dev->chanmap[i];
-		info->drv_index = i;
-		dev->m_idx[i] = info->line;
-		dev->usage[i] |= ISDN_USAGE_OUTGOING;
-		info->last_dir = 1;
-		isdn_info_update();
-		spin_unlock_irqrestore(&dev->lock, flags);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.command = ISDN_CMD_CLREAZ;
-		isdn_command(&cmd);
-		strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver));
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETEAZ;
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL2;
-		info->last_l2 = l2;
-		cmd.arg = info->isdn_channel + (l2 << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL3;
-		cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8);
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.parm.cmsg.Length = l + 14;
-		cmd.parm.cmsg.Command = CAPI_MANUFACTURER;
-		cmd.parm.cmsg.Subcommand = CAPI_REQ;
-		cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1;
-		cmd.parm.cmsg.para[0] = l + 1;
-		strncpy(&cmd.parm.cmsg.para[1], msg, l);
-		cmd.parm.cmsg.para[l + 1] = 0xd;
-		cmd.command = CAPI_PUT_MESSAGE;
-/*		info->dialing = 1;
-		strcpy(dev->num[i], n);
-		isdn_info_update();
-*/
-		isdn_command(&cmd);
-	}
-}
-
-static inline int
-isdn_tty_paranoia_check(modem_info *info, char *name, const char *routine)
-{
-#ifdef MODEM_PARANOIA_CHECK
-	if (!info) {
-		printk(KERN_WARNING "isdn_tty: null info_struct for %s in %s\n",
-		       name, routine);
-		return 1;
-	}
-	if (info->magic != ISDN_ASYNC_MAGIC) {
-		printk(KERN_WARNING "isdn_tty: bad magic for modem struct %s in %s\n",
-		       name, routine);
-		return 1;
-	}
-#endif
-	return 0;
-}
-
-/*
- * This routine is called to set the UART divisor registers to match
- * the specified baud rate for a serial port.
- */
-static void
-isdn_tty_change_speed(modem_info *info)
-{
-	struct tty_port *port = &info->port;
-	uint cflag,
-		cval,
-		quot;
-	int i;
-
-	if (!port->tty)
-		return;
-	cflag = port->tty->termios.c_cflag;
-
-	quot = i = cflag & CBAUD;
-	if (i & CBAUDEX) {
-		i &= ~CBAUDEX;
-		if (i < 1 || i > 2)
-			port->tty->termios.c_cflag &= ~CBAUDEX;
-		else
-			i += 15;
-	}
-	if (quot) {
-		info->mcr |= UART_MCR_DTR;
-		isdn_tty_modem_ncarrier(info);
-	} else {
-		info->mcr &= ~UART_MCR_DTR;
-		if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) {
-#ifdef ISDN_DEBUG_MODEM_HUP
-			printk(KERN_DEBUG "Mhup in changespeed\n");
-#endif
-			if (info->online)
-				info->ncarrier = 1;
-			isdn_tty_modem_reset_regs(info, 0);
-			isdn_tty_modem_hup(info, 1);
-		}
-		return;
-	}
-	/* byte size and parity */
-	cval = cflag & (CSIZE | CSTOPB);
-	cval >>= 4;
-	if (cflag & PARENB)
-		cval |= UART_LCR_PARITY;
-	if (!(cflag & PARODD))
-		cval |= UART_LCR_EPAR;
-
-	tty_port_set_check_carrier(port, ~cflag & CLOCAL);
-}
-
-static int
-isdn_tty_startup(modem_info *info)
-{
-	if (tty_port_initialized(&info->port))
-		return 0;
-	isdn_lock_drivers();
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "starting up ttyi%d ...\n", info->line);
-#endif
-	/*
-	 * Now, initialize the UART
-	 */
-	info->mcr = UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2;
-	if (info->port.tty)
-		clear_bit(TTY_IO_ERROR, &info->port.tty->flags);
-	/*
-	 * and set the speed of the serial port
-	 */
-	isdn_tty_change_speed(info);
-
-	tty_port_set_initialized(&info->port, 1);
-	info->msr |= (UART_MSR_DSR | UART_MSR_CTS);
-	info->send_outstanding = 0;
-	return 0;
-}
-
-/*
- * This routine will shutdown a serial port; interrupts are disabled, and
- * DTR is dropped if the hangup on close termio flag is on.
- */
-static void
-isdn_tty_shutdown(modem_info *info)
-{
-	if (!tty_port_initialized(&info->port))
-		return;
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "Shutting down isdnmodem port %d ....\n", info->line);
-#endif
-	isdn_unlock_drivers();
-	info->msr &= ~UART_MSR_RI;
-	if (!info->port.tty || (info->port.tty->termios.c_cflag & HUPCL)) {
-		info->mcr &= ~(UART_MCR_DTR | UART_MCR_RTS);
-		if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) {
-			isdn_tty_modem_reset_regs(info, 0);
-#ifdef ISDN_DEBUG_MODEM_HUP
-			printk(KERN_DEBUG "Mhup in isdn_tty_shutdown\n");
-#endif
-			isdn_tty_modem_hup(info, 1);
-		}
-	}
-	if (info->port.tty)
-		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
-
-	tty_port_set_initialized(&info->port, 0);
-}
-
-/* isdn_tty_write() is the main send-routine. It is called from the upper
- * levels within the kernel to perform sending data. Depending on the
- * online-flag it either directs output to the at-command-interpreter or
- * to the lower level. Additional tasks done here:
- *  - If online, check for escape-sequence (+++)
- *  - If sending audio-data, call isdn_tty_DLEdown() to parse DLE-codes.
- *  - If receiving audio-data, call isdn_tty_end_vrx() to abort if needed.
- *  - If dialing, abort dial.
- */
-static int
-isdn_tty_write(struct tty_struct *tty, const u_char *buf, int count)
-{
-	int c;
-	int total = 0;
-	modem_info *info = (modem_info *) tty->driver_data;
-	atemu *m = &info->emu;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_write"))
-		return 0;
-	/* See isdn_tty_senddown() */
-	atomic_inc(&info->xmit_lock);
-	while (1) {
-		c = count;
-		if (c > info->xmit_size - info->xmit_count)
-			c = info->xmit_size - info->xmit_count;
-		if (info->isdn_driver >= 0 && c > dev->drv[info->isdn_driver]->maxbufsize)
-			c = dev->drv[info->isdn_driver]->maxbufsize;
-		if (c <= 0)
-			break;
-		if ((info->online > 1)
-#ifdef CONFIG_ISDN_AUDIO
-		    || (info->vonline & 3)
-#endif
-			) {
-#ifdef CONFIG_ISDN_AUDIO
-			if (!info->vonline)
-#endif
-				isdn_tty_check_esc(buf, m->mdmreg[REG_ESC], c,
-						   &(m->pluscount),
-						   &(m->lastplus));
-			memcpy(&info->port.xmit_buf[info->xmit_count], buf, c);
-#ifdef CONFIG_ISDN_AUDIO
-			if (info->vonline) {
-				int cc = isdn_tty_handleDLEdown(info, m, c);
-				if (info->vonline & 2) {
-					if (!cc) {
-						/* If DLE decoding results in zero-transmit, but
-						 * c originally was non-zero, do a wakeup.
-						 */
-						tty_wakeup(tty);
-						info->msr |= UART_MSR_CTS;
-						info->lsr |= UART_LSR_TEMT;
-					}
-					info->xmit_count += cc;
-				}
-				if ((info->vonline & 3) == 1) {
-					/* Do NOT handle Ctrl-Q or Ctrl-S
-					 * when in full-duplex audio mode.
-					 */
-					if (isdn_tty_end_vrx(buf, c)) {
-						info->vonline &= ~1;
-#ifdef ISDN_DEBUG_MODEM_VOICE
-						printk(KERN_DEBUG
-						       "got !^Q/^S, send DLE-ETX,VCON on ttyI%d\n",
-						       info->line);
-#endif
-						isdn_tty_at_cout("\020\003\r\nVCON\r\n", info);
-					}
-				}
-			} else
-				if (TTY_IS_FCLASS1(info)) {
-					int cc = isdn_tty_handleDLEdown(info, m, c);
-
-					if (info->vonline & 4) { /* ETX seen */
-						isdn_ctrl c;
-
-						c.command = ISDN_CMD_FAXCMD;
-						c.driver = info->isdn_driver;
-						c.arg = info->isdn_channel;
-						c.parm.aux.cmd = ISDN_FAX_CLASS1_CTRL;
-						c.parm.aux.subcmd = ETX;
-						isdn_command(&c);
-					}
-					info->vonline = 0;
-#ifdef ISDN_DEBUG_MODEM_VOICE
-					printk(KERN_DEBUG "fax dle cc/c %d/%d\n", cc, c);
-#endif
-					info->xmit_count += cc;
-				} else
-#endif
-					info->xmit_count += c;
-		} else {
-			info->msr |= UART_MSR_CTS;
-			info->lsr |= UART_LSR_TEMT;
-			if (info->dialing) {
-				info->dialing = 0;
-#ifdef ISDN_DEBUG_MODEM_HUP
-				printk(KERN_DEBUG "Mhup in isdn_tty_write\n");
-#endif
-				isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-				isdn_tty_modem_hup(info, 1);
-			} else
-				c = isdn_tty_edit_at(buf, c, info);
-		}
-		buf += c;
-		count -= c;
-		total += c;
-	}
-	atomic_dec(&info->xmit_lock);
-	if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue)) {
-		if (m->mdmreg[REG_DXMT] & BIT_DXMT) {
-			isdn_tty_senddown(info);
-			isdn_tty_tint(info);
-		}
-		isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, 1);
-	}
-	return total;
-}
-
-static int
-isdn_tty_write_room(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-	int ret;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_write_room"))
-		return 0;
-	if (!info->online)
-		return info->xmit_size;
-	ret = info->xmit_size - info->xmit_count;
-	return (ret < 0) ? 0 : ret;
-}
-
-static int
-isdn_tty_chars_in_buffer(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_chars_in_buffer"))
-		return 0;
-	if (!info->online)
-		return 0;
-	return (info->xmit_count);
-}
-
-static void
-isdn_tty_flush_buffer(struct tty_struct *tty)
-{
-	modem_info *info;
-
-	if (!tty) {
-		return;
-	}
-	info = (modem_info *) tty->driver_data;
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_flush_buffer")) {
-		return;
-	}
-	isdn_tty_cleanup_xmit(info);
-	info->xmit_count = 0;
-	tty_wakeup(tty);
-}
-
-static void
-isdn_tty_flush_chars(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_flush_chars"))
-		return;
-	if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue))
-		isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, 1);
-}
-
-/*
- * ------------------------------------------------------------
- * isdn_tty_throttle()
- *
- * This routine is called by the upper-layer tty layer to signal that
- * incoming characters should be throttled.
- * ------------------------------------------------------------
- */
-static void
-isdn_tty_throttle(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_throttle"))
-		return;
-	if (I_IXOFF(tty))
-		info->x_char = STOP_CHAR(tty);
-	info->mcr &= ~UART_MCR_RTS;
-}
-
-static void
-isdn_tty_unthrottle(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_unthrottle"))
-		return;
-	if (I_IXOFF(tty)) {
-		if (info->x_char)
-			info->x_char = 0;
-		else
-			info->x_char = START_CHAR(tty);
-	}
-	info->mcr |= UART_MCR_RTS;
-}
-
-/*
- * ------------------------------------------------------------
- * isdn_tty_ioctl() and friends
- * ------------------------------------------------------------
- */
-
-/*
- * isdn_tty_get_lsr_info - get line status register info
- *
- * Purpose: Let user call ioctl() to get info when the UART physically
- *          is emptied.  On bus types like RS485, the transmitter must
- *          release the bus after transmitting. This must be done when
- *          the transmit shift register is empty, not be done when the
- *          transmit holding register is empty.  This functionality
- *          allows RS485 driver to be written in user space.
- */
-static int
-isdn_tty_get_lsr_info(modem_info *info, uint __user *value)
-{
-	u_char status;
-	uint result;
-
-	status = info->lsr;
-	result = ((status & UART_LSR_TEMT) ? TIOCSER_TEMT : 0);
-	return put_user(result, value);
-}
-
-
-static int
-isdn_tty_tiocmget(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-	u_char control, status;
-
-	if (isdn_tty_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-	if (tty_io_error(tty))
-		return -EIO;
-
-	mutex_lock(&modem_info_mutex);
-#ifdef ISDN_DEBUG_MODEM_IOCTL
-	printk(KERN_DEBUG "ttyI%d ioctl TIOCMGET\n", info->line);
-#endif
-
-	control = info->mcr;
-	status = info->msr;
-	mutex_unlock(&modem_info_mutex);
-	return ((control & UART_MCR_RTS) ? TIOCM_RTS : 0)
-		| ((control & UART_MCR_DTR) ? TIOCM_DTR : 0)
-		| ((status & UART_MSR_DCD) ? TIOCM_CAR : 0)
-		| ((status & UART_MSR_RI) ? TIOCM_RNG : 0)
-		| ((status & UART_MSR_DSR) ? TIOCM_DSR : 0)
-		| ((status & UART_MSR_CTS) ? TIOCM_CTS : 0);
-}
-
-static int
-isdn_tty_tiocmset(struct tty_struct *tty,
-		  unsigned int set, unsigned int clear)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-	if (tty_io_error(tty))
-		return -EIO;
-
-#ifdef ISDN_DEBUG_MODEM_IOCTL
-	printk(KERN_DEBUG "ttyI%d ioctl TIOCMxxx: %x %x\n", info->line, set, clear);
-#endif
-
-	mutex_lock(&modem_info_mutex);
-	if (set & TIOCM_RTS)
-		info->mcr |= UART_MCR_RTS;
-	if (set & TIOCM_DTR) {
-		info->mcr |= UART_MCR_DTR;
-		isdn_tty_modem_ncarrier(info);
-	}
-
-	if (clear & TIOCM_RTS)
-		info->mcr &= ~UART_MCR_RTS;
-	if (clear & TIOCM_DTR) {
-		info->mcr &= ~UART_MCR_DTR;
-		if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) {
-			isdn_tty_modem_reset_regs(info, 0);
-#ifdef ISDN_DEBUG_MODEM_HUP
-			printk(KERN_DEBUG "Mhup in TIOCMSET\n");
-#endif
-			if (info->online)
-				info->ncarrier = 1;
-			isdn_tty_modem_hup(info, 1);
-		}
-	}
-	mutex_unlock(&modem_info_mutex);
-	return 0;
-}
-
-static int
-isdn_tty_ioctl(struct tty_struct *tty, uint cmd, ulong arg)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_ioctl"))
-		return -ENODEV;
-	if (tty_io_error(tty))
-		return -EIO;
-	switch (cmd) {
-	case TIOCSERGETLSR:	/* Get line status register */
-#ifdef ISDN_DEBUG_MODEM_IOCTL
-		printk(KERN_DEBUG "ttyI%d ioctl TIOCSERGETLSR\n", info->line);
-#endif
-		return isdn_tty_get_lsr_info(info, (uint __user *) arg);
-	default:
-#ifdef ISDN_DEBUG_MODEM_IOCTL
-		printk(KERN_DEBUG "UNKNOWN ioctl 0x%08x on ttyi%d\n", cmd, info->line);
-#endif
-		return -ENOIOCTLCMD;
-	}
-	return 0;
-}
-
-static void
-isdn_tty_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-
-	mutex_lock(&modem_info_mutex);
-	if (!old_termios)
-		isdn_tty_change_speed(info);
-	else {
-		if (tty->termios.c_cflag == old_termios->c_cflag &&
-		    tty->termios.c_ispeed == old_termios->c_ispeed &&
-		    tty->termios.c_ospeed == old_termios->c_ospeed) {
-			mutex_unlock(&modem_info_mutex);
-			return;
-		}
-		isdn_tty_change_speed(info);
-	}
-	mutex_unlock(&modem_info_mutex);
-}
-
-/*
- * ------------------------------------------------------------
- * isdn_tty_open() and friends
- * ------------------------------------------------------------
- */
-
-static int isdn_tty_install(struct tty_driver *driver, struct tty_struct *tty)
-{
-	modem_info *info = &dev->mdm.info[tty->index];
-
-	if (isdn_tty_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-
-	tty->driver_data = info;
-
-	return tty_port_install(&info->port, driver, tty);
-}
-
-/*
- * This routine is called whenever a serial port is opened.  It
- * enables interrupts for a serial port, linking in its async structure into
- * the IRQ chain.   It also performs the serial-specific
- * initialization for the tty structure.
- */
-static int
-isdn_tty_open(struct tty_struct *tty, struct file *filp)
-{
-	modem_info *info = tty->driver_data;
-	struct tty_port *port = &info->port;
-	int retval;
-
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "isdn_tty_open %s, count = %d\n", tty->name,
-	       port->count);
-#endif
-	port->count++;
-	port->tty = tty;
-	/*
-	 * Start up serial port
-	 */
-	retval = isdn_tty_startup(info);
-	if (retval) {
-#ifdef ISDN_DEBUG_MODEM_OPEN
-		printk(KERN_DEBUG "isdn_tty_open return after startup\n");
-#endif
-		return retval;
-	}
-	retval = tty_port_block_til_ready(port, tty, filp);
-	if (retval) {
-#ifdef ISDN_DEBUG_MODEM_OPEN
-		printk(KERN_DEBUG "isdn_tty_open return after isdn_tty_block_til_ready \n");
-#endif
-		return retval;
-	}
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "isdn_tty_open ttyi%d successful...\n", info->line);
-#endif
-	dev->modempoll++;
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "isdn_tty_open normal exit\n");
-#endif
-	return 0;
-}
-
-static void
-isdn_tty_close(struct tty_struct *tty, struct file *filp)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-	struct tty_port *port = &info->port;
-	ulong timeout;
-
-	if (!info || isdn_tty_paranoia_check(info, tty->name, "isdn_tty_close"))
-		return;
-	if (tty_hung_up_p(filp)) {
-#ifdef ISDN_DEBUG_MODEM_OPEN
-		printk(KERN_DEBUG "isdn_tty_close return after tty_hung_up_p\n");
-#endif
-		return;
-	}
-	if ((tty->count == 1) && (port->count != 1)) {
-		/*
-		 * Uh, oh.  tty->count is 1, which means that the tty
-		 * structure will be freed.  Info->count should always
-		 * be one in these conditions.  If it's greater than
-		 * one, we've got real problems, since it means the
-		 * serial port won't be shutdown.
-		 */
-		printk(KERN_ERR "isdn_tty_close: bad port count; tty->count is 1, "
-		       "info->count is %d\n", port->count);
-		port->count = 1;
-	}
-	if (--port->count < 0) {
-		printk(KERN_ERR "isdn_tty_close: bad port count for ttyi%d: %d\n",
-		       info->line, port->count);
-		port->count = 0;
-	}
-	if (port->count) {
-#ifdef ISDN_DEBUG_MODEM_OPEN
-		printk(KERN_DEBUG "isdn_tty_close after info->count != 0\n");
-#endif
-		return;
-	}
-	info->closing = 1;
-
-	tty->closing = 1;
-	/*
-	 * At this point we stop accepting input.  To do this, we
-	 * disable the receive line status interrupts, and tell the
-	 * interrupt driver to stop checking the data ready bit in the
-	 * line status register.
-	 */
-	if (tty_port_initialized(port)) {
-		tty_wait_until_sent(tty, 3000);	/* 30 seconds timeout */
-		/*
-		 * Before we drop DTR, make sure the UART transmitter
-		 * has completely drained; this is especially
-		 * important if there is a transmit FIFO!
-		 */
-		timeout = jiffies + HZ;
-		while (!(info->lsr & UART_LSR_TEMT)) {
-			schedule_timeout_interruptible(20);
-			if (time_after(jiffies, timeout))
-				break;
-		}
-	}
-	dev->modempoll--;
-	isdn_tty_shutdown(info);
-	isdn_tty_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-	port->tty = NULL;
-	info->ncarrier = 0;
-
-	tty_port_close_end(port, tty);
-	info->closing = 0;
-#ifdef ISDN_DEBUG_MODEM_OPEN
-	printk(KERN_DEBUG "isdn_tty_close normal exit\n");
-#endif
-}
-
-/*
- * isdn_tty_hangup() --- called by tty_hangup() when a hangup is signaled.
- */
-static void
-isdn_tty_hangup(struct tty_struct *tty)
-{
-	modem_info *info = (modem_info *) tty->driver_data;
-	struct tty_port *port = &info->port;
-
-	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_hangup"))
-		return;
-	isdn_tty_shutdown(info);
-	port->count = 0;
-	tty_port_set_active(port, 0);
-	port->tty = NULL;
-	wake_up_interruptible(&port->open_wait);
-}
-
-/* This routine initializes all emulator-data.
- */
-static void
-isdn_tty_reset_profile(atemu *m)
-{
-	m->profile[0] = 0;
-	m->profile[1] = 0;
-	m->profile[2] = 43;
-	m->profile[3] = 13;
-	m->profile[4] = 10;
-	m->profile[5] = 8;
-	m->profile[6] = 3;
-	m->profile[7] = 60;
-	m->profile[8] = 2;
-	m->profile[9] = 6;
-	m->profile[10] = 7;
-	m->profile[11] = 70;
-	m->profile[12] = 0x45;
-	m->profile[13] = 4;
-	m->profile[14] = ISDN_PROTO_L2_X75I;
-	m->profile[15] = ISDN_PROTO_L3_TRANS;
-	m->profile[16] = ISDN_SERIAL_XMIT_SIZE / 16;
-	m->profile[17] = ISDN_MODEM_WINSIZE;
-	m->profile[18] = 4;
-	m->profile[19] = 0;
-	m->profile[20] = 0;
-	m->profile[23] = 0;
-	m->pmsn[0] = '\0';
-	m->plmsn[0] = '\0';
-}
-
-#ifdef CONFIG_ISDN_AUDIO
-static void
-isdn_tty_modem_reset_vpar(atemu *m)
-{
-	m->vpar[0] = 2;         /* Voice-device            (2 = phone line) */
-	m->vpar[1] = 0;         /* Silence detection level (0 = none      ) */
-	m->vpar[2] = 70;        /* Silence interval        (7 sec.        ) */
-	m->vpar[3] = 2;         /* Compression type        (1 = ADPCM-2   ) */
-	m->vpar[4] = 0;         /* DTMF detection level    (0 = softcode  ) */
-	m->vpar[5] = 8;         /* DTMF interval           (8 * 5 ms.     ) */
-}
-#endif
-
-#ifdef CONFIG_ISDN_TTY_FAX
-static void
-isdn_tty_modem_reset_faxpar(modem_info *info)
-{
-	T30_s *f = info->fax;
-
-	f->code = 0;
-	f->phase = ISDN_FAX_PHASE_IDLE;
-	f->direction = 0;
-	f->resolution = 1;	/* fine */
-	f->rate = 5;		/* 14400 bit/s */
-	f->width = 0;
-	f->length = 0;
-	f->compression = 0;
-	f->ecm = 0;
-	f->binary = 0;
-	f->scantime = 0;
-	memset(&f->id[0], 32, FAXIDLEN - 1);
-	f->id[FAXIDLEN - 1] = 0;
-	f->badlin = 0;
-	f->badmul = 0;
-	f->bor = 0;
-	f->nbc = 0;
-	f->cq = 0;
-	f->cr = 0;
-	f->ctcrty = 0;
-	f->minsp = 0;
-	f->phcto = 30;
-	f->rel = 0;
-	memset(&f->pollid[0], 32, FAXIDLEN - 1);
-	f->pollid[FAXIDLEN - 1] = 0;
-}
-#endif
-
-static void
-isdn_tty_modem_reset_regs(modem_info *info, int force)
-{
-	atemu *m = &info->emu;
-	if ((m->mdmreg[REG_DTRR] & BIT_DTRR) || force) {
-		memcpy(m->mdmreg, m->profile, ISDN_MODEM_NUMREG);
-		memcpy(m->msn, m->pmsn, ISDN_MSNLEN);
-		memcpy(m->lmsn, m->plmsn, ISDN_LMSNLEN);
-		info->xmit_size = m->mdmreg[REG_PSIZE] * 16;
-	}
-#ifdef CONFIG_ISDN_AUDIO
-	isdn_tty_modem_reset_vpar(m);
-#endif
-#ifdef CONFIG_ISDN_TTY_FAX
-	isdn_tty_modem_reset_faxpar(info);
-#endif
-	m->mdmcmdl = 0;
-}
-
-static void
-modem_write_profile(atemu *m)
-{
-	memcpy(m->profile, m->mdmreg, ISDN_MODEM_NUMREG);
-	memcpy(m->pmsn, m->msn, ISDN_MSNLEN);
-	memcpy(m->plmsn, m->lmsn, ISDN_LMSNLEN);
-	if (dev->profd)
-		send_sig(SIGIO, dev->profd, 1);
-}
-
-static const struct tty_operations modem_ops = {
-	.install = isdn_tty_install,
-	.open = isdn_tty_open,
-	.close = isdn_tty_close,
-	.write = isdn_tty_write,
-	.flush_chars = isdn_tty_flush_chars,
-	.write_room = isdn_tty_write_room,
-	.chars_in_buffer = isdn_tty_chars_in_buffer,
-	.flush_buffer = isdn_tty_flush_buffer,
-	.ioctl = isdn_tty_ioctl,
-	.throttle = isdn_tty_throttle,
-	.unthrottle = isdn_tty_unthrottle,
-	.set_termios = isdn_tty_set_termios,
-	.hangup = isdn_tty_hangup,
-	.tiocmget = isdn_tty_tiocmget,
-	.tiocmset = isdn_tty_tiocmset,
-};
-
-static int isdn_tty_carrier_raised(struct tty_port *port)
-{
-	modem_info *info = container_of(port, modem_info, port);
-	return info->msr & UART_MSR_DCD;
-}
-
-static const struct tty_port_operations isdn_tty_port_ops = {
-	.carrier_raised = isdn_tty_carrier_raised,
-};
-
-int
-isdn_tty_modem_init(void)
-{
-	isdn_modem_t	*m;
-	int		i, retval;
-	modem_info	*info;
-
-	m = &dev->mdm;
-	m->tty_modem = alloc_tty_driver(ISDN_MAX_CHANNELS);
-	if (!m->tty_modem)
-		return -ENOMEM;
-	m->tty_modem->name = "ttyI";
-	m->tty_modem->major = ISDN_TTY_MAJOR;
-	m->tty_modem->minor_start = 0;
-	m->tty_modem->type = TTY_DRIVER_TYPE_SERIAL;
-	m->tty_modem->subtype = SERIAL_TYPE_NORMAL;
-	m->tty_modem->init_termios = tty_std_termios;
-	m->tty_modem->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
-	m->tty_modem->flags = TTY_DRIVER_REAL_RAW;
-	m->tty_modem->driver_name = "isdn_tty";
-	tty_set_operations(m->tty_modem, &modem_ops);
-	retval = tty_register_driver(m->tty_modem);
-	if (retval) {
-		printk(KERN_WARNING "isdn_tty: Couldn't register modem-device\n");
-		goto err;
-	}
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		info = &m->info[i];
-#ifdef CONFIG_ISDN_TTY_FAX
-		if (!(info->fax = kmalloc(sizeof(T30_s), GFP_KERNEL))) {
-			printk(KERN_ERR "Could not allocate fax t30-buffer\n");
-			retval = -ENOMEM;
-			goto err_unregister;
-		}
-#endif
-		tty_port_init(&info->port);
-		info->port.ops = &isdn_tty_port_ops;
-		spin_lock_init(&info->readlock);
-		sprintf(info->last_cause, "0000");
-		sprintf(info->last_num, "none");
-		info->last_dir = 0;
-		info->last_lhup = 1;
-		info->last_l2 = -1;
-		info->last_si = 0;
-		isdn_tty_reset_profile(&info->emu);
-		isdn_tty_modem_reset_regs(info, 1);
-		info->magic = ISDN_ASYNC_MAGIC;
-		info->line = i;
-		info->x_char = 0;
-		info->isdn_driver = -1;
-		info->isdn_channel = -1;
-		info->drv_index = -1;
-		info->xmit_size = ISDN_SERIAL_XMIT_SIZE;
-		timer_setup(&info->nc_timer, isdn_tty_modem_do_ncarrier, 0);
-		skb_queue_head_init(&info->xmit_queue);
-#ifdef CONFIG_ISDN_AUDIO
-		skb_queue_head_init(&info->dtmf_queue);
-#endif
-		info->port.xmit_buf = kmalloc(ISDN_SERIAL_XMIT_MAX + 5,
-				GFP_KERNEL);
-		if (!info->port.xmit_buf) {
-			printk(KERN_ERR "Could not allocate modem xmit-buffer\n");
-			retval = -ENOMEM;
-			goto err_unregister;
-		}
-		/* Make room for T.70 header */
-		info->port.xmit_buf += 4;
-	}
-	return 0;
-err_unregister:
-	for (i--; i >= 0; i--) {
-		info = &m->info[i];
-#ifdef CONFIG_ISDN_TTY_FAX
-		kfree(info->fax);
-#endif
-		kfree(info->port.xmit_buf - 4);
-		info->port.xmit_buf = NULL;
-		tty_port_destroy(&info->port);
-	}
-	tty_unregister_driver(m->tty_modem);
-err:
-	put_tty_driver(m->tty_modem);
-	m->tty_modem = NULL;
-	return retval;
-}
-
-void
-isdn_tty_exit(void)
-{
-	modem_info *info;
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		info = &dev->mdm.info[i];
-		isdn_tty_cleanup_xmit(info);
-#ifdef CONFIG_ISDN_TTY_FAX
-		kfree(info->fax);
-#endif
-		kfree(info->port.xmit_buf - 4);
-		info->port.xmit_buf = NULL;
-		tty_port_destroy(&info->port);
-	}
-	tty_unregister_driver(dev->mdm.tty_modem);
-	put_tty_driver(dev->mdm.tty_modem);
-	dev->mdm.tty_modem = NULL;
-}
-
-
-/*
- * isdn_tty_match_icall(char *MSN, atemu *tty_emulator, int dev_idx)
- *      match the MSN against the MSNs (glob patterns) defined for tty_emulator,
- *      and return 0 for match, 1 for no match, 2 if MSN could match if longer.
- */
-
-static int
-isdn_tty_match_icall(char *cid, atemu *emu, int di)
-{
-#ifdef ISDN_DEBUG_MODEM_ICALL
-	printk(KERN_DEBUG "m_fi: msn=%s lmsn=%s mmsn=%s mreg[SI1]=%d mreg[SI2]=%d\n",
-	       emu->msn, emu->lmsn, isdn_map_eaz2msn(emu->msn, di),
-	       emu->mdmreg[REG_SI1], emu->mdmreg[REG_SI2]);
-#endif
-	if (strlen(emu->lmsn)) {
-		char *p = emu->lmsn;
-		char *q;
-		int  tmp;
-		int  ret = 0;
-
-		while (1) {
-			if ((q = strchr(p, ';')))
-				*q = '\0';
-			if ((tmp = isdn_msncmp(cid, isdn_map_eaz2msn(p, di))) > ret)
-				ret = tmp;
-#ifdef ISDN_DEBUG_MODEM_ICALL
-			printk(KERN_DEBUG "m_fi: lmsnX=%s mmsn=%s -> tmp=%d\n",
-			       p, isdn_map_eaz2msn(emu->msn, di), tmp);
-#endif
-			if (q) {
-				*q = ';';
-				p = q;
-				p++;
-			}
-			if (!tmp)
-				return 0;
-			if (!q)
-				break;
-		}
-		return ret;
-	} else {
-		int tmp;
-		tmp = isdn_msncmp(cid, isdn_map_eaz2msn(emu->msn, di));
-#ifdef ISDN_DEBUG_MODEM_ICALL
-		printk(KERN_DEBUG "m_fi: mmsn=%s -> tmp=%d\n",
-		       isdn_map_eaz2msn(emu->msn, di), tmp);
-#endif
-		return tmp;
-	}
-}
-
-/*
- * An incoming call-request has arrived.
- * Search the tty-devices for an appropriate device and bind
- * it to the ISDN-Channel.
- * Return:
- *
- *  0 = No matching device found.
- *  1 = A matching device found.
- *  3 = No match found, but eventually would match, if
- *      CID is longer.
- */
-int
-isdn_tty_find_icall(int di, int ch, setup_parm *setup)
-{
-	char *eaz;
-	int i;
-	int wret;
-	int idx;
-	int si1;
-	int si2;
-	char *nr;
-	ulong flags;
-
-	if (!setup->phone[0]) {
-		nr = "0";
-		printk(KERN_INFO "isdn_tty: Incoming call without OAD, assuming '0'\n");
-	} else
-		nr = setup->phone;
-	si1 = (int) setup->si1;
-	si2 = (int) setup->si2;
-	if (!setup->eazmsn[0]) {
-		printk(KERN_WARNING "isdn_tty: Incoming call without CPN, assuming '0'\n");
-		eaz = "0";
-	} else
-		eaz = setup->eazmsn;
-#ifdef ISDN_DEBUG_MODEM_ICALL
-	printk(KERN_DEBUG "m_fi: eaz=%s si1=%d si2=%d\n", eaz, si1, si2);
-#endif
-	wret = 0;
-	spin_lock_irqsave(&dev->lock, flags);
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		modem_info *info = &dev->mdm.info[i];
-
-		if (info->port.count == 0)
-			continue;
-		if ((info->emu.mdmreg[REG_SI1] & si2bit[si1]) &&  /* SI1 is matching */
-		    (info->emu.mdmreg[REG_SI2] == si2))	{         /* SI2 is matching */
-			idx = isdn_dc2minor(di, ch);
-#ifdef ISDN_DEBUG_MODEM_ICALL
-			printk(KERN_DEBUG "m_fi: match1 wret=%d\n", wret);
-			printk(KERN_DEBUG "m_fi: idx=%d flags=%08lx drv=%d ch=%d usg=%d\n", idx,
-			       info->port.flags, info->isdn_driver,
-			       info->isdn_channel, dev->usage[idx]);
-#endif
-			if (
-#ifndef FIX_FILE_TRANSFER
-			    tty_port_active(&info->port) &&
-#endif
-				(info->isdn_driver == -1) &&
-				(info->isdn_channel == -1) &&
-				(USG_NONE(dev->usage[idx]))) {
-				int matchret;
-
-				if ((matchret = isdn_tty_match_icall(eaz, &info->emu, di)) > wret)
-					wret = matchret;
-				if (!matchret) {                  /* EAZ is matching */
-					info->isdn_driver = di;
-					info->isdn_channel = ch;
-					info->drv_index = idx;
-					dev->m_idx[idx] = info->line;
-					dev->usage[idx] &= ISDN_USAGE_EXCLUSIVE;
-					dev->usage[idx] |= isdn_calc_usage(si1, info->emu.mdmreg[REG_L2PROT]);
-					strcpy(dev->num[idx], nr);
-					strcpy(info->emu.cpn, eaz);
-					info->emu.mdmreg[REG_SI1I] = si2bit[si1];
-					info->emu.mdmreg[REG_PLAN] = setup->plan;
-					info->emu.mdmreg[REG_SCREEN] = setup->screen;
-					isdn_info_update();
-					spin_unlock_irqrestore(&dev->lock, flags);
-					printk(KERN_INFO "isdn_tty: call from %s, -> RING on ttyI%d\n", nr,
-					       info->line);
-					info->msr |= UART_MSR_RI;
-					isdn_tty_modem_result(RESULT_RING, info);
-					isdn_timer_ctrl(ISDN_TIMER_MODEMRING, 1);
-					return 1;
-				}
-			}
-		}
-	}
-	spin_unlock_irqrestore(&dev->lock, flags);
-	printk(KERN_INFO "isdn_tty: call from %s -> %s %s\n", nr, eaz,
-	       ((dev->drv[di]->flags & DRV_FLAG_REJBUS) && (wret != 2)) ? "rejected" : "ignored");
-	return (wret == 2) ? 3 : 0;
-}
-
-int
-isdn_tty_stat_callback(int i, isdn_ctrl *c)
-{
-	int mi;
-	modem_info *info;
-	char *e;
-
-	if (i < 0)
-		return 0;
-	if ((mi = dev->m_idx[i]) >= 0) {
-		info = &dev->mdm.info[mi];
-		switch (c->command) {
-		case ISDN_STAT_CINF:
-			printk(KERN_DEBUG "CHARGEINFO on ttyI%d: %ld %s\n", info->line, c->arg, c->parm.num);
-			info->emu.charge = (unsigned) simple_strtoul(c->parm.num, &e, 10);
-			if (e == (char *)c->parm.num)
-				info->emu.charge = 0;
-
-			break;
-		case ISDN_STAT_BSENT:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_BSENT ttyI%d\n", info->line);
-#endif
-			if ((info->isdn_driver == c->driver) &&
-			    (info->isdn_channel == c->arg)) {
-				info->msr |= UART_MSR_CTS;
-				if (info->send_outstanding)
-					if (!(--info->send_outstanding))
-						info->lsr |= UART_LSR_TEMT;
-				isdn_tty_tint(info);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_CAUSE:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_CAUSE ttyI%d\n", info->line);
-#endif
-			/* Signal cause to tty-device */
-			strncpy(info->last_cause, c->parm.num, 5);
-			return 1;
-		case ISDN_STAT_DISPLAY:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_DISPLAY ttyI%d\n", info->line);
-#endif
-			/* Signal display to tty-device */
-			if ((info->emu.mdmreg[REG_DISPLAY] & BIT_DISPLAY) &&
-			    !(info->emu.mdmreg[REG_RESPNUM] & BIT_RESPNUM)) {
-				isdn_tty_at_cout("\r\n", info);
-				isdn_tty_at_cout("DISPLAY: ", info);
-				isdn_tty_at_cout(c->parm.display, info);
-				isdn_tty_at_cout("\r\n", info);
-			}
-			return 1;
-		case ISDN_STAT_DCONN:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_DCONN ttyI%d\n", info->line);
-#endif
-			if (tty_port_active(&info->port)) {
-				if (info->dialing == 1) {
-					info->dialing = 2;
-					return 1;
-				}
-			}
-			break;
-		case ISDN_STAT_DHUP:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_DHUP ttyI%d\n", info->line);
-#endif
-			if (tty_port_active(&info->port)) {
-				if (info->dialing == 1)
-					isdn_tty_modem_result(RESULT_BUSY, info);
-				if (info->dialing > 1)
-					isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-				info->dialing = 0;
-#ifdef ISDN_DEBUG_MODEM_HUP
-				printk(KERN_DEBUG "Mhup in ISDN_STAT_DHUP\n");
-#endif
-				isdn_tty_modem_hup(info, 0);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_BCONN:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_BCONN ttyI%d\n", info->line);
-#endif
-			/* Wake up any processes waiting
-			 * for incoming call of this device when
-			 * DCD follow the state of incoming carrier
-			 */
-			if (info->port.blocked_open &&
-			    (info->emu.mdmreg[REG_DCD] & BIT_DCD)) {
-				wake_up_interruptible(&info->port.open_wait);
-			}
-
-			/* Schedule CONNECT-Message to any tty
-			 * waiting for it and
-			 * set DCD-bit of its modem-status.
-			 */
-			if (tty_port_active(&info->port) ||
-			    (info->port.blocked_open &&
-			     (info->emu.mdmreg[REG_DCD] & BIT_DCD))) {
-				info->msr |= UART_MSR_DCD;
-				info->emu.charge = 0;
-				if (info->dialing & 0xf)
-					info->last_dir = 1;
-				else
-					info->last_dir = 0;
-				info->dialing = 0;
-				info->rcvsched = 1;
-				if (USG_MODEM(dev->usage[i])) {
-					if (info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM) {
-						strcpy(info->emu.connmsg, c->parm.num);
-						isdn_tty_modem_result(RESULT_CONNECT, info);
-					} else
-						isdn_tty_modem_result(RESULT_CONNECT64000, info);
-				}
-				if (USG_VOICE(dev->usage[i]))
-					isdn_tty_modem_result(RESULT_VCON, info);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_BHUP:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_BHUP ttyI%d\n", info->line);
-#endif
-			if (tty_port_active(&info->port)) {
-#ifdef ISDN_DEBUG_MODEM_HUP
-				printk(KERN_DEBUG "Mhup in ISDN_STAT_BHUP\n");
-#endif
-				isdn_tty_modem_hup(info, 0);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_NODCH:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_NODCH ttyI%d\n", info->line);
-#endif
-			if (tty_port_active(&info->port)) {
-				if (info->dialing) {
-					info->dialing = 0;
-					info->last_l2 = -1;
-					info->last_si = 0;
-					sprintf(info->last_cause, "0000");
-					isdn_tty_modem_result(RESULT_NO_DIALTONE, info);
-				}
-				isdn_tty_modem_hup(info, 0);
-				return 1;
-			}
-			break;
-		case ISDN_STAT_UNLOAD:
-#ifdef ISDN_TTY_STAT_DEBUG
-			printk(KERN_DEBUG "tty_STAT_UNLOAD ttyI%d\n", info->line);
-#endif
-			for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-				info = &dev->mdm.info[i];
-				if (info->isdn_driver == c->driver) {
-					if (info->online)
-						isdn_tty_modem_hup(info, 1);
-				}
-			}
-			return 1;
-#ifdef CONFIG_ISDN_TTY_FAX
-		case ISDN_STAT_FAXIND:
-			if (tty_port_active(&info->port)) {
-				isdn_tty_fax_command(info, c);
-			}
-			break;
-#endif
-#ifdef CONFIG_ISDN_AUDIO
-		case ISDN_STAT_AUDIO:
-			if (tty_port_active(&info->port)) {
-				switch (c->parm.num[0]) {
-				case ISDN_AUDIO_DTMF:
-					if (info->vonline) {
-						isdn_audio_put_dle_code(info,
-									c->parm.num[1]);
-					}
-					break;
-				}
-			}
-			break;
-#endif
-		}
-	}
-	return 0;
-}
-
-/*********************************************************************
- Modem-Emulator-Routines
-*********************************************************************/
-
-#define cmdchar(c) ((c >= ' ') && (c <= 0x7f))
-
-/*
- * Put a message from the AT-emulator into receive-buffer of tty,
- * convert CR, LF, and BS to values in modem-registers 3, 4 and 5.
- */
-void
-isdn_tty_at_cout(char *msg, modem_info *info)
-{
-	struct tty_port *port = &info->port;
-	atemu *m = &info->emu;
-	char *p;
-	char c;
-	u_long flags;
-	struct sk_buff *skb = NULL;
-	char *sp = NULL;
-	int l;
-
-	if (!msg) {
-		printk(KERN_WARNING "isdn_tty: Null-Message in isdn_tty_at_cout\n");
-		return;
-	}
-
-	l = strlen(msg);
-
-	spin_lock_irqsave(&info->readlock, flags);
-	if (info->closing) {
-		spin_unlock_irqrestore(&info->readlock, flags);
-		return;
-	}
-
-	/* use queue instead of direct, if online and */
-	/* data is in queue or buffer is full */
-	if (info->online && ((tty_buffer_request_room(port, l) < l) ||
-			     !skb_queue_empty(&dev->drv[info->isdn_driver]->rpqueue[info->isdn_channel]))) {
-		skb = alloc_skb(l, GFP_ATOMIC);
-		if (!skb) {
-			spin_unlock_irqrestore(&info->readlock, flags);
-			return;
-		}
-		sp = skb_put(skb, l);
-#ifdef CONFIG_ISDN_AUDIO
-		ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
-		ISDN_AUDIO_SKB_LOCK(skb) = 0;
-#endif
-	}
-
-	for (p = msg; *p; p++) {
-		switch (*p) {
-		case '\r':
-			c = m->mdmreg[REG_CR];
-			break;
-		case '\n':
-			c = m->mdmreg[REG_LF];
-			break;
-		case '\b':
-			c = m->mdmreg[REG_BS];
-			break;
-		default:
-			c = *p;
-		}
-		if (skb) {
-			*sp++ = c;
-		} else {
-			if (tty_insert_flip_char(port, c, TTY_NORMAL) == 0)
-				break;
-		}
-	}
-	if (skb) {
-		__skb_queue_tail(&dev->drv[info->isdn_driver]->rpqueue[info->isdn_channel], skb);
-		dev->drv[info->isdn_driver]->rcvcount[info->isdn_channel] += skb->len;
-		spin_unlock_irqrestore(&info->readlock, flags);
-		/* Schedule dequeuing */
-		if (dev->modempoll && info->rcvsched)
-			isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
-
-	} else {
-		spin_unlock_irqrestore(&info->readlock, flags);
-		tty_flip_buffer_push(port);
-	}
-}
-
-/*
- * Perform ATH Hangup
- */
-static void
-isdn_tty_on_hook(modem_info *info)
-{
-	if (info->isdn_channel >= 0) {
-#ifdef ISDN_DEBUG_MODEM_HUP
-		printk(KERN_DEBUG "Mhup in isdn_tty_on_hook\n");
-#endif
-		isdn_tty_modem_hup(info, 1);
-	}
-}
-
-static void
-isdn_tty_off_hook(void)
-{
-	printk(KERN_DEBUG "isdn_tty_off_hook\n");
-}
-
-#define PLUSWAIT1 (HZ / 2)      /* 0.5 sec. */
-#define PLUSWAIT2 (HZ * 3 / 2)  /* 1.5 sec */
-
-/*
- * Check Buffer for Modem-escape-sequence, activate timer-callback to
- * isdn_tty_modem_escape() if sequence found.
- *
- * Parameters:
- *   p          pointer to databuffer
- *   plus       escape-character
- *   count      length of buffer
- *   pluscount  count of valid escape-characters so far
- *   lastplus   timestamp of last character
- */
-static void
-isdn_tty_check_esc(const u_char *p, u_char plus, int count, int *pluscount,
-		   u_long *lastplus)
-{
-	if (plus > 127)
-		return;
-	if (count > 3) {
-		p += count - 3;
-		count = 3;
-		*pluscount = 0;
-	}
-	while (count > 0) {
-		if (*(p++) == plus) {
-			if ((*pluscount)++) {
-				/* Time since last '+' > 0.5 sec. ? */
-				if (time_after(jiffies, *lastplus + PLUSWAIT1))
-					*pluscount = 1;
-			} else {
-				/* Time since last non-'+' < 1.5 sec. ? */
-				if (time_before(jiffies, *lastplus + PLUSWAIT2))
-					*pluscount = 0;
-			}
-			if ((*pluscount == 3) && (count == 1))
-				isdn_timer_ctrl(ISDN_TIMER_MODEMPLUS, 1);
-			if (*pluscount > 3)
-				*pluscount = 1;
-		} else
-			*pluscount = 0;
-		*lastplus = jiffies;
-		count--;
-	}
-}
-
-/*
- * Return result of AT-emulator to tty-receive-buffer, depending on
- * modem-register 12, bit 0 and 1.
- * For CONNECT-messages also switch to online-mode.
- * For RING-message handle auto-ATA if register 0 != 0
- */
-
-static void
-isdn_tty_modem_result(int code, modem_info *info)
-{
-	atemu *m = &info->emu;
-	static char *msg[] =
-		{"OK", "CONNECT", "RING", "NO CARRIER", "ERROR",
-		 "CONNECT 64000", "NO DIALTONE", "BUSY", "NO ANSWER",
-		 "RINGING", "NO MSN/EAZ", "VCON", "RUNG"};
-	char s[ISDN_MSNLEN + 10];
-
-	switch (code) {
-	case RESULT_RING:
-		m->mdmreg[REG_RINGCNT]++;
-		if (m->mdmreg[REG_RINGCNT] == m->mdmreg[REG_RINGATA])
-			/* Automatically accept incoming call */
-			isdn_tty_cmd_ATA(info);
-		break;
-	case RESULT_NO_CARRIER:
-#ifdef ISDN_DEBUG_MODEM_HUP
-		printk(KERN_DEBUG "modem_result: NO CARRIER %d %d\n",
-		       info->closing, !info->port.tty);
-#endif
-		m->mdmreg[REG_RINGCNT] = 0;
-		del_timer(&info->nc_timer);
-		info->ncarrier = 0;
-		if (info->closing || !info->port.tty)
-			return;
-
-#ifdef CONFIG_ISDN_AUDIO
-		if (info->vonline & 1) {
-#ifdef ISDN_DEBUG_MODEM_VOICE
-			printk(KERN_DEBUG "res3: send DLE-ETX on ttyI%d\n",
-			       info->line);
-#endif
-			/* voice-recording, add DLE-ETX */
-			isdn_tty_at_cout("\020\003", info);
-		}
-		if (info->vonline & 2) {
-#ifdef ISDN_DEBUG_MODEM_VOICE
-			printk(KERN_DEBUG "res3: send DLE-DC4 on ttyI%d\n",
-			       info->line);
-#endif
-			/* voice-playing, add DLE-DC4 */
-			isdn_tty_at_cout("\020\024", info);
-		}
-#endif
-		break;
-	case RESULT_CONNECT:
-	case RESULT_CONNECT64000:
-		sprintf(info->last_cause, "0000");
-		if (!info->online)
-			info->online = 2;
-		break;
-	case RESULT_VCON:
-#ifdef ISDN_DEBUG_MODEM_VOICE
-		printk(KERN_DEBUG "res3: send VCON on ttyI%d\n",
-		       info->line);
-#endif
-		sprintf(info->last_cause, "0000");
-		if (!info->online)
-			info->online = 1;
-		break;
-	} /* switch (code) */
-
-	if (m->mdmreg[REG_RESP] & BIT_RESP) {
-		/* Show results */
-		if (m->mdmreg[REG_RESPNUM] & BIT_RESPNUM) {
-			/* Show numeric results only */
-			sprintf(s, "\r\n%d\r\n", code);
-			isdn_tty_at_cout(s, info);
-		} else {
-			if (code == RESULT_RING) {
-				/* return if "show RUNG" and ringcounter>1 */
-				if ((m->mdmreg[REG_RUNG] & BIT_RUNG) &&
-				    (m->mdmreg[REG_RINGCNT] > 1))
-					return;
-				/* print CID, _before_ _every_ ring */
-				if (!(m->mdmreg[REG_CIDONCE] & BIT_CIDONCE)) {
-					isdn_tty_at_cout("\r\nCALLER NUMBER: ", info);
-					isdn_tty_at_cout(dev->num[info->drv_index], info);
-					if (m->mdmreg[REG_CDN] & BIT_CDN) {
-						isdn_tty_at_cout("\r\nCALLED NUMBER: ", info);
-						isdn_tty_at_cout(info->emu.cpn, info);
-					}
-				}
-			}
-			isdn_tty_at_cout("\r\n", info);
-			isdn_tty_at_cout(msg[code], info);
-			switch (code) {
-			case RESULT_CONNECT:
-				switch (m->mdmreg[REG_L2PROT]) {
-				case ISDN_PROTO_L2_MODEM:
-					isdn_tty_at_cout(" ", info);
-					isdn_tty_at_cout(m->connmsg, info);
-					break;
-				}
-				break;
-			case RESULT_RING:
-				/* Append CPN, if enabled */
-				if ((m->mdmreg[REG_CPN] & BIT_CPN)) {
-					sprintf(s, "/%s", m->cpn);
-					isdn_tty_at_cout(s, info);
-				}
-				/* Print CID only once, _after_ 1st RING */
-				if ((m->mdmreg[REG_CIDONCE] & BIT_CIDONCE) &&
-				    (m->mdmreg[REG_RINGCNT] == 1)) {
-					isdn_tty_at_cout("\r\n", info);
-					isdn_tty_at_cout("CALLER NUMBER: ", info);
-					isdn_tty_at_cout(dev->num[info->drv_index], info);
-					if (m->mdmreg[REG_CDN] & BIT_CDN) {
-						isdn_tty_at_cout("\r\nCALLED NUMBER: ", info);
-						isdn_tty_at_cout(info->emu.cpn, info);
-					}
-				}
-				break;
-			case RESULT_NO_CARRIER:
-			case RESULT_NO_DIALTONE:
-			case RESULT_BUSY:
-			case RESULT_NO_ANSWER:
-				m->mdmreg[REG_RINGCNT] = 0;
-				/* Append Cause-Message if enabled */
-				if (m->mdmreg[REG_RESPXT] & BIT_RESPXT) {
-					sprintf(s, "/%s", info->last_cause);
-					isdn_tty_at_cout(s, info);
-				}
-				break;
-			case RESULT_CONNECT64000:
-				/* Append Protocol to CONNECT message */
-				switch (m->mdmreg[REG_L2PROT]) {
-				case ISDN_PROTO_L2_X75I:
-				case ISDN_PROTO_L2_X75UI:
-				case ISDN_PROTO_L2_X75BUI:
-					isdn_tty_at_cout("/X.75", info);
-					break;
-				case ISDN_PROTO_L2_HDLC:
-					isdn_tty_at_cout("/HDLC", info);
-					break;
-				case ISDN_PROTO_L2_V11096:
-					isdn_tty_at_cout("/V110/9600", info);
-					break;
-				case ISDN_PROTO_L2_V11019:
-					isdn_tty_at_cout("/V110/19200", info);
-					break;
-				case ISDN_PROTO_L2_V11038:
-					isdn_tty_at_cout("/V110/38400", info);
-					break;
-				}
-				if (m->mdmreg[REG_T70] & BIT_T70) {
-					isdn_tty_at_cout("/T.70", info);
-					if (m->mdmreg[REG_T70] & BIT_T70_EXT)
-						isdn_tty_at_cout("+", info);
-				}
-				break;
-			}
-			isdn_tty_at_cout("\r\n", info);
-		}
-	}
-	if (code == RESULT_NO_CARRIER) {
-		if (info->closing || (!info->port.tty))
-			return;
-
-		if (tty_port_check_carrier(&info->port))
-			tty_hangup(info->port.tty);
-	}
-}
-
-
-/*
- * Display a modem-register-value.
- */
-static void
-isdn_tty_show_profile(int ridx, modem_info *info)
-{
-	char v[6];
-
-	sprintf(v, "\r\n%d", info->emu.mdmreg[ridx]);
-	isdn_tty_at_cout(v, info);
-}
-
-/*
- * Get MSN-string from char-pointer, set pointer to end of number
- */
-static void
-isdn_tty_get_msnstr(char *n, char **p)
-{
-	int limit = ISDN_MSNLEN - 1;
-
-	while (((*p[0] >= '0' && *p[0] <= '9') ||
-		/* Why a comma ??? */
-		(*p[0] == ',') || (*p[0] == ':')) &&
-	       (limit--))
-		*n++ = *p[0]++;
-	*n = '\0';
-}
-
-/*
- * Get phone-number from modem-commandbuffer
- */
-static void
-isdn_tty_getdial(char *p, char *q, int cnt)
-{
-	int first = 1;
-	int limit = ISDN_MSNLEN - 1;	/* MUST match the size of interface var to avoid
-					   buffer overflow */
-
-	while (strchr(" 0123456789,#.*WPTSR-", *p) && *p && --cnt > 0) {
-		if ((*p >= '0' && *p <= '9') || ((*p == 'S') && first) ||
-		    ((*p == 'R') && first) ||
-		    (*p == '*') || (*p == '#')) {
-			*q++ = *p;
-			limit--;
-		}
-		if (!limit)
-			break;
-		p++;
-		first = 0;
-	}
-	*q = 0;
-}
-
-#define PARSE_ERROR { isdn_tty_modem_result(RESULT_ERROR, info); return; }
-#define PARSE_ERROR1 { isdn_tty_modem_result(RESULT_ERROR, info); return 1; }
-
-static void
-isdn_tty_report(modem_info *info)
-{
-	atemu *m = &info->emu;
-	char s[80];
-
-	isdn_tty_at_cout("\r\nStatistics of last connection:\r\n\r\n", info);
-	sprintf(s, "    Remote Number:    %s\r\n", info->last_num);
-	isdn_tty_at_cout(s, info);
-	sprintf(s, "    Direction:        %s\r\n", info->last_dir ? "outgoing" : "incoming");
-	isdn_tty_at_cout(s, info);
-	isdn_tty_at_cout("    Layer-2 Protocol: ", info);
-	switch (info->last_l2) {
-	case ISDN_PROTO_L2_X75I:
-		isdn_tty_at_cout("X.75i", info);
-		break;
-	case ISDN_PROTO_L2_X75UI:
-		isdn_tty_at_cout("X.75ui", info);
-		break;
-	case ISDN_PROTO_L2_X75BUI:
-		isdn_tty_at_cout("X.75bui", info);
-		break;
-	case ISDN_PROTO_L2_HDLC:
-		isdn_tty_at_cout("HDLC", info);
-		break;
-	case ISDN_PROTO_L2_V11096:
-		isdn_tty_at_cout("V.110 9600 Baud", info);
-		break;
-	case ISDN_PROTO_L2_V11019:
-		isdn_tty_at_cout("V.110 19200 Baud", info);
-		break;
-	case ISDN_PROTO_L2_V11038:
-		isdn_tty_at_cout("V.110 38400 Baud", info);
-		break;
-	case ISDN_PROTO_L2_TRANS:
-		isdn_tty_at_cout("transparent", info);
-		break;
-	case ISDN_PROTO_L2_MODEM:
-		isdn_tty_at_cout("modem", info);
-		break;
-	case ISDN_PROTO_L2_FAX:
-		isdn_tty_at_cout("fax", info);
-		break;
-	default:
-		isdn_tty_at_cout("unknown", info);
-		break;
-	}
-	if (m->mdmreg[REG_T70] & BIT_T70) {
-		isdn_tty_at_cout("/T.70", info);
-		if (m->mdmreg[REG_T70] & BIT_T70_EXT)
-			isdn_tty_at_cout("+", info);
-	}
-	isdn_tty_at_cout("\r\n", info);
-	isdn_tty_at_cout("    Service:          ", info);
-	switch (info->last_si) {
-	case 1:
-		isdn_tty_at_cout("audio\r\n", info);
-		break;
-	case 5:
-		isdn_tty_at_cout("btx\r\n", info);
-		break;
-	case 7:
-		isdn_tty_at_cout("data\r\n", info);
-		break;
-	default:
-		sprintf(s, "%d\r\n", info->last_si);
-		isdn_tty_at_cout(s, info);
-		break;
-	}
-	sprintf(s, "    Hangup location:  %s\r\n", info->last_lhup ? "local" : "remote");
-	isdn_tty_at_cout(s, info);
-	sprintf(s, "    Last cause:       %s\r\n", info->last_cause);
-	isdn_tty_at_cout(s, info);
-}
-
-/*
- * Parse AT&.. commands.
- */
-static int
-isdn_tty_cmd_ATand(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	int i;
-	char rb[100];
-
-#define MAXRB (sizeof(rb) - 1)
-
-	switch (*p[0]) {
-	case 'B':
-		/* &B - Set Buffersize */
-		p[0]++;
-		i = isdn_getnum(p);
-		if ((i < 0) || (i > ISDN_SERIAL_XMIT_MAX))
-			PARSE_ERROR1;
-#ifdef CONFIG_ISDN_AUDIO
-		if ((m->mdmreg[REG_SI1] & 1) && (i > VBUF))
-			PARSE_ERROR1;
-#endif
-		m->mdmreg[REG_PSIZE] = i / 16;
-		info->xmit_size = m->mdmreg[REG_PSIZE] * 16;
-		switch (m->mdmreg[REG_L2PROT]) {
-		case ISDN_PROTO_L2_V11096:
-		case ISDN_PROTO_L2_V11019:
-		case ISDN_PROTO_L2_V11038:
-			info->xmit_size /= 10;
-		}
-		break;
-	case 'C':
-		/* &C - DCD Status */
-		p[0]++;
-		switch (isdn_getnum(p)) {
-		case 0:
-			m->mdmreg[REG_DCD] &= ~BIT_DCD;
-			break;
-		case 1:
-			m->mdmreg[REG_DCD] |= BIT_DCD;
-			break;
-		default:
-			PARSE_ERROR1
-				}
-		break;
-	case 'D':
-		/* &D - Set DTR-Low-behavior */
-		p[0]++;
-		switch (isdn_getnum(p)) {
-		case 0:
-			m->mdmreg[REG_DTRHUP] &= ~BIT_DTRHUP;
-			m->mdmreg[REG_DTRR] &= ~BIT_DTRR;
-			break;
-		case 2:
-			m->mdmreg[REG_DTRHUP] |= BIT_DTRHUP;
-			m->mdmreg[REG_DTRR] &= ~BIT_DTRR;
-			break;
-		case 3:
-			m->mdmreg[REG_DTRHUP] |= BIT_DTRHUP;
-			m->mdmreg[REG_DTRR] |= BIT_DTRR;
-			break;
-		default:
-			PARSE_ERROR1
-				}
-		break;
-	case 'E':
-		/* &E -Set EAZ/MSN */
-		p[0]++;
-		isdn_tty_get_msnstr(m->msn, p);
-		break;
-	case 'F':
-		/* &F -Set Factory-Defaults */
-		p[0]++;
-		if (info->msr & UART_MSR_DCD)
-			PARSE_ERROR1;
-		isdn_tty_reset_profile(m);
-		isdn_tty_modem_reset_regs(info, 1);
-		break;
-#ifdef DUMMY_HAYES_AT
-	case 'K':
-		/* only for be compilant with common scripts */
-		/* &K Flowcontrol - no function */
-		p[0]++;
-		isdn_getnum(p);
-		break;
-#endif
-	case 'L':
-		/* &L -Set Numbers to listen on */
-		p[0]++;
-		i = 0;
-		while (*p[0] && (strchr("0123456789,-*[]?;", *p[0])) &&
-		       (i < ISDN_LMSNLEN - 1))
-			m->lmsn[i++] = *p[0]++;
-		m->lmsn[i] = '\0';
-		break;
-	case 'R':
-		/* &R - Set V.110 bitrate adaption */
-		p[0]++;
-		i = isdn_getnum(p);
-		switch (i) {
-		case 0:
-			/* Switch off V.110, back to X.75 */
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-			m->mdmreg[REG_SI2] = 0;
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16;
-			break;
-		case 9600:
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11096;
-			m->mdmreg[REG_SI2] = 197;
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10;
-			break;
-		case 19200:
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11019;
-			m->mdmreg[REG_SI2] = 199;
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10;
-			break;
-		case 38400:
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11038;
-			m->mdmreg[REG_SI2] = 198; /* no existing standard for this */
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		/* Switch off T.70 */
-		m->mdmreg[REG_T70] &= ~(BIT_T70 | BIT_T70_EXT);
-		/* Set Service 7 */
-		m->mdmreg[REG_SI1] |= 4;
-		break;
-	case 'S':
-		/* &S - Set Windowsize */
-		p[0]++;
-		i = isdn_getnum(p);
-		if ((i > 0) && (i < 9))
-			m->mdmreg[REG_WSIZE] = i;
-		else
-			PARSE_ERROR1;
-		break;
-	case 'V':
-		/* &V - Show registers */
-		p[0]++;
-		isdn_tty_at_cout("\r\n", info);
-		for (i = 0; i < ISDN_MODEM_NUMREG; i++) {
-			sprintf(rb, "S%02d=%03d%s", i,
-				m->mdmreg[i], ((i + 1) % 10) ? " " : "\r\n");
-			isdn_tty_at_cout(rb, info);
-		}
-		sprintf(rb, "\r\nEAZ/MSN: %.50s\r\n",
-			strlen(m->msn) ? m->msn : "None");
-		isdn_tty_at_cout(rb, info);
-		if (strlen(m->lmsn)) {
-			isdn_tty_at_cout("\r\nListen: ", info);
-			isdn_tty_at_cout(m->lmsn, info);
-			isdn_tty_at_cout("\r\n", info);
-		}
-		break;
-	case 'W':
-		/* &W - Write Profile */
-		p[0]++;
-		switch (*p[0]) {
-		case '0':
-			p[0]++;
-			modem_write_profile(m);
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 'X':
-		/* &X - Switch to BTX-Mode and T.70 */
-		p[0]++;
-		switch (isdn_getnum(p)) {
-		case 0:
-			m->mdmreg[REG_T70] &= ~(BIT_T70 | BIT_T70_EXT);
-			info->xmit_size = m->mdmreg[REG_PSIZE] * 16;
-			break;
-		case 1:
-			m->mdmreg[REG_T70] |= BIT_T70;
-			m->mdmreg[REG_T70] &= ~BIT_T70_EXT;
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-			info->xmit_size = 112;
-			m->mdmreg[REG_SI1] = 4;
-			m->mdmreg[REG_SI2] = 0;
-			break;
-		case 2:
-			m->mdmreg[REG_T70] |= (BIT_T70 | BIT_T70_EXT);
-			m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-			info->xmit_size = 112;
-			m->mdmreg[REG_SI1] = 4;
-			m->mdmreg[REG_SI2] = 0;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	default:
-		PARSE_ERROR1;
-	}
-	return 0;
-}
-
-static int
-isdn_tty_check_ats(int mreg, int mval, modem_info *info, atemu *m)
-{
-	/* Some plausibility checks */
-	switch (mreg) {
-	case REG_L2PROT:
-		if (mval > ISDN_PROTO_L2_MAX)
-			return 1;
-		break;
-	case REG_PSIZE:
-		if ((mval * 16) > ISDN_SERIAL_XMIT_MAX)
-			return 1;
-#ifdef CONFIG_ISDN_AUDIO
-		if ((m->mdmreg[REG_SI1] & 1) && (mval > VBUFX))
-			return 1;
-#endif
-		info->xmit_size = mval * 16;
-		switch (m->mdmreg[REG_L2PROT]) {
-		case ISDN_PROTO_L2_V11096:
-		case ISDN_PROTO_L2_V11019:
-		case ISDN_PROTO_L2_V11038:
-			info->xmit_size /= 10;
-		}
-		break;
-	case REG_SI1I:
-	case REG_PLAN:
-	case REG_SCREEN:
-		/* readonly registers */
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Perform ATS command
- */
-static int
-isdn_tty_cmd_ATS(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	int bitpos;
-	int mreg;
-	int mval;
-	int bval;
-
-	mreg = isdn_getnum(p);
-	if (mreg < 0 || mreg >= ISDN_MODEM_NUMREG)
-		PARSE_ERROR1;
-	switch (*p[0]) {
-	case '=':
-		p[0]++;
-		mval = isdn_getnum(p);
-		if (mval < 0 || mval > 255)
-			PARSE_ERROR1;
-		if (isdn_tty_check_ats(mreg, mval, info, m))
-			PARSE_ERROR1;
-		m->mdmreg[mreg] = mval;
-		break;
-	case '.':
-		/* Set/Clear a single bit */
-		p[0]++;
-		bitpos = isdn_getnum(p);
-		if ((bitpos < 0) || (bitpos > 7))
-			PARSE_ERROR1;
-		switch (*p[0]) {
-		case '=':
-			p[0]++;
-			bval = isdn_getnum(p);
-			if (bval < 0 || bval > 1)
-				PARSE_ERROR1;
-			if (bval)
-				mval = m->mdmreg[mreg] | (1 << bitpos);
-			else
-				mval = m->mdmreg[mreg] & ~(1 << bitpos);
-			if (isdn_tty_check_ats(mreg, mval, info, m))
-				PARSE_ERROR1;
-			m->mdmreg[mreg] = mval;
-			break;
-		case '?':
-			p[0]++;
-			isdn_tty_at_cout("\r\n", info);
-			isdn_tty_at_cout((m->mdmreg[mreg] & (1 << bitpos)) ? "1" : "0",
-					 info);
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case '?':
-		p[0]++;
-		isdn_tty_show_profile(mreg, info);
-		break;
-	default:
-		PARSE_ERROR1;
-		break;
-	}
-	return 0;
-}
-
-/*
- * Perform ATA command
- */
-static void
-isdn_tty_cmd_ATA(modem_info *info)
-{
-	atemu *m = &info->emu;
-	isdn_ctrl cmd;
-	int l2;
-
-	if (info->msr & UART_MSR_RI) {
-		/* Accept incoming call */
-		info->last_dir = 0;
-		strcpy(info->last_num, dev->num[info->drv_index]);
-		m->mdmreg[REG_RINGCNT] = 0;
-		info->msr &= ~UART_MSR_RI;
-		l2 = m->mdmreg[REG_L2PROT];
-#ifdef CONFIG_ISDN_AUDIO
-		/* If more than one bit set in reg18, autoselect Layer2 */
-		if ((m->mdmreg[REG_SI1] & m->mdmreg[REG_SI1I]) != m->mdmreg[REG_SI1]) {
-			if (m->mdmreg[REG_SI1I] == 1) {
-				if ((l2 != ISDN_PROTO_L2_MODEM) && (l2 != ISDN_PROTO_L2_FAX))
-					l2 = ISDN_PROTO_L2_TRANS;
-			} else
-				l2 = ISDN_PROTO_L2_X75I;
-		}
-#endif
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL2;
-		cmd.arg = info->isdn_channel + (l2 << 8);
-		info->last_l2 = l2;
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.command = ISDN_CMD_SETL3;
-		cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8);
-#ifdef CONFIG_ISDN_TTY_FAX
-		if (l2 == ISDN_PROTO_L2_FAX) {
-			cmd.parm.fax = info->fax;
-			info->fax->direction = ISDN_TTY_FAX_CONN_IN;
-		}
-#endif
-		isdn_command(&cmd);
-		cmd.driver = info->isdn_driver;
-		cmd.arg = info->isdn_channel;
-		cmd.command = ISDN_CMD_ACCEPTD;
-		info->dialing = 16;
-		info->emu.carrierwait = 0;
-		isdn_command(&cmd);
-		isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1);
-	} else
-		isdn_tty_modem_result(RESULT_NO_ANSWER, info);
-}
-
-#ifdef CONFIG_ISDN_AUDIO
-/*
- * Parse AT+F.. commands
- */
-static int
-isdn_tty_cmd_PLUSF(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	char rs[20];
-
-	if (!strncmp(p[0], "CLASS", 5)) {
-		p[0] += 5;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d",
-				(m->mdmreg[REG_SI1] & 1) ? 8 : 0);
-#ifdef CONFIG_ISDN_TTY_FAX
-			if (TTY_IS_FCLASS2(info))
-				sprintf(rs, "\r\n2");
-			else if (TTY_IS_FCLASS1(info))
-				sprintf(rs, "\r\n1");
-#endif
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			switch (*p[0]) {
-			case '0':
-				p[0]++;
-				m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-				m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_TRANS;
-				m->mdmreg[REG_SI1] = 4;
-				info->xmit_size =
-					m->mdmreg[REG_PSIZE] * 16;
-				break;
-#ifdef CONFIG_ISDN_TTY_FAX
-			case '1':
-				p[0]++;
-				if (!(dev->global_features &
-				      ISDN_FEATURE_L3_FCLASS1))
-					PARSE_ERROR1;
-				m->mdmreg[REG_SI1] = 1;
-				m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_FAX;
-				m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_FCLASS1;
-				info->xmit_size =
-					m->mdmreg[REG_PSIZE] * 16;
-				break;
-			case '2':
-				p[0]++;
-				if (!(dev->global_features &
-				      ISDN_FEATURE_L3_FCLASS2))
-					PARSE_ERROR1;
-				m->mdmreg[REG_SI1] = 1;
-				m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_FAX;
-				m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_FCLASS2;
-				info->xmit_size =
-					m->mdmreg[REG_PSIZE] * 16;
-				break;
-#endif
-			case '8':
-				p[0]++;
-				/* L2 will change on dialout with si=1 */
-				m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I;
-				m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_TRANS;
-				m->mdmreg[REG_SI1] = 5;
-				info->xmit_size = VBUF;
-				break;
-			case '?':
-				p[0]++;
-				strcpy(rs, "\r\n0,");
-#ifdef CONFIG_ISDN_TTY_FAX
-				if (dev->global_features &
-				    ISDN_FEATURE_L3_FCLASS1)
-					strcat(rs, "1,");
-				if (dev->global_features &
-				    ISDN_FEATURE_L3_FCLASS2)
-					strcat(rs, "2,");
-#endif
-				strcat(rs, "8");
-				isdn_tty_at_cout(rs, info);
-				break;
-			default:
-				PARSE_ERROR1;
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-#ifdef CONFIG_ISDN_TTY_FAX
-	return (isdn_tty_cmd_PLUSF_FAX(p, info));
-#else
-	PARSE_ERROR1;
-#endif
-}
-
-/*
- * Parse AT+V.. commands
- */
-static int
-isdn_tty_cmd_PLUSV(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	isdn_ctrl cmd;
-	static char *vcmd[] =
-		{"NH", "IP", "LS", "RX", "SD", "SM", "TX", "DD", NULL};
-	int i;
-	int par1;
-	int par2;
-	char rs[20];
-
-	i = 0;
-	while (vcmd[i]) {
-		if (!strncmp(vcmd[i], p[0], 2)) {
-			p[0] += 2;
-			break;
-		}
-		i++;
-	}
-	switch (i) {
-	case 0:
-		/* AT+VNH - Auto hangup feature */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			isdn_tty_at_cout("\r\n1", info);
-			break;
-		case '=':
-			p[0]++;
-			switch (*p[0]) {
-			case '1':
-				p[0]++;
-				break;
-			case '?':
-				p[0]++;
-				isdn_tty_at_cout("\r\n1", info);
-				break;
-			default:
-				PARSE_ERROR1;
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 1:
-		/* AT+VIP - Reset all voice parameters */
-		isdn_tty_modem_reset_vpar(m);
-		break;
-	case 2:
-		/* AT+VLS - Select device, accept incoming call */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", m->vpar[0]);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			switch (*p[0]) {
-			case '0':
-				p[0]++;
-				m->vpar[0] = 0;
-				break;
-			case '2':
-				p[0]++;
-				m->vpar[0] = 2;
-				break;
-			case '?':
-				p[0]++;
-				isdn_tty_at_cout("\r\n0,2", info);
-				break;
-			default:
-				PARSE_ERROR1;
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 3:
-		/* AT+VRX - Start recording */
-		if (!m->vpar[0])
-			PARSE_ERROR1;
-		if (info->online != 1) {
-			isdn_tty_modem_result(RESULT_NO_ANSWER, info);
-			return 1;
-		}
-		info->dtmf_state = isdn_audio_dtmf_init(info->dtmf_state);
-		if (!info->dtmf_state) {
-			printk(KERN_WARNING "isdn_tty: Couldn't malloc dtmf state\n");
-			PARSE_ERROR1;
-		}
-		info->silence_state = isdn_audio_silence_init(info->silence_state);
-		if (!info->silence_state) {
-			printk(KERN_WARNING "isdn_tty: Couldn't malloc silence state\n");
-			PARSE_ERROR1;
-		}
-		if (m->vpar[3] < 5) {
-			info->adpcmr = isdn_audio_adpcm_init(info->adpcmr, m->vpar[3]);
-			if (!info->adpcmr) {
-				printk(KERN_WARNING "isdn_tty: Couldn't malloc adpcm state\n");
-				PARSE_ERROR1;
-			}
-		}
-#ifdef ISDN_DEBUG_AT
-		printk(KERN_DEBUG "AT: +VRX\n");
-#endif
-		info->vonline |= 1;
-		isdn_tty_modem_result(RESULT_CONNECT, info);
-		return 0;
-		break;
-	case 4:
-		/* AT+VSD - Silence detection */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n<%d>,<%d>",
-				m->vpar[1],
-				m->vpar[2]);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if ((*p[0] >= '0') && (*p[0] <= '9')) {
-				par1 = isdn_getnum(p);
-				if ((par1 < 0) || (par1 > 31))
-					PARSE_ERROR1;
-				if (*p[0] != ',')
-					PARSE_ERROR1;
-				p[0]++;
-				par2 = isdn_getnum(p);
-				if ((par2 < 0) || (par2 > 255))
-					PARSE_ERROR1;
-				m->vpar[1] = par1;
-				m->vpar[2] = par2;
-				break;
-			} else
-				if (*p[0] == '?') {
-					p[0]++;
-					isdn_tty_at_cout("\r\n<0-31>,<0-255>",
-							 info);
-					break;
-				} else
-					PARSE_ERROR1;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 5:
-		/* AT+VSM - Select compression */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n<%d>,<%d><8000>",
-				m->vpar[3],
-				m->vpar[1]);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			switch (*p[0]) {
-			case '2':
-			case '3':
-			case '4':
-			case '5':
-			case '6':
-				par1 = isdn_getnum(p);
-				if ((par1 < 2) || (par1 > 6))
-					PARSE_ERROR1;
-				m->vpar[3] = par1;
-				break;
-			case '?':
-				p[0]++;
-				isdn_tty_at_cout("\r\n2;ADPCM;2;0;(8000)\r\n",
-						 info);
-				isdn_tty_at_cout("3;ADPCM;3;0;(8000)\r\n",
-						 info);
-				isdn_tty_at_cout("4;ADPCM;4;0;(8000)\r\n",
-						 info);
-				isdn_tty_at_cout("5;ALAW;8;0;(8000)\r\n",
-						 info);
-				isdn_tty_at_cout("6;ULAW;8;0;(8000)\r\n",
-						 info);
-				break;
-			default:
-				PARSE_ERROR1;
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	case 6:
-		/* AT+VTX - Start sending */
-		if (!m->vpar[0])
-			PARSE_ERROR1;
-		if (info->online != 1) {
-			isdn_tty_modem_result(RESULT_NO_ANSWER, info);
-			return 1;
-		}
-		info->dtmf_state = isdn_audio_dtmf_init(info->dtmf_state);
-		if (!info->dtmf_state) {
-			printk(KERN_WARNING "isdn_tty: Couldn't malloc dtmf state\n");
-			PARSE_ERROR1;
-		}
-		if (m->vpar[3] < 5) {
-			info->adpcms = isdn_audio_adpcm_init(info->adpcms, m->vpar[3]);
-			if (!info->adpcms) {
-				printk(KERN_WARNING "isdn_tty: Couldn't malloc adpcm state\n");
-				PARSE_ERROR1;
-			}
-		}
-#ifdef ISDN_DEBUG_AT
-		printk(KERN_DEBUG "AT: +VTX\n");
-#endif
-		m->lastDLE = 0;
-		info->vonline |= 2;
-		isdn_tty_modem_result(RESULT_CONNECT, info);
-		return 0;
-		break;
-	case 7:
-		/* AT+VDD - DTMF detection */
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n<%d>,<%d>",
-				m->vpar[4],
-				m->vpar[5]);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if ((*p[0] >= '0') && (*p[0] <= '9')) {
-				if (info->online != 1)
-					PARSE_ERROR1;
-				par1 = isdn_getnum(p);
-				if ((par1 < 0) || (par1 > 15))
-					PARSE_ERROR1;
-				if (*p[0] != ',')
-					PARSE_ERROR1;
-				p[0]++;
-				par2 = isdn_getnum(p);
-				if ((par2 < 0) || (par2 > 255))
-					PARSE_ERROR1;
-				m->vpar[4] = par1;
-				m->vpar[5] = par2;
-				cmd.driver = info->isdn_driver;
-				cmd.command = ISDN_CMD_AUDIO;
-				cmd.arg = info->isdn_channel + (ISDN_AUDIO_SETDD << 8);
-				cmd.parm.num[0] = par1;
-				cmd.parm.num[1] = par2;
-				isdn_command(&cmd);
-				break;
-			} else
-				if (*p[0] == '?') {
-					p[0]++;
-					isdn_tty_at_cout("\r\n<0-15>,<0-255>",
-							 info);
-					break;
-				} else
-					PARSE_ERROR1;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		break;
-	default:
-		PARSE_ERROR1;
-	}
-	return 0;
-}
-#endif                          /* CONFIG_ISDN_AUDIO */
-
-/*
- * Parse and perform an AT-command-line.
- */
-static void
-isdn_tty_parse_at(modem_info *info)
-{
-	atemu *m = &info->emu;
-	char *p;
-	char ds[ISDN_MSNLEN];
-
-#ifdef ISDN_DEBUG_AT
-	printk(KERN_DEBUG "AT: '%s'\n", m->mdmcmd);
-#endif
-	for (p = &m->mdmcmd[2]; *p;) {
-		switch (*p) {
-		case ' ':
-			p++;
-			break;
-		case 'A':
-			/* A - Accept incoming call */
-			p++;
-			isdn_tty_cmd_ATA(info);
-			return;
-		case 'D':
-			/* D - Dial */
-			if (info->msr & UART_MSR_DCD)
-				PARSE_ERROR;
-			if (info->msr & UART_MSR_RI) {
-				isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-				return;
-			}
-			isdn_tty_getdial(++p, ds, sizeof ds);
-			p += strlen(p);
-			if (!strlen(m->msn))
-				isdn_tty_modem_result(RESULT_NO_MSN_EAZ, info);
-			else if (strlen(ds))
-				isdn_tty_dial(ds, info, m);
-			else
-				PARSE_ERROR;
-			return;
-		case 'E':
-			/* E - Turn Echo on/off */
-			p++;
-			switch (isdn_getnum(&p)) {
-			case 0:
-				m->mdmreg[REG_ECHO] &= ~BIT_ECHO;
-				break;
-			case 1:
-				m->mdmreg[REG_ECHO] |= BIT_ECHO;
-				break;
-			default:
-				PARSE_ERROR;
-			}
-			break;
-		case 'H':
-			/* H - On/Off-hook */
-			p++;
-			switch (*p) {
-			case '0':
-				p++;
-				isdn_tty_on_hook(info);
-				break;
-			case '1':
-				p++;
-				isdn_tty_off_hook();
-				break;
-			default:
-				isdn_tty_on_hook(info);
-				break;
-			}
-			break;
-		case 'I':
-			/* I - Information */
-			p++;
-			isdn_tty_at_cout("\r\nLinux ISDN", info);
-			switch (*p) {
-			case '0':
-			case '1':
-				p++;
-				break;
-			case '2':
-				p++;
-				isdn_tty_report(info);
-				break;
-			case '3':
-				p++;
-				snprintf(ds, sizeof(ds), "\r\n%d", info->emu.charge);
-				isdn_tty_at_cout(ds, info);
-				break;
-			default:;
-			}
-			break;
-#ifdef DUMMY_HAYES_AT
-		case 'L':
-		case 'M':
-			/* only for be compilant with common scripts */
-			/* no function */
-			p++;
-			isdn_getnum(&p);
-			break;
-#endif
-		case 'O':
-			/* O - Go online */
-			p++;
-			if (info->msr & UART_MSR_DCD)
-				/* if B-Channel is up */
-				isdn_tty_modem_result((m->mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM) ? RESULT_CONNECT : RESULT_CONNECT64000, info);
-			else
-				isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-			return;
-		case 'Q':
-			/* Q - Turn Emulator messages on/off */
-			p++;
-			switch (isdn_getnum(&p)) {
-			case 0:
-				m->mdmreg[REG_RESP] |= BIT_RESP;
-				break;
-			case 1:
-				m->mdmreg[REG_RESP] &= ~BIT_RESP;
-				break;
-			default:
-				PARSE_ERROR;
-			}
-			break;
-		case 'S':
-			/* S - Set/Get Register */
-			p++;
-			if (isdn_tty_cmd_ATS(&p, info))
-				return;
-			break;
-		case 'V':
-			/* V - Numeric or ASCII Emulator-messages */
-			p++;
-			switch (isdn_getnum(&p)) {
-			case 0:
-				m->mdmreg[REG_RESP] |= BIT_RESPNUM;
-				break;
-			case 1:
-				m->mdmreg[REG_RESP] &= ~BIT_RESPNUM;
-				break;
-			default:
-				PARSE_ERROR;
-			}
-			break;
-		case 'Z':
-			/* Z - Load Registers from Profile */
-			p++;
-			if (info->msr & UART_MSR_DCD) {
-				info->online = 0;
-				isdn_tty_on_hook(info);
-			}
-			isdn_tty_modem_reset_regs(info, 1);
-			break;
-		case '+':
-			p++;
-			switch (*p) {
-#ifdef CONFIG_ISDN_AUDIO
-			case 'F':
-				p++;
-				if (isdn_tty_cmd_PLUSF(&p, info))
-					return;
-				break;
-			case 'V':
-				if ((!(m->mdmreg[REG_SI1] & 1)) ||
-				    (m->mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM))
-					PARSE_ERROR;
-				p++;
-				if (isdn_tty_cmd_PLUSV(&p, info))
-					return;
-				break;
-#endif                          /* CONFIG_ISDN_AUDIO */
-			case 'S':	/* SUSPEND */
-				p++;
-				isdn_tty_get_msnstr(ds, &p);
-				isdn_tty_suspend(ds, info, m);
-				break;
-			case 'R':	/* RESUME */
-				p++;
-				isdn_tty_get_msnstr(ds, &p);
-				isdn_tty_resume(ds, info, m);
-				break;
-			case 'M':	/* MESSAGE */
-				p++;
-				isdn_tty_send_msg(info, m, p);
-				break;
-			default:
-				PARSE_ERROR;
-			}
-			break;
-		case '&':
-			p++;
-			if (isdn_tty_cmd_ATand(&p, info))
-				return;
-			break;
-		default:
-			PARSE_ERROR;
-		}
-	}
-#ifdef CONFIG_ISDN_AUDIO
-	if (!info->vonline)
-#endif
-		isdn_tty_modem_result(RESULT_OK, info);
-}
-
-/* Need own toupper() because standard-toupper is not available
- * within modules.
- */
-#define my_toupper(c) (((c >= 'a') && (c <= 'z')) ? (c & 0xdf) : c)
-
-/*
- * Perform line-editing of AT-commands
- *
- * Parameters:
- *   p        inputbuffer
- *   count    length of buffer
- *   channel  index to line (minor-device)
- */
-static int
-isdn_tty_edit_at(const char *p, int count, modem_info *info)
-{
-	atemu *m = &info->emu;
-	int total = 0;
-	u_char c;
-	char eb[2];
-	int cnt;
-
-	for (cnt = count; cnt > 0; p++, cnt--) {
-		c = *p;
-		total++;
-		if (c == m->mdmreg[REG_CR] || c == m->mdmreg[REG_LF]) {
-			/* Separator (CR or LF) */
-			m->mdmcmd[m->mdmcmdl] = 0;
-			if (m->mdmreg[REG_ECHO] & BIT_ECHO) {
-				eb[0] = c;
-				eb[1] = 0;
-				isdn_tty_at_cout(eb, info);
-			}
-			if ((m->mdmcmdl >= 2) && (!(strncmp(m->mdmcmd, "AT", 2))))
-				isdn_tty_parse_at(info);
-			m->mdmcmdl = 0;
-			continue;
-		}
-		if (c == m->mdmreg[REG_BS] && m->mdmreg[REG_BS] < 128) {
-			/* Backspace-Function */
-			if ((m->mdmcmdl > 2) || (!m->mdmcmdl)) {
-				if (m->mdmcmdl)
-					m->mdmcmdl--;
-				if (m->mdmreg[REG_ECHO] & BIT_ECHO)
-					isdn_tty_at_cout("\b", info);
-			}
-			continue;
-		}
-		if (cmdchar(c)) {
-			if (m->mdmreg[REG_ECHO] & BIT_ECHO) {
-				eb[0] = c;
-				eb[1] = 0;
-				isdn_tty_at_cout(eb, info);
-			}
-			if (m->mdmcmdl < 255) {
-				c = my_toupper(c);
-				switch (m->mdmcmdl) {
-				case 1:
-					if (c == 'T') {
-						m->mdmcmd[m->mdmcmdl] = c;
-						m->mdmcmd[++m->mdmcmdl] = 0;
-						break;
-					} else
-						m->mdmcmdl = 0;
-					/* Fall through - check for 'A' */
-				case 0:
-					if (c == 'A') {
-						m->mdmcmd[m->mdmcmdl] = c;
-						m->mdmcmd[++m->mdmcmdl] = 0;
-					}
-					break;
-				default:
-					m->mdmcmd[m->mdmcmdl] = c;
-					m->mdmcmd[++m->mdmcmdl] = 0;
-				}
-			}
-		}
-	}
-	return total;
-}
-
-/*
- * Switch all modem-channels who are online and got a valid
- * escape-sequence 1.5 seconds ago, to command-mode.
- * This function is called every second via timer-interrupt from within
- * timer-dispatcher isdn_timer_function()
- */
-void
-isdn_tty_modem_escape(void)
-{
-	int ton = 0;
-	int i;
-	int midx;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++)
-		if (USG_MODEM(dev->usage[i]) && (midx = dev->m_idx[i]) >= 0) {
-			modem_info *info = &dev->mdm.info[midx];
-			if (info->online) {
-				ton = 1;
-				if ((info->emu.pluscount == 3) &&
-				    time_after(jiffies,
-					    info->emu.lastplus + PLUSWAIT2)) {
-					info->emu.pluscount = 0;
-					info->online = 0;
-					isdn_tty_modem_result(RESULT_OK, info);
-				}
-			}
-		}
-	isdn_timer_ctrl(ISDN_TIMER_MODEMPLUS, ton);
-}
-
-/*
- * Put a RING-message to all modem-channels who have the RI-bit set.
- * This function is called every second via timer-interrupt from within
- * timer-dispatcher isdn_timer_function()
- */
-void
-isdn_tty_modem_ring(void)
-{
-	int ton = 0;
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		modem_info *info = &dev->mdm.info[i];
-		if (info->msr & UART_MSR_RI) {
-			ton = 1;
-			isdn_tty_modem_result(RESULT_RING, info);
-		}
-	}
-	isdn_timer_ctrl(ISDN_TIMER_MODEMRING, ton);
-}
-
-/*
- * For all online tty's, try sending data to
- * the lower levels.
- */
-void
-isdn_tty_modem_xmit(void)
-{
-	int ton = 1;
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		modem_info *info = &dev->mdm.info[i];
-		if (info->online) {
-			ton = 1;
-			isdn_tty_senddown(info);
-			isdn_tty_tint(info);
-		}
-	}
-	isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, ton);
-}
-
-/*
- * Check all channels if we have a 'no carrier' timeout.
- * Timeout value is set by Register S7.
- */
-void
-isdn_tty_carrier_timeout(void)
-{
-	int ton = 0;
-	int i;
-
-	for (i = 0; i < ISDN_MAX_CHANNELS; i++) {
-		modem_info *info = &dev->mdm.info[i];
-		if (!info->dialing)
-			continue;
-		if (info->emu.carrierwait++ > info->emu.mdmreg[REG_WAITC]) {
-			info->dialing = 0;
-			isdn_tty_modem_result(RESULT_NO_CARRIER, info);
-			isdn_tty_modem_hup(info, 1);
-		} else
-			ton = 1;
-	}
-	isdn_timer_ctrl(ISDN_TIMER_CARRIER, ton);
-}
diff --git a/drivers/isdn/i4l/isdn_tty.h b/drivers/isdn/i4l/isdn_tty.h
deleted file mode 100644
index a6f801d2263b..000000000000
--- a/drivers/isdn/i4l/isdn_tty.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* $Id: isdn_tty.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, tty related functions (linklevel).
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-
-#define DLE 0x10
-#define ETX 0x03
-#define DC4 0x14
-
-
-/*
- * Definition of some special Registers of AT-Emulator
- */
-#define REG_RINGATA   0
-#define REG_RINGCNT   1  /* ring counter register */
-#define REG_ESC       2
-#define REG_CR        3
-#define REG_LF        4
-#define REG_BS        5
-
-#define REG_WAITC     7
-
-#define REG_RESP     12  /* show response messages register */
-#define BIT_RESP      1  /* show response messages bit      */
-#define REG_RESPNUM  12  /* show numeric responses register */
-#define BIT_RESPNUM   2  /* show numeric responses bit      */
-#define REG_ECHO     12
-#define BIT_ECHO      4
-#define REG_DCD      12
-#define BIT_DCD       8
-#define REG_CTS      12
-#define BIT_CTS      16
-#define REG_DTRR     12
-#define BIT_DTRR     32
-#define REG_DSR      12
-#define BIT_DSR      64
-#define REG_CPPP     12
-#define BIT_CPPP    128
-
-#define REG_DXMT     13
-#define BIT_DXMT      1
-#define REG_T70      13
-#define BIT_T70       2
-#define BIT_T70_EXT  32
-#define REG_DTRHUP   13
-#define BIT_DTRHUP    4
-#define REG_RESPXT   13
-#define BIT_RESPXT    8
-#define REG_CIDONCE  13
-#define BIT_CIDONCE  16
-#define REG_RUNG     13  /* show RUNG message register      */
-#define BIT_RUNG     64  /* show RUNG message bit           */
-#define REG_DISPLAY  13
-#define BIT_DISPLAY 128
-
-#define REG_L2PROT   14
-#define REG_L3PROT   15
-#define REG_PSIZE    16
-#define REG_WSIZE    17
-#define REG_SI1      18
-#define REG_SI2      19
-#define REG_SI1I     20
-#define REG_PLAN     21
-#define REG_SCREEN   22
-
-#define REG_CPN      23
-#define BIT_CPN       1
-#define REG_CPNFCON  23
-#define BIT_CPNFCON   2
-#define REG_CDN      23
-#define BIT_CDN       4
-
-/* defines for result codes */
-#define RESULT_OK		0
-#define RESULT_CONNECT		1
-#define RESULT_RING		2
-#define RESULT_NO_CARRIER	3
-#define RESULT_ERROR		4
-#define RESULT_CONNECT64000	5
-#define RESULT_NO_DIALTONE	6
-#define RESULT_BUSY		7
-#define RESULT_NO_ANSWER	8
-#define RESULT_RINGING		9
-#define RESULT_NO_MSN_EAZ	10
-#define RESULT_VCON		11
-#define RESULT_RUNG		12
-
-#define TTY_IS_FCLASS1(info)						\
-	((info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_FAX) &&		\
-	 (info->emu.mdmreg[REG_L3PROT] == ISDN_PROTO_L3_FCLASS1))
-#define TTY_IS_FCLASS2(info)						\
-	((info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_FAX) &&		\
-	 (info->emu.mdmreg[REG_L3PROT] == ISDN_PROTO_L3_FCLASS2))
-
-extern void isdn_tty_modem_escape(void);
-extern void isdn_tty_modem_ring(void);
-extern void isdn_tty_carrier_timeout(void);
-extern void isdn_tty_modem_xmit(void);
-extern int  isdn_tty_modem_init(void);
-extern void isdn_tty_exit(void);
-extern void isdn_tty_readmodem(void);
-extern int  isdn_tty_find_icall(int, int, setup_parm *);
-extern int  isdn_tty_stat_callback(int, isdn_ctrl *);
-extern int  isdn_tty_rcv_skb(int, int, int, struct sk_buff *);
-extern int  isdn_tty_capi_facility(capi_msg *cm);
-extern void isdn_tty_at_cout(char *, modem_info *);
-extern void isdn_tty_modem_hup(modem_info *, int);
-#ifdef CONFIG_ISDN_TTY_FAX
-extern int  isdn_tty_cmd_PLUSF_FAX(char **, modem_info *);
-extern int  isdn_tty_fax_command(modem_info *, isdn_ctrl *);
-extern void isdn_tty_fax_bitorder(modem_info *, struct sk_buff *);
-#endif
diff --git a/drivers/isdn/i4l/isdn_ttyfax.c b/drivers/isdn/i4l/isdn_ttyfax.c
deleted file mode 100644
index 47aae4916730..000000000000
--- a/drivers/isdn/i4l/isdn_ttyfax.c
+++ /dev/null
@@ -1,1123 +0,0 @@
-/* $Id: isdn_ttyfax.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, tty_fax AT-command emulator (linklevel).
- *
- * Copyright 1999    by Armin Schindler (mac@melware.de)
- * Copyright 1999    by Ralf Spachmann (mel@melware.de)
- * Copyright 1999    by Cytronics & Melware
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#undef ISDN_TTY_FAX_STAT_DEBUG
-#undef ISDN_TTY_FAX_CMD_DEBUG
-
-#include <linux/isdn.h>
-#include "isdn_common.h"
-#include "isdn_tty.h"
-#include "isdn_ttyfax.h"
-
-
-static char *isdn_tty_fax_revision = "$Revision: 1.1.2.2 $";
-
-#define PARSE_ERROR1 { isdn_tty_fax_modem_result(1, info); return 1; }
-
-static char *
-isdn_getrev(const char *revision)
-{
-	char *rev;
-	char *p;
-
-	if ((p = strchr(revision, ':'))) {
-		rev = p + 2;
-		p = strchr(rev, '$');
-		*--p = 0;
-	} else
-		rev = "???";
-	return rev;
-}
-
-/*
- * Fax Class 2 Modem results
- *
- */
-
-static void
-isdn_tty_fax_modem_result(int code, modem_info *info)
-{
-	atemu *m = &info->emu;
-	T30_s *f = info->fax;
-	char rs[50];
-	char rss[50];
-	char *rp;
-	int i;
-	static char *msg[] =
-		{"OK", "ERROR", "+FCON", "+FCSI:", "+FDIS:",
-		 "+FHNG:", "+FDCS:", "CONNECT", "+FTSI:",
-		 "+FCFR", "+FPTS:", "+FET:"};
-
-
-	isdn_tty_at_cout("\r\n", info);
-	isdn_tty_at_cout(msg[code], info);
-
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty: Fax send %s on ttyI%d\n",
-	       msg[code], info->line);
-#endif
-	switch (code) {
-	case 0: /* OK */
-		break;
-	case 1: /* ERROR */
-		break;
-	case 2:	/* +FCON */
-		/* Append CPN, if enabled */
-		if ((m->mdmreg[REG_CPNFCON] & BIT_CPNFCON) &&
-		    (!(dev->usage[info->isdn_channel] & ISDN_USAGE_OUTGOING))) {
-			sprintf(rs, "/%s", m->cpn);
-			isdn_tty_at_cout(rs, info);
-		}
-		info->online = 1;
-		f->fet = 0;
-		if (f->phase == ISDN_FAX_PHASE_A)
-			f->phase = ISDN_FAX_PHASE_B;
-		break;
-	case 3:	/* +FCSI */
-	case 8:	/* +FTSI */
-		sprintf(rs, "\"%s\"", f->r_id);
-		isdn_tty_at_cout(rs, info);
-		break;
-	case 4:	/* +FDIS */
-		rs[0] = 0;
-		rp = &f->r_resolution;
-		for (i = 0; i < 8; i++) {
-			sprintf(rss, "%c%s", rp[i] + 48,
-				(i < 7) ? "," : "");
-			strcat(rs, rss);
-		}
-		isdn_tty_at_cout(rs, info);
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax DIS=%s on ttyI%d\n",
-		       rs, info->line);
-#endif
-		break;
-	case 5:	/* +FHNG */
-		sprintf(rs, "%d", f->code);
-		isdn_tty_at_cout(rs, info);
-		info->faxonline = 0;
-		break;
-	case 6:	/* +FDCS */
-		rs[0] = 0;
-		rp = &f->r_resolution;
-		for (i = 0; i < 8; i++) {
-			sprintf(rss, "%c%s", rp[i] + 48,
-				(i < 7) ? "," : "");
-			strcat(rs, rss);
-		}
-		isdn_tty_at_cout(rs, info);
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax DCS=%s on ttyI%d\n",
-		       rs, info->line);
-#endif
-		break;
-	case 7:	/* CONNECT */
-		info->faxonline |= 2;
-		break;
-	case 9:	/* FCFR */
-		break;
-	case 10:	/* FPTS */
-		isdn_tty_at_cout("1", info);
-		break;
-	case 11:	/* FET */
-		sprintf(rs, "%d", f->fet);
-		isdn_tty_at_cout(rs, info);
-		break;
-	}
-
-	isdn_tty_at_cout("\r\n", info);
-
-	switch (code) {
-	case 7:	/* CONNECT */
-		info->online = 2;
-		if (info->faxonline & 1) {
-			sprintf(rs, "%c", XON);
-			isdn_tty_at_cout(rs, info);
-		}
-		break;
-	}
-}
-
-static int
-isdn_tty_fax_command1(modem_info *info, isdn_ctrl *c)
-{
-	static char *msg[] =
-		{"OK", "CONNECT", "NO CARRIER", "ERROR", "FCERROR"};
-
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty: FCLASS1 cmd(%d)\n", c->parm.aux.cmd);
-#endif
-	if (c->parm.aux.cmd < ISDN_FAX_CLASS1_QUERY) {
-		if (info->online)
-			info->online = 1;
-		isdn_tty_at_cout("\r\n", info);
-		isdn_tty_at_cout(msg[c->parm.aux.cmd], info);
-		isdn_tty_at_cout("\r\n", info);
-	}
-	switch (c->parm.aux.cmd) {
-	case ISDN_FAX_CLASS1_CONNECT:
-		info->online = 2;
-		break;
-	case ISDN_FAX_CLASS1_OK:
-	case ISDN_FAX_CLASS1_FCERROR:
-	case ISDN_FAX_CLASS1_ERROR:
-	case ISDN_FAX_CLASS1_NOCARR:
-		break;
-	case ISDN_FAX_CLASS1_QUERY:
-		isdn_tty_at_cout("\r\n", info);
-		if (!c->parm.aux.para[0]) {
-			isdn_tty_at_cout(msg[ISDN_FAX_CLASS1_ERROR], info);
-			isdn_tty_at_cout("\r\n", info);
-		} else {
-			isdn_tty_at_cout(c->parm.aux.para, info);
-			isdn_tty_at_cout("\r\nOK\r\n", info);
-		}
-		break;
-	}
-	return (0);
-}
-
-int
-isdn_tty_fax_command(modem_info *info, isdn_ctrl *c)
-{
-	T30_s *f = info->fax;
-	char rs[10];
-
-	if (TTY_IS_FCLASS1(info))
-		return (isdn_tty_fax_command1(info, c));
-
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty: Fax cmd %d on ttyI%d\n",
-	       f->r_code, info->line);
-#endif
-	switch (f->r_code) {
-	case ISDN_TTY_FAX_FCON:
-		info->faxonline = 1;
-		isdn_tty_fax_modem_result(2, info);	/* +FCON */
-		return (0);
-	case ISDN_TTY_FAX_FCON_I:
-		info->faxonline = 16;
-		isdn_tty_fax_modem_result(2, info);	/* +FCON */
-		return (0);
-	case ISDN_TTY_FAX_RID:
-		if (info->faxonline & 1)
-			isdn_tty_fax_modem_result(3, info);	/* +FCSI */
-		if (info->faxonline & 16)
-			isdn_tty_fax_modem_result(8, info);	/* +FTSI */
-		return (0);
-	case ISDN_TTY_FAX_DIS:
-		isdn_tty_fax_modem_result(4, info);	/* +FDIS */
-		return (0);
-	case ISDN_TTY_FAX_HNG:
-		if (f->phase == ISDN_FAX_PHASE_C) {
-			if (f->direction == ISDN_TTY_FAX_CONN_IN) {
-				sprintf(rs, "%c%c", DLE, ETX);
-				isdn_tty_at_cout(rs, info);
-			} else {
-				sprintf(rs, "%c", 0x18);
-				isdn_tty_at_cout(rs, info);
-			}
-			info->faxonline &= ~2;	/* leave data mode */
-			info->online = 1;
-		}
-		f->phase = ISDN_FAX_PHASE_E;
-		isdn_tty_fax_modem_result(5, info);	/* +FHNG */
-		isdn_tty_fax_modem_result(0, info);	/* OK */
-		return (0);
-	case ISDN_TTY_FAX_DCS:
-		isdn_tty_fax_modem_result(6, info);	/* +FDCS */
-		isdn_tty_fax_modem_result(7, info);	/* CONNECT */
-		f->phase = ISDN_FAX_PHASE_C;
-		return (0);
-	case ISDN_TTY_FAX_TRAIN_OK:
-		isdn_tty_fax_modem_result(6, info);	/* +FDCS */
-		isdn_tty_fax_modem_result(0, info);	/* OK */
-		return (0);
-	case ISDN_TTY_FAX_SENT:
-		isdn_tty_fax_modem_result(0, info);	/* OK */
-		return (0);
-	case ISDN_TTY_FAX_CFR:
-		isdn_tty_fax_modem_result(9, info);	/* +FCFR */
-		return (0);
-	case ISDN_TTY_FAX_ET:
-		sprintf(rs, "%c%c", DLE, ETX);
-		isdn_tty_at_cout(rs, info);
-		isdn_tty_fax_modem_result(10, info);	/* +FPTS */
-		isdn_tty_fax_modem_result(11, info);	/* +FET */
-		isdn_tty_fax_modem_result(0, info);	/* OK */
-		info->faxonline &= ~2;	/* leave data mode */
-		info->online = 1;
-		f->phase = ISDN_FAX_PHASE_D;
-		return (0);
-	case ISDN_TTY_FAX_PTS:
-		isdn_tty_fax_modem_result(10, info);	/* +FPTS */
-		if (f->direction == ISDN_TTY_FAX_CONN_OUT) {
-			if (f->fet == 1)
-				f->phase = ISDN_FAX_PHASE_B;
-			if (f->fet == 0)
-				isdn_tty_fax_modem_result(0, info);	/* OK */
-		}
-		return (0);
-	case ISDN_TTY_FAX_EOP:
-		info->faxonline &= ~2;	/* leave data mode */
-		info->online = 1;
-		f->phase = ISDN_FAX_PHASE_D;
-		return (0);
-
-	}
-	return (-1);
-}
-
-
-void
-isdn_tty_fax_bitorder(modem_info *info, struct sk_buff *skb)
-{
-	__u8 LeftMask;
-	__u8 RightMask;
-	__u8 fBit;
-	__u8 Data;
-	int i;
-
-	if (!info->fax->bor) {
-		for (i = 0; i < skb->len; i++) {
-			Data = skb->data[i];
-			for (
-				LeftMask = 0x80, RightMask = 0x01;
-				LeftMask > RightMask;
-				LeftMask >>= 1, RightMask <<= 1
-				) {
-				fBit = (Data & LeftMask);
-				if (Data & RightMask)
-					Data |= LeftMask;
-				else
-					Data &= ~LeftMask;
-				if (fBit)
-					Data |= RightMask;
-				else
-					Data &= ~RightMask;
-
-			}
-			skb->data[i] = Data;
-		}
-	}
-}
-
-/*
- * Parse AT+F.. FAX class 1 commands
- */
-
-static int
-isdn_tty_cmd_FCLASS1(char **p, modem_info *info)
-{
-	static char *cmd[] =
-		{"AE", "TS", "RS", "TM", "RM", "TH", "RH"};
-	isdn_ctrl c;
-	int par, i;
-	u_long flags;
-
-	for (c.parm.aux.cmd = 0; c.parm.aux.cmd < 7; c.parm.aux.cmd++)
-		if (!strncmp(p[0], cmd[c.parm.aux.cmd], 2))
-			break;
-
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty_cmd_FCLASS1 (%s,%d)\n", p[0], c.parm.aux.cmd);
-#endif
-	if (c.parm.aux.cmd == 7)
-		PARSE_ERROR1;
-
-	p[0] += 2;
-	switch (*p[0]) {
-	case '?':
-		p[0]++;
-		c.parm.aux.subcmd = AT_QUERY;
-		break;
-	case '=':
-		p[0]++;
-		if (*p[0] == '?') {
-			p[0]++;
-			c.parm.aux.subcmd = AT_EQ_QUERY;
-		} else {
-			par = isdn_getnum(p);
-			if ((par < 0) || (par > 255))
-				PARSE_ERROR1;
-			c.parm.aux.subcmd = AT_EQ_VALUE;
-			c.parm.aux.para[0] = par;
-		}
-		break;
-	case 0:
-		c.parm.aux.subcmd = AT_COMMAND;
-		break;
-	default:
-		PARSE_ERROR1;
-	}
-	c.command = ISDN_CMD_FAXCMD;
-#ifdef ISDN_TTY_FAX_CMD_DEBUG
-	printk(KERN_DEBUG "isdn_tty_cmd_FCLASS1 %d/%d/%d)\n",
-	       c.parm.aux.cmd, c.parm.aux.subcmd, c.parm.aux.para[0]);
-#endif
-	if (info->isdn_driver < 0) {
-		if ((c.parm.aux.subcmd == AT_EQ_VALUE) ||
-		    (c.parm.aux.subcmd == AT_COMMAND)) {
-			PARSE_ERROR1;
-		}
-		spin_lock_irqsave(&dev->lock, flags);
-		/* get a temporary connection to the first free fax driver */
-		i = isdn_get_free_channel(ISDN_USAGE_FAX, ISDN_PROTO_L2_FAX,
-					  ISDN_PROTO_L3_FCLASS1, -1, -1, "00");
-		if (i < 0) {
-			spin_unlock_irqrestore(&dev->lock, flags);
-			PARSE_ERROR1;
-		}
-		info->isdn_driver = dev->drvmap[i];
-		info->isdn_channel = dev->chanmap[i];
-		info->drv_index = i;
-		dev->m_idx[i] = info->line;
-		spin_unlock_irqrestore(&dev->lock, flags);
-		c.driver = info->isdn_driver;
-		c.arg = info->isdn_channel;
-		isdn_command(&c);
-		spin_lock_irqsave(&dev->lock, flags);
-		isdn_free_channel(info->isdn_driver, info->isdn_channel,
-				  ISDN_USAGE_FAX);
-		info->isdn_driver = -1;
-		info->isdn_channel = -1;
-		if (info->drv_index >= 0) {
-			dev->m_idx[info->drv_index] = -1;
-			info->drv_index = -1;
-		}
-		spin_unlock_irqrestore(&dev->lock, flags);
-	} else {
-		c.driver = info->isdn_driver;
-		c.arg = info->isdn_channel;
-		isdn_command(&c);
-	}
-	return 1;
-}
-
-/*
- * Parse AT+F.. FAX class 2 commands
- */
-
-static int
-isdn_tty_cmd_FCLASS2(char **p, modem_info *info)
-{
-	atemu *m = &info->emu;
-	T30_s *f = info->fax;
-	isdn_ctrl cmd;
-	int par;
-	char rs[50];
-	char rss[50];
-	int maxdccval[] =
-		{1, 5, 2, 2, 3, 2, 0, 7};
-
-	/* FAA still unchanged */
-	if (!strncmp(p[0], "AA", 2)) {	/* TODO */
-		p[0] += 2;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", 0);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			par = isdn_getnum(p);
-			if ((par < 0) || (par > 255))
-				PARSE_ERROR1;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* BADLIN=value - dummy 0=disable errorchk disabled, 1-255 nr. of lines for making page bad */
-	if (!strncmp(p[0], "BADLIN", 6)) {
-		p[0] += 6;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->badlin);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-255");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 255))
-					PARSE_ERROR1;
-				f->badlin = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FBADLIN=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* BADMUL=value - dummy 0=disable errorchk disabled (threshold multiplier) */
-	if (!strncmp(p[0], "BADMUL", 6)) {
-		p[0] += 6;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->badmul);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-255");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 255))
-					PARSE_ERROR1;
-				f->badmul = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FBADMUL=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* BOR=n - Phase C bit order, 0=direct, 1=reverse */
-	if (!strncmp(p[0], "BOR", 3)) {
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->bor);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 1))
-					PARSE_ERROR1;
-				f->bor = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FBOR=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* NBC=n - No Best Capabilities */
-	if (!strncmp(p[0], "NBC", 3)) {
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->nbc);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 1))
-					PARSE_ERROR1;
-				f->nbc = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FNBC=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* BUF? - Readonly buffersize readout  */
-	if (!strncmp(p[0], "BUF?", 4)) {
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax FBUF? (%d) \n", (16 * m->mdmreg[REG_PSIZE]));
-#endif
-		p[0]++;
-		sprintf(rs, "\r\n %d ", (16 * m->mdmreg[REG_PSIZE]));
-		isdn_tty_at_cout(rs, info);
-		return 0;
-	}
-	/* CIG=string - local fax station id string for polling rx */
-	if (!strncmp(p[0], "CIG", 3)) {
-		int i, r;
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n\"%s\"", f->pollid);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n\"STRING\"");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				if (*p[0] == '"')
-					p[0]++;
-				for (i = 0; (*p[0]) && i < (FAXIDLEN - 1) && (*p[0] != '"'); i++) {
-					f->pollid[i] = *p[0]++;
-				}
-				if (*p[0] == '"')
-					p[0]++;
-				for (r = i; r < FAXIDLEN; r++) {
-					f->pollid[r] = 32;
-				}
-				f->pollid[FAXIDLEN - 1] = 0;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax local poll ID rx \"%s\"\n", f->pollid);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* CQ=n - copy qlty chk, 0= no chk, 1=only 1D chk, 2=1D+2D chk */
-	if (!strncmp(p[0], "CQ", 2)) {
-		p[0] += 2;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->cq);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1,2");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 2))
-					PARSE_ERROR1;
-				f->cq = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FCQ=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* CR=n - can receive? 0= no data rx or poll remote dev, 1=do receive data or poll remote dev */
-	if (!strncmp(p[0], "CR", 2)) {
-		p[0] += 2;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->cr);	/* read actual value from struct and print */
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1");		/* display online help */
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 1))
-					PARSE_ERROR1;
-				f->cr = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FCR=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* CTCRTY=value - ECM retry count */
-	if (!strncmp(p[0], "CTCRTY", 6)) {
-		p[0] += 6;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->ctcrty);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-255");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 255))
-					PARSE_ERROR1;
-				f->ctcrty = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FCTCRTY=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* DCC=vr,br,wd,ln,df,ec,bf,st - DCE capabilities parms */
-	if (!strncmp(p[0], "DCC", 3)) {
-		char *rp = &f->resolution;
-		int i;
-
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			strcpy(rs, "\r\n");
-			for (i = 0; i < 8; i++) {
-				sprintf(rss, "%c%s", rp[i] + 48,
-					(i < 7) ? "," : "");
-				strcat(rs, rss);
-			}
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				isdn_tty_at_cout("\r\n(0,1),(0-5),(0-2),(0-2),(0-3),(0-2),(0),(0-7)", info);
-				p[0]++;
-			} else {
-				for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 8); i++) {
-					if (*p[0] != ',') {
-						if ((*p[0] - 48) > maxdccval[i]) {
-							PARSE_ERROR1;
-						}
-						rp[i] = *p[0] - 48;
-						p[0]++;
-						if (*p[0] == ',')
-							p[0]++;
-					} else
-						p[0]++;
-				}
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FDCC capabilities DCE=%d,%d,%d,%d,%d,%d,%d,%d\n",
-				       rp[0], rp[1], rp[2], rp[3], rp[4], rp[5], rp[6], rp[7]);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* DIS=vr,br,wd,ln,df,ec,bf,st - current session parms */
-	if (!strncmp(p[0], "DIS", 3)) {
-		char *rp = &f->resolution;
-		int i;
-
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			strcpy(rs, "\r\n");
-			for (i = 0; i < 8; i++) {
-				sprintf(rss, "%c%s", rp[i] + 48,
-					(i < 7) ? "," : "");
-				strcat(rs, rss);
-			}
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				isdn_tty_at_cout("\r\n(0,1),(0-5),(0-2),(0-2),(0-3),(0-2),(0),(0-7)", info);
-				p[0]++;
-			} else {
-				for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 8); i++) {
-					if (*p[0] != ',') {
-						if ((*p[0] - 48) > maxdccval[i]) {
-							PARSE_ERROR1;
-						}
-						rp[i] = *p[0] - 48;
-						p[0]++;
-						if (*p[0] == ',')
-							p[0]++;
-					} else
-						p[0]++;
-				}
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FDIS session parms=%d,%d,%d,%d,%d,%d,%d,%d\n",
-				       rp[0], rp[1], rp[2], rp[3], rp[4], rp[5], rp[6], rp[7]);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* DR - Receive Phase C data command, initiates document reception */
-	if (!strncmp(p[0], "DR", 2)) {
-		p[0] += 2;
-		if ((info->faxonline & 16) &&	/* incoming connection */
-		    ((f->phase == ISDN_FAX_PHASE_B) || (f->phase == ISDN_FAX_PHASE_D))) {
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-			printk(KERN_DEBUG "isdn_tty: Fax FDR\n");
-#endif
-			f->code = ISDN_TTY_FAX_DR;
-			cmd.driver = info->isdn_driver;
-			cmd.arg = info->isdn_channel;
-			cmd.command = ISDN_CMD_FAXCMD;
-			isdn_command(&cmd);
-			if (f->phase == ISDN_FAX_PHASE_B) {
-				f->phase = ISDN_FAX_PHASE_C;
-			} else if (f->phase == ISDN_FAX_PHASE_D) {
-				switch (f->fet) {
-				case 0:	/* next page will be received */
-					f->phase = ISDN_FAX_PHASE_C;
-					isdn_tty_fax_modem_result(7, info);	/* CONNECT */
-					break;
-				case 1:	/* next doc will be received */
-					f->phase = ISDN_FAX_PHASE_B;
-					break;
-				case 2:	/* fax session is terminating */
-					f->phase = ISDN_FAX_PHASE_E;
-					break;
-				default:
-					PARSE_ERROR1;
-				}
-			}
-		} else {
-			PARSE_ERROR1;
-		}
-		return 1;
-	}
-	/* DT=df,vr,wd,ln - TX phase C data command (release DCE to proceed with negotiation) */
-	if (!strncmp(p[0], "DT", 2)) {
-		int i, val[] =
-			{4, 0, 2, 3};
-		char *rp = &f->resolution;
-
-		p[0] += 2;
-		if (!(info->faxonline & 1))	/* not outgoing connection */
-			PARSE_ERROR1;
-
-		for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 4); i++) {
-			if (*p[0] != ',') {
-				if ((*p[0] - 48) > maxdccval[val[i]]) {
-					PARSE_ERROR1;
-				}
-				rp[val[i]] = *p[0] - 48;
-				p[0]++;
-				if (*p[0] == ',')
-					p[0]++;
-			} else
-				p[0]++;
-		}
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax FDT tx data command parms=%d,%d,%d,%d\n",
-		       rp[4], rp[0], rp[2], rp[3]);
-#endif
-		if ((f->phase == ISDN_FAX_PHASE_B) || (f->phase == ISDN_FAX_PHASE_D)) {
-			f->code = ISDN_TTY_FAX_DT;
-			cmd.driver = info->isdn_driver;
-			cmd.arg = info->isdn_channel;
-			cmd.command = ISDN_CMD_FAXCMD;
-			isdn_command(&cmd);
-			if (f->phase == ISDN_FAX_PHASE_D) {
-				f->phase = ISDN_FAX_PHASE_C;
-				isdn_tty_fax_modem_result(7, info);	/* CONNECT */
-			}
-		} else {
-			PARSE_ERROR1;
-		}
-		return 1;
-	}
-	/* ECM=n - Error mode control 0=disabled, 2=enabled, handled by DCE alone incl. buff of partial pages */
-	if (!strncmp(p[0], "ECM", 3)) {
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->ecm);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,2");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par != 0) && (par != 2))
-					PARSE_ERROR1;
-				f->ecm = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FECM=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* ET=n - End of page or document */
-	if (!strncmp(p[0], "ET=", 3)) {
-		p[0] += 3;
-		if (*p[0] == '?') {
-			p[0]++;
-			sprintf(rs, "\r\n0-2");
-			isdn_tty_at_cout(rs, info);
-		} else {
-			if ((f->phase != ISDN_FAX_PHASE_D) ||
-			    (!(info->faxonline & 1)))
-				PARSE_ERROR1;
-			par = isdn_getnum(p);
-			if ((par < 0) || (par > 2))
-				PARSE_ERROR1;
-			f->fet = par;
-			f->code = ISDN_TTY_FAX_ET;
-			cmd.driver = info->isdn_driver;
-			cmd.arg = info->isdn_channel;
-			cmd.command = ISDN_CMD_FAXCMD;
-			isdn_command(&cmd);
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-			printk(KERN_DEBUG "isdn_tty: Fax FET=%d\n", par);
-#endif
-			return 1;
-		}
-		return 0;
-	}
-	/* K - terminate */
-	if (!strncmp(p[0], "K", 1)) {
-		p[0] += 1;
-		if ((f->phase == ISDN_FAX_PHASE_IDLE) || (f->phase == ISDN_FAX_PHASE_E))
-			PARSE_ERROR1;
-		isdn_tty_modem_hup(info, 1);
-		return 1;
-	}
-	/* LID=string - local fax ID */
-	if (!strncmp(p[0], "LID", 3)) {
-		int i, r;
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n\"%s\"", f->id);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n\"STRING\"");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				if (*p[0] == '"')
-					p[0]++;
-				for (i = 0; (*p[0]) && i < (FAXIDLEN - 1) && (*p[0] != '"'); i++) {
-					f->id[i] = *p[0]++;
-				}
-				if (*p[0] == '"')
-					p[0]++;
-				for (r = i; r < FAXIDLEN; r++) {
-					f->id[r] = 32;
-				}
-				f->id[FAXIDLEN - 1] = 0;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax local ID \"%s\"\n", f->id);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-
-	/* MDL? - DCE Model       */
-	if (!strncmp(p[0], "MDL?", 4)) {
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: FMDL?\n");
-#endif
-		isdn_tty_at_cout("\r\nisdn4linux", info);
-		return 0;
-	}
-	/* MFR? - DCE Manufacturer */
-	if (!strncmp(p[0], "MFR?", 4)) {
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: FMFR?\n");
-#endif
-		isdn_tty_at_cout("\r\nisdn4linux", info);
-		return 0;
-	}
-	/* MINSP=n - Minimum Speed for Phase C */
-	if (!strncmp(p[0], "MINSP", 5)) {
-		p[0] += 5;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->minsp);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-5");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 5))
-					PARSE_ERROR1;
-				f->minsp = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FMINSP=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* PHCTO=value - DTE phase C timeout */
-	if (!strncmp(p[0], "PHCTO", 5)) {
-		p[0] += 5;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->phcto);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0-255");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 255))
-					PARSE_ERROR1;
-				f->phcto = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FPHCTO=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-
-	/* REL=n - Phase C received EOL alignment */
-	if (!strncmp(p[0], "REL", 3)) {
-		p[0] += 3;
-		switch (*p[0]) {
-		case '?':
-			p[0]++;
-			sprintf(rs, "\r\n%d", f->rel);
-			isdn_tty_at_cout(rs, info);
-			break;
-		case '=':
-			p[0]++;
-			if (*p[0] == '?') {
-				p[0]++;
-				sprintf(rs, "\r\n0,1");
-				isdn_tty_at_cout(rs, info);
-			} else {
-				par = isdn_getnum(p);
-				if ((par < 0) || (par > 1))
-					PARSE_ERROR1;
-				f->rel = par;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-				printk(KERN_DEBUG "isdn_tty: Fax FREL=%d\n", par);
-#endif
-			}
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	/* REV? - DCE Revision */
-	if (!strncmp(p[0], "REV?", 4)) {
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: FREV?\n");
-#endif
-		strcpy(rss, isdn_tty_fax_revision);
-		sprintf(rs, "\r\nRev: %s", isdn_getrev(rss));
-		isdn_tty_at_cout(rs, info);
-		return 0;
-	}
-
-	/* Phase C Transmit Data Block Size */
-	if (!strncmp(p[0], "TBC=", 4)) {	/* dummy, not used */
-		p[0] += 4;
-#ifdef ISDN_TTY_FAX_STAT_DEBUG
-		printk(KERN_DEBUG "isdn_tty: Fax FTBC=%c\n", *p[0]);
-#endif
-		switch (*p[0]) {
-		case '0':
-			p[0]++;
-			break;
-		default:
-			PARSE_ERROR1;
-		}
-		return 0;
-	}
-	printk(KERN_DEBUG "isdn_tty: unknown token=>AT+F%s<\n", p[0]);
-	PARSE_ERROR1;
-}
-
-int
-isdn_tty_cmd_PLUSF_FAX(char **p, modem_info *info)
-{
-	if (TTY_IS_FCLASS2(info))
-		return (isdn_tty_cmd_FCLASS2(p, info));
-	else if (TTY_IS_FCLASS1(info))
-		return (isdn_tty_cmd_FCLASS1(p, info));
-	PARSE_ERROR1;
-}
diff --git a/drivers/isdn/i4l/isdn_ttyfax.h b/drivers/isdn/i4l/isdn_ttyfax.h
deleted file mode 100644
index ccda4fcf8f7b..000000000000
--- a/drivers/isdn/i4l/isdn_ttyfax.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* $Id: isdn_ttyfax.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, tty_fax related functions (linklevel).
- *
- * Copyright 1999   by Armin Schindler (mac@melware.de)
- * Copyright 1999   by Ralf Spachmann (mel@melware.de)
- * Copyright 1999   by Cytronics & Melware
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-
-#define XON	0x11
-#define XOFF	0x13
-#define DC2	0x12
diff --git a/drivers/isdn/i4l/isdn_v110.c b/drivers/isdn/i4l/isdn_v110.c
deleted file mode 100644
index d11fe76f138f..000000000000
--- a/drivers/isdn/i4l/isdn_v110.c
+++ /dev/null
@@ -1,625 +0,0 @@
-/* $Id: isdn_v110.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, V.110 related functions (linklevel).
- *
- * Copyright by Thomas Pfeiffer (pfeiffer@pds.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-
-#include <linux/isdn.h>
-#include "isdn_v110.h"
-
-#undef ISDN_V110_DEBUG
-
-char *isdn_v110_revision = "$Revision: 1.1.2.2 $";
-
-#define V110_38400 255
-#define V110_19200  15
-#define V110_9600    3
-
-/*
- * The following data are precoded matrices, online and offline matrix
- * for 9600, 19200 und 38400, respectively
- */
-static unsigned char V110_OnMatrix_9600[] =
-{0xfc, 0xfc, 0xfc, 0xfc, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff,
- 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd,
- 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff,
- 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd};
-
-static unsigned char V110_OffMatrix_9600[] =
-{0xfc, 0xfc, 0xfc, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-
-static unsigned char V110_OnMatrix_19200[] =
-{0xf0, 0xf0, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7,
- 0xfd, 0xff, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7};
-
-static unsigned char V110_OffMatrix_19200[] =
-{0xf0, 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-
-static unsigned char V110_OnMatrix_38400[] =
-{0x00, 0x7f, 0x7f, 0x7f, 0x7f, 0xfd, 0x7f, 0x7f, 0x7f, 0x7f};
-
-static unsigned char V110_OffMatrix_38400[] =
-{0x00, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xff};
-
-/*
- * FlipBits reorders sequences of keylen bits in one byte.
- * E.g. source order 7654321 will be converted to 45670123 when keylen = 4,
- * and to 67452301 when keylen = 2. This is necessary because ordering on
- * the isdn line is the other way.
- */
-static inline unsigned char
-FlipBits(unsigned char c, int keylen)
-{
-	unsigned char b = c;
-	unsigned char bit = 128;
-	int i;
-	int j;
-	int hunks = (8 / keylen);
-
-	c = 0;
-	for (i = 0; i < hunks; i++) {
-		for (j = 0; j < keylen; j++) {
-			if (b & (bit >> j))
-				c |= bit >> (keylen - j - 1);
-		}
-		bit >>= keylen;
-	}
-	return c;
-}
-
-
-/* isdn_v110_open allocates and initializes private V.110 data
- * structures and returns a pointer to these.
- */
-static isdn_v110_stream *
-isdn_v110_open(unsigned char key, int hdrlen, int maxsize)
-{
-	int i;
-	isdn_v110_stream *v;
-
-	if ((v = kzalloc(sizeof(isdn_v110_stream), GFP_ATOMIC)) == NULL)
-		return NULL;
-	v->key = key;
-	v->nbits = 0;
-	for (i = 0; key & (1 << i); i++)
-		v->nbits++;
-
-	v->nbytes = 8 / v->nbits;
-	v->decodelen = 0;
-
-	switch (key) {
-	case V110_38400:
-		v->OnlineFrame = V110_OnMatrix_38400;
-		v->OfflineFrame = V110_OffMatrix_38400;
-		break;
-	case V110_19200:
-		v->OnlineFrame = V110_OnMatrix_19200;
-		v->OfflineFrame = V110_OffMatrix_19200;
-		break;
-	default:
-		v->OnlineFrame = V110_OnMatrix_9600;
-		v->OfflineFrame = V110_OffMatrix_9600;
-		break;
-	}
-	v->framelen = v->nbytes * 10;
-	v->SyncInit = 5;
-	v->introducer = 0;
-	v->dbit = 1;
-	v->b = 0;
-	v->skbres = hdrlen;
-	v->maxsize = maxsize - hdrlen;
-	if ((v->encodebuf = kmalloc(maxsize, GFP_ATOMIC)) == NULL) {
-		kfree(v);
-		return NULL;
-	}
-	return v;
-}
-
-/* isdn_v110_close frees private V.110 data structures */
-void
-isdn_v110_close(isdn_v110_stream *v)
-{
-	if (v == NULL)
-		return;
-#ifdef ISDN_V110_DEBUG
-	printk(KERN_DEBUG "v110 close\n");
-#endif
-	kfree(v->encodebuf);
-	kfree(v);
-}
-
-
-/*
- * ValidHeaderBytes return the number of valid bytes in v->decodebuf
- */
-static int
-ValidHeaderBytes(isdn_v110_stream *v)
-{
-	int i;
-	for (i = 0; (i < v->decodelen) && (i < v->nbytes); i++)
-		if ((v->decodebuf[i] & v->key) != 0)
-			break;
-	return i;
-}
-
-/*
- * SyncHeader moves the decodebuf ptr to the next valid header
- */
-static void
-SyncHeader(isdn_v110_stream *v)
-{
-	unsigned char *rbuf = v->decodebuf;
-	int len = v->decodelen;
-
-	if (len == 0)
-		return;
-	for (rbuf++, len--; len > 0; len--, rbuf++)	/* such den SyncHeader in buf ! */
-		if ((*rbuf & v->key) == 0)	/* erstes byte gefunden ?       */
-			break;  /* jupp!                        */
-	if (len)
-		memcpy(v->decodebuf, rbuf, len);
-
-	v->decodelen = len;
-#ifdef ISDN_V110_DEBUG
-	printk(KERN_DEBUG "isdn_v110: Header resync\n");
-#endif
-}
-
-/* DecodeMatrix takes n (n>=1) matrices (v110 frames, 10 bytes) where
-   len is the number of matrix-lines. len must be a multiple of 10, i.e.
-   only complete matices must be given.
-   From these, netto data is extracted and returned in buf. The return-value
-   is the bytecount of the decoded data.
-*/
-static int
-DecodeMatrix(isdn_v110_stream *v, unsigned char *m, int len, unsigned char *buf)
-{
-	int line = 0;
-	int buflen = 0;
-	int mbit = 64;
-	int introducer = v->introducer;
-	int dbit = v->dbit;
-	unsigned char b = v->b;
-
-	while (line < len) {    /* Are we done with all lines of the matrix? */
-		if ((line % 10) == 0) {	/* the 0. line of the matrix is always 0 ! */
-			if (m[line] != 0x00) {	/* not 0 ? -> error! */
-#ifdef ISDN_V110_DEBUG
-				printk(KERN_DEBUG "isdn_v110: DecodeMatrix, V110 Bad Header\n");
-				/* returning now is not the right thing, though :-( */
-#endif
-			}
-			line++; /* next line of matrix */
-			continue;
-		} else if ((line % 10) == 5) {	/* in line 5 there's only e-bits ! */
-			if ((m[line] & 0x70) != 0x30) {	/* 011 has to be at the beginning! */
-#ifdef ISDN_V110_DEBUG
-				printk(KERN_DEBUG "isdn_v110: DecodeMatrix, V110 Bad 5th line\n");
-				/* returning now is not the right thing, though :-( */
-#endif
-			}
-			line++; /* next line */
-			continue;
-		} else if (!introducer) {	/* every byte starts with 10 (stopbit, startbit) */
-			introducer = (m[line] & mbit) ? 0 : 1;	/* current bit of the matrix */
-		next_byte:
-			if (mbit > 2) {	/* was it the last bit in this line ? */
-				mbit >>= 1;	/* no -> take next */
-				continue;
-			}       /* otherwise start with leftmost bit in the next line */
-			mbit = 64;
-			line++;
-			continue;
-		} else {        /* otherwise we need to set a data bit */
-			if (m[line] & mbit)	/* was that bit set in the matrix ? */
-				b |= dbit;	/* yes -> set it in the data byte */
-			else
-				b &= dbit - 1;	/* no -> clear it in the data byte */
-			if (dbit < 128)	/* is that data byte done ? */
-				dbit <<= 1;	/* no, got the next bit */
-			else {  /* data byte is done */
-				buf[buflen++] = b;	/* copy byte into the output buffer */
-				introducer = b = 0;	/* init of the intro sequence and of the data byte */
-				dbit = 1;	/* next we look for the 0th bit */
-			}
-			goto next_byte;	/* look for next bit in the matrix */
-		}
-	}
-	v->introducer = introducer;
-	v->dbit = dbit;
-	v->b = b;
-	return buflen;          /* return number of bytes in the output buffer */
-}
-
-/*
- * DecodeStream receives V.110 coded data from the input stream. It recovers the
- * original frames.
- * The input stream doesn't need to be framed
- */
-struct sk_buff *
-isdn_v110_decode(isdn_v110_stream *v, struct sk_buff *skb)
-{
-	int i;
-	int j;
-	int len;
-	unsigned char *v110_buf;
-	unsigned char *rbuf;
-
-	if (!skb) {
-		printk(KERN_WARNING "isdn_v110_decode called with NULL skb!\n");
-		return NULL;
-	}
-	rbuf = skb->data;
-	len = skb->len;
-	if (v == NULL) {
-		/* invalid handle, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_decode called with NULL stream!\n");
-		dev_kfree_skb(skb);
-		return NULL;
-	}
-	if (v->decodelen == 0)  /* cache empty?               */
-		for (; len > 0; len--, rbuf++)	/* scan for SyncHeader in buf */
-			if ((*rbuf & v->key) == 0)
-				break;	/* found first byte           */
-	if (len == 0) {
-		dev_kfree_skb(skb);
-		return NULL;
-	}
-	/* copy new data to decode-buffer */
-	memcpy(&(v->decodebuf[v->decodelen]), rbuf, len);
-	v->decodelen += len;
-ReSync:
-	if (v->decodelen < v->nbytes) {	/* got a new header ? */
-		dev_kfree_skb(skb);
-		return NULL;    /* no, try later      */
-	}
-	if (ValidHeaderBytes(v) != v->nbytes) {	/* is that a valid header? */
-		SyncHeader(v);  /* no -> look for header */
-		goto ReSync;
-	}
-	len = (v->decodelen - (v->decodelen % (10 * v->nbytes))) / v->nbytes;
-	if ((v110_buf = kmalloc(len, GFP_ATOMIC)) == NULL) {
-		printk(KERN_WARNING "isdn_v110_decode: Couldn't allocate v110_buf\n");
-		dev_kfree_skb(skb);
-		return NULL;
-	}
-	for (i = 0; i < len; i++) {
-		v110_buf[i] = 0;
-		for (j = 0; j < v->nbytes; j++)
-			v110_buf[i] |= (v->decodebuf[(i * v->nbytes) + j] & v->key) << (8 - ((j + 1) * v->nbits));
-		v110_buf[i] = FlipBits(v110_buf[i], v->nbits);
-	}
-	v->decodelen = (v->decodelen % (10 * v->nbytes));
-	memcpy(v->decodebuf, &(v->decodebuf[len * v->nbytes]), v->decodelen);
-
-	skb_trim(skb, DecodeMatrix(v, v110_buf, len, skb->data));
-	kfree(v110_buf);
-	if (skb->len)
-		return skb;
-	else {
-		kfree_skb(skb);
-		return NULL;
-	}
-}
-
-/* EncodeMatrix takes input data in buf, len is the bytecount.
-   Data is encoded into v110 frames in m. Return value is the number of
-   matrix-lines generated.
-*/
-static int
-EncodeMatrix(unsigned char *buf, int len, unsigned char *m, int mlen)
-{
-	int line = 0;
-	int i = 0;
-	int mbit = 128;
-	int dbit = 1;
-	int introducer = 3;
-	int ibit[] = {0, 1, 1};
-
-	while ((i < len) && (line < mlen)) {	/* while we still have input data */
-		switch (line % 10) {	/* in which line of the matrix are we? */
-		case 0:
-			m[line++] = 0x00;	/* line 0 is always 0 */
-			mbit = 128;	/* go on with the 7th bit */
-			break;
-		case 5:
-			m[line++] = 0xbf;	/* line 5 is always 10111111 */
-			mbit = 128;	/* go on with the 7th bit */
-			break;
-		}
-		if (line >= mlen) {
-			printk(KERN_WARNING "isdn_v110 (EncodeMatrix): buffer full!\n");
-			return line;
-		}
-	next_bit:
-		switch (mbit) { /* leftmost or rightmost bit ? */
-		case 1:
-			line++;	/* rightmost -> go to next line */
-			if (line >= mlen) {
-				printk(KERN_WARNING "isdn_v110 (EncodeMatrix): buffer full!\n");
-				return line;
-			}
-			/* fall through */
-		case 128:
-			m[line] = 128;	/* leftmost -> set byte to 1000000 */
-			mbit = 64;	/* current bit in the matrix line */
-			continue;
-		}
-		if (introducer) {	/* set 110 sequence ? */
-			introducer--;	/* set on digit less */
-			m[line] |= ibit[introducer] ? mbit : 0;	/* set corresponding bit */
-			mbit >>= 1;	/* bit of matrix line  >> 1 */
-			goto next_bit;	/* and go on there */
-		}               /* else push data bits into the matrix! */
-		m[line] |= (buf[i] & dbit) ? mbit : 0;	/* set data bit in matrix */
-		if (dbit == 128) {	/* was it the last one? */
-			dbit = 1;	/* then go on with first bit of  */
-			i++;            /* next byte in input buffer */
-			if (i < len)	/* input buffer done ? */
-				introducer = 3;	/* no, write introducer 110 */
-			else {  /* input buffer done ! */
-				m[line] |= (mbit - 1) & 0xfe;	/* set remaining bits in line to 1 */
-				break;
-			}
-		} else          /* not the last data bit */
-			dbit <<= 1;	/* then go to next data bit */
-		mbit >>= 1;     /* go to next bit of matrix */
-		goto next_bit;
-
-	}
-	/* if necessary, generate remaining lines of the matrix... */
-	if ((line) && ((line + 10) < mlen))
-		switch (++line % 10) {
-		case 1:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 2:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 3:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 4:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 5:
-			m[line++] = 0xbf;
-			/* fall through */
-		case 6:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 7:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 8:
-			m[line++] = 0xfe;
-			/* fall through */
-		case 9:
-			m[line++] = 0xfe;
-		}
-	return line;            /* that's how many lines we have */
-}
-
-/*
- * Build a sync frame.
- */
-static struct sk_buff *
-isdn_v110_sync(isdn_v110_stream *v)
-{
-	struct sk_buff *skb;
-
-	if (v == NULL) {
-		/* invalid handle, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_sync called with NULL stream!\n");
-		return NULL;
-	}
-	if ((skb = dev_alloc_skb(v->framelen + v->skbres))) {
-		skb_reserve(skb, v->skbres);
-		skb_put_data(skb, v->OfflineFrame, v->framelen);
-	}
-	return skb;
-}
-
-/*
- * Build an idle frame.
- */
-static struct sk_buff *
-isdn_v110_idle(isdn_v110_stream *v)
-{
-	struct sk_buff *skb;
-
-	if (v == NULL) {
-		/* invalid handle, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_sync called with NULL stream!\n");
-		return NULL;
-	}
-	if ((skb = dev_alloc_skb(v->framelen + v->skbres))) {
-		skb_reserve(skb, v->skbres);
-		skb_put_data(skb, v->OnlineFrame, v->framelen);
-	}
-	return skb;
-}
-
-struct sk_buff *
-isdn_v110_encode(isdn_v110_stream *v, struct sk_buff *skb)
-{
-	int i;
-	int j;
-	int rlen;
-	int mlen;
-	int olen;
-	int size;
-	int sval1;
-	int sval2;
-	int nframes;
-	unsigned char *v110buf;
-	unsigned char *rbuf;
-	struct sk_buff *nskb;
-
-	if (v == NULL) {
-		/* invalid handle, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_encode called with NULL stream!\n");
-		return NULL;
-	}
-	if (!skb) {
-		/* invalid skb, no chance to proceed */
-		printk(KERN_WARNING "isdn_v110_encode called with NULL skb!\n");
-		return NULL;
-	}
-	rlen = skb->len;
-	nframes = (rlen + 3) / 4;
-	v110buf = v->encodebuf;
-	if ((nframes * 40) > v->maxsize) {
-		size = v->maxsize;
-		rlen = v->maxsize / 40;
-	} else
-		size = nframes * 40;
-	if (!(nskb = dev_alloc_skb(size + v->skbres + sizeof(int)))) {
-		printk(KERN_WARNING "isdn_v110_encode: Couldn't alloc skb\n");
-		return NULL;
-	}
-	skb_reserve(nskb, v->skbres + sizeof(int));
-	if (skb->len == 0) {
-		skb_put_data(nskb, v->OnlineFrame, v->framelen);
-		*((int *)skb_push(nskb, sizeof(int))) = 0;
-		return nskb;
-	}
-	mlen = EncodeMatrix(skb->data, rlen, v110buf, size);
-	/* now distribute 2 or 4 bits each to the output stream! */
-	rbuf = skb_put(nskb, size);
-	olen = 0;
-	sval1 = 8 - v->nbits;
-	sval2 = v->key << sval1;
-	for (i = 0; i < mlen; i++) {
-		v110buf[i] = FlipBits(v110buf[i], v->nbits);
-		for (j = 0; j < v->nbytes; j++) {
-			if (size--)
-				*rbuf++ = ~v->key | (((v110buf[i] << (j * v->nbits)) & sval2) >> sval1);
-			else {
-				printk(KERN_WARNING "isdn_v110_encode: buffers full!\n");
-				goto buffer_full;
-			}
-			olen++;
-		}
-	}
-buffer_full:
-	skb_trim(nskb, olen);
-	*((int *)skb_push(nskb, sizeof(int))) = rlen;
-	return nskb;
-}
-
-int
-isdn_v110_stat_callback(int idx, isdn_ctrl *c)
-{
-	isdn_v110_stream *v = NULL;
-	int i;
-	int ret = 0;
-
-	if (idx < 0)
-		return 0;
-	switch (c->command) {
-	case ISDN_STAT_BSENT:
-		/* Keep the send-queue of the driver filled
-		 * with frames:
-		 * If number of outstanding frames < 3,
-		 * send down an Idle-Frame (or an Sync-Frame, if
-		 * v->SyncInit != 0).
-		 */
-		if (!(v = dev->v110[idx]))
-			return 0;
-		atomic_inc(&dev->v110use[idx]);
-		for (i = 0; i * v->framelen < c->parm.length; i++) {
-			if (v->skbidle > 0) {
-				v->skbidle--;
-				ret = 1;
-			} else {
-				if (v->skbuser > 0)
-					v->skbuser--;
-				ret = 0;
-			}
-		}
-		for (i = v->skbuser + v->skbidle; i < 2; i++) {
-			struct sk_buff *skb;
-			if (v->SyncInit > 0)
-				skb = isdn_v110_sync(v);
-			else
-				skb = isdn_v110_idle(v);
-			if (skb) {
-				if (dev->drv[c->driver]->interface->writebuf_skb(c->driver, c->arg, 1, skb) <= 0) {
-					dev_kfree_skb(skb);
-					break;
-				} else {
-					if (v->SyncInit)
-						v->SyncInit--;
-					v->skbidle++;
-				}
-			} else
-				break;
-		}
-		atomic_dec(&dev->v110use[idx]);
-		return ret;
-	case ISDN_STAT_DHUP:
-	case ISDN_STAT_BHUP:
-		while (1) {
-			atomic_inc(&dev->v110use[idx]);
-			if (atomic_dec_and_test(&dev->v110use[idx])) {
-				isdn_v110_close(dev->v110[idx]);
-				dev->v110[idx] = NULL;
-				break;
-			}
-			mdelay(1);
-		}
-		break;
-	case ISDN_STAT_BCONN:
-		if (dev->v110emu[idx] && (dev->v110[idx] == NULL)) {
-			int hdrlen = dev->drv[c->driver]->interface->hl_hdrlen;
-			int maxsize = dev->drv[c->driver]->interface->maxbufsize;
-			atomic_inc(&dev->v110use[idx]);
-			switch (dev->v110emu[idx]) {
-			case ISDN_PROTO_L2_V11096:
-				dev->v110[idx] = isdn_v110_open(V110_9600, hdrlen, maxsize);
-				break;
-			case ISDN_PROTO_L2_V11019:
-				dev->v110[idx] = isdn_v110_open(V110_19200, hdrlen, maxsize);
-				break;
-			case ISDN_PROTO_L2_V11038:
-				dev->v110[idx] = isdn_v110_open(V110_38400, hdrlen, maxsize);
-				break;
-			default:;
-			}
-			if ((v = dev->v110[idx])) {
-				while (v->SyncInit) {
-					struct sk_buff *skb = isdn_v110_sync(v);
-					if (dev->drv[c->driver]->interface->writebuf_skb(c->driver, c->arg, 1, skb) <= 0) {
-						dev_kfree_skb(skb);
-						/* Unable to send, try later */
-						break;
-					}
-					v->SyncInit--;
-					v->skbidle++;
-				}
-			} else
-				printk(KERN_WARNING "isdn_v110: Couldn't open stream for chan %d\n", idx);
-			atomic_dec(&dev->v110use[idx]);
-		}
-		break;
-	default:
-		return 0;
-	}
-	return 0;
-}
diff --git a/drivers/isdn/i4l/isdn_v110.h b/drivers/isdn/i4l/isdn_v110.h
deleted file mode 100644
index de774ab598c9..000000000000
--- a/drivers/isdn/i4l/isdn_v110.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* $Id: isdn_v110.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, V.110 related functions (linklevel).
- *
- * Copyright by Thomas Pfeiffer (pfeiffer@pds.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _isdn_v110_h_
-#define _isdn_v110_h_
-
-/*
- * isdn_v110_encode will take raw data and encode it using V.110
- */
-extern struct sk_buff *isdn_v110_encode(isdn_v110_stream *, struct sk_buff *);
-
-/*
- * isdn_v110_decode receives V.110 coded data from the stream and rebuilds
- * frames from them. The source stream doesn't need to be framed.
- */
-extern struct sk_buff *isdn_v110_decode(isdn_v110_stream *, struct sk_buff *);
-
-extern int isdn_v110_stat_callback(int, isdn_ctrl *);
-extern void isdn_v110_close(isdn_v110_stream *v);
-
-#endif
diff --git a/drivers/isdn/i4l/isdn_x25iface.c b/drivers/isdn/i4l/isdn_x25iface.c
deleted file mode 100644
index 48bfbcb4a09d..000000000000
--- a/drivers/isdn/i4l/isdn_x25iface.c
+++ /dev/null
@@ -1,332 +0,0 @@
-/* $Id: isdn_x25iface.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * Linux ISDN subsystem, X.25 related functions
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- * stuff needed to support the Linux X.25 PLP code on top of devices that
- * can provide a lab_b service using the concap_proto mechanism.
- * This module supports a network interface which provides lapb_sematics
- * -- as defined in Documentation/networking/x25-iface.txt -- to
- * the upper layer and assumes that the lower layer provides a reliable
- * data link service by means of the concap_device_ops callbacks.
- *
- * Only protocol specific stuff goes here. Device specific stuff
- * goes to another -- device related -- concap_proto support source file.
- *
- */
-
-/* #include <linux/isdn.h> */
-#include <linux/netdevice.h>
-#include <linux/concap.h>
-#include <linux/slab.h>
-#include <linux/wanrouter.h>
-#include <net/x25device.h>
-#include "isdn_x25iface.h"
-
-/* for debugging messages not to cause an oops when device pointer is NULL*/
-#define MY_DEVNAME(dev)  ((dev) ? (dev)->name : "DEVICE UNSPECIFIED")
-
-
-typedef struct isdn_x25iface_proto_data {
-	int magic;
-	enum wan_states state;
-	/* Private stuff, not to be accessed via proto_data. We provide the
-	   other storage for the concap_proto instance here as well,
-	   enabling us to allocate both with just one kmalloc(): */
-	struct concap_proto priv;
-} ix25_pdata_t;
-
-
-
-/* is now in header file (extern): struct concap_proto * isdn_x25iface_proto_new(void); */
-static void isdn_x25iface_proto_del(struct concap_proto *);
-static int isdn_x25iface_proto_close(struct concap_proto *);
-static int isdn_x25iface_proto_restart(struct concap_proto *,
-				       struct net_device *,
-				       struct concap_device_ops *);
-static int isdn_x25iface_xmit(struct concap_proto *, struct sk_buff *);
-static int isdn_x25iface_receive(struct concap_proto *, struct sk_buff *);
-static int isdn_x25iface_connect_ind(struct concap_proto *);
-static int isdn_x25iface_disconn_ind(struct concap_proto *);
-
-
-static struct concap_proto_ops ix25_pops = {
-	.proto_new = &isdn_x25iface_proto_new,
-	.proto_del = &isdn_x25iface_proto_del,
-	.restart = &isdn_x25iface_proto_restart,
-	.close = &isdn_x25iface_proto_close,
-	.encap_and_xmit = &isdn_x25iface_xmit,
-	.data_ind = &isdn_x25iface_receive,
-	.connect_ind = &isdn_x25iface_connect_ind,
-	.disconn_ind = &isdn_x25iface_disconn_ind
-};
-
-/* error message helper function */
-static void illegal_state_warn(unsigned state, unsigned char firstbyte)
-{
-	printk(KERN_WARNING "isdn_x25iface: firstbyte %x illegal in"
-	       "current state %d\n", firstbyte, state);
-}
-
-/* check protocol data field for consistency */
-static int pdata_is_bad(ix25_pdata_t *pda) {
-
-	if (pda  &&  pda->magic == ISDN_X25IFACE_MAGIC) return 0;
-	printk(KERN_WARNING
-	       "isdn_x25iface_xxx: illegal pointer to proto data\n");
-	return 1;
-}
-
-/* create a new x25 interface protocol instance
- */
-struct concap_proto *isdn_x25iface_proto_new(void)
-{
-	ix25_pdata_t *tmp = kmalloc(sizeof(ix25_pdata_t), GFP_KERNEL);
-	IX25DEBUG("isdn_x25iface_proto_new\n");
-	if (tmp) {
-		tmp->magic = ISDN_X25IFACE_MAGIC;
-		tmp->state = WAN_UNCONFIGURED;
-		/* private data space used to hold the concap_proto data.
-		   Only to be accessed via the returned pointer */
-		spin_lock_init(&tmp->priv.lock);
-		tmp->priv.dops       = NULL;
-		tmp->priv.net_dev    = NULL;
-		tmp->priv.pops       = &ix25_pops;
-		tmp->priv.flags      = 0;
-		tmp->priv.proto_data = tmp;
-		return (&(tmp->priv));
-	}
-	return NULL;
-};
-
-/* close the x25iface encapsulation protocol
- */
-static int isdn_x25iface_proto_close(struct concap_proto *cprot) {
-
-	ix25_pdata_t *tmp;
-	int ret = 0;
-	ulong flags;
-
-	if (!cprot) {
-		printk(KERN_ERR "isdn_x25iface_proto_close: "
-		       "invalid concap_proto pointer\n");
-		return -1;
-	}
-	IX25DEBUG("isdn_x25iface_proto_close %s \n", MY_DEVNAME(cprot->net_dev));
-	spin_lock_irqsave(&cprot->lock, flags);
-	cprot->dops    = NULL;
-	cprot->net_dev = NULL;
-	tmp = cprot->proto_data;
-	if (pdata_is_bad(tmp)) {
-		ret = -1;
-	} else {
-		tmp->state = WAN_UNCONFIGURED;
-	}
-	spin_unlock_irqrestore(&cprot->lock, flags);
-	return ret;
-}
-
-/* Delete the x25iface encapsulation protocol instance
- */
-static void isdn_x25iface_proto_del(struct concap_proto *cprot) {
-
-	ix25_pdata_t *tmp;
-
-	IX25DEBUG("isdn_x25iface_proto_del \n");
-	if (!cprot) {
-		printk(KERN_ERR "isdn_x25iface_proto_del: "
-		       "concap_proto pointer is NULL\n");
-		return;
-	}
-	tmp = cprot->proto_data;
-	if (tmp == NULL) {
-		printk(KERN_ERR "isdn_x25iface_proto_del: inconsistent "
-		       "proto_data pointer (maybe already deleted?)\n");
-		return;
-	}
-	/* close if the protocol is still open */
-	if (cprot->dops) isdn_x25iface_proto_close(cprot);
-	/* freeing the storage should be sufficient now. But some additional
-	   settings might help to catch wild pointer bugs */
-	tmp->magic = 0;
-	cprot->proto_data = NULL;
-
-	kfree(tmp);
-	return;
-}
-
-/* (re-)initialize the data structures for x25iface encapsulation
- */
-static int isdn_x25iface_proto_restart(struct concap_proto *cprot,
-				       struct net_device *ndev,
-				       struct concap_device_ops *dops)
-{
-	ix25_pdata_t *pda = cprot->proto_data;
-	ulong flags;
-
-	IX25DEBUG("isdn_x25iface_proto_restart %s \n", MY_DEVNAME(ndev));
-
-	if (pdata_is_bad(pda)) return -1;
-
-	if (!(dops && dops->data_req && dops->connect_req
-	      && dops->disconn_req)) {
-		printk(KERN_WARNING "isdn_x25iface_restart: required dops"
-		       " missing\n");
-		isdn_x25iface_proto_close(cprot);
-		return -1;
-	}
-	spin_lock_irqsave(&cprot->lock, flags);
-	cprot->net_dev = ndev;
-	cprot->pops = &ix25_pops;
-	cprot->dops = dops;
-	pda->state = WAN_DISCONNECTED;
-	spin_unlock_irqrestore(&cprot->lock, flags);
-	return 0;
-}
-
-/* deliver a dl_data frame received from i4l HL driver to the network layer
- */
-static int isdn_x25iface_receive(struct concap_proto *cprot, struct sk_buff *skb)
-{
-	IX25DEBUG("isdn_x25iface_receive %s \n", MY_DEVNAME(cprot->net_dev));
-	if (((ix25_pdata_t *)(cprot->proto_data))
-	    ->state == WAN_CONNECTED) {
-		if (skb_push(skb, 1)) {
-			skb->data[0] = X25_IFACE_DATA;
-			skb->protocol = x25_type_trans(skb, cprot->net_dev);
-			netif_rx(skb);
-			return 0;
-		}
-	}
-	printk(KERN_WARNING "isdn_x25iface_receive %s: not connected, skb dropped\n", MY_DEVNAME(cprot->net_dev));
-	dev_kfree_skb(skb);
-	return -1;
-}
-
-/* a connection set up is indicated by lower layer
- */
-static int isdn_x25iface_connect_ind(struct concap_proto *cprot)
-{
-	struct sk_buff *skb;
-	enum wan_states *state_p
-		= &(((ix25_pdata_t *)(cprot->proto_data))->state);
-	IX25DEBUG("isdn_x25iface_connect_ind %s \n"
-		  , MY_DEVNAME(cprot->net_dev));
-	if (*state_p == WAN_UNCONFIGURED) {
-		printk(KERN_WARNING
-		       "isdn_x25iface_connect_ind while unconfigured %s\n"
-		       , MY_DEVNAME(cprot->net_dev));
-		return -1;
-	}
-	*state_p = WAN_CONNECTED;
-
-	skb = dev_alloc_skb(1);
-	if (skb) {
-		skb_put_u8(skb, X25_IFACE_CONNECT);
-		skb->protocol = x25_type_trans(skb, cprot->net_dev);
-		netif_rx(skb);
-		return 0;
-	} else {
-		printk(KERN_WARNING "isdn_x25iface_connect_ind: "
-		       " out of memory -- disconnecting\n");
-		cprot->dops->disconn_req(cprot);
-		return -1;
-	}
-}
-
-/* a disconnect is indicated by lower layer
- */
-static int isdn_x25iface_disconn_ind(struct concap_proto *cprot)
-{
-	struct sk_buff *skb;
-	enum wan_states *state_p
-		= &(((ix25_pdata_t *)(cprot->proto_data))->state);
-	IX25DEBUG("isdn_x25iface_disconn_ind %s \n", MY_DEVNAME(cprot->net_dev));
-	if (*state_p == WAN_UNCONFIGURED) {
-		printk(KERN_WARNING
-		       "isdn_x25iface_disconn_ind while unconfigured\n");
-		return -1;
-	}
-	if (!cprot->net_dev) return -1;
-	*state_p = WAN_DISCONNECTED;
-	skb = dev_alloc_skb(1);
-	if (skb) {
-		skb_put_u8(skb, X25_IFACE_DISCONNECT);
-		skb->protocol = x25_type_trans(skb, cprot->net_dev);
-		netif_rx(skb);
-		return 0;
-	} else {
-		printk(KERN_WARNING "isdn_x25iface_disconn_ind:"
-		       " out of memory\n");
-		return -1;
-	}
-}
-
-/* process a frame handed over to us from linux network layer. First byte
-   semantics as defined in Documentation/networking/x25-iface.txt
-*/
-static int isdn_x25iface_xmit(struct concap_proto *cprot, struct sk_buff *skb)
-{
-	unsigned char firstbyte = skb->data[0];
-	enum wan_states *state = &((ix25_pdata_t *)cprot->proto_data)->state;
-	int ret = 0;
-	IX25DEBUG("isdn_x25iface_xmit: %s first=%x state=%d\n",
-		  MY_DEVNAME(cprot->net_dev), firstbyte, *state);
-	switch (firstbyte) {
-	case X25_IFACE_DATA:
-		if (*state == WAN_CONNECTED) {
-			skb_pull(skb, 1);
-			netif_trans_update(cprot->net_dev);
-			ret = (cprot->dops->data_req(cprot, skb));
-			/* prepare for future retransmissions */
-			if (ret) skb_push(skb, 1);
-			return ret;
-		}
-		illegal_state_warn(*state, firstbyte);
-		break;
-	case X25_IFACE_CONNECT:
-		if (*state == WAN_DISCONNECTED) {
-			*state = WAN_CONNECTING;
-			ret = cprot->dops->connect_req(cprot);
-			if (ret) {
-				/* reset state and notify upper layer about
-				 * immidiatly failed attempts */
-				isdn_x25iface_disconn_ind(cprot);
-			}
-		} else {
-			illegal_state_warn(*state, firstbyte);
-		}
-		break;
-	case X25_IFACE_DISCONNECT:
-		switch (*state) {
-		case WAN_DISCONNECTED:
-			/* Should not happen. However, give upper layer a
-			   chance to recover from inconstistency  but don't
-			   trust the lower layer sending the disconn_confirm
-			   when already disconnected */
-			printk(KERN_WARNING "isdn_x25iface_xmit: disconnect "
-			       " requested while disconnected\n");
-			isdn_x25iface_disconn_ind(cprot);
-			break; /* prevent infinite loops */
-		case WAN_CONNECTING:
-		case WAN_CONNECTED:
-			*state = WAN_DISCONNECTED;
-			cprot->dops->disconn_req(cprot);
-			break;
-		default:
-			illegal_state_warn(*state, firstbyte);
-		}
-		break;
-	case X25_IFACE_PARAMS:
-		printk(KERN_WARNING "isdn_x25iface_xmit: setting of lapb"
-		       " options not yet supported\n");
-		break;
-	default:
-		printk(KERN_WARNING "isdn_x25iface_xmit: frame with illegal"
-		       " first byte %x ignored:\n", firstbyte);
-	}
-	dev_kfree_skb(skb);
-	return 0;
-}
diff --git a/drivers/isdn/i4l/isdn_x25iface.h b/drivers/isdn/i4l/isdn_x25iface.h
deleted file mode 100644
index ca08e082cf7c..000000000000
--- a/drivers/isdn/i4l/isdn_x25iface.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* $Id: isdn_x25iface.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $
- *
- * header for Linux ISDN subsystem, x.25 related functions
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _LINUX_ISDN_X25IFACE_H
-#define _LINUX_ISDN_X25IFACE_H
-
-#define ISDN_X25IFACE_MAGIC 0x1e75a2b9
-/* #define DEBUG_ISDN_X25 if you want isdn_x25 debugging messages */
-#ifdef DEBUG_ISDN_X25
-#   define IX25DEBUG(fmt, args...) printk(KERN_DEBUG fmt, ##args)
-#else
-#   define IX25DEBUG(fmt, args...)
-#endif
-
-#include <linux/skbuff.h>
-#include <linux/isdn.h>
-#include <linux/concap.h>
-
-extern struct concap_proto_ops *isdn_x25iface_concap_proto_ops_pt;
-extern struct concap_proto *isdn_x25iface_proto_new(void);
-
-
-
-#endif
diff --git a/drivers/isdn/isdnloop/Makefile b/drivers/isdn/isdnloop/Makefile
deleted file mode 100644
index 5ff4c0e09768..000000000000
--- a/drivers/isdn/isdnloop/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-# Makefile for the isdnloop ISDN device driver
-
-# Each configuration option enables a list of files.
-
-obj-$(CONFIG_ISDN_DRV_LOOP)	+= isdnloop.o
diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c
deleted file mode 100644
index 755c6bbc9553..000000000000
--- a/drivers/isdn/isdnloop/isdnloop.c
+++ /dev/null
@@ -1,1528 +0,0 @@
-/* $Id: isdnloop.c,v 1.11.6.7 2001/11/11 19:54:31 kai Exp $
- *
- * ISDN low-level module implementing a dummy loop driver.
- *
- * Copyright 1997 by Fritz Elfert (fritz@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include "isdnloop.h"
-
-static char *isdnloop_id = "loop0";
-
-MODULE_DESCRIPTION("ISDN4Linux: Pseudo Driver that simulates an ISDN card");
-MODULE_AUTHOR("Fritz Elfert");
-MODULE_LICENSE("GPL");
-module_param(isdnloop_id, charp, 0);
-MODULE_PARM_DESC(isdnloop_id, "ID-String of first card");
-
-static int isdnloop_addcard(char *);
-
-/*
- * Free queue completely.
- *
- * Parameter:
- *   card    = pointer to card struct
- *   channel = channel number
- */
-static void
-isdnloop_free_queue(isdnloop_card *card, int channel)
-{
-	struct sk_buff_head *queue = &card->bqueue[channel];
-
-	skb_queue_purge(queue);
-	card->sndcount[channel] = 0;
-}
-
-/*
- * Send B-Channel data to another virtual card.
- * This routine is called via timer-callback from isdnloop_pollbchan().
- *
- * Parameter:
- *   card = pointer to card struct.
- *   ch   = channel number (0-based)
- */
-static void
-isdnloop_bchan_send(isdnloop_card *card, int ch)
-{
-	isdnloop_card *rcard = card->rcard[ch];
-	int rch = card->rch[ch], len, ack;
-	struct sk_buff *skb;
-	isdn_ctrl cmd;
-
-	while (card->sndcount[ch]) {
-		skb = skb_dequeue(&card->bqueue[ch]);
-		if (skb) {
-			len = skb->len;
-			card->sndcount[ch] -= len;
-			ack = *(skb->head); /* used as scratch area */
-			cmd.driver = card->myid;
-			cmd.arg = ch;
-			if (rcard) {
-				rcard->interface.rcvcallb_skb(rcard->myid, rch, skb);
-			} else {
-				printk(KERN_WARNING "isdnloop: no rcard, skb dropped\n");
-				dev_kfree_skb(skb);
-
-			}
-			cmd.command = ISDN_STAT_BSENT;
-			cmd.parm.length = len;
-			card->interface.statcallb(&cmd);
-		} else
-			card->sndcount[ch] = 0;
-	}
-}
-
-/*
- * Send/Receive Data to/from the B-Channel.
- * This routine is called via timer-callback.
- * It schedules itself while any B-Channel is open.
- *
- * Parameter:
- *   data = pointer to card struct, set by kernel timer.data
- */
-static void
-isdnloop_pollbchan(struct timer_list *t)
-{
-	isdnloop_card *card = from_timer(card, t, rb_timer);
-	unsigned long flags;
-
-	if (card->flags & ISDNLOOP_FLAGS_B1ACTIVE)
-		isdnloop_bchan_send(card, 0);
-	if (card->flags & ISDNLOOP_FLAGS_B2ACTIVE)
-		isdnloop_bchan_send(card, 1);
-	if (card->flags & (ISDNLOOP_FLAGS_B1ACTIVE | ISDNLOOP_FLAGS_B2ACTIVE)) {
-		/* schedule b-channel polling again */
-		spin_lock_irqsave(&card->isdnloop_lock, flags);
-		card->rb_timer.expires = jiffies + ISDNLOOP_TIMER_BCREAD;
-		add_timer(&card->rb_timer);
-		card->flags |= ISDNLOOP_FLAGS_RBTIMER;
-		spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-	} else
-		card->flags &= ~ISDNLOOP_FLAGS_RBTIMER;
-}
-
-/*
- * Parse ICN-type setup string and fill fields of setup-struct
- * with parsed data.
- *
- * Parameter:
- *   setup = setup string, format: [caller-id],si1,si2,[called-id]
- *   cmd   = pointer to struct to be filled.
- */
-static void
-isdnloop_parse_setup(char *setup, isdn_ctrl *cmd)
-{
-	char *t = setup;
-	char *s = strchr(t, ',');
-
-	*s++ = '\0';
-	strlcpy(cmd->parm.setup.phone, t, sizeof(cmd->parm.setup.phone));
-	s = strchr(t = s, ',');
-	*s++ = '\0';
-	if (!strlen(t))
-		cmd->parm.setup.si1 = 0;
-	else
-		cmd->parm.setup.si1 = simple_strtoul(t, NULL, 10);
-	s = strchr(t = s, ',');
-	*s++ = '\0';
-	if (!strlen(t))
-		cmd->parm.setup.si2 = 0;
-	else
-		cmd->parm.setup.si2 =
-			simple_strtoul(t, NULL, 10);
-	strlcpy(cmd->parm.setup.eazmsn, s, sizeof(cmd->parm.setup.eazmsn));
-	cmd->parm.setup.plan = 0;
-	cmd->parm.setup.screen = 0;
-}
-
-typedef struct isdnloop_stat {
-	char *statstr;
-	int command;
-	int action;
-} isdnloop_stat;
-/* *INDENT-OFF* */
-static isdnloop_stat isdnloop_stat_table[] = {
-	{"BCON_",          ISDN_STAT_BCONN, 1}, /* B-Channel connected        */
-	{"BDIS_",          ISDN_STAT_BHUP,  2}, /* B-Channel disconnected     */
-	{"DCON_",          ISDN_STAT_DCONN, 0}, /* D-Channel connected        */
-	{"DDIS_",          ISDN_STAT_DHUP,  0}, /* D-Channel disconnected     */
-	{"DCAL_I",         ISDN_STAT_ICALL, 3}, /* Incoming call dialup-line  */
-	{"DSCA_I",         ISDN_STAT_ICALL, 3}, /* Incoming call 1TR6-SPV     */
-	{"FCALL",          ISDN_STAT_ICALL, 4}, /* Leased line connection up  */
-	{"CIF",            ISDN_STAT_CINF,  5}, /* Charge-info, 1TR6-type     */
-	{"AOC",            ISDN_STAT_CINF,  6}, /* Charge-info, DSS1-type     */
-	{"CAU",            ISDN_STAT_CAUSE, 7}, /* Cause code                 */
-	{"TEI OK",         ISDN_STAT_RUN,   0}, /* Card connected to wallplug */
-	{"E_L1: ACT FAIL", ISDN_STAT_BHUP,  8}, /* Layer-1 activation failed  */
-	{"E_L2: DATA LIN", ISDN_STAT_BHUP,  8}, /* Layer-2 data link lost     */
-	{"E_L1: ACTIVATION FAILED",
-	 ISDN_STAT_BHUP,  8},         /* Layer-1 activation failed  */
-	{NULL, 0, -1}
-};
-/* *INDENT-ON* */
-
-
-/*
- * Parse Status message-strings from virtual card.
- * Depending on status, call statcallb for sending messages to upper
- * levels. Also set/reset B-Channel active-flags.
- *
- * Parameter:
- *   status  = status string to parse.
- *   channel = channel where message comes from.
- *   card    = card where message comes from.
- */
-static void
-isdnloop_parse_status(u_char *status, int channel, isdnloop_card *card)
-{
-	isdnloop_stat *s = isdnloop_stat_table;
-	int action = -1;
-	isdn_ctrl cmd;
-
-	while (s->statstr) {
-		if (!strncmp(status, s->statstr, strlen(s->statstr))) {
-			cmd.command = s->command;
-			action = s->action;
-			break;
-		}
-		s++;
-	}
-	if (action == -1)
-		return;
-	cmd.driver = card->myid;
-	cmd.arg = channel;
-	switch (action) {
-	case 1:
-		/* BCON_x */
-		card->flags |= (channel) ?
-			ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE;
-		break;
-	case 2:
-		/* BDIS_x */
-		card->flags &= ~((channel) ?
-				 ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE);
-		isdnloop_free_queue(card, channel);
-		break;
-	case 3:
-		/* DCAL_I and DSCA_I */
-		isdnloop_parse_setup(status + 6, &cmd);
-		break;
-	case 4:
-		/* FCALL */
-		sprintf(cmd.parm.setup.phone, "LEASED%d", card->myid);
-		sprintf(cmd.parm.setup.eazmsn, "%d", channel + 1);
-		cmd.parm.setup.si1 = 7;
-		cmd.parm.setup.si2 = 0;
-		cmd.parm.setup.plan = 0;
-		cmd.parm.setup.screen = 0;
-		break;
-	case 5:
-		/* CIF */
-		strlcpy(cmd.parm.num, status + 3, sizeof(cmd.parm.num));
-		break;
-	case 6:
-		/* AOC */
-		snprintf(cmd.parm.num, sizeof(cmd.parm.num), "%d",
-			 (int) simple_strtoul(status + 7, NULL, 16));
-		break;
-	case 7:
-		/* CAU */
-		status += 3;
-		if (strlen(status) == 4)
-			snprintf(cmd.parm.num, sizeof(cmd.parm.num), "%s%c%c",
-				 status + 2, *status, *(status + 1));
-		else
-			strlcpy(cmd.parm.num, status + 1, sizeof(cmd.parm.num));
-		break;
-	case 8:
-		/* Misc Errors on L1 and L2 */
-		card->flags &= ~ISDNLOOP_FLAGS_B1ACTIVE;
-		isdnloop_free_queue(card, 0);
-		cmd.arg = 0;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-		cmd.command = ISDN_STAT_DHUP;
-		cmd.arg = 0;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-		cmd.command = ISDN_STAT_BHUP;
-		card->flags &= ~ISDNLOOP_FLAGS_B2ACTIVE;
-		isdnloop_free_queue(card, 1);
-		cmd.arg = 1;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-		cmd.command = ISDN_STAT_DHUP;
-		cmd.arg = 1;
-		cmd.driver = card->myid;
-		break;
-	}
-	card->interface.statcallb(&cmd);
-}
-
-/*
- * Store a cwcharacter into ringbuffer for reading from /dev/isdnctrl
- *
- * Parameter:
- *   card = pointer to card struct.
- *   c    = char to store.
- */
-static void
-isdnloop_putmsg(isdnloop_card *card, unsigned char c)
-{
-	ulong flags;
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	*card->msg_buf_write++ = (c == 0xff) ? '\n' : c;
-	if (card->msg_buf_write == card->msg_buf_read) {
-		if (++card->msg_buf_read > card->msg_buf_end)
-			card->msg_buf_read = card->msg_buf;
-	}
-	if (card->msg_buf_write > card->msg_buf_end)
-		card->msg_buf_write = card->msg_buf;
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Poll a virtual cards message queue.
- * If there are new status-replies from the card, copy them to
- * ringbuffer for reading on /dev/isdnctrl and call
- * isdnloop_parse_status() for processing them. Watch for special
- * Firmware bootmessage and parse it, to get the D-Channel protocol.
- * If there are B-Channels open, initiate a timer-callback to
- * isdnloop_pollbchan().
- * This routine is called periodically via timer interrupt.
- *
- * Parameter:
- *   data = pointer to card struct
- */
-static void
-isdnloop_polldchan(struct timer_list *t)
-{
-	isdnloop_card *card = from_timer(card, t, st_timer);
-	struct sk_buff *skb;
-	int avail;
-	int left;
-	u_char c;
-	int ch;
-	unsigned long flags;
-	u_char *p;
-	isdn_ctrl cmd;
-
-	skb = skb_dequeue(&card->dqueue);
-	if (skb)
-		avail = skb->len;
-	else
-		avail = 0;
-	for (left = avail; left > 0; left--) {
-		c = *skb->data;
-		skb_pull(skb, 1);
-		isdnloop_putmsg(card, c);
-		card->imsg[card->iptr] = c;
-		if (card->iptr < 59)
-			card->iptr++;
-		if (!skb->len) {
-			avail++;
-			isdnloop_putmsg(card, '\n');
-			card->imsg[card->iptr] = 0;
-			card->iptr = 0;
-			if (card->imsg[0] == '0' && card->imsg[1] >= '0' &&
-			    card->imsg[1] <= '2' && card->imsg[2] == ';') {
-				ch = (card->imsg[1] - '0') - 1;
-				p = &card->imsg[3];
-				isdnloop_parse_status(p, ch, card);
-			} else {
-				p = card->imsg;
-				if (!strncmp(p, "DRV1.", 5)) {
-					printk(KERN_INFO "isdnloop: (%s) %s\n", CID, p);
-					if (!strncmp(p + 7, "TC", 2)) {
-						card->ptype = ISDN_PTYPE_1TR6;
-						card->interface.features |= ISDN_FEATURE_P_1TR6;
-						printk(KERN_INFO
-						       "isdnloop: (%s) 1TR6-Protocol loaded and running\n", CID);
-					}
-					if (!strncmp(p + 7, "EC", 2)) {
-						card->ptype = ISDN_PTYPE_EURO;
-						card->interface.features |= ISDN_FEATURE_P_EURO;
-						printk(KERN_INFO
-						       "isdnloop: (%s) Euro-Protocol loaded and running\n", CID);
-					}
-					continue;
-
-				}
-			}
-		}
-	}
-	if (avail) {
-		cmd.command = ISDN_STAT_STAVAIL;
-		cmd.driver = card->myid;
-		cmd.arg = avail;
-		card->interface.statcallb(&cmd);
-	}
-	if (card->flags & (ISDNLOOP_FLAGS_B1ACTIVE | ISDNLOOP_FLAGS_B2ACTIVE))
-		if (!(card->flags & ISDNLOOP_FLAGS_RBTIMER)) {
-			/* schedule b-channel polling */
-			card->flags |= ISDNLOOP_FLAGS_RBTIMER;
-			spin_lock_irqsave(&card->isdnloop_lock, flags);
-			del_timer(&card->rb_timer);
-			card->rb_timer.expires = jiffies + ISDNLOOP_TIMER_BCREAD;
-			add_timer(&card->rb_timer);
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-		}
-	/* schedule again */
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	card->st_timer.expires = jiffies + ISDNLOOP_TIMER_DCREAD;
-	add_timer(&card->st_timer);
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Append a packet to the transmit buffer-queue.
- *
- * Parameter:
- *   channel = Number of B-channel
- *   skb     = packet to send.
- *   card    = pointer to card-struct
- * Return:
- *   Number of bytes transferred, -E??? on error
- */
-static int
-isdnloop_sendbuf(int channel, struct sk_buff *skb, isdnloop_card *card)
-{
-	int len = skb->len;
-	unsigned long flags;
-	struct sk_buff *nskb;
-
-	if (len > 4000) {
-		printk(KERN_WARNING
-		       "isdnloop: Send packet too large\n");
-		return -EINVAL;
-	}
-	if (len) {
-		if (!(card->flags & (channel ? ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE)))
-			return 0;
-		if (card->sndcount[channel] > ISDNLOOP_MAX_SQUEUE)
-			return 0;
-		spin_lock_irqsave(&card->isdnloop_lock, flags);
-		nskb = dev_alloc_skb(skb->len);
-		if (nskb) {
-			skb_copy_from_linear_data(skb,
-						  skb_put(nskb, len), len);
-			skb_queue_tail(&card->bqueue[channel], nskb);
-			dev_kfree_skb(skb);
-		} else
-			len = 0;
-		card->sndcount[channel] += len;
-		spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-	}
-	return len;
-}
-
-/*
- * Read the messages from the card's ringbuffer
- *
- * Parameter:
- *   buf  = pointer to buffer.
- *   len  = number of bytes to read.
- *   user = flag, 1: called from userlevel 0: called from kernel.
- *   card = pointer to card struct.
- * Return:
- *   number of bytes actually transferred.
- */
-static int
-isdnloop_readstatus(u_char __user *buf, int len, isdnloop_card *card)
-{
-	int count;
-	u_char __user *p;
-
-	for (p = buf, count = 0; count < len; p++, count++) {
-		if (card->msg_buf_read == card->msg_buf_write)
-			return count;
-		if (put_user(*card->msg_buf_read++, p))
-			return -EFAULT;
-		if (card->msg_buf_read > card->msg_buf_end)
-			card->msg_buf_read = card->msg_buf;
-	}
-	return count;
-}
-
-/*
- * Simulate a card's response by appending it to the cards
- * message queue.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   s    = pointer to message-string.
- *   ch   = channel: 0 = generic messages, 1 and 2 = D-channel messages.
- * Return:
- *   0 on success, 1 on memory squeeze.
- */
-static int
-isdnloop_fake(isdnloop_card *card, char *s, int ch)
-{
-	struct sk_buff *skb;
-	int len = strlen(s) + ((ch >= 0) ? 3 : 0);
-	skb = dev_alloc_skb(len);
-	if (!skb) {
-		printk(KERN_WARNING "isdnloop: Out of memory in isdnloop_fake\n");
-		return 1;
-	}
-	if (ch >= 0)
-		sprintf(skb_put(skb, 3), "%02d;", ch);
-	skb_put_data(skb, s, strlen(s));
-	skb_queue_tail(&card->dqueue, skb);
-	return 0;
-}
-/* *INDENT-OFF* */
-static isdnloop_stat isdnloop_cmd_table[] = {
-	{"BCON_R",         0,  1},	/* B-Channel connect        */
-	{"BCON_I",         0, 17},	/* B-Channel connect ind    */
-	{"BDIS_R",         0,  2},	/* B-Channel disconnect     */
-	{"DDIS_R",         0,  3},	/* D-Channel disconnect     */
-	{"DCON_R",         0, 16},	/* D-Channel connect        */
-	{"DSCA_R",         0,  4},	/* Dial 1TR6-SPV     */
-	{"DCAL_R",         0,  5},	/* Dial */
-	{"EAZC",           0,  6},	/* Clear EAZ listener */
-	{"EAZ",            0,  7},	/* Set EAZ listener */
-	{"SEEAZ",          0,  8},	/* Get EAZ listener */
-	{"MSN",            0,  9},	/* Set/Clear MSN listener */
-	{"MSALL",          0, 10},	/* Set multi MSN listeners */
-	{"SETSIL",         0, 11},	/* Set SI list     */
-	{"SEESIL",         0, 12},	/* Get SI list     */
-	{"SILC",           0, 13},	/* Clear SI list     */
-	{"LOCK",           0, -1},	/* LOCK channel     */
-	{"UNLOCK",         0, -1},	/* UNLOCK channel     */
-	{"FV2ON",          1, 14},	/* Leased mode on               */
-	{"FV2OFF",         1, 15},	/* Leased mode off              */
-	{NULL, 0, -1}
-};
-/* *INDENT-ON* */
-
-
-/*
- * Simulate an error-response from a card.
- *
- * Parameter:
- *   card = pointer to card struct.
- */
-static void
-isdnloop_fake_err(isdnloop_card *card)
-{
-	char buf[64];
-
-	snprintf(buf, sizeof(buf), "E%s", card->omsg);
-	isdnloop_fake(card, buf, -1);
-	isdnloop_fake(card, "NAK", -1);
-}
-
-static u_char ctable_eu[] = {0x00, 0x11, 0x01, 0x12};
-static u_char ctable_1t[] = {0x00, 0x3b, 0x01, 0x3a};
-
-/*
- * Assemble a simplified cause message depending on the
- * D-channel protocol used.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   loc  = location: 0 = local, 1 = remote.
- *   cau  = cause: 1 = busy, 2 = nonexistent callerid, 3 = no user responding.
- * Return:
- *   Pointer to buffer containing the assembled message.
- */
-static char *
-isdnloop_unicause(isdnloop_card *card, int loc, int cau)
-{
-	static char buf[6];
-
-	switch (card->ptype) {
-	case ISDN_PTYPE_EURO:
-		sprintf(buf, "E%02X%02X", (loc) ? 4 : 2, ctable_eu[cau]);
-		break;
-	case ISDN_PTYPE_1TR6:
-		sprintf(buf, "%02X44", ctable_1t[cau]);
-		break;
-	default:
-		return "0000";
-	}
-	return buf;
-}
-
-/*
- * Release a virtual connection. Called from timer interrupt, when
- * called party did not respond.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   ch   = channel (0-based)
- */
-static void
-isdnloop_atimeout(isdnloop_card *card, int ch)
-{
-	unsigned long flags;
-	char buf[60];
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	if (card->rcard[ch]) {
-		isdnloop_fake(card->rcard[ch], "DDIS_I", card->rch[ch] + 1);
-		card->rcard[ch]->rcard[card->rch[ch]] = NULL;
-		card->rcard[ch] = NULL;
-	}
-	isdnloop_fake(card, "DDIS_I", ch + 1);
-	/* No user responding */
-	sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 3));
-	isdnloop_fake(card, buf, ch + 1);
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Wrapper for isdnloop_atimeout().
- */
-static void
-isdnloop_atimeout0(struct timer_list *t)
-{
-	isdnloop_card *card = from_timer(card, t, c_timer[0]);
-
-	isdnloop_atimeout(card, 0);
-}
-
-/*
- * Wrapper for isdnloop_atimeout().
- */
-static void
-isdnloop_atimeout1(struct timer_list *t)
-{
-	isdnloop_card *card = from_timer(card, t, c_timer[1]);
-
-	isdnloop_atimeout(card, 1);
-}
-
-/*
- * Install a watchdog for a user, not responding.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   ch   = channel to watch for.
- */
-static void
-isdnloop_start_ctimer(isdnloop_card *card, int ch)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	timer_setup(&card->c_timer[ch], ch ? isdnloop_atimeout1
-					   : isdnloop_atimeout0, 0);
-	card->c_timer[ch].expires = jiffies + ISDNLOOP_TIMER_ALERTWAIT;
-	add_timer(&card->c_timer[ch]);
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Kill a pending channel watchdog.
- *
- * Parameter:
- *   card = pointer to card struct.
- *   ch   = channel (0-based).
- */
-static void
-isdnloop_kill_ctimer(isdnloop_card *card, int ch)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	del_timer(&card->c_timer[ch]);
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-static u_char si2bit[] = {0, 1, 0, 0, 0, 2, 0, 4, 0, 0};
-static u_char bit2si[] = {1, 5, 7};
-
-/*
- * Try finding a listener for an outgoing call.
- *
- * Parameter:
- *   card = pointer to calling card.
- *   p    = pointer to ICN-type setup-string.
- *   lch  = channel of calling card.
- *   cmd  = pointer to struct to be filled when parsing setup.
- * Return:
- *   0 = found match, alerting should happen.
- *   1 = found matching number but it is busy.
- *   2 = no matching listener.
- *   3 = found matching number but SI does not match.
- */
-static int
-isdnloop_try_call(isdnloop_card *card, char *p, int lch, isdn_ctrl *cmd)
-{
-	isdnloop_card *cc = cards;
-	unsigned long flags;
-	int ch;
-	int num_match;
-	int i;
-	char *e;
-	char nbuf[32];
-
-	isdnloop_parse_setup(p, cmd);
-	while (cc) {
-		for (ch = 0; ch < 2; ch++) {
-			/* Exclude ourself */
-			if ((cc == card) && (ch == lch))
-				continue;
-			num_match = 0;
-			switch (cc->ptype) {
-			case ISDN_PTYPE_EURO:
-				for (i = 0; i < 3; i++)
-					if (!(strcmp(cc->s0num[i], cmd->parm.setup.phone)))
-						num_match = 1;
-				break;
-			case ISDN_PTYPE_1TR6:
-				e = cc->eazlist[ch];
-				while (*e) {
-					sprintf(nbuf, "%s%c", cc->s0num[0], *e);
-					if (!(strcmp(nbuf, cmd->parm.setup.phone)))
-						num_match = 1;
-					e++;
-				}
-			}
-			if (num_match) {
-				spin_lock_irqsave(&card->isdnloop_lock, flags);
-				/* channel idle? */
-				if (!(cc->rcard[ch])) {
-					/* Check SI */
-					if (!(si2bit[cmd->parm.setup.si1] & cc->sil[ch])) {
-						spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-						return 3;
-					}
-					/* ch is idle, si and number matches */
-					cc->rcard[ch] = card;
-					cc->rch[ch] = lch;
-					card->rcard[lch] = cc;
-					card->rch[lch] = ch;
-					spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-					return 0;
-				} else {
-					spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-					/* num matches, but busy */
-					if (ch == 1)
-						return 1;
-				}
-			}
-		}
-		cc = cc->next;
-	}
-	return 2;
-}
-
-/*
- * Depending on D-channel protocol and caller/called, modify
- * phone number.
- *
- * Parameter:
- *   card   = pointer to card struct.
- *   phone  = pointer phone number.
- *   caller = flag: 1 = caller, 0 = called.
- * Return:
- *   pointer to new phone number.
- */
-static char *
-isdnloop_vstphone(isdnloop_card *card, char *phone, int caller)
-{
-	int i;
-	static char nphone[30];
-
-	if (!card) {
-		printk("BUG!!!\n");
-		return "";
-	}
-	switch (card->ptype) {
-	case ISDN_PTYPE_EURO:
-		if (caller) {
-			for (i = 0; i < 2; i++)
-				if (!(strcmp(card->s0num[i], phone)))
-					return phone;
-			return card->s0num[0];
-		}
-		return phone;
-		break;
-	case ISDN_PTYPE_1TR6:
-		if (caller) {
-			sprintf(nphone, "%s%c", card->s0num[0], phone[0]);
-			return nphone;
-		} else
-			return &phone[strlen(phone) - 1];
-		break;
-	}
-	return "";
-}
-
-/*
- * Parse an ICN-type command string sent to the 'card'.
- * Perform misc. actions depending on the command.
- *
- * Parameter:
- *   card = pointer to card struct.
- */
-static void
-isdnloop_parse_cmd(isdnloop_card *card)
-{
-	char *p = card->omsg;
-	isdn_ctrl cmd;
-	char buf[60];
-	isdnloop_stat *s = isdnloop_cmd_table;
-	int action = -1;
-	int i;
-	int ch;
-
-	if ((card->omsg[0] != '0') && (card->omsg[2] != ';')) {
-		isdnloop_fake_err(card);
-		return;
-	}
-	ch = card->omsg[1] - '0';
-	if ((ch < 0) || (ch > 2)) {
-		isdnloop_fake_err(card);
-		return;
-	}
-	p += 3;
-	while (s->statstr) {
-		if (!strncmp(p, s->statstr, strlen(s->statstr))) {
-			action = s->action;
-			if (s->command && (ch != 0)) {
-				isdnloop_fake_err(card);
-				return;
-			}
-			break;
-		}
-		s++;
-	}
-	if (action == -1)
-		return;
-	switch (action) {
-	case 1:
-		/* 0x;BCON_R */
-		if (card->rcard[ch - 1]) {
-			isdnloop_fake(card->rcard[ch - 1], "BCON_I",
-				      card->rch[ch - 1] + 1);
-			isdnloop_fake(card, "BCON_C", ch);
-		}
-		break;
-	case 17:
-		/* 0x;BCON_I */
-		if (card->rcard[ch - 1]) {
-			isdnloop_fake(card->rcard[ch - 1], "BCON_C",
-				      card->rch[ch - 1] + 1);
-		}
-		break;
-	case 2:
-		/* 0x;BDIS_R */
-		isdnloop_fake(card, "BDIS_C", ch);
-		if (card->rcard[ch - 1]) {
-			isdnloop_fake(card->rcard[ch - 1], "BDIS_I",
-				      card->rch[ch - 1] + 1);
-		}
-		break;
-	case 16:
-		/* 0x;DCON_R */
-		isdnloop_kill_ctimer(card, ch - 1);
-		if (card->rcard[ch - 1]) {
-			isdnloop_kill_ctimer(card->rcard[ch - 1], card->rch[ch - 1]);
-			isdnloop_fake(card->rcard[ch - 1], "DCON_C",
-				      card->rch[ch - 1] + 1);
-			isdnloop_fake(card, "DCON_C", ch);
-		}
-		break;
-	case 3:
-		/* 0x;DDIS_R */
-		isdnloop_kill_ctimer(card, ch - 1);
-		if (card->rcard[ch - 1]) {
-			isdnloop_kill_ctimer(card->rcard[ch - 1], card->rch[ch - 1]);
-			isdnloop_fake(card->rcard[ch - 1], "DDIS_I",
-				      card->rch[ch - 1] + 1);
-			card->rcard[ch - 1] = NULL;
-		}
-		isdnloop_fake(card, "DDIS_C", ch);
-		break;
-	case 4:
-		/* 0x;DSCA_Rdd,yy,zz,oo */
-		if (card->ptype != ISDN_PTYPE_1TR6) {
-			isdnloop_fake_err(card);
-			return;
-		}
-		/* Fall through */
-	case 5:
-		/* 0x;DCAL_Rdd,yy,zz,oo */
-		p += 6;
-		switch (isdnloop_try_call(card, p, ch - 1, &cmd)) {
-		case 0:
-			/* Alerting */
-			sprintf(buf, "D%s_I%s,%02d,%02d,%s",
-				(action == 4) ? "SCA" : "CAL",
-				isdnloop_vstphone(card, cmd.parm.setup.eazmsn, 1),
-				cmd.parm.setup.si1,
-				cmd.parm.setup.si2,
-				isdnloop_vstphone(card->rcard[ch - 1],
-						  cmd.parm.setup.phone, 0));
-			isdnloop_fake(card->rcard[ch - 1], buf, card->rch[ch - 1] + 1);
-			/* Fall through */
-		case 3:
-			/* si1 does not match, don't alert but start timer */
-			isdnloop_start_ctimer(card, ch - 1);
-			break;
-		case 1:
-			/* Remote busy */
-			isdnloop_fake(card, "DDIS_I", ch);
-			sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 1));
-			isdnloop_fake(card, buf, ch);
-			break;
-		case 2:
-			/* No such user */
-			isdnloop_fake(card, "DDIS_I", ch);
-			sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 2));
-			isdnloop_fake(card, buf, ch);
-			break;
-		}
-		break;
-	case 6:
-		/* 0x;EAZC */
-		card->eazlist[ch - 1][0] = '\0';
-		break;
-	case 7:
-		/* 0x;EAZ */
-		p += 3;
-		if (strlen(p) >= sizeof(card->eazlist[0]))
-			break;
-		strcpy(card->eazlist[ch - 1], p);
-		break;
-	case 8:
-		/* 0x;SEEAZ */
-		sprintf(buf, "EAZ-LIST: %s", card->eazlist[ch - 1]);
-		isdnloop_fake(card, buf, ch + 1);
-		break;
-	case 9:
-		/* 0x;MSN */
-		break;
-	case 10:
-		/* 0x;MSNALL */
-		break;
-	case 11:
-		/* 0x;SETSIL */
-		p += 6;
-		i = 0;
-		while (strchr("0157", *p)) {
-			if (i)
-				card->sil[ch - 1] |= si2bit[*p - '0'];
-			i = (*p++ == '0');
-		}
-		if (*p)
-			isdnloop_fake_err(card);
-		break;
-	case 12:
-		/* 0x;SEESIL */
-		sprintf(buf, "SIN-LIST: ");
-		p = buf + 10;
-		for (i = 0; i < 3; i++)
-			if (card->sil[ch - 1] & (1 << i))
-				p += sprintf(p, "%02d", bit2si[i]);
-		isdnloop_fake(card, buf, ch + 1);
-		break;
-	case 13:
-		/* 0x;SILC */
-		card->sil[ch - 1] = 0;
-		break;
-	case 14:
-		/* 00;FV2ON */
-		break;
-	case 15:
-		/* 00;FV2OFF */
-		break;
-	}
-}
-
-/*
- * Put command-strings into the of the 'card'. In reality, execute them
- * right in place by calling isdnloop_parse_cmd(). Also copy every
- * command to the read message ringbuffer, preceding it with a '>'.
- * These mesagges can be read at /dev/isdnctrl.
- *
- * Parameter:
- *   buf  = pointer to command buffer.
- *   len  = length of buffer data.
- *   user = flag: 1 = called form userlevel, 0 called from kernel.
- *   card = pointer to card struct.
- * Return:
- *   number of bytes transferred (currently always equals len).
- */
-static int
-isdnloop_writecmd(const u_char *buf, int len, int user, isdnloop_card *card)
-{
-	int xcount = 0;
-	int ocount = 1;
-	isdn_ctrl cmd;
-
-	while (len) {
-		int count = len;
-		u_char *p;
-		u_char msg[0x100];
-
-		if (count > 255)
-			count = 255;
-		if (user) {
-			if (copy_from_user(msg, buf, count))
-				return -EFAULT;
-		} else
-			memcpy(msg, buf, count);
-		isdnloop_putmsg(card, '>');
-		for (p = msg; count > 0; count--, p++) {
-			len--;
-			xcount++;
-			isdnloop_putmsg(card, *p);
-			card->omsg[card->optr] = *p;
-			if (*p == '\n') {
-				card->omsg[card->optr] = '\0';
-				card->optr = 0;
-				isdnloop_parse_cmd(card);
-				if (len) {
-					isdnloop_putmsg(card, '>');
-					ocount++;
-				}
-			} else {
-				if (card->optr < 59)
-					card->optr++;
-			}
-			ocount++;
-		}
-	}
-	cmd.command = ISDN_STAT_STAVAIL;
-	cmd.driver = card->myid;
-	cmd.arg = ocount;
-	card->interface.statcallb(&cmd);
-	return xcount;
-}
-
-/*
- * Delete card's pending timers, send STOP to linklevel
- */
-static void
-isdnloop_stopcard(isdnloop_card *card)
-{
-	unsigned long flags;
-	isdn_ctrl cmd;
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	if (card->flags & ISDNLOOP_FLAGS_RUNNING) {
-		card->flags &= ~ISDNLOOP_FLAGS_RUNNING;
-		del_timer(&card->st_timer);
-		del_timer(&card->rb_timer);
-		del_timer(&card->c_timer[0]);
-		del_timer(&card->c_timer[1]);
-		cmd.command = ISDN_STAT_STOP;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-	}
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-}
-
-/*
- * Stop all cards before unload.
- */
-static void
-isdnloop_stopallcards(void)
-{
-	isdnloop_card *p = cards;
-
-	while (p) {
-		isdnloop_stopcard(p);
-		p = p->next;
-	}
-}
-
-/*
- * Start a 'card'. Simulate card's boot message and set the phone
- * number(s) of the virtual 'S0-Interface'. Install D-channel
- * poll timer.
- *
- * Parameter:
- *   card  = pointer to card struct.
- *   sdefp = pointer to struct holding ioctl parameters.
- * Return:
- *   0 on success, -E??? otherwise.
- */
-static int
-isdnloop_start(isdnloop_card *card, isdnloop_sdef *sdefp)
-{
-	unsigned long flags;
-	isdnloop_sdef sdef;
-	int i;
-
-	if (card->flags & ISDNLOOP_FLAGS_RUNNING)
-		return -EBUSY;
-	if (copy_from_user((char *) &sdef, (char *) sdefp, sizeof(sdef)))
-		return -EFAULT;
-
-	for (i = 0; i < 3; i++) {
-		if (!memchr(sdef.num[i], 0, sizeof(sdef.num[i])))
-			return -EINVAL;
-	}
-
-	spin_lock_irqsave(&card->isdnloop_lock, flags);
-	switch (sdef.ptype) {
-	case ISDN_PTYPE_EURO:
-		if (isdnloop_fake(card, "DRV1.23EC-Q.931-CAPI-CNS-BASIS-20.02.96",
-				  -1)) {
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-			return -ENOMEM;
-		}
-		card->sil[0] = card->sil[1] = 4;
-		if (isdnloop_fake(card, "TEI OK", 0)) {
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-			return -ENOMEM;
-		}
-		for (i = 0; i < 3; i++) {
-			strlcpy(card->s0num[i], sdef.num[i],
-				sizeof(card->s0num[0]));
-		}
-		break;
-	case ISDN_PTYPE_1TR6:
-		if (isdnloop_fake(card, "DRV1.04TC-1TR6-CAPI-CNS-BASIS-29.11.95",
-				  -1)) {
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-			return -ENOMEM;
-		}
-		card->sil[0] = card->sil[1] = 4;
-		if (isdnloop_fake(card, "TEI OK", 0)) {
-			spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-			return -ENOMEM;
-		}
-		strlcpy(card->s0num[0], sdef.num[0], sizeof(card->s0num[0]));
-		card->s0num[1][0] = '\0';
-		card->s0num[2][0] = '\0';
-		break;
-	default:
-		spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-		printk(KERN_WARNING "isdnloop: Illegal D-channel protocol %d\n",
-		       sdef.ptype);
-		return -EINVAL;
-	}
-	timer_setup(&card->rb_timer, isdnloop_pollbchan, 0);
-	timer_setup(&card->st_timer, isdnloop_polldchan, 0);
-	card->st_timer.expires = jiffies + ISDNLOOP_TIMER_DCREAD;
-	add_timer(&card->st_timer);
-	card->flags |= ISDNLOOP_FLAGS_RUNNING;
-	spin_unlock_irqrestore(&card->isdnloop_lock, flags);
-	return 0;
-}
-
-/*
- * Main handler for commands sent by linklevel.
- */
-static int
-isdnloop_command(isdn_ctrl *c, isdnloop_card *card)
-{
-	ulong a;
-	int i;
-	char cbuf[80];
-	isdn_ctrl cmd;
-	isdnloop_cdef cdef;
-
-	switch (c->command) {
-	case ISDN_CMD_IOCTL:
-		memcpy(&a, c->parm.num, sizeof(ulong));
-		switch (c->arg) {
-		case ISDNLOOP_IOCTL_DEBUGVAR:
-			return (ulong) card;
-		case ISDNLOOP_IOCTL_STARTUP:
-			return isdnloop_start(card, (isdnloop_sdef *) a);
-			break;
-		case ISDNLOOP_IOCTL_ADDCARD:
-			if (copy_from_user((char *)&cdef,
-					   (char *)a,
-					   sizeof(cdef)))
-				return -EFAULT;
-			return isdnloop_addcard(cdef.id1);
-			break;
-		case ISDNLOOP_IOCTL_LEASEDCFG:
-			if (a) {
-				if (!card->leased) {
-					card->leased = 1;
-					while (card->ptype == ISDN_PTYPE_UNKNOWN)
-						schedule_timeout_interruptible(10);
-					schedule_timeout_interruptible(10);
-					sprintf(cbuf, "00;FV2ON\n01;EAZ1\n02;EAZ2\n");
-					i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-					printk(KERN_INFO
-					       "isdnloop: (%s) Leased-line mode enabled\n",
-					       CID);
-					cmd.command = ISDN_STAT_RUN;
-					cmd.driver = card->myid;
-					cmd.arg = 0;
-					card->interface.statcallb(&cmd);
-				}
-			} else {
-				if (card->leased) {
-					card->leased = 0;
-					sprintf(cbuf, "00;FV2OFF\n");
-					i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-					printk(KERN_INFO
-					       "isdnloop: (%s) Leased-line mode disabled\n",
-					       CID);
-					cmd.command = ISDN_STAT_RUN;
-					cmd.driver = card->myid;
-					cmd.arg = 0;
-					card->interface.statcallb(&cmd);
-				}
-			}
-			return 0;
-		default:
-			return -EINVAL;
-		}
-		break;
-	case ISDN_CMD_DIAL:
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		if (card->leased)
-			break;
-		if ((c->arg & 255) < ISDNLOOP_BCH) {
-			char *p;
-			char dcode[4];
-
-			a = c->arg;
-			p = c->parm.setup.phone;
-			if (*p == 's' || *p == 'S') {
-				/* Dial for SPV */
-				p++;
-				strcpy(dcode, "SCA");
-			} else
-				/* Normal Dial */
-				strcpy(dcode, "CAL");
-			snprintf(cbuf, sizeof(cbuf),
-				 "%02d;D%s_R%s,%02d,%02d,%s\n", (int) (a + 1),
-				 dcode, p, c->parm.setup.si1,
-				 c->parm.setup.si2, c->parm.setup.eazmsn);
-			i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-		}
-		break;
-	case ISDN_CMD_ACCEPTD:
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		if (c->arg < ISDNLOOP_BCH) {
-			a = c->arg + 1;
-			cbuf[0] = 0;
-			switch (card->l2_proto[a - 1]) {
-			case ISDN_PROTO_L2_X75I:
-				sprintf(cbuf, "%02d;BX75\n", (int) a);
-				break;
-#ifdef CONFIG_ISDN_X25
-			case ISDN_PROTO_L2_X25DTE:
-				sprintf(cbuf, "%02d;BX2T\n", (int) a);
-				break;
-			case ISDN_PROTO_L2_X25DCE:
-				sprintf(cbuf, "%02d;BX2C\n", (int) a);
-				break;
-#endif
-			case ISDN_PROTO_L2_HDLC:
-				sprintf(cbuf, "%02d;BTRA\n", (int) a);
-				break;
-			}
-			if (strlen(cbuf))
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			sprintf(cbuf, "%02d;DCON_R\n", (int) a);
-			i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-		}
-		break;
-	case ISDN_CMD_ACCEPTB:
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		if (c->arg < ISDNLOOP_BCH) {
-			a = c->arg + 1;
-			switch (card->l2_proto[a - 1]) {
-			case ISDN_PROTO_L2_X75I:
-				sprintf(cbuf, "%02d;BCON_R,BX75\n", (int) a);
-				break;
-#ifdef CONFIG_ISDN_X25
-			case ISDN_PROTO_L2_X25DTE:
-				sprintf(cbuf, "%02d;BCON_R,BX2T\n", (int) a);
-				break;
-			case ISDN_PROTO_L2_X25DCE:
-				sprintf(cbuf, "%02d;BCON_R,BX2C\n", (int) a);
-				break;
-#endif
-			case ISDN_PROTO_L2_HDLC:
-				sprintf(cbuf, "%02d;BCON_R,BTRA\n", (int) a);
-				break;
-			default:
-				sprintf(cbuf, "%02d;BCON_R\n", (int) a);
-			}
-			printk(KERN_DEBUG "isdnloop writecmd '%s'\n", cbuf);
-			i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			break;
-		case ISDN_CMD_HANGUP:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			if (c->arg < ISDNLOOP_BCH) {
-				a = c->arg + 1;
-				sprintf(cbuf, "%02d;BDIS_R\n%02d;DDIS_R\n", (int) a, (int) a);
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			}
-			break;
-		case ISDN_CMD_SETEAZ:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			if (card->leased)
-				break;
-			if (c->arg < ISDNLOOP_BCH) {
-				a = c->arg + 1;
-				if (card->ptype == ISDN_PTYPE_EURO) {
-					sprintf(cbuf, "%02d;MS%s%s\n", (int) a,
-						c->parm.num[0] ? "N" : "ALL", c->parm.num);
-				} else
-					sprintf(cbuf, "%02d;EAZ%s\n", (int) a,
-						c->parm.num[0] ? c->parm.num : (u_char *) "0123456789");
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			}
-			break;
-		case ISDN_CMD_CLREAZ:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			if (card->leased)
-				break;
-			if (c->arg < ISDNLOOP_BCH) {
-				a = c->arg + 1;
-				if (card->ptype == ISDN_PTYPE_EURO)
-					sprintf(cbuf, "%02d;MSNC\n", (int) a);
-				else
-					sprintf(cbuf, "%02d;EAZC\n", (int) a);
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-			}
-			break;
-		case ISDN_CMD_SETL2:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			if ((c->arg & 255) < ISDNLOOP_BCH) {
-				a = c->arg;
-				switch (a >> 8) {
-				case ISDN_PROTO_L2_X75I:
-					sprintf(cbuf, "%02d;BX75\n", (int) (a & 255) + 1);
-					break;
-#ifdef CONFIG_ISDN_X25
-				case ISDN_PROTO_L2_X25DTE:
-					sprintf(cbuf, "%02d;BX2T\n", (int) (a & 255) + 1);
-					break;
-				case ISDN_PROTO_L2_X25DCE:
-					sprintf(cbuf, "%02d;BX2C\n", (int) (a & 255) + 1);
-					break;
-#endif
-				case ISDN_PROTO_L2_HDLC:
-					sprintf(cbuf, "%02d;BTRA\n", (int) (a & 255) + 1);
-					break;
-				case ISDN_PROTO_L2_TRANS:
-					sprintf(cbuf, "%02d;BTRA\n", (int) (a & 255) + 1);
-					break;
-				default:
-					return -EINVAL;
-				}
-				i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card);
-				card->l2_proto[a & 255] = (a >> 8);
-			}
-			break;
-		case ISDN_CMD_SETL3:
-			if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-				return -ENODEV;
-			return 0;
-		default:
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
-/*
- * Find card with given driverId
- */
-static inline isdnloop_card *
-isdnloop_findcard(int driverid)
-{
-	isdnloop_card *p = cards;
-
-	while (p) {
-		if (p->myid == driverid)
-			return p;
-		p = p->next;
-	}
-	return (isdnloop_card *) 0;
-}
-
-/*
- * Wrapper functions for interface to linklevel
- */
-static int
-if_command(isdn_ctrl *c)
-{
-	isdnloop_card *card = isdnloop_findcard(c->driver);
-
-	if (card)
-		return isdnloop_command(c, card);
-	printk(KERN_ERR
-	       "isdnloop: if_command called with invalid driverId!\n");
-	return -ENODEV;
-}
-
-static int
-if_writecmd(const u_char __user *buf, int len, int id, int channel)
-{
-	isdnloop_card *card = isdnloop_findcard(id);
-
-	if (card) {
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		return isdnloop_writecmd(buf, len, 1, card);
-	}
-	printk(KERN_ERR
-	       "isdnloop: if_writecmd called with invalid driverId!\n");
-	return -ENODEV;
-}
-
-static int
-if_readstatus(u_char __user *buf, int len, int id, int channel)
-{
-	isdnloop_card *card = isdnloop_findcard(id);
-
-	if (card) {
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		return isdnloop_readstatus(buf, len, card);
-	}
-	printk(KERN_ERR
-	       "isdnloop: if_readstatus called with invalid driverId!\n");
-	return -ENODEV;
-}
-
-static int
-if_sendbuf(int id, int channel, int ack, struct sk_buff *skb)
-{
-	isdnloop_card *card = isdnloop_findcard(id);
-
-	if (card) {
-		if (!(card->flags & ISDNLOOP_FLAGS_RUNNING))
-			return -ENODEV;
-		/* ack request stored in skb scratch area */
-		*(skb->head) = ack;
-		return isdnloop_sendbuf(channel, skb, card);
-	}
-	printk(KERN_ERR
-	       "isdnloop: if_sendbuf called with invalid driverId!\n");
-	return -ENODEV;
-}
-
-/*
- * Allocate a new card-struct, initialize it
- * link it into cards-list and register it at linklevel.
- */
-static isdnloop_card *
-isdnloop_initcard(char *id)
-{
-	isdnloop_card *card;
-	int i;
-	card = kzalloc(sizeof(isdnloop_card), GFP_KERNEL);
-	if (!card) {
-		printk(KERN_WARNING
-		       "isdnloop: (%s) Could not allocate card-struct.\n", id);
-		return (isdnloop_card *) 0;
-	}
-	card->interface.owner = THIS_MODULE;
-	card->interface.channels = ISDNLOOP_BCH;
-	card->interface.hl_hdrlen  = 1; /* scratch area for storing ack flag*/
-	card->interface.maxbufsize = 4000;
-	card->interface.command = if_command;
-	card->interface.writebuf_skb = if_sendbuf;
-	card->interface.writecmd = if_writecmd;
-	card->interface.readstat = if_readstatus;
-	card->interface.features = ISDN_FEATURE_L2_X75I |
-#ifdef CONFIG_ISDN_X25
-		ISDN_FEATURE_L2_X25DTE |
-		ISDN_FEATURE_L2_X25DCE |
-#endif
-		ISDN_FEATURE_L2_HDLC |
-		ISDN_FEATURE_L3_TRANS |
-		ISDN_FEATURE_P_UNKNOWN;
-	card->ptype = ISDN_PTYPE_UNKNOWN;
-	strlcpy(card->interface.id, id, sizeof(card->interface.id));
-	card->msg_buf_write = card->msg_buf;
-	card->msg_buf_read = card->msg_buf;
-	card->msg_buf_end = &card->msg_buf[sizeof(card->msg_buf) - 1];
-	for (i = 0; i < ISDNLOOP_BCH; i++) {
-		card->l2_proto[i] = ISDN_PROTO_L2_X75I;
-		skb_queue_head_init(&card->bqueue[i]);
-	}
-	skb_queue_head_init(&card->dqueue);
-	spin_lock_init(&card->isdnloop_lock);
-	card->next = cards;
-	cards = card;
-	if (!register_isdn(&card->interface)) {
-		cards = cards->next;
-		printk(KERN_WARNING
-		       "isdnloop: Unable to register %s\n", id);
-		kfree(card);
-		return (isdnloop_card *) 0;
-	}
-	card->myid = card->interface.channels;
-	return card;
-}
-
-static int
-isdnloop_addcard(char *id1)
-{
-	isdnloop_card *card;
-	card = isdnloop_initcard(id1);
-	if (!card) {
-		return -EIO;
-	}
-	printk(KERN_INFO
-	       "isdnloop: (%s) virtual card added\n",
-	       card->interface.id);
-	return 0;
-}
-
-static int __init
-isdnloop_init(void)
-{
-	if (isdnloop_id)
-		return isdnloop_addcard(isdnloop_id);
-
-	return 0;
-}
-
-static void __exit
-isdnloop_exit(void)
-{
-	isdn_ctrl cmd;
-	isdnloop_card *card = cards;
-	isdnloop_card *last;
-	int i;
-
-	isdnloop_stopallcards();
-	while (card) {
-		cmd.command = ISDN_STAT_UNLOAD;
-		cmd.driver = card->myid;
-		card->interface.statcallb(&cmd);
-		for (i = 0; i < ISDNLOOP_BCH; i++)
-			isdnloop_free_queue(card, i);
-		card = card->next;
-	}
-	card = cards;
-	while (card) {
-		last = card;
-		skb_queue_purge(&card->dqueue);
-		card = card->next;
-		kfree(last);
-	}
-	printk(KERN_NOTICE "isdnloop-ISDN-driver unloaded\n");
-}
-
-module_init(isdnloop_init);
-module_exit(isdnloop_exit);
diff --git a/drivers/isdn/isdnloop/isdnloop.h b/drivers/isdn/isdnloop/isdnloop.h
deleted file mode 100644
index e9e035552bb4..000000000000
--- a/drivers/isdn/isdnloop/isdnloop.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* $Id: isdnloop.h,v 1.5.6.3 2001/09/23 22:24:56 kai Exp $
- *
- * Loopback lowlevel module for testing of linklevel.
- *
- * Copyright 1997 by Fritz Elfert (fritz@isdn4linux.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef isdnloop_h
-#define isdnloop_h
-
-#define ISDNLOOP_IOCTL_DEBUGVAR  0
-#define ISDNLOOP_IOCTL_ADDCARD   1
-#define ISDNLOOP_IOCTL_LEASEDCFG 2
-#define ISDNLOOP_IOCTL_STARTUP   3
-
-/* Struct for adding new cards */
-typedef struct isdnloop_cdef {
-	char id1[10];
-} isdnloop_cdef;
-
-/* Struct for configuring cards */
-typedef struct isdnloop_sdef {
-	int ptype;
-	char num[3][20];
-} isdnloop_sdef;
-
-#if defined(__KERNEL__) || defined(__DEBUGVAR__)
-
-#ifdef __KERNEL__
-/* Kernel includes */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/major.h>
-#include <asm/io.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/ioport.h>
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/isdnif.h>
-
-#endif                          /* __KERNEL__ */
-
-#define ISDNLOOP_FLAGS_B1ACTIVE 1	/* B-Channel-1 is open           */
-#define ISDNLOOP_FLAGS_B2ACTIVE 2	/* B-Channel-2 is open           */
-#define ISDNLOOP_FLAGS_RUNNING  4	/* Cards driver activated        */
-#define ISDNLOOP_FLAGS_RBTIMER  8	/* scheduling of B-Channel-poll  */
-#define ISDNLOOP_TIMER_BCREAD 1 /* B-Channel poll-cycle          */
-#define ISDNLOOP_TIMER_DCREAD (HZ/2)	/* D-Channel poll-cycle          */
-#define ISDNLOOP_TIMER_ALERTWAIT (10 * HZ)	/* Alert timeout                 */
-#define ISDNLOOP_MAX_SQUEUE 65536	/* Max. outstanding send-data    */
-#define ISDNLOOP_BCH 2          /* channels per card             */
-
-/*
- * Per card driver data
- */
-typedef struct isdnloop_card {
-	struct isdnloop_card *next;	/* Pointer to next device struct    */
-	struct isdnloop_card
-	*rcard[ISDNLOOP_BCH];   /* Pointer to 'remote' card         */
-	int rch[ISDNLOOP_BCH];  /* 'remote' channel                 */
-	int myid;               /* Driver-Nr. assigned by linklevel */
-	int leased;             /* Flag: This Adapter is connected  */
-	/*       to a leased line           */
-	int sil[ISDNLOOP_BCH];  /* SI's to listen for               */
-	char eazlist[ISDNLOOP_BCH][11];
-	/* EAZ's to listen for              */
-	char s0num[3][20];      /* 1TR6 base-number or MSN's        */
-	unsigned short flags;   /* Statusflags                      */
-	int ptype;              /* Protocol type (1TR6 or Euro)     */
-	struct timer_list st_timer;	/* Timer for Status-Polls           */
-	struct timer_list rb_timer;	/* Timer for B-Channel-Polls        */
-	struct timer_list
-	c_timer[ISDNLOOP_BCH]; /* Timer for Alerting               */
-	int l2_proto[ISDNLOOP_BCH];	/* Current layer-2-protocol         */
-	isdn_if interface;      /* Interface to upper layer         */
-	int iptr;               /* Index to imsg-buffer             */
-	char imsg[60];          /* Internal buf for status-parsing  */
-	int optr;               /* Index to omsg-buffer             */
-	char omsg[60];          /* Internal buf for cmd-parsing     */
-	char msg_buf[2048];     /* Buffer for status-messages       */
-	char *msg_buf_write;    /* Writepointer for statusbuffer    */
-	char *msg_buf_read;     /* Readpointer for statusbuffer     */
-	char *msg_buf_end;      /* Pointer to end of statusbuffer   */
-	int sndcount[ISDNLOOP_BCH];	/* Byte-counters for B-Ch.-send     */
-	struct sk_buff_head
-	bqueue[ISDNLOOP_BCH];  /* B-Channel queues                 */
-	struct sk_buff_head dqueue;	/* D-Channel queue                  */
-	spinlock_t isdnloop_lock;
-} isdnloop_card;
-
-/*
- * Main driver data
- */
-#ifdef __KERNEL__
-static isdnloop_card *cards = (isdnloop_card *) 0;
-#endif                          /* __KERNEL__ */
-
-/* Utility-Macros */
-
-#define CID (card->interface.id)
-
-#endif                          /* defined(__KERNEL__) || defined(__DEBUGVAR__) */
-#endif                          /* isdnloop_h */
diff --git a/include/linux/concap.h b/include/linux/concap.h
deleted file mode 100644
index 977acb3d1fb2..000000000000
--- a/include/linux/concap.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* $Id: concap.h,v 1.3.2.2 2004/01/12 23:08:35 keil Exp $
- *
- * Copyright 1997 by Henner Eisen <eis@baty.hanse.de>
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- */
-
-#ifndef _LINUX_CONCAP_H
-#define _LINUX_CONCAP_H
-
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-
-/* Stuff to support encapsulation protocols genericly. The encapsulation
-   protocol is processed at the uppermost layer of the network interface.
-
-   Based on a ideas developed in a 'synchronous device' thread in the
-   linux-x25 mailing list contributed by Alan Cox, Thomasz Motylewski
-   and Jonathan Naylor.
-
-   For more documetation on this refer to Documentation/isdn/README.concap
-*/
-
-struct concap_proto_ops;
-struct concap_device_ops;
-
-/* this manages all data needed by the encapsulation protocol
- */
-struct concap_proto{
-	struct net_device *net_dev;	/* net device using our service  */
-	struct concap_device_ops *dops;	/* callbacks provided by device */
- 	struct concap_proto_ops  *pops;	/* callbacks provided by us */
- 	spinlock_t lock;
-	int flags;
-	void *proto_data;		/* protocol specific private data, to
-					   be accessed via *pops methods only*/
-	/*
-	  :
-	  whatever 
-	  :
-	  */
-};
-
-/* Operations to be supported by the net device. Called by the encapsulation
- * protocol entity. No receive method is offered because the encapsulation
- * protocol directly calls netif_rx().
- */
-struct concap_device_ops{
-
-	/* to request data is submitted by device*/ 
-	int (*data_req)(struct concap_proto *, struct sk_buff *);
-
-	/* Control methods must be set to NULL by devices which do not
-	   support connection control.*/
-	/* to request a connection is set up */ 
-	int (*connect_req)(struct concap_proto *);
-
-	/* to request a connection is released */
-	int (*disconn_req)(struct concap_proto *);	
-};
-
-/* Operations to be supported by the encapsulation protocol. Called by
- * device driver.
- */
-struct concap_proto_ops{
-
-	/* create a new encapsulation protocol instance of same type */
-	struct concap_proto *  (*proto_new) (void);
-
-	/* delete encapsulation protocol instance and free all its resources.
-	   cprot may no loger be referenced after calling this */
-	void (*proto_del)(struct concap_proto *cprot);
-
-	/* initialize the protocol's data. To be called at interface startup
-	   or when the device driver resets the interface. All services of the
-	   encapsulation protocol may be used after this*/
-	int (*restart)(struct concap_proto *cprot, 
-		       struct net_device *ndev,
-		       struct concap_device_ops *dops);
-
-	/* inactivate an encapsulation protocol instance. The encapsulation
-	   protocol may not call any *dops methods after this. */
-	int (*close)(struct concap_proto *cprot);
-
-	/* process a frame handed down to us by upper layer */
-	int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called for each data entity received from lower layer*/ 
-	int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb);
-
-	/* to be called when a connection was set up/down.
-	   Protocols that don't process these primitives might fill in
-	   dummy methods here */
-	int (*connect_ind)(struct concap_proto *cprot);
-	int (*disconn_ind)(struct concap_proto *cprot);
-  /*
-    Some network device support functions, like net_header(), rebuild_header(),
-    and others, that depend solely on the encapsulation protocol, might
-    be provided here, too. The net device would just fill them in its
-    corresponding fields when it is opened.
-    */
-};
-
-/* dummy restart/close/connect/reset/disconn methods
- */
-extern int concap_nop(struct concap_proto *cprot); 
-
-/* dummy submit method
- */
-extern int concap_drop_skb(struct concap_proto *cprot, struct sk_buff *skb);
-#endif
diff --git a/include/linux/isdn.h b/include/linux/isdn.h
deleted file mode 100644
index df97c8444f5d..000000000000
--- a/include/linux/isdn.h
+++ /dev/null
@@ -1,473 +0,0 @@
-/* $Id: isdn.h,v 1.125.2.3 2004/02/10 01:07:14 keil Exp $
- *
- * Main header for the Linux ISDN subsystem (linklevel).
- *
- * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- * 
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#ifndef __ISDN_H__
-#define __ISDN_H__
-
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/major.h>
-#include <asm/io.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/slab.h>
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/serial_reg.h>
-#include <linux/fcntl.h>
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/ip.h>
-#include <linux/in.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/tcp.h>
-#include <linux/mutex.h>
-#include <uapi/linux/isdn.h>
-
-#define ISDN_TTY_MAJOR    43
-#define ISDN_TTYAUX_MAJOR 44
-#define ISDN_MAJOR        45
-
-/* The minor-devicenumbers for Channel 0 and 1 are used as arguments for
- * physical Channel-Mapping, so they MUST NOT be changed without changing
- * the correspondent code in isdn.c
- */
-
-#define ISDN_MINOR_B        0
-#define ISDN_MINOR_BMAX     (ISDN_MAX_CHANNELS-1)
-#define ISDN_MINOR_CTRL     64
-#define ISDN_MINOR_CTRLMAX  (64 + (ISDN_MAX_CHANNELS-1))
-#define ISDN_MINOR_PPP      128
-#define ISDN_MINOR_PPPMAX   (128 + (ISDN_MAX_CHANNELS-1))
-#define ISDN_MINOR_STATUS   255
-
-#ifdef CONFIG_ISDN_PPP
-
-#ifdef CONFIG_ISDN_PPP_VJ
-#  include <net/slhc_vj.h>
-#endif
-
-#include <linux/ppp_defs.h>
-#include <linux/ppp-ioctl.h>
-
-#include <linux/isdn_ppp.h>
-#endif
-
-#ifdef CONFIG_ISDN_X25
-#  include <linux/concap.h>
-#endif
-
-#include <linux/isdnif.h>
-
-#define ISDN_DRVIOCTL_MASK       0x7f  /* Mask for Device-ioctl */
-
-/* Until now unused */
-#define ISDN_SERVICE_VOICE 1
-#define ISDN_SERVICE_AB    1<<1 
-#define ISDN_SERVICE_X21   1<<2
-#define ISDN_SERVICE_G4    1<<3
-#define ISDN_SERVICE_BTX   1<<4
-#define ISDN_SERVICE_DFUE  1<<5
-#define ISDN_SERVICE_X25   1<<6
-#define ISDN_SERVICE_TTX   1<<7
-#define ISDN_SERVICE_MIXED 1<<8
-#define ISDN_SERVICE_FW    1<<9
-#define ISDN_SERVICE_GTEL  1<<10
-#define ISDN_SERVICE_BTXN  1<<11
-#define ISDN_SERVICE_BTEL  1<<12
-
-/* Macros checking plain usage */
-#define USG_NONE(x)         ((x & ISDN_USAGE_MASK)==ISDN_USAGE_NONE)
-#define USG_RAW(x)          ((x & ISDN_USAGE_MASK)==ISDN_USAGE_RAW)
-#define USG_MODEM(x)        ((x & ISDN_USAGE_MASK)==ISDN_USAGE_MODEM)
-#define USG_VOICE(x)        ((x & ISDN_USAGE_MASK)==ISDN_USAGE_VOICE)
-#define USG_NET(x)          ((x & ISDN_USAGE_MASK)==ISDN_USAGE_NET)
-#define USG_FAX(x)          ((x & ISDN_USAGE_MASK)==ISDN_USAGE_FAX)
-#define USG_OUTGOING(x)     ((x & ISDN_USAGE_OUTGOING)==ISDN_USAGE_OUTGOING)
-#define USG_MODEMORVOICE(x) (((x & ISDN_USAGE_MASK)==ISDN_USAGE_MODEM) || \
-                             ((x & ISDN_USAGE_MASK)==ISDN_USAGE_VOICE)     )
-
-/* Timer-delays and scheduling-flags */
-#define ISDN_TIMER_RES         4                         /* Main Timer-Resolution   */
-#define ISDN_TIMER_02SEC       (HZ/ISDN_TIMER_RES/5)     /* Slow-Timer1 .2 sec      */
-#define ISDN_TIMER_1SEC        (HZ/ISDN_TIMER_RES)       /* Slow-Timer2 1 sec       */
-#define ISDN_TIMER_RINGING     5 /* tty RINGs = ISDN_TIMER_1SEC * this factor       */
-#define ISDN_TIMER_KEEPINT    10 /* Cisco-Keepalive = ISDN_TIMER_1SEC * this factor */
-#define ISDN_TIMER_MODEMREAD   1
-#define ISDN_TIMER_MODEMPLUS   2
-#define ISDN_TIMER_MODEMRING   4
-#define ISDN_TIMER_MODEMXMIT   8
-#define ISDN_TIMER_NETDIAL    16 
-#define ISDN_TIMER_NETHANGUP  32
-#define ISDN_TIMER_CARRIER   256 /* Wait for Carrier */
-#define ISDN_TIMER_FAST      (ISDN_TIMER_MODEMREAD | ISDN_TIMER_MODEMPLUS | \
-                              ISDN_TIMER_MODEMXMIT)
-#define ISDN_TIMER_SLOW      (ISDN_TIMER_MODEMRING | ISDN_TIMER_NETHANGUP | \
-                              ISDN_TIMER_NETDIAL | ISDN_TIMER_CARRIER)
-
-/* Timeout-Values for isdn_net_dial() */
-#define ISDN_TIMER_DTIMEOUT10 (10*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1)))
-#define ISDN_TIMER_DTIMEOUT15 (15*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1)))
-#define ISDN_TIMER_DTIMEOUT60 (60*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1)))
-
-/* GLOBAL_FLAGS */
-#define ISDN_GLOBAL_STOPPED 1
-
-/*=================== Start of ip-over-ISDN stuff =========================*/
-
-/* Feature- and status-flags for a net-interface */
-#define ISDN_NET_CONNECTED  0x01       /* Bound to ISDN-Channel             */
-#define ISDN_NET_SECURE     0x02       /* Accept calls from phonelist only  */
-#define ISDN_NET_CALLBACK   0x04       /* activate callback                 */
-#define ISDN_NET_CBHUP      0x08       /* hangup before callback            */
-#define ISDN_NET_CBOUT      0x10       /* remote machine does callback      */
-
-#define ISDN_NET_MAGIC      0x49344C02 /* for paranoia-checking             */
-
-/* Phone-list-element */
-typedef struct {
-  void *next;
-  char num[ISDN_MSNLEN];
-} isdn_net_phone;
-
-/*
-   Principles when extending structures for generic encapsulation protocol
-   ("concap") support:
-   - Stuff which is hardware specific (here i4l-specific) goes in 
-     the netdev -> local structure (here: isdn_net_local)
-   - Stuff which is encapsulation protocol specific goes in the structure
-     which holds the linux device structure (here: isdn_net_device)
-*/
-
-/* Local interface-data */
-typedef struct isdn_net_local_s {
-  ulong                  magic;
-  struct net_device_stats stats;       /* Ethernet Statistics              */
-  int                    isdn_device;  /* Index to isdn-device             */
-  int                    isdn_channel; /* Index to isdn-channel            */
-  int			 ppp_slot;     /* PPPD device slot number          */
-  int                    pre_device;   /* Preselected isdn-device          */
-  int                    pre_channel;  /* Preselected isdn-channel         */
-  int                    exclusive;    /* If non-zero idx to reserved chan.*/
-  int                    flags;        /* Connection-flags                 */
-  int                    dialretry;    /* Counter for Dialout-retries      */
-  int                    dialmax;      /* Max. Number of Dial-retries      */
-  int                    cbdelay;      /* Delay before Callback starts     */
-  int                    dtimer;       /* Timeout-counter for dialing      */
-  char                   msn[ISDN_MSNLEN]; /* MSNs/EAZs for this interface */
-  u_char                 cbhup;        /* Flag: Reject Call before Callback*/
-  u_char                 dialstate;    /* State for dialing                */
-  u_char                 p_encap;      /* Packet encapsulation             */
-                                       /*   0 = Ethernet over ISDN         */
-				       /*   1 = RAW-IP                     */
-                                       /*   2 = IP with type field         */
-  u_char                 l2_proto;     /* Layer-2-protocol                 */
-				       /* See ISDN_PROTO_L2..-constants in */
-                                       /* isdnif.h                         */
-                                       /*   0 = X75/LAPB with I-Frames     */
-				       /*   1 = X75/LAPB with UI-Frames    */
-				       /*   2 = X75/LAPB with BUI-Frames   */
-				       /*   3 = HDLC                       */
-  u_char                 l3_proto;     /* Layer-3-protocol                 */
-				       /* See ISDN_PROTO_L3..-constants in */
-                                       /* isdnif.h                         */
-                                       /*   0 = Transparent                */
-  int                    huptimer;     /* Timeout-counter for auto-hangup  */
-  int                    charge;       /* Counter for charging units       */
-  ulong                  chargetime;   /* Timer for Charging info          */
-  int                    hupflags;     /* Flags for charge-unit-hangup:    */
-				       /* bit0: chargeint is invalid       */
-				       /* bit1: Getting charge-interval    */
-                                       /* bit2: Do charge-unit-hangup      */
-                                       /* bit3: Do hangup even on incoming */
-  int                    outgoing;     /* Flag: outgoing call              */
-  int                    onhtime;      /* Time to keep link up             */
-  int                    chargeint;    /* Interval between charge-infos    */
-  int                    onum;         /* Flag: at least 1 outgoing number */
-  int                    cps;          /* current speed of this interface  */
-  int                    transcount;   /* byte-counter for cps-calculation */
-  int                    sqfull;       /* Flag: netdev-queue overloaded    */
-  ulong                  sqfull_stamp; /* Start-Time of overload           */
-  ulong                  slavedelay;   /* Dynamic bundling delaytime       */
-  int                    triggercps;   /* BogoCPS needed for trigger slave */
-  isdn_net_phone         *phone[2];    /* List of remote-phonenumbers      */
-				       /* phone[0] = Incoming Numbers      */
-				       /* phone[1] = Outgoing Numbers      */
-  isdn_net_phone         *dial;        /* Pointer to dialed number         */
-  struct net_device      *master;      /* Ptr to Master device for slaves  */
-  struct net_device      *slave;       /* Ptr to Slave device for masters  */
-  struct isdn_net_local_s *next;       /* Ptr to next link in bundle       */
-  struct isdn_net_local_s *last;       /* Ptr to last link in bundle       */
-  struct isdn_net_dev_s  *netdev;      /* Ptr to netdev                    */
-  struct sk_buff_head    super_tx_queue; /* List of supervisory frames to  */
-	                               /* be transmitted asap              */
-  atomic_t frame_cnt;                  /* number of frames currently       */
-                        	       /* queued in HL driver              */    
-                                       /* Ptr to orig. hard_header_cache   */
-  spinlock_t             xmit_lock;    /* used to protect the xmit path of */
-                                       /* a particular channel (including  */
-                                       /* the frame_cnt                    */
-
-  int  pppbind;                        /* ippp device for bindings         */
-  int					dialtimeout;	/* How long shall we try on dialing? (jiffies) */
-  int					dialwait;		/* How long shall we wait after failed attempt? (jiffies) */
-  ulong					dialstarted;	/* jiffies of first dialing-attempt */
-  ulong					dialwait_timer;	/* jiffies of earliest next dialing-attempt */
-  int					huptimeout;		/* How long will the connection be up? (seconds) */
-#ifdef CONFIG_ISDN_X25
-  struct concap_device_ops *dops;      /* callbacks used by encapsulator   */
-#endif
-  /* use an own struct for that in later versions */
-  ulong cisco_myseq;                   /* Local keepalive seq. for Cisco   */
-  ulong cisco_mineseen;                /* returned keepalive seq. from remote */
-  ulong cisco_yourseq;                 /* Remote keepalive seq. for Cisco  */
-  int cisco_keepalive_period;		/* keepalive period */
-  ulong cisco_last_slarp_in;		/* jiffie of last keepalive packet we received */
-  char cisco_line_state;		/* state of line according to keepalive packets */
-  char cisco_debserint;			/* debugging flag of cisco hdlc with slarp */
-  struct timer_list cisco_timer;
-  struct work_struct tqueue;
-} isdn_net_local;
-
-/* the interface itself */
-typedef struct isdn_net_dev_s {
-  isdn_net_local *local;
-  isdn_net_local *queue;               /* circular list of all bundled
-					  channels, which are currently
-					  online                           */
-  spinlock_t queue_lock;               /* lock to protect queue            */
-  void *next;                          /* Pointer to next isdn-interface   */
-  struct net_device *dev;              /* interface to upper levels        */
-#ifdef CONFIG_ISDN_PPP
-  ippp_bundle * pb;		/* pointer to the common bundle structure
-   			         * with the per-bundle data */
-#endif
-#ifdef CONFIG_ISDN_X25
-  struct concap_proto  *cprot; /* connection oriented encapsulation protocol */
-#endif
-
-} isdn_net_dev;
-
-/*===================== End of ip-over-ISDN stuff ===========================*/
-
-/*======================= Start of ISDN-tty stuff ===========================*/
-
-#define ISDN_ASYNC_MAGIC          0x49344C01 /* for paranoia-checking        */
-#define ISDN_SERIAL_XMIT_SIZE           1024 /* Default bufsize for write    */
-#define ISDN_SERIAL_XMIT_MAX            4000 /* Maximum bufsize for write    */
-
-#ifdef CONFIG_ISDN_AUDIO
-/* For using sk_buffs with audio we need some private variables
- * within each sk_buff. For this purpose, we declare a struct here,
- * and put it always at the private skb->cb data array. A few macros help
- * accessing the variables.
- */
-typedef struct _isdn_audio_data {
-  unsigned short dle_count;
-  unsigned char  lock;
-} isdn_audio_data_t;
-
-#define ISDN_AUDIO_SKB_DLECOUNT(skb)	(((isdn_audio_data_t *)&skb->cb[0])->dle_count)
-#define ISDN_AUDIO_SKB_LOCK(skb)	(((isdn_audio_data_t *)&skb->cb[0])->lock)
-#endif
-
-/* Private data of AT-command-interpreter */
-typedef struct atemu {
-	u_char       profile[ISDN_MODEM_NUMREG]; /* Modem-Regs. Profile 0              */
-	u_char       mdmreg[ISDN_MODEM_NUMREG];  /* Modem-Registers                    */
-	char         pmsn[ISDN_MSNLEN];          /* EAZ/MSNs Profile 0                 */
-	char         msn[ISDN_MSNLEN];           /* EAZ/MSN                            */
-	char         plmsn[ISDN_LMSNLEN];        /* Listening MSNs Profile 0           */
-	char         lmsn[ISDN_LMSNLEN];         /* Listening MSNs                     */
-	char         cpn[ISDN_MSNLEN];           /* CalledPartyNumber on incoming call */
-	char         connmsg[ISDN_CMSGLEN];	 /* CONNECT-Msg from HL-Driver	       */
-#ifdef CONFIG_ISDN_AUDIO
-	u_char       vpar[10];                   /* Voice-parameters                   */
-	int          lastDLE;                    /* Flag for voice-coding: DLE seen    */
-#endif
-	int          mdmcmdl;                    /* Length of Modem-Commandbuffer      */
-	int          pluscount;                  /* Counter for +++ sequence           */
-	u_long       lastplus;                   /* Timestamp of last +                */
-	int	     carrierwait;                /* Seconds of carrier waiting         */
-	char         mdmcmd[255];                /* Modem-Commandbuffer                */
-	unsigned int charge;                     /* Charge units of current connection */
-} atemu;
-
-/* Private data (similar to async_struct in <linux/serial.h>) */
-typedef struct modem_info {
-  int			magic;
-  struct tty_port	port;
-  int			x_char;		 /* xon/xoff character             */
-  int			mcr;		 /* Modem control register         */
-  int                   msr;             /* Modem status register          */
-  int                   lsr;             /* Line status register           */
-  int			line;
-  int                   online;          /* 1 = B-Channel is up, drop data */
-					 /* 2 = B-Channel is up, deliver d.*/
-  int                   dialing;         /* Dial in progress or ATA        */
-  int                   closing;
-  int                   rcvsched;        /* Receive needs schedule         */
-  int                   isdn_driver;	 /* Index to isdn-driver           */
-  int                   isdn_channel;    /* Index to isdn-channel          */
-  int                   drv_index;       /* Index to dev->usage            */
-  int                   ncarrier;        /* Flag: schedule NO CARRIER      */
-  unsigned char         last_cause[8];   /* Last cause message             */
-  unsigned char         last_num[ISDN_MSNLEN];
-	                                 /* Last phone-number              */
-  unsigned char         last_l2;         /* Last layer-2 protocol          */
-  unsigned char         last_si;         /* Last service                   */
-  unsigned char         last_lhup;       /* Last hangup local?             */
-  unsigned char         last_dir;        /* Last direction (in or out)     */
-  struct timer_list     nc_timer;        /* Timer for delayed NO CARRIER   */
-  int                   send_outstanding;/* # of outstanding send-requests */
-  int                   xmit_size;       /* max. # of chars in xmit_buf    */
-  int                   xmit_count;      /* # of chars in xmit_buf         */
-  struct sk_buff_head   xmit_queue;      /* transmit queue                 */
-  atomic_t              xmit_lock;       /* Semaphore for isdn_tty_write   */
-#ifdef CONFIG_ISDN_AUDIO
-  int                   vonline;         /* Voice-channel status           */
-					 /* Bit 0 = recording              */
-					 /* Bit 1 = playback               */
-					 /* Bit 2 = playback, DLE-ETX seen */
-  struct sk_buff_head   dtmf_queue;      /* queue for dtmf results         */
-  void                  *adpcms;         /* state for adpcm decompression  */
-  void                  *adpcmr;         /* state for adpcm compression    */
-  void                  *dtmf_state;     /* state for dtmf decoder         */
-  void                  *silence_state;  /* state for silence detection    */
-#endif
-#ifdef CONFIG_ISDN_TTY_FAX
-  struct T30_s		*fax;		 /* T30 Fax Group 3 data/interface */
-  int			faxonline;	 /* Fax-channel status             */
-#endif
-  atemu                 emu;             /* AT-emulator data               */
-  spinlock_t	        readlock;
-} modem_info;
-
-#define ISDN_MODEM_WINSIZE 8
-
-/* Description of one ISDN-tty */
-typedef struct _isdn_modem {
-  int                refcount;				/* Number of opens        */
-  struct tty_driver  *tty_modem;			/* tty-device             */
-  struct tty_struct  *modem_table[ISDN_MAX_CHANNELS];	/* ?? copied from Orig    */
-  struct ktermios     *modem_termios[ISDN_MAX_CHANNELS];
-  struct ktermios     *modem_termios_locked[ISDN_MAX_CHANNELS];
-  modem_info         info[ISDN_MAX_CHANNELS];	   /* Private data           */
-} isdn_modem_t;
-
-/*======================= End of ISDN-tty stuff ============================*/
-
-/*======================== Start of V.110 stuff ============================*/
-#define V110_BUFSIZE 1024
-
-typedef struct {
-	int nbytes;                    /* 1 Matrixbyte -> nbytes in stream     */
-	int nbits;                     /* Number of used bits in streambyte    */
-	unsigned char key;             /* Bitmask in stream eg. 11 (nbits=2)   */
-	int decodelen;                 /* Amount of data in decodebuf          */
-	int SyncInit;                  /* Number of sync frames to send        */
-	unsigned char *OnlineFrame;    /* Precalculated V110 idle frame        */
-	unsigned char *OfflineFrame;   /* Precalculated V110 sync Frame        */
-	int framelen;                  /* Length of frames                     */
-	int skbuser;                   /* Number of unacked userdata skbs      */
-	int skbidle;                   /* Number of unacked idle/sync skbs     */
-	int introducer;                /* Local vars for decoder               */
-	int dbit;
-	unsigned char b;
-	int skbres;                    /* space to reserve in outgoing skb     */
-	int maxsize;                   /* maxbufsize of lowlevel driver        */
-	unsigned char *encodebuf;      /* temporary buffer for encoding        */
-	unsigned char decodebuf[V110_BUFSIZE]; /* incomplete V110 matrices     */
-} isdn_v110_stream;
-
-/*========================= End of V.110 stuff =============================*/
-
-/*======================= Start of general stuff ===========================*/
-
-typedef struct {
-	char *next;
-	char *private;
-} infostruct;
-
-#define DRV_FLAG_RUNNING 1
-#define DRV_FLAG_REJBUS  2
-#define DRV_FLAG_LOADED  4
-
-/* Description of hardware-level-driver */
-typedef struct _isdn_driver {
-	ulong               online;           /* Channel-Online flags             */
-	ulong               flags;            /* Misc driver Flags                */
-	int                 locks;            /* Number of locks for this driver  */
-	int                 channels;         /* Number of channels               */
-	wait_queue_head_t   st_waitq;         /* Wait-Queue for status-read's     */
-	int                 maxbufsize;       /* Maximum Buffersize supported     */
-	unsigned long       pktcount;         /* Until now: unused                */
-	int                 stavail;          /* Chars avail on Status-device     */
-	isdn_if            *interface;        /* Interface to driver              */
-	int                *rcverr;           /* Error-counters for B-Ch.-receive */
-	int                *rcvcount;         /* Byte-counters for B-Ch.-receive  */
-#ifdef CONFIG_ISDN_AUDIO
-	unsigned long      DLEflag;           /* Flags: Insert DLE at next read   */
-#endif
-	struct sk_buff_head *rpqueue;         /* Pointers to start of Rcv-Queue   */
-	wait_queue_head_t  *rcv_waitq;       /* Wait-Queues for B-Channel-Reads  */
-	wait_queue_head_t  *snd_waitq;       /* Wait-Queue for B-Channel-Send's  */
-	char               msn2eaz[10][ISDN_MSNLEN];  /* Mapping-Table MSN->EAZ   */
-} isdn_driver_t;
-
-/* Main driver-data */
-typedef struct isdn_devt {
-	struct module     *owner;
-	spinlock_t	  lock;
-	unsigned short    flags;		      /* Bitmapped Flags:           */
-	int               drivers;		      /* Current number of drivers  */
-	int               channels;		      /* Current number of channels */
-	int               net_verbose;                /* Verbose-Flag               */
-	int               modempoll;		      /* Flag: tty-read active      */
-	spinlock_t	  timerlock;
-	int               tflags;                     /* Timer-Flags:               */
-	/*  see ISDN_TIMER_..defines  */
-	int               global_flags;
-	infostruct        *infochain;                 /* List of open info-devs.    */
-	wait_queue_head_t info_waitq;                 /* Wait-Queue for isdninfo    */
-	struct timer_list timer;		      /* Misc.-function Timer       */
-	int               chanmap[ISDN_MAX_CHANNELS]; /* Map minor->device-channel  */
-	int               drvmap[ISDN_MAX_CHANNELS];  /* Map minor->driver-index    */
-	int               usage[ISDN_MAX_CHANNELS];   /* Used by tty/ip/voice       */
-	char              num[ISDN_MAX_CHANNELS][ISDN_MSNLEN];
-	/* Remote number of active ch.*/
-	int               m_idx[ISDN_MAX_CHANNELS];   /* Index for mdm....          */
-	isdn_driver_t     *drv[ISDN_MAX_DRIVERS];     /* Array of drivers           */
-	isdn_net_dev      *netdev;		      /* Linked list of net-if's    */
-	char              drvid[ISDN_MAX_DRIVERS][20];/* Driver-ID                 */
-	struct task_struct *profd;                    /* For iprofd                 */
-	isdn_modem_t      mdm;			      /* tty-driver-data            */
-	isdn_net_dev      *rx_netdev[ISDN_MAX_CHANNELS]; /* rx netdev-pointers     */
-	isdn_net_dev      *st_netdev[ISDN_MAX_CHANNELS]; /* stat netdev-pointers   */
-	ulong             ibytes[ISDN_MAX_CHANNELS];  /* Statistics incoming bytes  */
-	ulong             obytes[ISDN_MAX_CHANNELS];  /* Statistics outgoing bytes  */
-	int               v110emu[ISDN_MAX_CHANNELS]; /* V.110 emulator-mode 0=none */
-	atomic_t          v110use[ISDN_MAX_CHANNELS]; /* Usage-Semaphore for stream */
-	isdn_v110_stream  *v110[ISDN_MAX_CHANNELS];   /* V.110 private data         */
-	struct mutex      mtx;                        /* serialize list access*/
-	unsigned long     global_features;
-} isdn_dev;
-
-extern isdn_dev *dev;
-
-
-#endif /* __ISDN_H__ */
diff --git a/include/linux/isdn_divertif.h b/include/linux/isdn_divertif.h
deleted file mode 100644
index 19ab361f9f07..000000000000
--- a/include/linux/isdn_divertif.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* $Id: isdn_divertif.h,v 1.4.6.1 2001/09/23 22:25:05 kai Exp $
- *
- * Header for the diversion supplementary interface for i4l.
- *
- * Author    Werner Cornelius (werner@titro.de)
- * Copyright by Werner Cornelius (werner@titro.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#ifndef _LINUX_ISDN_DIVERTIF_H
-#define _LINUX_ISDN_DIVERTIF_H
-
-#include <linux/isdnif.h>
-#include <linux/types.h>
-#include <uapi/linux/isdn_divertif.h>
-
-/***************************************************************/
-/* structure exchanging data between isdn hl and divert module */
-/***************************************************************/ 
-typedef struct
-  { ulong if_magic; /* magic info and version */
-    int cmd; /* command */
-    int (*stat_callback)(isdn_ctrl *); /* supplied by divert module when calling */
-    int (*ll_cmd)(isdn_ctrl *); /* supplied by hl on return */
-    char * (*drv_to_name)(int); /* map a driver id to name, supplied by hl */
-    int (*name_to_drv)(char *); /* map a driver id to name, supplied by hl */
-  } isdn_divert_if;
-
-/*********************/
-/* function register */
-/*********************/
-extern int DIVERT_REG_NAME(isdn_divert_if *);
-#endif /* _LINUX_ISDN_DIVERTIF_H */
diff --git a/include/linux/isdn_ppp.h b/include/linux/isdn_ppp.h
deleted file mode 100644
index a0070c6dfaf8..000000000000
--- a/include/linux/isdn_ppp.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Linux ISDN subsystem, sync PPP, interface to ipppd
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- * Copyright 2000-2002  by Kai Germaschewski (kai@germaschewski.name)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#ifndef _LINUX_ISDN_PPP_H
-#define _LINUX_ISDN_PPP_H
-
-
-
-
-#ifdef CONFIG_IPPP_FILTER
-#include <linux/filter.h>
-#endif
-#include <uapi/linux/isdn_ppp.h>
-
-#define DECOMP_ERR_NOMEM	(-10)
-
-#define MP_END_FRAG    0x40
-#define MP_BEGIN_FRAG  0x80
-
-#define MP_MAX_QUEUE_LEN	16
-
-/*
- * We need a way for the decompressor to influence the generation of CCP
- * Reset-Requests in a variety of ways. The decompressor is already returning
- * a lot of information (generated skb length, error conditions) so we use
- * another parameter. This parameter is a pointer to a structure which is
- * to be marked valid by the decompressor and only in this case is ever used.
- * Furthermore, the only case where this data is used is when the decom-
- * pressor returns DECOMP_ERROR.
- *
- * We use this same struct for the reset entry of the compressor to commu-
- * nicate to its caller how to deal with sending of a Reset Ack. In this
- * case, expra is not used, but other options still apply (suppressing
- * sending with rsend, appending arbitrary data, etc).
- */
-
-#define IPPP_RESET_MAXDATABYTES	32
-
-struct isdn_ppp_resetparams {
-  unsigned char valid:1;	/* rw Is this structure filled at all ? */
-  unsigned char rsend:1;	/* rw Should we send one at all ? */
-  unsigned char idval:1;	/* rw Is the id field valid ? */
-  unsigned char dtval:1;	/* rw Is the data field valid ? */
-  unsigned char expra:1;	/* rw Is an Ack expected for this Req ? */
-  unsigned char id;		/* wo Send CCP ResetReq with this id */
-  unsigned short maxdlen;	/* ro Max bytes to be stored in data field */
-  unsigned short dlen;		/* rw Bytes stored in data field */
-  unsigned char *data;		/* wo Data for ResetReq info field */
-};
-
-/*
- * this is an 'old friend' from ppp-comp.h under a new name 
- * check the original include for more information
- */
-struct isdn_ppp_compressor {
-  struct isdn_ppp_compressor *next, *prev;
-  struct module *owner;
-  int num; /* CCP compression protocol number */
-  
-  void *(*alloc) (struct isdn_ppp_comp_data *);
-  void (*free) (void *state);
-  int  (*init) (void *state, struct isdn_ppp_comp_data *,
-		int unit,int debug);
-  
-  /* The reset entry needs to get more exact information about the
-     ResetReq or ResetAck it was called with. The parameters are
-     obvious. If reset is called without a Req or Ack frame which
-     could be handed into it, code MUST be set to 0. Using rsparm,
-     the reset entry can control if and how a ResetAck is returned. */
-  
-  void (*reset) (void *state, unsigned char code, unsigned char id,
-		 unsigned char *data, unsigned len,
-		 struct isdn_ppp_resetparams *rsparm);
-  
-  int  (*compress) (void *state, struct sk_buff *in,
-		    struct sk_buff *skb_out, int proto);
-  
-	int  (*decompress) (void *state,struct sk_buff *in,
-			    struct sk_buff *skb_out,
-			    struct isdn_ppp_resetparams *rsparm);
-  
-  void (*incomp) (void *state, struct sk_buff *in,int proto);
-  void (*stat) (void *state, struct compstat *stats);
-};
-
-extern int isdn_ppp_register_compressor(struct isdn_ppp_compressor *);
-extern int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *);
-extern int isdn_ppp_dial_slave(char *);
-extern int isdn_ppp_hangup_slave(char *);
-
-typedef struct {
-  unsigned long seqerrs;
-  unsigned long frame_drops;
-  unsigned long overflows;
-  unsigned long max_queue_len;
-} isdn_mppp_stats;
-
-typedef struct {
-  int mp_mrru;                        /* unused                             */
-  struct sk_buff * frags;	/* fragments sl list -- use skb->next */
-  long frames;			/* number of frames in the frame list */
-  unsigned int seq;		/* last processed packet seq #: any packets
-  				 * with smaller seq # will be dropped
-				 * unconditionally */
-  spinlock_t lock;
-  int ref_ct;				 
-  /* statistics */
-  isdn_mppp_stats stats;
-} ippp_bundle;
-
-#define NUM_RCV_BUFFS     64
-
-struct ippp_buf_queue {
-  struct ippp_buf_queue *next;
-  struct ippp_buf_queue *last;
-  char *buf;                 /* NULL here indicates end of queue */
-  int len;
-};
-
-/* The data structure for one CCP reset transaction */
-enum ippp_ccp_reset_states {
-  CCPResetIdle,
-  CCPResetSentReq,
-  CCPResetRcvdReq,
-  CCPResetSentAck,
-  CCPResetRcvdAck
-};
-
-struct ippp_ccp_reset_state {
-  enum ippp_ccp_reset_states state;	/* State of this transaction */
-  struct ippp_struct *is;		/* Backlink to device stuff */
-  unsigned char id;			/* Backlink id index */
-  unsigned char ta:1;			/* The timer is active (flag) */
-  unsigned char expra:1;		/* We expect a ResetAck at all */
-  int dlen;				/* Databytes stored in data */
-  struct timer_list timer;		/* For timeouts/retries */
-  /* This is a hack but seems sufficient for the moment. We do not want
-     to have this be yet another allocation for some bytes, it is more
-     memory management overhead than the whole mess is worth. */
-  unsigned char data[IPPP_RESET_MAXDATABYTES];
-};
-
-/* The data structure keeping track of the currently outstanding CCP Reset
-   transactions. */
-struct ippp_ccp_reset {
-  struct ippp_ccp_reset_state *rs[256];	/* One per possible id */
-  unsigned char lastid;			/* Last id allocated by the engine */
-};
-
-struct ippp_struct {
-  struct ippp_struct *next_link;
-  int state;
-  spinlock_t buflock;
-  struct ippp_buf_queue rq[NUM_RCV_BUFFS]; /* packet queue for isdn_ppp_read() */
-  struct ippp_buf_queue *first;  /* pointer to (current) first packet */
-  struct ippp_buf_queue *last;   /* pointer to (current) last used packet in queue */
-  wait_queue_head_t wq;
-  struct task_struct *tk;
-  unsigned int mpppcfg;
-  unsigned int pppcfg;
-  unsigned int mru;
-  unsigned int mpmru;
-  unsigned int mpmtu;
-  unsigned int maxcid;
-  struct isdn_net_local_s *lp;
-  int unit;
-  int minor;
-  unsigned int last_link_seqno;
-  long mp_seqno;
-#ifdef CONFIG_ISDN_PPP_VJ
-  unsigned char *cbuf;
-  struct slcompress *slcomp;
-#endif
-#ifdef CONFIG_IPPP_FILTER
-  struct bpf_prog *pass_filter;   /* filter for packets to pass */
-  struct bpf_prog *active_filter; /* filter for pkts to reset idle */
-#endif
-  unsigned long debug;
-  struct isdn_ppp_compressor *compressor,*decompressor;
-  struct isdn_ppp_compressor *link_compressor,*link_decompressor;
-  void *decomp_stat,*comp_stat,*link_decomp_stat,*link_comp_stat;
-  struct ippp_ccp_reset *reset;	/* Allocated on demand, may never be needed */
-  unsigned long compflags;
-};
-
-#endif /* _LINUX_ISDN_PPP_H */
diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h
deleted file mode 100644
index 8d80fdc68647..000000000000
--- a/include/linux/isdnif.h
+++ /dev/null
@@ -1,505 +0,0 @@
-/* $Id: isdnif.h,v 1.43.2.2 2004/01/12 23:08:35 keil Exp $
- *
- * Linux ISDN subsystem
- * Definition of the interface between the subsystem and its low-level drivers.
- *
- * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * 
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-#ifndef __ISDNIF_H__
-#define __ISDNIF_H__
-
-
-#include <linux/skbuff.h>
-#include <uapi/linux/isdnif.h>
-
-/***************************************************************************/
-/* Extensions made by Werner Cornelius (werner@ikt.de)                     */
-/*                                                                         */ 
-/* The proceed command holds a incoming call in a state to leave processes */
-/* enough time to check whether ist should be accepted.                    */
-/* The PROT_IO Command extends the interface to make protocol dependent    */
-/* features available (call diversion, call waiting...).                   */
-/*                                                                         */ 
-/* The PROT_IO Command is executed with the desired driver id and the arg  */
-/* parameter coded as follows:                                             */
-/* The lower 8 bits of arg contain the desired protocol from ISDN_PTYPE    */
-/* definitions. The upper 24 bits represent the protocol specific cmd/stat.*/
-/* Any additional data is protocol and command specific.                   */
-/* This mechanism also applies to the statcallb callback STAT_PROT.        */    
-/*                                                                         */
-/* This suggested extension permits an easy expansion of protocol specific */
-/* handling. Extensions may be added at any time without changing the HL   */
-/* driver code and not getting conflicts without certifications.           */
-/* The well known CAPI 2.0 interface handles such extensions in a similar  */
-/* way. Perhaps a protocol specific module may be added and separately     */
-/* loaded and linked to the basic isdn module for handling.                */                    
-/***************************************************************************/
-
-/*****************/
-/* DSS1 commands */ 
-/*****************/
-#define DSS1_CMD_INVOKE       ((0x00 << 8) | ISDN_PTYPE_EURO)   /* invoke a supplementary service */
-#define DSS1_CMD_INVOKE_ABORT ((0x01 << 8) | ISDN_PTYPE_EURO)   /* abort a invoke cmd */
-
-/*******************************/
-/* DSS1 Status callback values */
-/*******************************/
-#define DSS1_STAT_INVOKE_RES  ((0x80 << 8) | ISDN_PTYPE_EURO)   /* Result for invocation */
-#define DSS1_STAT_INVOKE_ERR  ((0x81 << 8) | ISDN_PTYPE_EURO)   /* Error Return for invocation */
-#define DSS1_STAT_INVOKE_BRD  ((0x82 << 8) | ISDN_PTYPE_EURO)   /* Deliver invoke broadcast info */
-
-
-/*********************************************************************/
-/* structures for DSS1 commands and callback                         */
-/*                                                                   */
-/* An action is invoked by sending a DSS1_CMD_INVOKE. The ll_id, proc*/
-/* timeout, datalen and data fields must be set before calling.      */
-/*                                                                   */
-/* The return value is a positive hl_id value also delivered in the  */
-/* hl_id field. A value of zero signals no more left hl_id capacitys.*/
-/* A negative return value signals errors in LL. So if the return    */
-/* value is <= 0 no action in LL will be taken -> request ignored    */
-/*                                                                   */
-/* The timeout field must be filled with a positive value specifying */
-/* the amount of time the INVOKED process waits for a reaction from  */
-/* the network.                                                      */
-/* If a response (either error or result) is received during this    */
-/* intervall, a reporting callback is initiated and the process will */
-/* be deleted, the hl identifier will be freed.                      */
-/* If no response is received during the specified intervall, a error*/
-/* callback is initiated with timeout set to -1 and a datalen set    */
-/* to 0.                                                             */
-/* If timeout is set to a value <= 0 during INVOCATION the process is*/
-/* immediately deleted after sending the data. No callback occurs !  */
-/*                                                                   */
-/* A currently waiting process may be aborted with INVOKE_ABORT. No  */
-/* callback will occur when a process has been aborted.              */
-/*                                                                   */
-/* Broadcast invoke frames from the network are reported via the     */
-/* STAT_INVOKE_BRD callback. The ll_id is set to 0, the other fields */
-/* are supplied by the network and not by the HL.                    */   
-/*********************************************************************/
-
-/*****************/
-/* NI1 commands */ 
-/*****************/
-#define NI1_CMD_INVOKE       ((0x00 << 8) | ISDN_PTYPE_NI1)   /* invoke a supplementary service */
-#define NI1_CMD_INVOKE_ABORT ((0x01 << 8) | ISDN_PTYPE_NI1)   /* abort a invoke cmd */
-
-/*******************************/
-/* NI1 Status callback values */
-/*******************************/
-#define NI1_STAT_INVOKE_RES  ((0x80 << 8) | ISDN_PTYPE_NI1)   /* Result for invocation */
-#define NI1_STAT_INVOKE_ERR  ((0x81 << 8) | ISDN_PTYPE_NI1)   /* Error Return for invocation */
-#define NI1_STAT_INVOKE_BRD  ((0x82 << 8) | ISDN_PTYPE_NI1)   /* Deliver invoke broadcast info */
-
-typedef struct
-  { ulong ll_id; /* ID supplied by LL when executing    */
-		 /* a command and returned by HL for    */
-                 /* INVOKE_RES and INVOKE_ERR           */
-    int hl_id;   /* ID supplied by HL when called       */
-                 /* for executing a cmd and delivered   */
-                 /* for results and errors              */
-                 /* must be supplied by LL when aborting*/  
-    int proc;    /* invoke procedure used by CMD_INVOKE */
-                 /* returned by callback and broadcast  */ 
-    int timeout; /* timeout for INVOKE CMD in ms        */
-                 /* -1  in stat callback when timed out */
-                 /* error value when error callback     */
-    int datalen; /* length of cmd or stat data          */
-    u_char *data;/* pointer to data delivered or send   */
-  } isdn_cmd_stat;
-
-/*
- * Commands from linklevel to lowlevel
- *
- */
-#define ISDN_CMD_IOCTL    0       /* Perform ioctl                         */
-#define ISDN_CMD_DIAL     1       /* Dial out                              */
-#define ISDN_CMD_ACCEPTD  2       /* Accept an incoming call on D-Chan.    */
-#define ISDN_CMD_ACCEPTB  3       /* Request B-Channel connect.            */
-#define ISDN_CMD_HANGUP   4       /* Hangup                                */
-#define ISDN_CMD_CLREAZ   5       /* Clear EAZ(s) of channel               */
-#define ISDN_CMD_SETEAZ   6       /* Set EAZ(s) of channel                 */
-#define ISDN_CMD_GETEAZ   7       /* Get EAZ(s) of channel                 */
-#define ISDN_CMD_SETSIL   8       /* Set Service-Indicator-List of channel */
-#define ISDN_CMD_GETSIL   9       /* Get Service-Indicator-List of channel */
-#define ISDN_CMD_SETL2   10       /* Set B-Chan. Layer2-Parameter          */
-#define ISDN_CMD_GETL2   11       /* Get B-Chan. Layer2-Parameter          */
-#define ISDN_CMD_SETL3   12       /* Set B-Chan. Layer3-Parameter          */
-#define ISDN_CMD_GETL3   13       /* Get B-Chan. Layer3-Parameter          */
-// #define ISDN_CMD_LOCK    14       /* Signal usage by upper levels          */
-// #define ISDN_CMD_UNLOCK  15       /* Release usage-lock                    */
-#define ISDN_CMD_SUSPEND 16       /* Suspend connection                    */
-#define ISDN_CMD_RESUME  17       /* Resume connection                     */
-#define ISDN_CMD_PROCEED 18       /* Proceed with call establishment       */
-#define ISDN_CMD_ALERT   19       /* Alert after Proceeding                */
-#define ISDN_CMD_REDIR   20       /* Redir a incoming call                 */
-#define ISDN_CMD_PROT_IO 21       /* Protocol specific commands            */
-#define CAPI_PUT_MESSAGE 22       /* CAPI message send down or up          */
-#define ISDN_CMD_FAXCMD  23       /* FAX commands to HL-driver             */
-#define ISDN_CMD_AUDIO   24       /* DSP, DTMF, ... settings               */
-
-/*
- * Status-Values delivered from lowlevel to linklevel via
- * statcallb().
- *
- */
-#define ISDN_STAT_STAVAIL 256    /* Raw status-data available             */
-#define ISDN_STAT_ICALL   257    /* Incoming call detected                */
-#define ISDN_STAT_RUN     258    /* Signal protocol-code is running       */
-#define ISDN_STAT_STOP    259    /* Signal halt of protocol-code          */
-#define ISDN_STAT_DCONN   260    /* Signal D-Channel connect              */
-#define ISDN_STAT_BCONN   261    /* Signal B-Channel connect              */
-#define ISDN_STAT_DHUP    262    /* Signal D-Channel disconnect           */
-#define ISDN_STAT_BHUP    263    /* Signal B-Channel disconnect           */
-#define ISDN_STAT_CINF    264    /* Charge-Info                           */
-#define ISDN_STAT_LOAD    265    /* Signal new lowlevel-driver is loaded  */
-#define ISDN_STAT_UNLOAD  266    /* Signal unload of lowlevel-driver      */
-#define ISDN_STAT_BSENT   267    /* Signal packet sent                    */
-#define ISDN_STAT_NODCH   268    /* Signal no D-Channel                   */
-#define ISDN_STAT_ADDCH   269    /* Add more Channels                     */
-#define ISDN_STAT_CAUSE   270    /* Cause-Message                         */
-#define ISDN_STAT_ICALLW  271    /* Incoming call without B-chan waiting  */
-#define ISDN_STAT_REDIR   272    /* Redir result                          */
-#define ISDN_STAT_PROT    273    /* protocol IO specific callback         */
-#define ISDN_STAT_DISPLAY 274    /* deliver a received display message    */
-#define ISDN_STAT_L1ERR   275    /* Signal Layer-1 Error                  */
-#define ISDN_STAT_FAXIND  276    /* FAX indications from HL-driver        */
-#define ISDN_STAT_AUDIO   277    /* DTMF, DSP indications                 */
-#define ISDN_STAT_DISCH   278    /* Disable/Enable channel usage          */
-
-/*
- * Audio commands
- */
-#define ISDN_AUDIO_SETDD	0	/* Set DTMF detection           */
-#define ISDN_AUDIO_DTMF		1	/* Rx/Tx DTMF                   */
-
-/*
- * Values for errcode field
- */
-#define ISDN_STAT_L1ERR_SEND 1
-#define ISDN_STAT_L1ERR_RECV 2
-
-/*
- * Values for feature-field of interface-struct.
- */
-/* Layer 2 */
-#define ISDN_FEATURE_L2_X75I    (0x0001 << ISDN_PROTO_L2_X75I)
-#define ISDN_FEATURE_L2_X75UI   (0x0001 << ISDN_PROTO_L2_X75UI)
-#define ISDN_FEATURE_L2_X75BUI  (0x0001 << ISDN_PROTO_L2_X75BUI)
-#define ISDN_FEATURE_L2_HDLC    (0x0001 << ISDN_PROTO_L2_HDLC)
-#define ISDN_FEATURE_L2_TRANS   (0x0001 << ISDN_PROTO_L2_TRANS)
-#define ISDN_FEATURE_L2_X25DTE  (0x0001 << ISDN_PROTO_L2_X25DTE)
-#define ISDN_FEATURE_L2_X25DCE  (0x0001 << ISDN_PROTO_L2_X25DCE)
-#define ISDN_FEATURE_L2_V11096  (0x0001 << ISDN_PROTO_L2_V11096)
-#define ISDN_FEATURE_L2_V11019  (0x0001 << ISDN_PROTO_L2_V11019)
-#define ISDN_FEATURE_L2_V11038  (0x0001 << ISDN_PROTO_L2_V11038)
-#define ISDN_FEATURE_L2_MODEM   (0x0001 << ISDN_PROTO_L2_MODEM)
-#define ISDN_FEATURE_L2_FAX	(0x0001 << ISDN_PROTO_L2_FAX)
-#define ISDN_FEATURE_L2_HDLC_56K (0x0001 << ISDN_PROTO_L2_HDLC_56K)
-
-#define ISDN_FEATURE_L2_MASK    (0x0FFFF) /* Max. 16 protocols */
-#define ISDN_FEATURE_L2_SHIFT   (0)
-
-/* Layer 3 */
-#define ISDN_FEATURE_L3_TRANS   (0x10000 << ISDN_PROTO_L3_TRANS)
-#define ISDN_FEATURE_L3_TRANSDSP (0x10000 << ISDN_PROTO_L3_TRANSDSP)
-#define ISDN_FEATURE_L3_FCLASS2	(0x10000 << ISDN_PROTO_L3_FCLASS2)
-#define ISDN_FEATURE_L3_FCLASS1	(0x10000 << ISDN_PROTO_L3_FCLASS1)
-
-#define ISDN_FEATURE_L3_MASK    (0x0FF0000) /* Max. 8 Protocols */
-#define ISDN_FEATURE_L3_SHIFT   (16)
-
-/* Signaling */
-#define ISDN_FEATURE_P_UNKNOWN  (0x1000000 << ISDN_PTYPE_UNKNOWN)
-#define ISDN_FEATURE_P_1TR6     (0x1000000 << ISDN_PTYPE_1TR6)
-#define ISDN_FEATURE_P_EURO     (0x1000000 << ISDN_PTYPE_EURO)
-#define ISDN_FEATURE_P_NI1      (0x1000000 << ISDN_PTYPE_NI1)
-
-#define ISDN_FEATURE_P_MASK     (0x0FF000000) /* Max. 8 Protocols */
-#define ISDN_FEATURE_P_SHIFT    (24)
-
-typedef struct setup_parm {
-    unsigned char phone[32];	/* Remote Phone-Number */
-    unsigned char eazmsn[32];	/* Local EAZ or MSN    */
-    unsigned char si1;      /* Service Indicator 1 */
-    unsigned char si2;      /* Service Indicator 2 */
-    unsigned char plan;     /* Numbering plan      */
-    unsigned char screen;   /* Screening info      */
-} setup_parm;
-
-
-#ifdef CONFIG_ISDN_TTY_FAX
-/* T.30 Fax G3 */
-
-#define FAXIDLEN 21
-
-typedef struct T30_s {
-	/* session parameters */
-	__u8 resolution;
-	__u8 rate;
-	__u8 width;
-	__u8 length;
-	__u8 compression;
-	__u8 ecm;
-	__u8 binary;
-	__u8 scantime;
-	__u8 id[FAXIDLEN];
-	/* additional parameters */
-	__u8 phase;
-	__u8 direction;
-	__u8 code;
-	__u8 badlin;
-	__u8 badmul;
-	__u8 bor;
-	__u8 fet;
-	__u8 pollid[FAXIDLEN];
-	__u8 cq;
-	__u8 cr;
-	__u8 ctcrty;
-	__u8 minsp;
-	__u8 phcto;
-	__u8 rel;
-	__u8 nbc;
-	/* remote station parameters */
-	__u8 r_resolution;
-	__u8 r_rate;
-	__u8 r_width;
-	__u8 r_length;
-	__u8 r_compression;
-	__u8 r_ecm;
-	__u8 r_binary;
-	__u8 r_scantime;
-	__u8 r_id[FAXIDLEN];
-	__u8 r_code;
-} __packed T30_s;
-
-#define ISDN_TTY_FAX_CONN_IN	0
-#define ISDN_TTY_FAX_CONN_OUT	1
-
-#define ISDN_TTY_FAX_FCON	0
-#define ISDN_TTY_FAX_DIS 	1
-#define ISDN_TTY_FAX_FTT 	2
-#define ISDN_TTY_FAX_MCF 	3
-#define ISDN_TTY_FAX_DCS 	4
-#define ISDN_TTY_FAX_TRAIN_OK	5
-#define ISDN_TTY_FAX_EOP 	6
-#define ISDN_TTY_FAX_EOM 	7
-#define ISDN_TTY_FAX_MPS 	8
-#define ISDN_TTY_FAX_DTC 	9
-#define ISDN_TTY_FAX_RID 	10
-#define ISDN_TTY_FAX_HNG 	11
-#define ISDN_TTY_FAX_DT  	12
-#define ISDN_TTY_FAX_FCON_I	13
-#define ISDN_TTY_FAX_DR  	14
-#define ISDN_TTY_FAX_ET  	15
-#define ISDN_TTY_FAX_CFR 	16
-#define ISDN_TTY_FAX_PTS 	17
-#define ISDN_TTY_FAX_SENT	18
-
-#define ISDN_FAX_PHASE_IDLE	0
-#define ISDN_FAX_PHASE_A	1
-#define ISDN_FAX_PHASE_B   	2
-#define ISDN_FAX_PHASE_C   	3
-#define ISDN_FAX_PHASE_D   	4
-#define ISDN_FAX_PHASE_E   	5
-
-#endif /* TTY_FAX */
-
-#define ISDN_FAX_CLASS1_FAE	0
-#define ISDN_FAX_CLASS1_FTS	1
-#define ISDN_FAX_CLASS1_FRS	2
-#define ISDN_FAX_CLASS1_FTM	3
-#define ISDN_FAX_CLASS1_FRM	4
-#define ISDN_FAX_CLASS1_FTH	5
-#define ISDN_FAX_CLASS1_FRH	6
-#define ISDN_FAX_CLASS1_CTRL	7
-
-#define ISDN_FAX_CLASS1_OK	0
-#define ISDN_FAX_CLASS1_CONNECT	1
-#define ISDN_FAX_CLASS1_NOCARR	2
-#define ISDN_FAX_CLASS1_ERROR	3
-#define ISDN_FAX_CLASS1_FCERROR	4
-#define ISDN_FAX_CLASS1_QUERY	5
-
-typedef struct {
-	__u8	cmd;
-	__u8	subcmd;
-	__u8	para[50];
-} aux_s;
-
-#define AT_COMMAND	0
-#define AT_EQ_VALUE	1
-#define AT_QUERY	2
-#define AT_EQ_QUERY	3
-
-/* CAPI structs */
-
-/* this is compatible to the old union size */
-#define MAX_CAPI_PARA_LEN 50
-
-typedef struct {
-	/* Header */
-	__u16 Length;
-	__u16 ApplId;
-	__u8 Command;
-	__u8 Subcommand;
-	__u16 Messagenumber;
-
-	/* Parameter */
-	union {
-		__u32 Controller;
-		__u32 PLCI;
-		__u32 NCCI;
-	} adr;
-	__u8 para[MAX_CAPI_PARA_LEN];
-} capi_msg;
-
-/*
- * Structure for exchanging above infos
- *
- */
-typedef struct {
-	int   driver;		/* Lowlevel-Driver-ID            */
-	int   command;		/* Command or Status (see above) */
-	ulong arg;		/* Additional Data               */
-	union {
-		ulong errcode;	/* Type of error with STAT_L1ERR	*/
-		int length;	/* Amount of bytes sent with STAT_BSENT	*/
-		u_char num[50];	/* Additional Data			*/
-		setup_parm setup;/* For SETUP msg			*/
-		capi_msg cmsg;	/* For CAPI like messages		*/
-		char display[85];/* display message data		*/ 
-		isdn_cmd_stat isdn_io; /* ISDN IO-parameter/result	*/
-		aux_s aux;	/* for modem commands/indications	*/
-#ifdef CONFIG_ISDN_TTY_FAX
-		T30_s	*fax;	/* Pointer to ttys fax struct		*/
-#endif
-		ulong userdata;	/* User Data */
-	} parm;
-} isdn_ctrl;
-
-#define dss1_io    isdn_io
-#define ni1_io     isdn_io
-
-/*
- * The interface-struct itself (initialized at load-time of lowlevel-driver)
- *
- * See Documentation/isdn/INTERFACE for a description, how the communication
- * between the ISDN subsystem and its drivers is done.
- *
- */
-typedef struct {
-  struct module *owner;
-
-  /* Number of channels supported by this driver
-   */
-  int channels;
-
-  /* 
-   * Maximum Size of transmit/receive-buffer this driver supports.
-   */
-  int maxbufsize;
-
-  /* Feature-Flags for this driver.
-   * See defines ISDN_FEATURE_... for Values
-   */
-  unsigned long features;
-
-  /*
-   * Needed for calculating
-   * dev->hard_header_len = linklayer header + hl_hdrlen;
-   * Drivers, not supporting sk_buff's should set this to 0.
-   */
-  unsigned short hl_hdrlen;
-
-  /*
-   * Receive-Callback using sk_buff's
-   * Parameters:
-   *             int                    Driver-ID
-   *             int                    local channel-number (0 ...)
-   *             struct sk_buff *skb    received Data
-   */
-  void (*rcvcallb_skb)(int, int, struct sk_buff *);
-
-  /* Status-Callback
-   * Parameters:
-   *             isdn_ctrl*
-   *                   driver  = Driver ID.
-   *                   command = One of above ISDN_STAT_... constants.
-   *                   arg     = depending on status-type.
-   *                   num     = depending on status-type.
-   */
-  int (*statcallb)(isdn_ctrl*);
-
-  /* Send command
-   * Parameters:
-   *             isdn_ctrl*
-   *                   driver  = Driver ID.
-   *                   command = One of above ISDN_CMD_... constants.
-   *                   arg     = depending on command.
-   *                   num     = depending on command.
-   */
-  int (*command)(isdn_ctrl*);
-
-  /*
-   * Send data using sk_buff's
-   * Parameters:
-   *             int                    driverId
-   *             int                    local channel-number (0...)
-   *             int                    Flag: Need ACK for this packet.
-   *             struct sk_buff *skb    Data to send
-   */
-  int (*writebuf_skb) (int, int, int, struct sk_buff *);
-
-  /* Send raw D-Channel-Commands
-   * Parameters:
-   *             u_char pointer data
-   *             int    length of data
-   *             int    driverId
-   *             int    local channel-number (0 ...)
-   */
-  int (*writecmd)(const u_char __user *, int, int, int);
-
-  /* Read raw Status replies
-   *             u_char pointer data (volatile)
-   *             int    length of buffer
-   *             int    driverId
-   *             int    local channel-number (0 ...)
-   */
-  int (*readstat)(u_char __user *, int, int, int);
-
-  char id[20];
-} isdn_if;
-
-/*
- * Function which must be called by lowlevel-driver at loadtime with
- * the following fields of above struct set:
- *
- * channels     Number of channels that will be supported.
- * hl_hdrlen    Space to preserve in sk_buff's when sending. Drivers, not
- *              supporting sk_buff's should set this to 0.
- * command      Address of Command-Handler.
- * features     Bitwise coded Features of this driver. (use ISDN_FEATURE_...)
- * writebuf_skb Address of Skbuff-Send-Handler.
- * writecmd        "    "  D-Channel  " which accepts raw D-Ch-Commands.
- * readstat        "    "  D-Channel  " which delivers raw Status-Data.
- *
- * The linklevel-driver fills the following fields:
- *
- * channels      Driver-ID assigned to this driver. (Must be used on all
- *               subsequent callbacks.
- * rcvcallb_skb  Address of handler for received Skbuff's.
- * statcallb        "    "     "    for status-changes.
- *
- */
-extern int register_isdn(isdn_if*);
-#include <linux/uaccess.h>
-
-#endif /* __ISDNIF_H__ */
diff --git a/include/linux/wanrouter.h b/include/linux/wanrouter.h
deleted file mode 100644
index f6358558f9f5..000000000000
--- a/include/linux/wanrouter.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * wanrouter.h	Legacy declarations kept around until X25 is removed
- */
-
-#ifndef	_ROUTER_H
-#define	_ROUTER_H
-
-#include <uapi/linux/wanrouter.h>
-
-#endif	/* _ROUTER_H */
diff --git a/include/uapi/linux/isdn.h b/include/uapi/linux/isdn.h
deleted file mode 100644
index f371fd52ed75..000000000000
--- a/include/uapi/linux/isdn.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* $Id: isdn.h,v 1.125.2.3 2004/02/10 01:07:14 keil Exp $
- *
- * Main header for the Linux ISDN subsystem (linklevel).
- *
- * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    by Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- * 
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _UAPI__ISDN_H__
-#define _UAPI__ISDN_H__
-
-#include <linux/ioctl.h>
-#include <linux/tty.h>
-
-#define ISDN_MAX_DRIVERS    32
-#define ISDN_MAX_CHANNELS   64
-
-/* New ioctl-codes */
-#define IIOCNETAIF  _IO('I',1)
-#define IIOCNETDIF  _IO('I',2)
-#define IIOCNETSCF  _IO('I',3)
-#define IIOCNETGCF  _IO('I',4)
-#define IIOCNETANM  _IO('I',5)
-#define IIOCNETDNM  _IO('I',6)
-#define IIOCNETGNM  _IO('I',7)
-#define IIOCGETSET  _IO('I',8) /* no longer supported */
-#define IIOCSETSET  _IO('I',9) /* no longer supported */
-#define IIOCSETVER  _IO('I',10)
-#define IIOCNETHUP  _IO('I',11)
-#define IIOCSETGST  _IO('I',12)
-#define IIOCSETBRJ  _IO('I',13)
-#define IIOCSIGPRF  _IO('I',14)
-#define IIOCGETPRF  _IO('I',15)
-#define IIOCSETPRF  _IO('I',16)
-#define IIOCGETMAP  _IO('I',17)
-#define IIOCSETMAP  _IO('I',18)
-#define IIOCNETASL  _IO('I',19)
-#define IIOCNETDIL  _IO('I',20)
-#define IIOCGETCPS  _IO('I',21)
-#define IIOCGETDVR  _IO('I',22)
-#define IIOCNETLCR  _IO('I',23) /* dwabc ioctl for LCR from isdnlog */
-#define IIOCNETDWRSET  _IO('I',24) /* dwabc ioctl to reset abc-values to default on a net-interface */
-
-#define IIOCNETALN  _IO('I',32)
-#define IIOCNETDLN  _IO('I',33)
-
-#define IIOCNETGPN  _IO('I',34)
-
-#define IIOCDBGVAR  _IO('I',127)
-
-#define IIOCDRVCTL  _IO('I',128)
-
-/* cisco hdlck device private ioctls */
-#define SIOCGKEEPPERIOD	(SIOCDEVPRIVATE + 0)
-#define SIOCSKEEPPERIOD	(SIOCDEVPRIVATE + 1)
-#define SIOCGDEBSERINT	(SIOCDEVPRIVATE + 2)
-#define SIOCSDEBSERINT	(SIOCDEVPRIVATE + 3)
-
-/* Packet encapsulations for net-interfaces */
-#define ISDN_NET_ENCAP_ETHER      0
-#define ISDN_NET_ENCAP_RAWIP      1
-#define ISDN_NET_ENCAP_IPTYP      2
-#define ISDN_NET_ENCAP_CISCOHDLC  3 /* Without SLARP and keepalive */
-#define ISDN_NET_ENCAP_SYNCPPP    4
-#define ISDN_NET_ENCAP_UIHDLC     5
-#define ISDN_NET_ENCAP_CISCOHDLCK 6 /* With SLARP and keepalive    */
-#define ISDN_NET_ENCAP_X25IFACE   7 /* Documentation/networking/x25-iface.txt */
-#define ISDN_NET_ENCAP_MAX_ENCAP  ISDN_NET_ENCAP_X25IFACE
-
-/* Facility which currently uses an ISDN-channel */
-#define ISDN_USAGE_NONE       0
-#define ISDN_USAGE_RAW        1
-#define ISDN_USAGE_MODEM      2
-#define ISDN_USAGE_NET        3
-#define ISDN_USAGE_VOICE      4
-#define ISDN_USAGE_FAX        5
-#define ISDN_USAGE_MASK       7 /* Mask to get plain usage */
-#define ISDN_USAGE_DISABLED  32 /* This bit is set, if channel is disabled */
-#define ISDN_USAGE_EXCLUSIVE 64 /* This bit is set, if channel is exclusive */
-#define ISDN_USAGE_OUTGOING 128 /* This bit is set, if channel is outgoing  */
-
-#define ISDN_MODEM_NUMREG    24        /* Number of Modem-Registers        */
-#define ISDN_LMSNLEN         255 /* Length of tty's Listen-MSN string */
-#define ISDN_CMSGLEN	     50	 /* Length of CONNECT-Message to add for Modem */
-
-#define ISDN_MSNLEN          32
-#define NET_DV 0x06  /* Data version for isdn_net_ioctl_cfg   */
-#define TTY_DV 0x06  /* Data version for iprofd etc.          */
-
-#define INF_DV 0x01  /* Data version for /dev/isdninfo        */
-
-typedef struct {
-  char drvid[25];
-  unsigned long arg;
-} isdn_ioctl_struct;
-
-typedef struct {
-  char name[10];
-  char phone[ISDN_MSNLEN];
-  int  outgoing;
-} isdn_net_ioctl_phone;
-
-typedef struct {
-  char name[10];     /* Name of interface                     */
-  char master[10];   /* Name of Master for Bundling           */
-  char slave[10];    /* Name of Slave for Bundling            */
-  char eaz[256];     /* EAZ/MSN                               */
-  char drvid[25];    /* DriverId for Bindings                 */
-  int  onhtime;      /* Hangup-Timeout                        */
-  int  charge;       /* Charge-Units                          */
-  int  l2_proto;     /* Layer-2 protocol                      */
-  int  l3_proto;     /* Layer-3 protocol                      */
-  int  p_encap;      /* Encapsulation                         */
-  int  exclusive;    /* Channel, if bound exclusive           */
-  int  dialmax;      /* Dial Retry-Counter                    */
-  int  slavedelay;   /* Delay until slave starts up           */
-  int  cbdelay;      /* Delay before Callback                 */
-  int  chargehup;    /* Flag: Charge-Hangup                   */
-  int  ihup;         /* Flag: Hangup-Timeout on incoming line */
-  int  secure;       /* Flag: Secure                          */
-  int  callback;     /* Flag: Callback                        */
-  int  cbhup;        /* Flag: Reject Call before Callback     */
-  int  pppbind;      /* ippp device for bindings              */
-  int  chargeint;    /* Use fixed charge interval length      */
-  int  triggercps;   /* BogoCPS needed for triggering slave   */
-  int  dialtimeout;  /* Dial-Timeout                          */
-  int  dialwait;     /* Time to wait after failed dial        */
-  int  dialmode;     /* Flag: off / on / auto                 */
-} isdn_net_ioctl_cfg;
-
-#define ISDN_NET_DIALMODE_MASK  0xC0    /* bits for status                */
-#define ISDN_NET_DM_OFF	        0x00    /* this interface is stopped      */
-#define ISDN_NET_DM_MANUAL	0x40    /* this interface is on (manual)  */
-#define ISDN_NET_DM_AUTO	0x80    /* this interface is autodial     */
-#define ISDN_NET_DIALMODE(x) ((&(x))->flags & ISDN_NET_DIALMODE_MASK)
-
-
-#endif /* _UAPI__ISDN_H__ */
diff --git a/include/uapi/linux/isdn_divertif.h b/include/uapi/linux/isdn_divertif.h
deleted file mode 100644
index 0a17bb1bcb1b..000000000000
--- a/include/uapi/linux/isdn_divertif.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* $Id: isdn_divertif.h,v 1.4.6.1 2001/09/23 22:25:05 kai Exp $
- *
- * Header for the diversion supplementary interface for i4l.
- *
- * Author    Werner Cornelius (werner@titro.de)
- * Copyright by Werner Cornelius (werner@titro.de)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _UAPI_LINUX_ISDN_DIVERTIF_H
-#define _UAPI_LINUX_ISDN_DIVERTIF_H
-
-/***********************************************************/
-/* magic value is also used to control version information */
-/***********************************************************/
-#define DIVERT_IF_MAGIC 0x25873401
-#define DIVERT_CMD_REG  0x00  /* register command */
-#define DIVERT_CMD_REL  0x01  /* release command */
-#define DIVERT_NO_ERR   0x00  /* return value no error */
-#define DIVERT_CMD_ERR  0x01  /* invalid cmd */
-#define DIVERT_VER_ERR  0x02  /* magic/version invalid */
-#define DIVERT_REG_ERR  0x03  /* module already registered */
-#define DIVERT_REL_ERR  0x04  /* module not registered */
-#define DIVERT_REG_NAME isdn_register_divert
-
-
-#endif /* _UAPI_LINUX_ISDN_DIVERTIF_H */
diff --git a/include/uapi/linux/isdn_ppp.h b/include/uapi/linux/isdn_ppp.h
deleted file mode 100644
index 0bdc4efaacb2..000000000000
--- a/include/uapi/linux/isdn_ppp.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */
-/* Linux ISDN subsystem, sync PPP, interface to ipppd
- *
- * Copyright 1994-1999  by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * Copyright 1995,96    by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
- * Copyright 2000-2002  by Kai Germaschewski (kai@germaschewski.name)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _UAPI_LINUX_ISDN_PPP_H
-#define _UAPI_LINUX_ISDN_PPP_H
-
-#define CALLTYPE_INCOMING 0x1
-#define CALLTYPE_OUTGOING 0x2
-#define CALLTYPE_CALLBACK 0x4
-
-#define IPPP_VERSION    "2.2.0"
-
-struct pppcallinfo
-{
-  int calltype;
-  unsigned char local_num[64];
-  unsigned char remote_num[64];
-  int charge_units;
-};
-
-#define PPPIOCGCALLINFO _IOWR('t',128,struct pppcallinfo)
-#define PPPIOCBUNDLE   _IOW('t',129,int)
-#define PPPIOCGMPFLAGS _IOR('t',130,int)
-#define PPPIOCSMPFLAGS _IOW('t',131,int)
-#define PPPIOCSMPMTU   _IOW('t',132,int)
-#define PPPIOCSMPMRU   _IOW('t',133,int)
-#define PPPIOCGCOMPRESSORS _IOR('t',134,unsigned long [8])
-#define PPPIOCSCOMPRESSOR _IOW('t',135,int)
-#define PPPIOCGIFNAME      _IOR('t',136, char [IFNAMSIZ] )
-
-
-#define SC_MP_PROT       0x00000200
-#define SC_REJ_MP_PROT   0x00000400
-#define SC_OUT_SHORT_SEQ 0x00000800
-#define SC_IN_SHORT_SEQ  0x00004000
-
-#define SC_DECOMP_ON		0x01
-#define SC_COMP_ON		0x02
-#define SC_DECOMP_DISCARD	0x04
-#define SC_COMP_DISCARD		0x08
-#define SC_LINK_DECOMP_ON	0x10
-#define SC_LINK_COMP_ON		0x20
-#define SC_LINK_DECOMP_DISCARD	0x40
-#define SC_LINK_COMP_DISCARD	0x80
-
-#define ISDN_PPP_COMP_MAX_OPTIONS 16
-
-#define IPPP_COMP_FLAG_XMIT 0x1
-#define IPPP_COMP_FLAG_LINK 0x2
-
-struct isdn_ppp_comp_data {
-  int num;
-  unsigned char options[ISDN_PPP_COMP_MAX_OPTIONS];
-  int optlen;
-  int flags;
-};
-
-#endif /* _UAPI_LINUX_ISDN_PPP_H */
diff --git a/include/uapi/linux/isdnif.h b/include/uapi/linux/isdnif.h
deleted file mode 100644
index 611a69196738..000000000000
--- a/include/uapi/linux/isdnif.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */
-/* $Id: isdnif.h,v 1.43.2.2 2004/01/12 23:08:35 keil Exp $
- *
- * Linux ISDN subsystem
- * Definition of the interface between the subsystem and its low-level drivers.
- *
- * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de)
- * Copyright 1995,96    Thinking Objects Software GmbH Wuerzburg
- * 
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- *
- */
-
-#ifndef _UAPI__ISDNIF_H__
-#define _UAPI__ISDNIF_H__
-
-
-/*
- * Values for general protocol-selection
- */
-#define ISDN_PTYPE_UNKNOWN   0   /* Protocol undefined   */
-#define ISDN_PTYPE_1TR6      1   /* german 1TR6-protocol */
-#define ISDN_PTYPE_EURO      2   /* EDSS1-protocol       */
-#define ISDN_PTYPE_LEASED    3   /* for leased lines     */
-#define ISDN_PTYPE_NI1       4   /* US NI-1 protocol     */
-#define ISDN_PTYPE_MAX       7   /* Max. 8 Protocols     */
-
-/*
- * Values for Layer-2-protocol-selection
- */
-#define ISDN_PROTO_L2_X75I   0   /* X75/LAPB with I-Frames            */
-#define ISDN_PROTO_L2_X75UI  1   /* X75/LAPB with UI-Frames           */
-#define ISDN_PROTO_L2_X75BUI 2   /* X75/LAPB with UI-Frames           */
-#define ISDN_PROTO_L2_HDLC   3   /* HDLC                              */
-#define ISDN_PROTO_L2_TRANS  4   /* Transparent (Voice)               */
-#define ISDN_PROTO_L2_X25DTE 5   /* X25/LAPB DTE mode                 */
-#define ISDN_PROTO_L2_X25DCE 6   /* X25/LAPB DCE mode                 */
-#define ISDN_PROTO_L2_V11096 7   /* V.110 bitrate adaption 9600 Baud  */
-#define ISDN_PROTO_L2_V11019 8   /* V.110 bitrate adaption 19200 Baud */
-#define ISDN_PROTO_L2_V11038 9   /* V.110 bitrate adaption 38400 Baud */
-#define ISDN_PROTO_L2_MODEM  10  /* Analog Modem on Board */
-#define ISDN_PROTO_L2_FAX    11  /* Fax Group 2/3         */
-#define ISDN_PROTO_L2_HDLC_56K 12   /* HDLC 56k                          */
-#define ISDN_PROTO_L2_MAX    15  /* Max. 16 Protocols                 */
-
-/*
- * Values for Layer-3-protocol-selection
- */
-#define ISDN_PROTO_L3_TRANS	0	/* Transparent */
-#define ISDN_PROTO_L3_TRANSDSP	1	/* Transparent with DSP */
-#define ISDN_PROTO_L3_FCLASS2	2	/* Fax Group 2/3 CLASS 2 */
-#define ISDN_PROTO_L3_FCLASS1	3	/* Fax Group 2/3 CLASS 1 */
-#define ISDN_PROTO_L3_MAX	7	/* Max. 8 Protocols */
-
-
-#endif /* _UAPI__ISDNIF_H__ */
diff --git a/include/uapi/linux/wanrouter.h b/include/uapi/linux/wanrouter.h
deleted file mode 100644
index 2f1216d00caa..000000000000
--- a/include/uapi/linux/wanrouter.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * wanrouter.h	Legacy declarations kept around until X25 is removed
- */
-
-#ifndef _UAPI_ROUTER_H
-#define _UAPI_ROUTER_H
-
-/* 'state' defines */
-enum wan_states
-{
-	WAN_UNCONFIGURED,	/* link/channel is not configured */
-	WAN_DISCONNECTED,	/* link/channel is disconnected */
-	WAN_CONNECTING,		/* connection is in progress */
-	WAN_CONNECTED		/* link/channel is operational */
-};
-
-#endif /* _UAPI_ROUTER_H */
-- 
cgit v1.2.3


From 99c2aa151a7182c58f9477a376304c538d9cc5ab Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 18 Apr 2019 22:57:08 +0200
Subject: isdn: hdlc: move into mISDN

The last remnant of the isdn4linux interface is now the isdnhdlc
support, used by the netjet driver. Move it next to that driver.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/isdn/Makefile                  |   1 -
 drivers/isdn/hardware/mISDN/Kconfig    |   7 +-
 drivers/isdn/hardware/mISDN/Makefile   |   2 +
 drivers/isdn/hardware/mISDN/isdnhdlc.c | 630 +++++++++++++++++++++++++++++++++
 drivers/isdn/hardware/mISDN/isdnhdlc.h |  82 +++++
 drivers/isdn/hardware/mISDN/netjet.c   |   2 +-
 drivers/isdn/i4l/Makefile              |   6 -
 drivers/isdn/i4l/isdnhdlc.c            | 630 ---------------------------------
 include/linux/isdn/hdlc.h              |  82 -----
 9 files changed, 720 insertions(+), 722 deletions(-)
 create mode 100644 drivers/isdn/hardware/mISDN/isdnhdlc.c
 create mode 100644 drivers/isdn/hardware/mISDN/isdnhdlc.h
 delete mode 100644 drivers/isdn/i4l/Makefile
 delete mode 100644 drivers/isdn/i4l/isdnhdlc.c
 delete mode 100644 include/linux/isdn/hdlc.h

(limited to 'include')

diff --git a/drivers/isdn/Makefile b/drivers/isdn/Makefile
index 379b4a03c321..f2a529c5a511 100644
--- a/drivers/isdn/Makefile
+++ b/drivers/isdn/Makefile
@@ -3,7 +3,6 @@
 
 # Object files in subdirectories
 
-obj-$(CONFIG_ISDN_I4L)			+= i4l/
 obj-$(CONFIG_ISDN_CAPI)			+= capi/
 obj-$(CONFIG_MISDN)			+= mISDN/
 obj-$(CONFIG_ISDN)			+= hardware/
diff --git a/drivers/isdn/hardware/mISDN/Kconfig b/drivers/isdn/hardware/mISDN/Kconfig
index a7a34a85b970..304f50c08da2 100644
--- a/drivers/isdn/hardware/mISDN/Kconfig
+++ b/drivers/isdn/hardware/mISDN/Kconfig
@@ -79,11 +79,14 @@ config MISDN_NETJET
 	depends on PCI
 	depends on TTY
 	select MISDN_IPAC
-	select ISDN_HDLC
-	select ISDN_I4L
+	select MISDN_HDLC
 	help
 	  Enable support for Traverse Technologies NETJet PCI cards.
 
+config MISDN_HDLC
+	tristate
+	select CRC_CCITT
+	select BITREVERSE
 
 config MISDN_IPAC
 	tristate
diff --git a/drivers/isdn/hardware/mISDN/Makefile b/drivers/isdn/hardware/mISDN/Makefile
index 422f9fd8ab9a..3f50f8c4753f 100644
--- a/drivers/isdn/hardware/mISDN/Makefile
+++ b/drivers/isdn/hardware/mISDN/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_MISDN_NETJET) += netjet.o
 # chip modules
 obj-$(CONFIG_MISDN_IPAC) += mISDNipac.o
 obj-$(CONFIG_MISDN_ISAR) += mISDNisar.o
+
+obj-$(CONFIG_MISDN_HDLC) += isdnhdlc.o
diff --git a/drivers/isdn/hardware/mISDN/isdnhdlc.c b/drivers/isdn/hardware/mISDN/isdnhdlc.c
new file mode 100644
index 000000000000..3a8b562e63b1
--- /dev/null
+++ b/drivers/isdn/hardware/mISDN/isdnhdlc.c
@@ -0,0 +1,630 @@
+/*
+ * isdnhdlc.c  --  General purpose ISDN HDLC decoder.
+ *
+ * Copyright (C)
+ *	2009	Karsten Keil		<keil@b1-systems.de>
+ *	2002	Wolfgang Mües		<wolfgang@iksw-muees.de>
+ *	2001	Frode Isaksen		<fisaksen@bewan.com>
+ *      2001	Kai Germaschewski	<kai.germaschewski@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/crc-ccitt.h>
+#include <linux/bitrev.h>
+#include "isdnhdlc.h"
+
+/*-------------------------------------------------------------------*/
+
+MODULE_AUTHOR("Wolfgang Mües <wolfgang@iksw-muees.de>, "
+	      "Frode Isaksen <fisaksen@bewan.com>, "
+	      "Kai Germaschewski <kai.germaschewski@gmx.de>");
+MODULE_DESCRIPTION("General purpose ISDN HDLC decoder");
+MODULE_LICENSE("GPL");
+
+/*-------------------------------------------------------------------*/
+
+enum {
+	HDLC_FAST_IDLE, HDLC_GET_FLAG_B0, HDLC_GETFLAG_B1A6, HDLC_GETFLAG_B7,
+	HDLC_GET_DATA, HDLC_FAST_FLAG
+};
+
+enum {
+	HDLC_SEND_DATA, HDLC_SEND_CRC1, HDLC_SEND_FAST_FLAG,
+	HDLC_SEND_FIRST_FLAG, HDLC_SEND_CRC2, HDLC_SEND_CLOSING_FLAG,
+	HDLC_SEND_IDLE1, HDLC_SEND_FAST_IDLE, HDLC_SENDFLAG_B0,
+	HDLC_SENDFLAG_B1A6, HDLC_SENDFLAG_B7, STOPPED, HDLC_SENDFLAG_ONE
+};
+
+void isdnhdlc_rcv_init(struct isdnhdlc_vars *hdlc, u32 features)
+{
+	memset(hdlc, 0, sizeof(struct isdnhdlc_vars));
+	hdlc->state = HDLC_GET_DATA;
+	if (features & HDLC_56KBIT)
+		hdlc->do_adapt56 = 1;
+	if (features & HDLC_BITREVERSE)
+		hdlc->do_bitreverse = 1;
+}
+EXPORT_SYMBOL(isdnhdlc_out_init);
+
+void isdnhdlc_out_init(struct isdnhdlc_vars *hdlc, u32 features)
+{
+	memset(hdlc, 0, sizeof(struct isdnhdlc_vars));
+	if (features & HDLC_DCHANNEL) {
+		hdlc->dchannel = 1;
+		hdlc->state = HDLC_SEND_FIRST_FLAG;
+	} else {
+		hdlc->dchannel = 0;
+		hdlc->state = HDLC_SEND_FAST_FLAG;
+		hdlc->ffvalue = 0x7e;
+	}
+	hdlc->cbin = 0x7e;
+	if (features & HDLC_56KBIT) {
+		hdlc->do_adapt56 = 1;
+		hdlc->state = HDLC_SENDFLAG_B0;
+	} else
+		hdlc->data_bits = 8;
+	if (features & HDLC_BITREVERSE)
+		hdlc->do_bitreverse = 1;
+}
+EXPORT_SYMBOL(isdnhdlc_rcv_init);
+
+static int
+check_frame(struct isdnhdlc_vars *hdlc)
+{
+	int status;
+
+	if (hdlc->dstpos < 2)	/* too small - framing error */
+		status = -HDLC_FRAMING_ERROR;
+	else if (hdlc->crc != 0xf0b8)	/* crc error */
+		status = -HDLC_CRC_ERROR;
+	else {
+		/* remove CRC */
+		hdlc->dstpos -= 2;
+		/* good frame */
+		status = hdlc->dstpos;
+	}
+	return status;
+}
+
+/*
+  isdnhdlc_decode - decodes HDLC frames from a transparent bit stream.
+
+  The source buffer is scanned for valid HDLC frames looking for
+  flags (01111110) to indicate the start of a frame. If the start of
+  the frame is found, the bit stuffing is removed (0 after 5 1's).
+  When a new flag is found, the complete frame has been received
+  and the CRC is checked.
+  If a valid frame is found, the function returns the frame length
+  excluding the CRC with the bit HDLC_END_OF_FRAME set.
+  If the beginning of a valid frame is found, the function returns
+  the length.
+  If a framing error is found (too many 1s and not a flag) the function
+  returns the length with the bit HDLC_FRAMING_ERROR set.
+  If a CRC error is found the function returns the length with the
+  bit HDLC_CRC_ERROR set.
+  If the frame length exceeds the destination buffer size, the function
+  returns the length with the bit HDLC_LENGTH_ERROR set.
+
+  src - source buffer
+  slen - source buffer length
+  count - number of bytes removed (decoded) from the source buffer
+  dst _ destination buffer
+  dsize - destination buffer size
+  returns - number of decoded bytes in the destination buffer and status
+  flag.
+*/
+int isdnhdlc_decode(struct isdnhdlc_vars *hdlc, const u8 *src, int slen,
+		    int *count, u8 *dst, int dsize)
+{
+	int status = 0;
+
+	static const unsigned char fast_flag[] = {
+		0x00, 0x00, 0x00, 0x20, 0x30, 0x38, 0x3c, 0x3e, 0x3f
+	};
+
+	static const unsigned char fast_flag_value[] = {
+		0x00, 0x7e, 0xfc, 0xf9, 0xf3, 0xe7, 0xcf, 0x9f, 0x3f
+	};
+
+	static const unsigned char fast_abort[] = {
+		0x00, 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
+	};
+
+#define handle_fast_flag(h)						\
+	do {								\
+		if (h->cbin == fast_flag[h->bit_shift]) {		\
+			h->ffvalue = fast_flag_value[h->bit_shift];	\
+			h->state = HDLC_FAST_FLAG;			\
+			h->ffbit_shift = h->bit_shift;			\
+			h->bit_shift = 1;				\
+		} else {						\
+			h->state = HDLC_GET_DATA;			\
+			h->data_received = 0;				\
+		}							\
+	} while (0)
+
+#define handle_abort(h)						\
+	do {							\
+		h->shift_reg = fast_abort[h->ffbit_shift - 1];	\
+		h->hdlc_bits1 = h->ffbit_shift - 2;		\
+		if (h->hdlc_bits1 < 0)				\
+			h->hdlc_bits1 = 0;			\
+		h->data_bits = h->ffbit_shift - 1;		\
+		h->state = HDLC_GET_DATA;			\
+		h->data_received = 0;				\
+	} while (0)
+
+	*count = slen;
+
+	while (slen > 0) {
+		if (hdlc->bit_shift == 0) {
+			/* the code is for bitreverse streams */
+			if (hdlc->do_bitreverse == 0)
+				hdlc->cbin = bitrev8(*src++);
+			else
+				hdlc->cbin = *src++;
+			slen--;
+			hdlc->bit_shift = 8;
+			if (hdlc->do_adapt56)
+				hdlc->bit_shift--;
+		}
+
+		switch (hdlc->state) {
+		case STOPPED:
+			return 0;
+		case HDLC_FAST_IDLE:
+			if (hdlc->cbin == 0xff) {
+				hdlc->bit_shift = 0;
+				break;
+			}
+			hdlc->state = HDLC_GET_FLAG_B0;
+			hdlc->hdlc_bits1 = 0;
+			hdlc->bit_shift = 8;
+			break;
+		case HDLC_GET_FLAG_B0:
+			if (!(hdlc->cbin & 0x80)) {
+				hdlc->state = HDLC_GETFLAG_B1A6;
+				hdlc->hdlc_bits1 = 0;
+			} else {
+				if ((!hdlc->do_adapt56) &&
+				    (++hdlc->hdlc_bits1 >= 8) &&
+				    (hdlc->bit_shift == 1))
+					hdlc->state = HDLC_FAST_IDLE;
+			}
+			hdlc->cbin <<= 1;
+			hdlc->bit_shift--;
+			break;
+		case HDLC_GETFLAG_B1A6:
+			if (hdlc->cbin & 0x80) {
+				hdlc->hdlc_bits1++;
+				if (hdlc->hdlc_bits1 == 6)
+					hdlc->state = HDLC_GETFLAG_B7;
+			} else
+				hdlc->hdlc_bits1 = 0;
+			hdlc->cbin <<= 1;
+			hdlc->bit_shift--;
+			break;
+		case HDLC_GETFLAG_B7:
+			if (hdlc->cbin & 0x80) {
+				hdlc->state = HDLC_GET_FLAG_B0;
+			} else {
+				hdlc->state = HDLC_GET_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->shift_reg = 0;
+				hdlc->hdlc_bits1 = 0;
+				hdlc->data_bits = 0;
+				hdlc->data_received = 0;
+			}
+			hdlc->cbin <<= 1;
+			hdlc->bit_shift--;
+			break;
+		case HDLC_GET_DATA:
+			if (hdlc->cbin & 0x80) {
+				hdlc->hdlc_bits1++;
+				switch (hdlc->hdlc_bits1) {
+				case 6:
+					break;
+				case 7:
+					if (hdlc->data_received)
+						/* bad frame */
+						status = -HDLC_FRAMING_ERROR;
+					if (!hdlc->do_adapt56) {
+						if (hdlc->cbin == fast_abort
+						    [hdlc->bit_shift + 1]) {
+							hdlc->state =
+								HDLC_FAST_IDLE;
+							hdlc->bit_shift = 1;
+							break;
+						}
+					} else
+						hdlc->state = HDLC_GET_FLAG_B0;
+					break;
+				default:
+					hdlc->shift_reg >>= 1;
+					hdlc->shift_reg |= 0x80;
+					hdlc->data_bits++;
+					break;
+				}
+			} else {
+				switch (hdlc->hdlc_bits1) {
+				case 5:
+					break;
+				case 6:
+					if (hdlc->data_received)
+						status = check_frame(hdlc);
+					hdlc->crc = 0xffff;
+					hdlc->shift_reg = 0;
+					hdlc->data_bits = 0;
+					if (!hdlc->do_adapt56)
+						handle_fast_flag(hdlc);
+					else {
+						hdlc->state = HDLC_GET_DATA;
+						hdlc->data_received = 0;
+					}
+					break;
+				default:
+					hdlc->shift_reg >>= 1;
+					hdlc->data_bits++;
+					break;
+				}
+				hdlc->hdlc_bits1 = 0;
+			}
+			if (status) {
+				hdlc->dstpos = 0;
+				*count -= slen;
+				hdlc->cbin <<= 1;
+				hdlc->bit_shift--;
+				return status;
+			}
+			if (hdlc->data_bits == 8) {
+				hdlc->data_bits = 0;
+				hdlc->data_received = 1;
+				hdlc->crc = crc_ccitt_byte(hdlc->crc,
+							   hdlc->shift_reg);
+
+				/* good byte received */
+				if (hdlc->dstpos < dsize)
+					dst[hdlc->dstpos++] = hdlc->shift_reg;
+				else {
+					/* frame too long */
+					status = -HDLC_LENGTH_ERROR;
+					hdlc->dstpos = 0;
+				}
+			}
+			hdlc->cbin <<= 1;
+			hdlc->bit_shift--;
+			break;
+		case HDLC_FAST_FLAG:
+			if (hdlc->cbin == hdlc->ffvalue) {
+				hdlc->bit_shift = 0;
+				break;
+			} else {
+				if (hdlc->cbin == 0xff) {
+					hdlc->state = HDLC_FAST_IDLE;
+					hdlc->bit_shift = 0;
+				} else if (hdlc->ffbit_shift == 8) {
+					hdlc->state = HDLC_GETFLAG_B7;
+					break;
+				} else
+					handle_abort(hdlc);
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	*count -= slen;
+	return 0;
+}
+EXPORT_SYMBOL(isdnhdlc_decode);
+/*
+  isdnhdlc_encode - encodes HDLC frames to a transparent bit stream.
+
+  The bit stream starts with a beginning flag (01111110). After
+  that each byte is added to the bit stream with bit stuffing added
+  (0 after 5 1's).
+  When the last byte has been removed from the source buffer, the
+  CRC (2 bytes is added) and the frame terminates with the ending flag.
+  For the dchannel, the idle character (all 1's) is also added at the end.
+  If this function is called with empty source buffer (slen=0), flags or
+  idle character will be generated.
+
+  src - source buffer
+  slen - source buffer length
+  count - number of bytes removed (encoded) from source buffer
+  dst _ destination buffer
+  dsize - destination buffer size
+  returns - number of encoded bytes in the destination buffer
+*/
+int isdnhdlc_encode(struct isdnhdlc_vars *hdlc, const u8 *src, u16 slen,
+		    int *count, u8 *dst, int dsize)
+{
+	static const unsigned char xfast_flag_value[] = {
+		0x7e, 0x3f, 0x9f, 0xcf, 0xe7, 0xf3, 0xf9, 0xfc, 0x7e
+	};
+
+	int len = 0;
+
+	*count = slen;
+
+	/* special handling for one byte frames */
+	if ((slen == 1) && (hdlc->state == HDLC_SEND_FAST_FLAG))
+		hdlc->state = HDLC_SENDFLAG_ONE;
+	while (dsize > 0) {
+		if (hdlc->bit_shift == 0) {
+			if (slen && !hdlc->do_closing) {
+				hdlc->shift_reg = *src++;
+				slen--;
+				if (slen == 0)
+					/* closing sequence, CRC + flag(s) */
+					hdlc->do_closing = 1;
+				hdlc->bit_shift = 8;
+			} else {
+				if (hdlc->state == HDLC_SEND_DATA) {
+					if (hdlc->data_received) {
+						hdlc->state = HDLC_SEND_CRC1;
+						hdlc->crc ^= 0xffff;
+						hdlc->bit_shift = 8;
+						hdlc->shift_reg =
+							hdlc->crc & 0xff;
+					} else if (!hdlc->do_adapt56)
+						hdlc->state =
+							HDLC_SEND_FAST_FLAG;
+					else
+						hdlc->state =
+							HDLC_SENDFLAG_B0;
+				}
+
+			}
+		}
+
+		switch (hdlc->state) {
+		case STOPPED:
+			while (dsize--)
+				*dst++ = 0xff;
+			return dsize;
+		case HDLC_SEND_FAST_FLAG:
+			hdlc->do_closing = 0;
+			if (slen == 0) {
+				/* the code is for bitreverse streams */
+				if (hdlc->do_bitreverse == 0)
+					*dst++ = bitrev8(hdlc->ffvalue);
+				else
+					*dst++ = hdlc->ffvalue;
+				len++;
+				dsize--;
+				break;
+			}
+			/* fall through */
+		case HDLC_SENDFLAG_ONE:
+			if (hdlc->bit_shift == 8) {
+				hdlc->cbin = hdlc->ffvalue >>
+					(8 - hdlc->data_bits);
+				hdlc->state = HDLC_SEND_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->hdlc_bits1 = 0;
+				hdlc->data_received = 1;
+			}
+			break;
+		case HDLC_SENDFLAG_B0:
+			hdlc->do_closing = 0;
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			hdlc->hdlc_bits1 = 0;
+			hdlc->state = HDLC_SENDFLAG_B1A6;
+			break;
+		case HDLC_SENDFLAG_B1A6:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			hdlc->cbin++;
+			if (++hdlc->hdlc_bits1 == 6)
+				hdlc->state = HDLC_SENDFLAG_B7;
+			break;
+		case HDLC_SENDFLAG_B7:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (slen == 0) {
+				hdlc->state = HDLC_SENDFLAG_B0;
+				break;
+			}
+			if (hdlc->bit_shift == 8) {
+				hdlc->state = HDLC_SEND_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->hdlc_bits1 = 0;
+				hdlc->data_received = 1;
+			}
+			break;
+		case HDLC_SEND_FIRST_FLAG:
+			hdlc->data_received = 1;
+			if (hdlc->data_bits == 8) {
+				hdlc->state = HDLC_SEND_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->shift_reg & 0x01)
+				hdlc->cbin++;
+			hdlc->shift_reg >>= 1;
+			hdlc->bit_shift--;
+			if (hdlc->bit_shift == 0) {
+				hdlc->state = HDLC_SEND_DATA;
+				hdlc->crc = 0xffff;
+				hdlc->hdlc_bits1 = 0;
+			}
+			break;
+		case HDLC_SEND_DATA:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->hdlc_bits1 == 5) {
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			if (hdlc->bit_shift == 8)
+				hdlc->crc = crc_ccitt_byte(hdlc->crc,
+							   hdlc->shift_reg);
+			if (hdlc->shift_reg & 0x01) {
+				hdlc->hdlc_bits1++;
+				hdlc->cbin++;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			} else {
+				hdlc->hdlc_bits1 = 0;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			}
+			break;
+		case HDLC_SEND_CRC1:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->hdlc_bits1 == 5) {
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			if (hdlc->shift_reg & 0x01) {
+				hdlc->hdlc_bits1++;
+				hdlc->cbin++;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			} else {
+				hdlc->hdlc_bits1 = 0;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			}
+			if (hdlc->bit_shift == 0) {
+				hdlc->shift_reg = (hdlc->crc >> 8);
+				hdlc->state = HDLC_SEND_CRC2;
+				hdlc->bit_shift = 8;
+			}
+			break;
+		case HDLC_SEND_CRC2:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->hdlc_bits1 == 5) {
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			if (hdlc->shift_reg & 0x01) {
+				hdlc->hdlc_bits1++;
+				hdlc->cbin++;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			} else {
+				hdlc->hdlc_bits1 = 0;
+				hdlc->shift_reg >>= 1;
+				hdlc->bit_shift--;
+			}
+			if (hdlc->bit_shift == 0) {
+				hdlc->shift_reg = 0x7e;
+				hdlc->state = HDLC_SEND_CLOSING_FLAG;
+				hdlc->bit_shift = 8;
+			}
+			break;
+		case HDLC_SEND_CLOSING_FLAG:
+			hdlc->cbin <<= 1;
+			hdlc->data_bits++;
+			if (hdlc->hdlc_bits1 == 5) {
+				hdlc->hdlc_bits1 = 0;
+				break;
+			}
+			if (hdlc->shift_reg & 0x01)
+				hdlc->cbin++;
+			hdlc->shift_reg >>= 1;
+			hdlc->bit_shift--;
+			if (hdlc->bit_shift == 0) {
+				hdlc->ffvalue =
+					xfast_flag_value[hdlc->data_bits];
+				if (hdlc->dchannel) {
+					hdlc->ffvalue = 0x7e;
+					hdlc->state = HDLC_SEND_IDLE1;
+					hdlc->bit_shift = 8-hdlc->data_bits;
+					if (hdlc->bit_shift == 0)
+						hdlc->state =
+							HDLC_SEND_FAST_IDLE;
+				} else {
+					if (!hdlc->do_adapt56) {
+						hdlc->state =
+							HDLC_SEND_FAST_FLAG;
+						hdlc->data_received = 0;
+					} else {
+						hdlc->state = HDLC_SENDFLAG_B0;
+						hdlc->data_received = 0;
+					}
+					/* Finished this frame, send flags */
+					if (dsize > 1)
+						dsize = 1;
+				}
+			}
+			break;
+		case HDLC_SEND_IDLE1:
+			hdlc->do_closing = 0;
+			hdlc->cbin <<= 1;
+			hdlc->cbin++;
+			hdlc->data_bits++;
+			hdlc->bit_shift--;
+			if (hdlc->bit_shift == 0) {
+				hdlc->state = HDLC_SEND_FAST_IDLE;
+				hdlc->bit_shift = 0;
+			}
+			break;
+		case HDLC_SEND_FAST_IDLE:
+			hdlc->do_closing = 0;
+			hdlc->cbin = 0xff;
+			hdlc->data_bits = 8;
+			if (hdlc->bit_shift == 8) {
+				hdlc->cbin = 0x7e;
+				hdlc->state = HDLC_SEND_FIRST_FLAG;
+			} else {
+				/* the code is for bitreverse streams */
+				if (hdlc->do_bitreverse == 0)
+					*dst++ = bitrev8(hdlc->cbin);
+				else
+					*dst++ = hdlc->cbin;
+				hdlc->bit_shift = 0;
+				hdlc->data_bits = 0;
+				len++;
+				dsize = 0;
+			}
+			break;
+		default:
+			break;
+		}
+		if (hdlc->do_adapt56) {
+			if (hdlc->data_bits == 7) {
+				hdlc->cbin <<= 1;
+				hdlc->cbin++;
+				hdlc->data_bits++;
+			}
+		}
+		if (hdlc->data_bits == 8) {
+			/* the code is for bitreverse streams */
+			if (hdlc->do_bitreverse == 0)
+				*dst++ = bitrev8(hdlc->cbin);
+			else
+				*dst++ = hdlc->cbin;
+			hdlc->data_bits = 0;
+			len++;
+			dsize--;
+		}
+	}
+	*count -= slen;
+
+	return len;
+}
+EXPORT_SYMBOL(isdnhdlc_encode);
diff --git a/drivers/isdn/hardware/mISDN/isdnhdlc.h b/drivers/isdn/hardware/mISDN/isdnhdlc.h
new file mode 100644
index 000000000000..96521370c782
--- /dev/null
+++ b/drivers/isdn/hardware/mISDN/isdnhdlc.h
@@ -0,0 +1,82 @@
+/*
+ * hdlc.h  --  General purpose ISDN HDLC decoder.
+ *
+ * Implementation of a HDLC decoder/encoder in software.
+ * Necessary because some ISDN devices don't have HDLC
+ * controllers.
+ *
+ * Copyright (C)
+ *	2009	Karsten Keil		<keil@b1-systems.de>
+ *	2002	Wolfgang Mües		<wolfgang@iksw-muees.de>
+ *	2001	Frode Isaksen		<fisaksen@bewan.com>
+ *	2001	Kai Germaschewski	<kai.germaschewski@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __ISDNHDLC_H__
+#define __ISDNHDLC_H__
+
+struct isdnhdlc_vars {
+	int bit_shift;
+	int hdlc_bits1;
+	int data_bits;
+	int ffbit_shift;	/* encoding only */
+	int state;
+	int dstpos;
+
+	u16 crc;
+
+	u8 cbin;
+	u8 shift_reg;
+	u8 ffvalue;
+
+	/* set if transferring data */
+	u32 data_received:1;
+	/* set if D channel (send idle instead of flags) */
+	u32 dchannel:1;
+	/* set if 56K adaptation */
+	u32 do_adapt56:1;
+	/* set if in closing phase (need to send CRC + flag) */
+	u32 do_closing:1;
+	/* set if data is bitreverse */
+	u32 do_bitreverse:1;
+};
+
+/* Feature Flags */
+#define HDLC_56KBIT	0x01
+#define HDLC_DCHANNEL	0x02
+#define HDLC_BITREVERSE	0x04
+
+/*
+  The return value from isdnhdlc_decode is
+  the frame length, 0 if no complete frame was decoded,
+  or a negative error number
+*/
+#define HDLC_FRAMING_ERROR     1
+#define HDLC_CRC_ERROR         2
+#define HDLC_LENGTH_ERROR      3
+
+extern void	isdnhdlc_rcv_init(struct isdnhdlc_vars *hdlc, u32 features);
+
+extern int	isdnhdlc_decode(struct isdnhdlc_vars *hdlc, const u8 *src,
+			int slen, int *count, u8 *dst, int dsize);
+
+extern void	isdnhdlc_out_init(struct isdnhdlc_vars *hdlc, u32 features);
+
+extern int	isdnhdlc_encode(struct isdnhdlc_vars *hdlc, const u8 *src,
+			u16 slen, int *count, u8 *dst, int dsize);
+
+#endif /* __ISDNHDLC_H__ */
diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c
index 2b317cb63d06..93a2d361eda5 100644
--- a/drivers/isdn/hardware/mISDN/netjet.c
+++ b/drivers/isdn/hardware/mISDN/netjet.c
@@ -29,7 +29,7 @@
 #include "ipac.h"
 #include "iohelper.h"
 #include "netjet.h"
-#include <linux/isdn/hdlc.h>
+#include "isdnhdlc.h"
 
 #define NETJET_REV	"2.0"
 
diff --git a/drivers/isdn/i4l/Makefile b/drivers/isdn/i4l/Makefile
deleted file mode 100644
index 11fe697739d5..000000000000
--- a/drivers/isdn/i4l/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Makefile for the kernel ISDN subsystem and device drivers.
-
-# Each configuration option enables a list of files.
-
-obj-$(CONFIG_ISDN_HDLC)		+= isdnhdlc.o
diff --git a/drivers/isdn/i4l/isdnhdlc.c b/drivers/isdn/i4l/isdnhdlc.c
deleted file mode 100644
index 027d1c590679..000000000000
--- a/drivers/isdn/i4l/isdnhdlc.c
+++ /dev/null
@@ -1,630 +0,0 @@
-/*
- * isdnhdlc.c  --  General purpose ISDN HDLC decoder.
- *
- * Copyright (C)
- *	2009	Karsten Keil		<keil@b1-systems.de>
- *	2002	Wolfgang Mües		<wolfgang@iksw-muees.de>
- *	2001	Frode Isaksen		<fisaksen@bewan.com>
- *      2001	Kai Germaschewski	<kai.germaschewski@gmx.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/crc-ccitt.h>
-#include <linux/isdn/hdlc.h>
-#include <linux/bitrev.h>
-
-/*-------------------------------------------------------------------*/
-
-MODULE_AUTHOR("Wolfgang Mües <wolfgang@iksw-muees.de>, "
-	      "Frode Isaksen <fisaksen@bewan.com>, "
-	      "Kai Germaschewski <kai.germaschewski@gmx.de>");
-MODULE_DESCRIPTION("General purpose ISDN HDLC decoder");
-MODULE_LICENSE("GPL");
-
-/*-------------------------------------------------------------------*/
-
-enum {
-	HDLC_FAST_IDLE, HDLC_GET_FLAG_B0, HDLC_GETFLAG_B1A6, HDLC_GETFLAG_B7,
-	HDLC_GET_DATA, HDLC_FAST_FLAG
-};
-
-enum {
-	HDLC_SEND_DATA, HDLC_SEND_CRC1, HDLC_SEND_FAST_FLAG,
-	HDLC_SEND_FIRST_FLAG, HDLC_SEND_CRC2, HDLC_SEND_CLOSING_FLAG,
-	HDLC_SEND_IDLE1, HDLC_SEND_FAST_IDLE, HDLC_SENDFLAG_B0,
-	HDLC_SENDFLAG_B1A6, HDLC_SENDFLAG_B7, STOPPED, HDLC_SENDFLAG_ONE
-};
-
-void isdnhdlc_rcv_init(struct isdnhdlc_vars *hdlc, u32 features)
-{
-	memset(hdlc, 0, sizeof(struct isdnhdlc_vars));
-	hdlc->state = HDLC_GET_DATA;
-	if (features & HDLC_56KBIT)
-		hdlc->do_adapt56 = 1;
-	if (features & HDLC_BITREVERSE)
-		hdlc->do_bitreverse = 1;
-}
-EXPORT_SYMBOL(isdnhdlc_out_init);
-
-void isdnhdlc_out_init(struct isdnhdlc_vars *hdlc, u32 features)
-{
-	memset(hdlc, 0, sizeof(struct isdnhdlc_vars));
-	if (features & HDLC_DCHANNEL) {
-		hdlc->dchannel = 1;
-		hdlc->state = HDLC_SEND_FIRST_FLAG;
-	} else {
-		hdlc->dchannel = 0;
-		hdlc->state = HDLC_SEND_FAST_FLAG;
-		hdlc->ffvalue = 0x7e;
-	}
-	hdlc->cbin = 0x7e;
-	if (features & HDLC_56KBIT) {
-		hdlc->do_adapt56 = 1;
-		hdlc->state = HDLC_SENDFLAG_B0;
-	} else
-		hdlc->data_bits = 8;
-	if (features & HDLC_BITREVERSE)
-		hdlc->do_bitreverse = 1;
-}
-EXPORT_SYMBOL(isdnhdlc_rcv_init);
-
-static int
-check_frame(struct isdnhdlc_vars *hdlc)
-{
-	int status;
-
-	if (hdlc->dstpos < 2)	/* too small - framing error */
-		status = -HDLC_FRAMING_ERROR;
-	else if (hdlc->crc != 0xf0b8)	/* crc error */
-		status = -HDLC_CRC_ERROR;
-	else {
-		/* remove CRC */
-		hdlc->dstpos -= 2;
-		/* good frame */
-		status = hdlc->dstpos;
-	}
-	return status;
-}
-
-/*
-  isdnhdlc_decode - decodes HDLC frames from a transparent bit stream.
-
-  The source buffer is scanned for valid HDLC frames looking for
-  flags (01111110) to indicate the start of a frame. If the start of
-  the frame is found, the bit stuffing is removed (0 after 5 1's).
-  When a new flag is found, the complete frame has been received
-  and the CRC is checked.
-  If a valid frame is found, the function returns the frame length
-  excluding the CRC with the bit HDLC_END_OF_FRAME set.
-  If the beginning of a valid frame is found, the function returns
-  the length.
-  If a framing error is found (too many 1s and not a flag) the function
-  returns the length with the bit HDLC_FRAMING_ERROR set.
-  If a CRC error is found the function returns the length with the
-  bit HDLC_CRC_ERROR set.
-  If the frame length exceeds the destination buffer size, the function
-  returns the length with the bit HDLC_LENGTH_ERROR set.
-
-  src - source buffer
-  slen - source buffer length
-  count - number of bytes removed (decoded) from the source buffer
-  dst _ destination buffer
-  dsize - destination buffer size
-  returns - number of decoded bytes in the destination buffer and status
-  flag.
-*/
-int isdnhdlc_decode(struct isdnhdlc_vars *hdlc, const u8 *src, int slen,
-		    int *count, u8 *dst, int dsize)
-{
-	int status = 0;
-
-	static const unsigned char fast_flag[] = {
-		0x00, 0x00, 0x00, 0x20, 0x30, 0x38, 0x3c, 0x3e, 0x3f
-	};
-
-	static const unsigned char fast_flag_value[] = {
-		0x00, 0x7e, 0xfc, 0xf9, 0xf3, 0xe7, 0xcf, 0x9f, 0x3f
-	};
-
-	static const unsigned char fast_abort[] = {
-		0x00, 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
-	};
-
-#define handle_fast_flag(h)						\
-	do {								\
-		if (h->cbin == fast_flag[h->bit_shift]) {		\
-			h->ffvalue = fast_flag_value[h->bit_shift];	\
-			h->state = HDLC_FAST_FLAG;			\
-			h->ffbit_shift = h->bit_shift;			\
-			h->bit_shift = 1;				\
-		} else {						\
-			h->state = HDLC_GET_DATA;			\
-			h->data_received = 0;				\
-		}							\
-	} while (0)
-
-#define handle_abort(h)						\
-	do {							\
-		h->shift_reg = fast_abort[h->ffbit_shift - 1];	\
-		h->hdlc_bits1 = h->ffbit_shift - 2;		\
-		if (h->hdlc_bits1 < 0)				\
-			h->hdlc_bits1 = 0;			\
-		h->data_bits = h->ffbit_shift - 1;		\
-		h->state = HDLC_GET_DATA;			\
-		h->data_received = 0;				\
-	} while (0)
-
-	*count = slen;
-
-	while (slen > 0) {
-		if (hdlc->bit_shift == 0) {
-			/* the code is for bitreverse streams */
-			if (hdlc->do_bitreverse == 0)
-				hdlc->cbin = bitrev8(*src++);
-			else
-				hdlc->cbin = *src++;
-			slen--;
-			hdlc->bit_shift = 8;
-			if (hdlc->do_adapt56)
-				hdlc->bit_shift--;
-		}
-
-		switch (hdlc->state) {
-		case STOPPED:
-			return 0;
-		case HDLC_FAST_IDLE:
-			if (hdlc->cbin == 0xff) {
-				hdlc->bit_shift = 0;
-				break;
-			}
-			hdlc->state = HDLC_GET_FLAG_B0;
-			hdlc->hdlc_bits1 = 0;
-			hdlc->bit_shift = 8;
-			break;
-		case HDLC_GET_FLAG_B0:
-			if (!(hdlc->cbin & 0x80)) {
-				hdlc->state = HDLC_GETFLAG_B1A6;
-				hdlc->hdlc_bits1 = 0;
-			} else {
-				if ((!hdlc->do_adapt56) &&
-				    (++hdlc->hdlc_bits1 >= 8) &&
-				    (hdlc->bit_shift == 1))
-					hdlc->state = HDLC_FAST_IDLE;
-			}
-			hdlc->cbin <<= 1;
-			hdlc->bit_shift--;
-			break;
-		case HDLC_GETFLAG_B1A6:
-			if (hdlc->cbin & 0x80) {
-				hdlc->hdlc_bits1++;
-				if (hdlc->hdlc_bits1 == 6)
-					hdlc->state = HDLC_GETFLAG_B7;
-			} else
-				hdlc->hdlc_bits1 = 0;
-			hdlc->cbin <<= 1;
-			hdlc->bit_shift--;
-			break;
-		case HDLC_GETFLAG_B7:
-			if (hdlc->cbin & 0x80) {
-				hdlc->state = HDLC_GET_FLAG_B0;
-			} else {
-				hdlc->state = HDLC_GET_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->shift_reg = 0;
-				hdlc->hdlc_bits1 = 0;
-				hdlc->data_bits = 0;
-				hdlc->data_received = 0;
-			}
-			hdlc->cbin <<= 1;
-			hdlc->bit_shift--;
-			break;
-		case HDLC_GET_DATA:
-			if (hdlc->cbin & 0x80) {
-				hdlc->hdlc_bits1++;
-				switch (hdlc->hdlc_bits1) {
-				case 6:
-					break;
-				case 7:
-					if (hdlc->data_received)
-						/* bad frame */
-						status = -HDLC_FRAMING_ERROR;
-					if (!hdlc->do_adapt56) {
-						if (hdlc->cbin == fast_abort
-						    [hdlc->bit_shift + 1]) {
-							hdlc->state =
-								HDLC_FAST_IDLE;
-							hdlc->bit_shift = 1;
-							break;
-						}
-					} else
-						hdlc->state = HDLC_GET_FLAG_B0;
-					break;
-				default:
-					hdlc->shift_reg >>= 1;
-					hdlc->shift_reg |= 0x80;
-					hdlc->data_bits++;
-					break;
-				}
-			} else {
-				switch (hdlc->hdlc_bits1) {
-				case 5:
-					break;
-				case 6:
-					if (hdlc->data_received)
-						status = check_frame(hdlc);
-					hdlc->crc = 0xffff;
-					hdlc->shift_reg = 0;
-					hdlc->data_bits = 0;
-					if (!hdlc->do_adapt56)
-						handle_fast_flag(hdlc);
-					else {
-						hdlc->state = HDLC_GET_DATA;
-						hdlc->data_received = 0;
-					}
-					break;
-				default:
-					hdlc->shift_reg >>= 1;
-					hdlc->data_bits++;
-					break;
-				}
-				hdlc->hdlc_bits1 = 0;
-			}
-			if (status) {
-				hdlc->dstpos = 0;
-				*count -= slen;
-				hdlc->cbin <<= 1;
-				hdlc->bit_shift--;
-				return status;
-			}
-			if (hdlc->data_bits == 8) {
-				hdlc->data_bits = 0;
-				hdlc->data_received = 1;
-				hdlc->crc = crc_ccitt_byte(hdlc->crc,
-							   hdlc->shift_reg);
-
-				/* good byte received */
-				if (hdlc->dstpos < dsize)
-					dst[hdlc->dstpos++] = hdlc->shift_reg;
-				else {
-					/* frame too long */
-					status = -HDLC_LENGTH_ERROR;
-					hdlc->dstpos = 0;
-				}
-			}
-			hdlc->cbin <<= 1;
-			hdlc->bit_shift--;
-			break;
-		case HDLC_FAST_FLAG:
-			if (hdlc->cbin == hdlc->ffvalue) {
-				hdlc->bit_shift = 0;
-				break;
-			} else {
-				if (hdlc->cbin == 0xff) {
-					hdlc->state = HDLC_FAST_IDLE;
-					hdlc->bit_shift = 0;
-				} else if (hdlc->ffbit_shift == 8) {
-					hdlc->state = HDLC_GETFLAG_B7;
-					break;
-				} else
-					handle_abort(hdlc);
-			}
-			break;
-		default:
-			break;
-		}
-	}
-	*count -= slen;
-	return 0;
-}
-EXPORT_SYMBOL(isdnhdlc_decode);
-/*
-  isdnhdlc_encode - encodes HDLC frames to a transparent bit stream.
-
-  The bit stream starts with a beginning flag (01111110). After
-  that each byte is added to the bit stream with bit stuffing added
-  (0 after 5 1's).
-  When the last byte has been removed from the source buffer, the
-  CRC (2 bytes is added) and the frame terminates with the ending flag.
-  For the dchannel, the idle character (all 1's) is also added at the end.
-  If this function is called with empty source buffer (slen=0), flags or
-  idle character will be generated.
-
-  src - source buffer
-  slen - source buffer length
-  count - number of bytes removed (encoded) from source buffer
-  dst _ destination buffer
-  dsize - destination buffer size
-  returns - number of encoded bytes in the destination buffer
-*/
-int isdnhdlc_encode(struct isdnhdlc_vars *hdlc, const u8 *src, u16 slen,
-		    int *count, u8 *dst, int dsize)
-{
-	static const unsigned char xfast_flag_value[] = {
-		0x7e, 0x3f, 0x9f, 0xcf, 0xe7, 0xf3, 0xf9, 0xfc, 0x7e
-	};
-
-	int len = 0;
-
-	*count = slen;
-
-	/* special handling for one byte frames */
-	if ((slen == 1) && (hdlc->state == HDLC_SEND_FAST_FLAG))
-		hdlc->state = HDLC_SENDFLAG_ONE;
-	while (dsize > 0) {
-		if (hdlc->bit_shift == 0) {
-			if (slen && !hdlc->do_closing) {
-				hdlc->shift_reg = *src++;
-				slen--;
-				if (slen == 0)
-					/* closing sequence, CRC + flag(s) */
-					hdlc->do_closing = 1;
-				hdlc->bit_shift = 8;
-			} else {
-				if (hdlc->state == HDLC_SEND_DATA) {
-					if (hdlc->data_received) {
-						hdlc->state = HDLC_SEND_CRC1;
-						hdlc->crc ^= 0xffff;
-						hdlc->bit_shift = 8;
-						hdlc->shift_reg =
-							hdlc->crc & 0xff;
-					} else if (!hdlc->do_adapt56)
-						hdlc->state =
-							HDLC_SEND_FAST_FLAG;
-					else
-						hdlc->state =
-							HDLC_SENDFLAG_B0;
-				}
-
-			}
-		}
-
-		switch (hdlc->state) {
-		case STOPPED:
-			while (dsize--)
-				*dst++ = 0xff;
-			return dsize;
-		case HDLC_SEND_FAST_FLAG:
-			hdlc->do_closing = 0;
-			if (slen == 0) {
-				/* the code is for bitreverse streams */
-				if (hdlc->do_bitreverse == 0)
-					*dst++ = bitrev8(hdlc->ffvalue);
-				else
-					*dst++ = hdlc->ffvalue;
-				len++;
-				dsize--;
-				break;
-			}
-			/* fall through */
-		case HDLC_SENDFLAG_ONE:
-			if (hdlc->bit_shift == 8) {
-				hdlc->cbin = hdlc->ffvalue >>
-					(8 - hdlc->data_bits);
-				hdlc->state = HDLC_SEND_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->hdlc_bits1 = 0;
-				hdlc->data_received = 1;
-			}
-			break;
-		case HDLC_SENDFLAG_B0:
-			hdlc->do_closing = 0;
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			hdlc->hdlc_bits1 = 0;
-			hdlc->state = HDLC_SENDFLAG_B1A6;
-			break;
-		case HDLC_SENDFLAG_B1A6:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			hdlc->cbin++;
-			if (++hdlc->hdlc_bits1 == 6)
-				hdlc->state = HDLC_SENDFLAG_B7;
-			break;
-		case HDLC_SENDFLAG_B7:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (slen == 0) {
-				hdlc->state = HDLC_SENDFLAG_B0;
-				break;
-			}
-			if (hdlc->bit_shift == 8) {
-				hdlc->state = HDLC_SEND_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->hdlc_bits1 = 0;
-				hdlc->data_received = 1;
-			}
-			break;
-		case HDLC_SEND_FIRST_FLAG:
-			hdlc->data_received = 1;
-			if (hdlc->data_bits == 8) {
-				hdlc->state = HDLC_SEND_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->shift_reg & 0x01)
-				hdlc->cbin++;
-			hdlc->shift_reg >>= 1;
-			hdlc->bit_shift--;
-			if (hdlc->bit_shift == 0) {
-				hdlc->state = HDLC_SEND_DATA;
-				hdlc->crc = 0xffff;
-				hdlc->hdlc_bits1 = 0;
-			}
-			break;
-		case HDLC_SEND_DATA:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->hdlc_bits1 == 5) {
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			if (hdlc->bit_shift == 8)
-				hdlc->crc = crc_ccitt_byte(hdlc->crc,
-							   hdlc->shift_reg);
-			if (hdlc->shift_reg & 0x01) {
-				hdlc->hdlc_bits1++;
-				hdlc->cbin++;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			} else {
-				hdlc->hdlc_bits1 = 0;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			}
-			break;
-		case HDLC_SEND_CRC1:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->hdlc_bits1 == 5) {
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			if (hdlc->shift_reg & 0x01) {
-				hdlc->hdlc_bits1++;
-				hdlc->cbin++;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			} else {
-				hdlc->hdlc_bits1 = 0;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			}
-			if (hdlc->bit_shift == 0) {
-				hdlc->shift_reg = (hdlc->crc >> 8);
-				hdlc->state = HDLC_SEND_CRC2;
-				hdlc->bit_shift = 8;
-			}
-			break;
-		case HDLC_SEND_CRC2:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->hdlc_bits1 == 5) {
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			if (hdlc->shift_reg & 0x01) {
-				hdlc->hdlc_bits1++;
-				hdlc->cbin++;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			} else {
-				hdlc->hdlc_bits1 = 0;
-				hdlc->shift_reg >>= 1;
-				hdlc->bit_shift--;
-			}
-			if (hdlc->bit_shift == 0) {
-				hdlc->shift_reg = 0x7e;
-				hdlc->state = HDLC_SEND_CLOSING_FLAG;
-				hdlc->bit_shift = 8;
-			}
-			break;
-		case HDLC_SEND_CLOSING_FLAG:
-			hdlc->cbin <<= 1;
-			hdlc->data_bits++;
-			if (hdlc->hdlc_bits1 == 5) {
-				hdlc->hdlc_bits1 = 0;
-				break;
-			}
-			if (hdlc->shift_reg & 0x01)
-				hdlc->cbin++;
-			hdlc->shift_reg >>= 1;
-			hdlc->bit_shift--;
-			if (hdlc->bit_shift == 0) {
-				hdlc->ffvalue =
-					xfast_flag_value[hdlc->data_bits];
-				if (hdlc->dchannel) {
-					hdlc->ffvalue = 0x7e;
-					hdlc->state = HDLC_SEND_IDLE1;
-					hdlc->bit_shift = 8-hdlc->data_bits;
-					if (hdlc->bit_shift == 0)
-						hdlc->state =
-							HDLC_SEND_FAST_IDLE;
-				} else {
-					if (!hdlc->do_adapt56) {
-						hdlc->state =
-							HDLC_SEND_FAST_FLAG;
-						hdlc->data_received = 0;
-					} else {
-						hdlc->state = HDLC_SENDFLAG_B0;
-						hdlc->data_received = 0;
-					}
-					/* Finished this frame, send flags */
-					if (dsize > 1)
-						dsize = 1;
-				}
-			}
-			break;
-		case HDLC_SEND_IDLE1:
-			hdlc->do_closing = 0;
-			hdlc->cbin <<= 1;
-			hdlc->cbin++;
-			hdlc->data_bits++;
-			hdlc->bit_shift--;
-			if (hdlc->bit_shift == 0) {
-				hdlc->state = HDLC_SEND_FAST_IDLE;
-				hdlc->bit_shift = 0;
-			}
-			break;
-		case HDLC_SEND_FAST_IDLE:
-			hdlc->do_closing = 0;
-			hdlc->cbin = 0xff;
-			hdlc->data_bits = 8;
-			if (hdlc->bit_shift == 8) {
-				hdlc->cbin = 0x7e;
-				hdlc->state = HDLC_SEND_FIRST_FLAG;
-			} else {
-				/* the code is for bitreverse streams */
-				if (hdlc->do_bitreverse == 0)
-					*dst++ = bitrev8(hdlc->cbin);
-				else
-					*dst++ = hdlc->cbin;
-				hdlc->bit_shift = 0;
-				hdlc->data_bits = 0;
-				len++;
-				dsize = 0;
-			}
-			break;
-		default:
-			break;
-		}
-		if (hdlc->do_adapt56) {
-			if (hdlc->data_bits == 7) {
-				hdlc->cbin <<= 1;
-				hdlc->cbin++;
-				hdlc->data_bits++;
-			}
-		}
-		if (hdlc->data_bits == 8) {
-			/* the code is for bitreverse streams */
-			if (hdlc->do_bitreverse == 0)
-				*dst++ = bitrev8(hdlc->cbin);
-			else
-				*dst++ = hdlc->cbin;
-			hdlc->data_bits = 0;
-			len++;
-			dsize--;
-		}
-	}
-	*count -= slen;
-
-	return len;
-}
-EXPORT_SYMBOL(isdnhdlc_encode);
diff --git a/include/linux/isdn/hdlc.h b/include/linux/isdn/hdlc.h
deleted file mode 100644
index 96521370c782..000000000000
--- a/include/linux/isdn/hdlc.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * hdlc.h  --  General purpose ISDN HDLC decoder.
- *
- * Implementation of a HDLC decoder/encoder in software.
- * Necessary because some ISDN devices don't have HDLC
- * controllers.
- *
- * Copyright (C)
- *	2009	Karsten Keil		<keil@b1-systems.de>
- *	2002	Wolfgang Mües		<wolfgang@iksw-muees.de>
- *	2001	Frode Isaksen		<fisaksen@bewan.com>
- *	2001	Kai Germaschewski	<kai.germaschewski@gmx.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef __ISDNHDLC_H__
-#define __ISDNHDLC_H__
-
-struct isdnhdlc_vars {
-	int bit_shift;
-	int hdlc_bits1;
-	int data_bits;
-	int ffbit_shift;	/* encoding only */
-	int state;
-	int dstpos;
-
-	u16 crc;
-
-	u8 cbin;
-	u8 shift_reg;
-	u8 ffvalue;
-
-	/* set if transferring data */
-	u32 data_received:1;
-	/* set if D channel (send idle instead of flags) */
-	u32 dchannel:1;
-	/* set if 56K adaptation */
-	u32 do_adapt56:1;
-	/* set if in closing phase (need to send CRC + flag) */
-	u32 do_closing:1;
-	/* set if data is bitreverse */
-	u32 do_bitreverse:1;
-};
-
-/* Feature Flags */
-#define HDLC_56KBIT	0x01
-#define HDLC_DCHANNEL	0x02
-#define HDLC_BITREVERSE	0x04
-
-/*
-  The return value from isdnhdlc_decode is
-  the frame length, 0 if no complete frame was decoded,
-  or a negative error number
-*/
-#define HDLC_FRAMING_ERROR     1
-#define HDLC_CRC_ERROR         2
-#define HDLC_LENGTH_ERROR      3
-
-extern void	isdnhdlc_rcv_init(struct isdnhdlc_vars *hdlc, u32 features);
-
-extern int	isdnhdlc_decode(struct isdnhdlc_vars *hdlc, const u8 *src,
-			int slen, int *count, u8 *dst, int dsize);
-
-extern void	isdnhdlc_out_init(struct isdnhdlc_vars *hdlc, u32 features);
-
-extern int	isdnhdlc_encode(struct isdnhdlc_vars *hdlc, const u8 *src,
-			u16 slen, int *count, u8 *dst, int dsize);
-
-#endif /* __ISDNHDLC_H__ */
-- 
cgit v1.2.3


From 1da40ab6caf924633116582c4c86939c486f20db Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 5 May 2019 15:14:38 +0300
Subject: ipvs: allow rs_table to contain different real server types

Before now rs_table was used only for NAT real servers.
Change it to allow TUN real severs from different types,
possibly hashed with different port key.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h            |  3 +++
 net/netfilter/ipvs/ip_vs_ctl.c | 43 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 2ac40135b576..9a8ac8997e34 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1497,6 +1497,9 @@ static inline int ip_vs_todrop(struct netns_ipvs *ipvs)
 static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; }
 #endif
 
+#define IP_VS_DFWD_METHOD(dest) (atomic_read(&(dest)->conn_flags) & \
+				 IP_VS_CONN_F_FWD_MASK)
+
 /* ip_vs_fwd_tag returns the forwarding tag of the connection */
 #define IP_VS_FWD_METHOD(cp)  (cp->flags & IP_VS_CONN_F_FWD_MASK)
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 0e887159425c..30b1a9f9c2e3 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -515,15 +515,36 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
 {
 	unsigned int hash;
+	__be16 port;
 
 	if (dest->in_rs_table)
 		return;
 
+	switch (IP_VS_DFWD_METHOD(dest)) {
+	case IP_VS_CONN_F_MASQ:
+		port = dest->port;
+		break;
+	case IP_VS_CONN_F_TUNNEL:
+		switch (dest->tun_type) {
+		case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+			port = dest->tun_port;
+			break;
+		case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
+			port = 0;
+			break;
+		default:
+			return;
+		}
+		break;
+	default:
+		return;
+	}
+
 	/*
 	 *	Hash by proto,addr,port,
 	 *	which are the parameters of the real service.
 	 */
-	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
 
 	hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
 	dest->in_rs_table = 1;
@@ -555,7 +576,8 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
 		if (dest->port == dport &&
 		    dest->af == af &&
 		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
-		    (dest->protocol == protocol || dest->vfwmark)) {
+		    (dest->protocol == protocol || dest->vfwmark) &&
+		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
 			/* HIT */
 			return true;
 		}
@@ -585,7 +607,8 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
 		if (dest->port == dport &&
 		    dest->af == af &&
 		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
-			(dest->protocol == protocol || dest->vfwmark)) {
+		    (dest->protocol == protocol || dest->vfwmark) &&
+		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
 			/* HIT */
 			return dest;
 		}
@@ -831,6 +854,13 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
 	conn_flags |= IP_VS_CONN_F_INACTIVE;
 
+	/* Need to rehash? */
+	if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
+	    IP_VS_DFWD_METHOD(dest) ||
+	    udest->tun_type != dest->tun_type ||
+	    udest->tun_port != dest->tun_port)
+		ip_vs_rs_unhash(dest);
+
 	/* set the tunnel info */
 	dest->tun_type = udest->tun_type;
 	dest->tun_port = udest->tun_port;
@@ -839,16 +869,13 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
 		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
 	} else {
-		/*
-		 *    Put the real service in rs_table if not present.
-		 *    For now only for NAT!
-		 */
-		ip_vs_rs_hash(ipvs, dest);
 		/* FTP-NAT requires conntrack for mangling */
 		if (svc->port == FTPPORT)
 			ip_vs_register_conntrack(svc);
 	}
 	atomic_set(&dest->conn_flags, conn_flags);
+	/* Put the real service in rs_table if not present. */
+	ip_vs_rs_hash(ipvs, dest);
 
 	/* bind the service */
 	old_svc = rcu_dereference_protected(dest->svc, 1);
-- 
cgit v1.2.3


From 2aa3c9f48bc28ca0effd9877e010ad54c8a630e5 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 5 May 2019 15:14:39 +0300
Subject: ipvs: add function to find tunnels

Add ip_vs_find_tunnel() to match tunnel headers
by family, address and optional port. Use it to
properly find the tunnel real server used in
received ICMP errors.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             |  3 +++
 net/netfilter/ipvs/ip_vs_core.c |  8 ++++++++
 net/netfilter/ipvs/ip_vs_ctl.c  | 29 +++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 9a8ac8997e34..b01a94ebfc0e 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1404,6 +1404,9 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
 struct ip_vs_dest *
 ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
 			const union nf_inet_addr *daddr, __be16 dport);
+struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
+				     const union nf_inet_addr *daddr,
+				     __be16 tun_port);
 
 int ip_vs_use_count_inc(void);
 void ip_vs_use_count_dec(void);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 14457551bcb4..4447ee512b88 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1598,6 +1598,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
 	struct ip_vs_proto_data *pd;
 	unsigned int offset, offset2, ihl, verdict;
 	bool ipip, new_cp = false;
+	union nf_inet_addr *raddr;
 
 	*related = 1;
 
@@ -1636,15 +1637,22 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
 	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 	if (cih == NULL)
 		return NF_ACCEPT; /* The packet looks wrong, ignore */
+	raddr = (union nf_inet_addr *)&cih->daddr;
 
 	/* Special case for errors for IPIP packets */
 	ipip = false;
 	if (cih->protocol == IPPROTO_IPIP) {
+		struct ip_vs_dest *dest;
+
 		if (unlikely(cih->frag_off & htons(IP_OFFSET)))
 			return NF_ACCEPT;
 		/* Error for our IPIP must arrive at LOCAL_IN */
 		if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
 			return NF_ACCEPT;
+		dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0);
+		/* Only for known tunnel */
+		if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP)
+			return NF_ACCEPT;
 		offset += cih->ihl * 4;
 		cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 		if (cih == NULL)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 30b1a9f9c2e3..d5847e06350f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -617,6 +617,35 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
 	return NULL;
 }
 
+/* Find real service record by <af,addr,tun_port>.
+ * In case of multiple records with the same <af,addr,tun_port>, only
+ * the first found record is returned.
+ *
+ * To be called under RCU lock.
+ */
+struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
+				     const union nf_inet_addr *daddr,
+				     __be16 tun_port)
+{
+	struct ip_vs_dest *dest;
+	unsigned int hash;
+
+	/* Check for "full" addressed entries */
+	hash = ip_vs_rs_hashkey(af, daddr, tun_port);
+
+	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+		if (dest->tun_port == tun_port &&
+		    dest->af == af &&
+		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
+		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
+			/* HIT */
+			return dest;
+		}
+	}
+
+	return NULL;
+}
+
 /* Lookup destination by {addr,port} in the given service
  * Called under RCU lock.
  */
-- 
cgit v1.2.3


From ea6cc2fd8a2b89ab6dcd096ba6dbc1ecbdf26564 Mon Sep 17 00:00:00 2001
From: Lukasz Pawelczyk <l.pawelczyk@samsung.com>
Date: Fri, 10 May 2019 13:46:22 +0200
Subject: netfilter: xt_owner: Add supplementary groups option

The XT_OWNER_SUPPL_GROUPS flag causes GIDs specified with XT_OWNER_GID
to be also checked in the supplementary groups of a process.

f_cred->group_info cannot be modified during its lifetime and f_cred
holds a reference to it so it's safe to use.

Signed-off-by: Lukasz Pawelczyk <l.pawelczyk@samsung.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_owner.h |  7 ++++---
 net/netfilter/xt_owner.c                | 23 ++++++++++++++++++++---
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/xt_owner.h b/include/uapi/linux/netfilter/xt_owner.h
index fa3ad84957d5..9e98c09eda32 100644
--- a/include/uapi/linux/netfilter/xt_owner.h
+++ b/include/uapi/linux/netfilter/xt_owner.h
@@ -5,9 +5,10 @@
 #include <linux/types.h>
 
 enum {
-	XT_OWNER_UID    = 1 << 0,
-	XT_OWNER_GID    = 1 << 1,
-	XT_OWNER_SOCKET = 1 << 2,
+	XT_OWNER_UID          = 1 << 0,
+	XT_OWNER_GID          = 1 << 1,
+	XT_OWNER_SOCKET       = 1 << 2,
+	XT_OWNER_SUPPL_GROUPS = 1 << 3,
 };
 
 struct xt_owner_match_info {
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 46686fb73784..a8784502aca6 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -91,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	}
 
 	if (info->match & XT_OWNER_GID) {
+		unsigned int i, match = false;
 		kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
 		kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
-		if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
-		     gid_lte(filp->f_cred->fsgid, gid_max)) ^
-		    !(info->invert & XT_OWNER_GID))
+		struct group_info *gi = filp->f_cred->group_info;
+
+		if (gid_gte(filp->f_cred->fsgid, gid_min) &&
+		    gid_lte(filp->f_cred->fsgid, gid_max))
+			match = true;
+
+		if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) {
+			for (i = 0; i < gi->ngroups; ++i) {
+				kgid_t group = gi->gid[i];
+
+				if (gid_gte(group, gid_min) &&
+				    gid_lte(group, gid_max)) {
+					match = true;
+					break;
+				}
+			}
+		}
+
+		if (match ^ !(info->invert & XT_OWNER_GID))
 			return false;
 	}
 
-- 
cgit v1.2.3


From 5e2ad02e9001fd99cae3c14e52f67bb976e9bee3 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 16 May 2019 04:02:31 +0900
Subject: netfilter: nf_flow_table: remove unnecessary variable in
 flow_offload_tuple

The oifidx in the struct flow_offload_tuple is not used anymore.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 2 --
 net/netfilter/nf_flow_table_core.c    | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 3e370cb36263..d8c187936bec 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -53,8 +53,6 @@ struct flow_offload_tuple {
 	u8				l4proto;
 	u8				dir;
 
-	int				oifidx;
-
 	u16				mtu;
 
 	struct dst_entry		*dst_cache;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 948b4ebbe3fb..e3d797252a98 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -53,7 +53,6 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 	ft->dst_port = ctt->dst.u.tcp.port;
 
 	ft->iifidx = other_dst->dev->ifindex;
-	ft->oifidx = dst->dev->ifindex;
 	ft->dst_cache = dst;
 }
 
-- 
cgit v1.2.3


From 2cf6bffc49dae26edd12af6b57c8c780590380bf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 May 2019 15:44:12 +0200
Subject: netfilter: replace skb_make_writable with skb_ensure_writable

This converts all remaining users and then removes skb_make_writable.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h        |  5 -----
 net/netfilter/core.c             | 22 ----------------------
 net/netfilter/nf_synproxy_core.c |  2 +-
 net/netfilter/nfnetlink_queue.c  |  2 +-
 net/netfilter/xt_DSCP.c          |  8 ++++----
 5 files changed, 6 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 996bc247ef6e..049aeb40fa35 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -336,11 +336,6 @@ int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int optval,
 		char __user *opt, int *len);
 #endif
 
-/* Call this before modifying an existing packet: ensures it is
-   modifiable and linear to the point you care about (writable_len).
-   Returns true or false. */
-int skb_make_writable(struct sk_buff *skb, unsigned int writable_len);
-
 struct flowi;
 struct nf_queue_entry;
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b96fd3f54705..817a9e5d16e4 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -536,28 +536,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 }
 EXPORT_SYMBOL(nf_hook_slow);
 
-
-int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
-{
-	if (writable_len > skb->len)
-		return 0;
-
-	/* Not exclusive use of packet?  Must copy. */
-	if (!skb_cloned(skb)) {
-		if (writable_len <= skb_headlen(skb))
-			return 1;
-	} else if (skb_clone_writable(skb, writable_len))
-		return 1;
-
-	if (writable_len <= skb_headlen(skb))
-		writable_len = 0;
-	else
-		writable_len -= skb_headlen(skb);
-
-	return !!__pskb_pull_tail(skb, writable_len);
-}
-EXPORT_SYMBOL(skb_make_writable);
-
 /* This needs to be compiled in any case to avoid dependencies between the
  * nfnetlink_queue code and nf_conntrack.
  */
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 8ff4d22f10b2..3d58a9e93e5a 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -196,7 +196,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
 	optoff = protoff + sizeof(struct tcphdr);
 	optend = protoff + th->doff * 4;
 
-	if (!skb_make_writable(skb, optend))
+	if (skb_ensure_writable(skb, optend))
 		return 0;
 
 	while (optoff < optend) {
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 27dac47b29c2..831f57008d78 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -863,7 +863,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
 		}
 		skb_put(e->skb, diff);
 	}
-	if (!skb_make_writable(e->skb, data_len))
+	if (skb_ensure_writable(e->skb, data_len))
 		return -ENOMEM;
 	skb_copy_to_linear_data(e->skb, data, data_len);
 	e->skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index 098ed851b7a7..30d554d6c213 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -34,7 +34,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
 
 	if (dscp != dinfo->dscp) {
-		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+		if (skb_ensure_writable(skb, sizeof(struct iphdr)))
 			return NF_DROP;
 
 		ipv4_change_dsfield(ip_hdr(skb),
@@ -52,7 +52,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
 
 	if (dscp != dinfo->dscp) {
-		if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+		if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
 			return NF_DROP;
 
 		ipv6_change_dsfield(ipv6_hdr(skb),
@@ -82,7 +82,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	nv   = (orig & ~info->tos_mask) ^ info->tos_value;
 
 	if (orig != nv) {
-		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+		if (skb_ensure_writable(skb, sizeof(struct iphdr)))
 			return NF_DROP;
 		iph = ip_hdr(skb);
 		ipv4_change_dsfield(iph, 0, nv);
@@ -102,7 +102,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	nv   = (orig & ~info->tos_mask) ^ info->tos_value;
 
 	if (orig != nv) {
-		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+		if (skb_ensure_writable(skb, sizeof(struct iphdr)))
 			return NF_DROP;
 		iph = ipv6_hdr(skb);
 		ipv6_change_dsfield(iph, 0, nv);
-- 
cgit v1.2.3


From 29930e314da3833437a2ddc7b17f6a954f38d8fb Mon Sep 17 00:00:00 2001
From: Jacky Hu <hengqing.hu@gmail.com>
Date: Thu, 30 May 2019 08:16:40 +0800
Subject: ipvs: add checksum support for gue encapsulation

Add checksum support for gue encapsulation with the tun_flags parameter,
which could be one of the values below:
IP_VS_TUNNEL_ENCAP_FLAG_NOCSUM
IP_VS_TUNNEL_ENCAP_FLAG_CSUM
IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM

Signed-off-by: Jacky Hu <hengqing.hu@gmail.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             |   2 +
 include/uapi/linux/ip_vs.h      |   7 ++
 net/netfilter/ipvs/ip_vs_ctl.c  |  11 +++-
 net/netfilter/ipvs/ip_vs_xmit.c | 143 +++++++++++++++++++++++++++++++++++-----
 4 files changed, 146 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index b01a94ebfc0e..cb1ad0cc5c7b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -603,6 +603,7 @@ struct ip_vs_dest_user_kern {
 
 	u16			tun_type;	/* tunnel type */
 	__be16			tun_port;	/* tunnel port */
+	u16			tun_flags;	/* tunnel flags */
 };
 
 
@@ -665,6 +666,7 @@ struct ip_vs_dest {
 	atomic_t		last_weight;	/* server latest weight */
 	__u16			tun_type;	/* tunnel type */
 	__be16			tun_port;	/* tunnel port */
+	__u16			tun_flags;	/* tunnel flags */
 
 	refcount_t		refcnt;		/* reference counter */
 	struct ip_vs_stats      stats;          /* statistics */
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index e34f436fc79d..e4f18061a4fd 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -131,6 +131,11 @@ enum {
 	IP_VS_CONN_F_TUNNEL_TYPE_MAX,
 };
 
+/* Tunnel encapsulation flags */
+#define IP_VS_TUNNEL_ENCAP_FLAG_NOCSUM		(0)
+#define IP_VS_TUNNEL_ENCAP_FLAG_CSUM		(1 << 0)
+#define IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM		(1 << 1)
+
 /*
  *	The struct ip_vs_service_user and struct ip_vs_dest_user are
  *	used to set IPVS rules through setsockopt.
@@ -403,6 +408,8 @@ enum {
 
 	IPVS_DEST_ATTR_TUN_PORT,	/* tunnel port */
 
+	IPVS_DEST_ATTR_TUN_FLAGS,	/* tunnel flags */
+
 	__IPVS_DEST_ATTR_MAX,
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index d5847e06350f..ad19ac08622f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -893,6 +893,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	/* set the tunnel info */
 	dest->tun_type = udest->tun_type;
 	dest->tun_port = udest->tun_port;
+	dest->tun_flags = udest->tun_flags;
 
 	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
 	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
@@ -2967,6 +2968,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
 	[IPVS_DEST_ATTR_ADDR_FAMILY]	= { .type = NLA_U16 },
 	[IPVS_DEST_ATTR_TUN_TYPE]	= { .type = NLA_U8 },
 	[IPVS_DEST_ATTR_TUN_PORT]	= { .type = NLA_U16 },
+	[IPVS_DEST_ATTR_TUN_FLAGS]	= { .type = NLA_U16 },
 };
 
 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
@@ -3273,6 +3275,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
 		       dest->tun_type) ||
 	    nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
 			 dest->tun_port) ||
+	    nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
+			dest->tun_flags) ||
 	    nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
 	    nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
 	    nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
@@ -3393,7 +3397,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 	/* If a full entry was requested, check for the additional fields */
 	if (full_entry) {
 		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
-			      *nla_l_thresh, *nla_tun_type, *nla_tun_port;
+			      *nla_l_thresh, *nla_tun_type, *nla_tun_port,
+			      *nla_tun_flags;
 
 		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
 		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
@@ -3401,6 +3406,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];
 		nla_tun_type	= attrs[IPVS_DEST_ATTR_TUN_TYPE];
 		nla_tun_port	= attrs[IPVS_DEST_ATTR_TUN_PORT];
+		nla_tun_flags	= attrs[IPVS_DEST_ATTR_TUN_FLAGS];
 
 		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
 			return -EINVAL;
@@ -3416,6 +3422,9 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 
 		if (nla_tun_port)
 			udest->tun_port = nla_get_be16(nla_tun_port);
+
+		if (nla_tun_flags)
+			udest->tun_flags = nla_get_u16(nla_tun_flags);
 	}
 
 	return 0;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 0b41d0504429..af3379d5e5bc 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -40,6 +40,7 @@
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
 #include <net/ip_tunnels.h>
+#include <net/ip6_checksum.h>
 #include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
@@ -385,8 +386,13 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
 		if (!dest)
 			goto err_put;
-		if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+		if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
 			mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+			if ((dest->tun_flags &
+			     IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+			    skb->ip_summed == CHECKSUM_PARTIAL)
+				mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+		}
 		if (mtu < 68) {
 			IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
 			goto err_put;
@@ -540,8 +546,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 		mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
 		if (!dest)
 			goto err_put;
-		if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
+		if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
 			mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+			if ((dest->tun_flags &
+			     IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+			    skb->ip_summed == CHECKSUM_PARTIAL)
+				mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+		}
 		if (mtu < IPV6_MIN_MTU) {
 			IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
 				     IPV6_MIN_MTU);
@@ -1006,17 +1017,56 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb,
 	__be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
 	struct udphdr  *udph;	/* Our new UDP header */
 	struct guehdr  *gueh;	/* Our new GUE header */
+	size_t hdrlen, optlen = 0;
+	void *data;
+	bool need_priv = false;
+
+	if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+	    skb->ip_summed == CHECKSUM_PARTIAL) {
+		optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+		need_priv = true;
+	}
 
-	skb_push(skb, sizeof(struct guehdr));
+	hdrlen = sizeof(struct guehdr) + optlen;
+
+	skb_push(skb, hdrlen);
 
 	gueh = (struct guehdr *)skb->data;
 
 	gueh->control = 0;
 	gueh->version = 0;
-	gueh->hlen = 0;
+	gueh->hlen = optlen >> 2;
 	gueh->flags = 0;
 	gueh->proto_ctype = *next_protocol;
 
+	data = &gueh[1];
+
+	if (need_priv) {
+		__be32 *flags = data;
+		u16 csum_start = skb_checksum_start_offset(skb);
+		__be16 *pd;
+
+		gueh->flags |= GUE_FLAG_PRIV;
+		*flags = 0;
+		data += GUE_LEN_PRIV;
+
+		if (csum_start < hdrlen)
+			return -EINVAL;
+
+		csum_start -= hdrlen;
+		pd = data;
+		pd[0] = htons(csum_start);
+		pd[1] = htons(csum_start + skb->csum_offset);
+
+		if (!skb_is_gso(skb)) {
+			skb->ip_summed = CHECKSUM_NONE;
+			skb->encapsulation = 0;
+		}
+
+		*flags |= GUE_PFLAG_REMCSUM;
+		data += GUE_PLEN_REMCSUM;
+	}
+
 	skb_push(skb, sizeof(struct udphdr));
 	skb_reset_transport_header(skb);
 
@@ -1070,6 +1120,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	unsigned int max_headroom;		/* The extra header space needed */
 	int ret, local;
 	int tun_type, gso_type;
+	int tun_flags;
 
 	EnterFunction(10);
 
@@ -1092,9 +1143,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
 
 	tun_type = cp->dest->tun_type;
+	tun_flags = cp->dest->tun_flags;
 
-	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
-		max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+		size_t gue_hdrlen, gue_optlen = 0;
+
+		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+		    skb->ip_summed == CHECKSUM_PARTIAL) {
+			gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+		}
+		gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
+
+		max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+	}
 
 	/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
 	dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
@@ -1105,8 +1166,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 
 	gso_type = __tun_gso_type_mask(AF_INET, cp->af);
-	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
-		gso_type |= SKB_GSO_UDP_TUNNEL;
+	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+		    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+			gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+		else
+			gso_type |= SKB_GSO_UDP_TUNNEL;
+		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+		    skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+		}
+	}
 
 	if (iptunnel_handle_offloads(skb, gso_type))
 		goto tx_error;
@@ -1115,8 +1185,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	skb_set_inner_ipproto(skb, next_protocol);
 
-	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
-		ipvs_gue_encap(net, skb, cp, &next_protocol);
+	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+		bool check = false;
+
+		if (ipvs_gue_encap(net, skb, cp, &next_protocol))
+			goto tx_error;
+
+		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+		    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+			check = true;
+
+		udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
+	}
+
 
 	skb_push(skb, sizeof(struct iphdr));
 	skb_reset_network_header(skb);
@@ -1174,6 +1255,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	unsigned int max_headroom;	/* The extra header space needed */
 	int ret, local;
 	int tun_type, gso_type;
+	int tun_flags;
 
 	EnterFunction(10);
 
@@ -1197,9 +1279,19 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
 
 	tun_type = cp->dest->tun_type;
+	tun_flags = cp->dest->tun_flags;
 
-	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
-		max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
+	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+		size_t gue_hdrlen, gue_optlen = 0;
+
+		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+		    skb->ip_summed == CHECKSUM_PARTIAL) {
+			gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+		}
+		gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
+
+		max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+	}
 
 	skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
 					 &next_protocol, &payload_len,
@@ -1208,8 +1300,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 
 	gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
-	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
-		gso_type |= SKB_GSO_UDP_TUNNEL;
+	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+		    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+			gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+		else
+			gso_type |= SKB_GSO_UDP_TUNNEL;
+		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+		    skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+		}
+	}
 
 	if (iptunnel_handle_offloads(skb, gso_type))
 		goto tx_error;
@@ -1218,8 +1319,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	skb_set_inner_ipproto(skb, next_protocol);
 
-	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
-		ipvs_gue_encap(net, skb, cp, &next_protocol);
+	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+		bool check = false;
+
+		if (ipvs_gue_encap(net, skb, cp, &next_protocol))
+			goto tx_error;
+
+		if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+		    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+			check = true;
+
+		udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
+	}
 
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
-- 
cgit v1.2.3


From c9bb6165a16e6d5498981a6c777b94a78e74462b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 31 May 2019 11:15:26 +0200
Subject: netfilter: nf_conntrack_bridge: fix CONFIG_IPV6=y

This patch fixes a few problems with CONFIG_IPV6=y and
CONFIG_NF_CONNTRACK_BRIDGE=m:

In file included from net/netfilter/utils.c:5:
include/linux/netfilter_ipv6.h: In function 'nf_ipv6_br_defrag':
include/linux/netfilter_ipv6.h:110:9: error: implicit declaration of function 'nf_ct_frag6_gather'; did you mean 'nf_ct_attach'? [-Werror=implicit-function-declaration]

And these too:

net/ipv6/netfilter.c:242:2: error: unknown field 'br_defrag' specified in initializer
net/ipv6/netfilter.c:243:2: error: unknown field 'br_fragment' specified in initializer

This patch includes an original chunk from wenxu.

Fixes: 764dd163ac92 ("netfilter: nf_conntrack_bridge: add support for IPv6")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Yuehaibing <yuehaibing@huawei.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6.h | 2 ++
 net/ipv6/netfilter.c           | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index a21b8c9623ee..3a3dc4b1f0e7 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -96,6 +96,8 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst,
 #endif
 }
 
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
 static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
 				    u32 user)
 {
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index c6665382acb5..9530cc280953 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -238,7 +238,7 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.route_input		= ip6_route_input,
 	.fragment		= ip6_fragment,
 	.reroute		= nf_ip6_reroute,
-#if IS_MODULE(CONFIG_NF_CONNTRACK_BRIDGE)
+#if IS_MODULE(CONFIG_IPV6)
 	.br_defrag		= nf_ct_frag6_gather,
 	.br_fragment		= br_ip6_fragment,
 #endif
-- 
cgit v1.2.3


From 0b9055a112fd86c07b9d4857b61019485ec6526f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 29 May 2019 22:50:24 +0000
Subject: net/mlx5: Add core dump register access HW bits

Add Firmware core dump registers and HW definitions.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5a27246db883..b5431f7d97cb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -107,6 +107,7 @@ enum {
 	MLX5_REG_FPGA_CAP	 = 0x4022,
 	MLX5_REG_FPGA_CTRL	 = 0x4023,
 	MLX5_REG_FPGA_ACCESS_REG = 0x4024,
+	MLX5_REG_CORE_DUMP	 = 0x402e,
 	MLX5_REG_PCAP		 = 0x5001,
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5e74305e2e57..7ee422e38826 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -715,7 +715,9 @@ struct mlx5_ifc_qos_cap_bits {
 };
 
 struct mlx5_ifc_debug_cap_bits {
-	u8         reserved_at_0[0x20];
+	u8         core_dump_general[0x1];
+	u8         core_dump_qp[0x1];
+	u8         reserved_at_2[0x1e];
 
 	u8         reserved_at_20[0x2];
 	u8         stall_detect[0x1];
@@ -2531,6 +2533,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
 	struct mlx5_ifc_vector_calc_cap_bits vector_calc_cap;
 	struct mlx5_ifc_qos_cap_bits qos_cap;
+	struct mlx5_ifc_debug_cap_bits debug_cap;
 	struct mlx5_ifc_fpga_cap_bits fpga_cap;
 	u8         reserved_at_0[0x8000];
 };
@@ -8546,6 +8549,18 @@ struct mlx5_ifc_qcam_reg_bits {
 	u8         reserved_at_1c0[0x80];
 };
 
+struct mlx5_ifc_core_dump_reg_bits {
+	u8         reserved_at_0[0x18];
+	u8         core_dump_type[0x8];
+
+	u8         reserved_at_20[0x30];
+	u8         vhca_id[0x10];
+
+	u8         reserved_at_60[0x8];
+	u8         qpn[0x18];
+	u8         reserved_at_80[0x180];
+};
+
 struct mlx5_ifc_pcap_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         local_port[0x8];
-- 
cgit v1.2.3


From c6d4e45d3b44b71227588c2f76615380b3961f96 Mon Sep 17 00:00:00 2001
From: Eli Britstein <elibr@mellanox.com>
Date: Wed, 29 May 2019 22:50:29 +0000
Subject: net/mlx5: Introduce termination table bits

Termination table is a flow table with a termination flag. The flag
allows the firmware to assume that the the specified actions are the last
actions list. This assumption allows the FW to safely perform potential
looping logic (e.g. hairpin). Introduce the bits for this attribute.

Signed-off-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 3 +++
 include/linux/mlx5/fs.h                          | 1 +
 include/linux/mlx5/mlx5_ifc.h                    | 6 ++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 013b1ca4a791..bb24c3797218 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -147,6 +147,7 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 {
 	int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
 	int en_decap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
+	int term = !!(ft->flags & MLX5_FLOW_TABLE_TERMINATION);
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)]   = {0};
 	struct mlx5_core_dev *dev = ns->dev;
@@ -167,6 +168,8 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 		 en_decap);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.reformat_en,
 		 en_encap);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.termination_table,
+		 term);
 
 	switch (ft->op_mod) {
 	case FS_FT_OP_MOD_NORMAL:
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index e690ba0f965c..2ddaa97f2179 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -47,6 +47,7 @@ enum {
 enum {
 	MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT = BIT(0),
 	MLX5_FLOW_TABLE_TUNNEL_EN_DECAP = BIT(1),
+	MLX5_FLOW_TABLE_TERMINATION = BIT(2),
 };
 
 #define LEFTOVERS_RULE_NUM	 2
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 7ee422e38826..feaa909bf14f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -382,7 +382,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8	   reformat_and_modify_action[0x1];
 	u8         reserved_at_15[0x2];
 	u8	   table_miss_action_domain[0x1];
-	u8         reserved_at_18[0x8];
+	u8         termination_table[0x1];
+	u8         reserved_at_19[0x7];
 	u8         reserved_at_20[0x2];
 	u8         log_max_ft_size[0x6];
 	u8         log_max_modify_header_context[0x8];
@@ -7239,7 +7240,8 @@ struct mlx5_ifc_create_flow_table_out_bits {
 struct mlx5_ifc_flow_table_context_bits {
 	u8         reformat_en[0x1];
 	u8         decap_en[0x1];
-	u8         reserved_at_2[0x2];
+	u8         reserved_at_2[0x1];
+	u8         termination_table[0x1];
 	u8         table_miss_action[0x4];
 	u8         level[0x8];
 	u8         reserved_at_10[0x8];
-- 
cgit v1.2.3


From cd56f929e6a547180f889a4def370bdd6d48d223 Mon Sep 17 00:00:00 2001
From: Vu Pham <vuhuong@mellanox.com>
Date: Wed, 29 May 2019 22:50:34 +0000
Subject: net/mlx5: E-Switch, Replace host_params event with functions_changed
 event

To support sriov on a E-Switch manager, num_vfs are queried
to the firmware whenever E-Switch manager is notified by
esw_functions_changed event.

Replace host_params event with esw_functions_changed event that reflects
more appropriate naming.

While at it, also correct num_vfs type from int to u16 as expected by
the function mlx5_esw_query_functions().

Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/ecpf.c     | 27 ---------
 drivers/net/ethernet/mellanox/mlx5/core/ecpf.h     |  4 --
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 32 +++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  6 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 69 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/events.c   |  4 +-
 include/linux/mlx5/device.h                        |  2 +-
 include/linux/mlx5/mlx5_ifc.h                      |  6 +-
 10 files changed, 86 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 937ba4bcb056..7d3aec98e31f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -316,7 +316,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_DESTROY_GENERAL_OBJECT:
 	case MLX5_CMD_OP_DEALLOC_MEMIC:
 	case MLX5_CMD_OP_PAGE_FAULT_RESUME:
-	case MLX5_CMD_OP_QUERY_HOST_PARAMS:
+	case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS:
 		return MLX5_CMD_STAT_OK;
 
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -628,7 +628,7 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(QUERY_MODIFY_HEADER_CONTEXT);
 	MLX5_COMMAND_STR_CASE(ALLOC_MEMIC);
 	MLX5_COMMAND_STR_CASE(DEALLOC_MEMIC);
-	MLX5_COMMAND_STR_CASE(QUERY_HOST_PARAMS);
+	MLX5_COMMAND_STR_CASE(QUERY_ESW_FUNCTIONS);
 	default: return "unknown command opcode";
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
index 4746f2d28fb6..1bcf8b8f9713 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
@@ -83,30 +83,3 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev)
 
 	mlx5_peer_pf_cleanup(dev);
 }
-
-static int mlx5_query_host_params_context(struct mlx5_core_dev *dev,
-					  u32 *out, int outlen)
-{
-	u32 in[MLX5_ST_SZ_DW(query_host_params_in)] = {};
-
-	MLX5_SET(query_host_params_in, in, opcode,
-		 MLX5_CMD_OP_QUERY_HOST_PARAMS);
-
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
-}
-
-int mlx5_query_host_params_num_vfs(struct mlx5_core_dev *dev, int *num_vf)
-{
-	u32 out[MLX5_ST_SZ_DW(query_host_params_out)] = {};
-	int err;
-
-	err = mlx5_query_host_params_context(dev, out, sizeof(out));
-	if (err)
-		return err;
-
-	*num_vf = MLX5_GET(query_host_params_out, out,
-			   host_params_context.host_num_of_vfs);
-	mlx5_core_dbg(dev, "host_num_of_vfs %d\n", *num_vf);
-
-	return 0;
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h
index 346372df218f..d3d7a00a02ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h
@@ -16,7 +16,6 @@ enum {
 bool mlx5_read_embedded_cpu(struct mlx5_core_dev *dev);
 int mlx5_ec_init(struct mlx5_core_dev *dev);
 void mlx5_ec_cleanup(struct mlx5_core_dev *dev);
-int mlx5_query_host_params_num_vfs(struct mlx5_core_dev *dev, int *num_vf);
 
 #else  /* CONFIG_MLX5_ESWITCH */
 
@@ -24,9 +23,6 @@ static inline bool
 mlx5_read_embedded_cpu(struct mlx5_core_dev *dev) { return false; }
 static inline int mlx5_ec_init(struct mlx5_core_dev *dev) { return 0; }
 static inline void mlx5_ec_cleanup(struct mlx5_core_dev *dev) {}
-static inline int
-mlx5_query_host_params_num_vfs(struct mlx5_core_dev *dev, int *num_vf)
-{ return -EOPNOTSUPP; }
 
 #endif /* CONFIG_MLX5_ESWITCH */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 23883d1fa22f..052bd70e4aa6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -534,7 +534,8 @@ static u64 gather_async_events_mask(struct mlx5_core_dev *dev)
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_MONITOR_COUNTER);
 
 	if (mlx5_core_is_ecpf_esw_manager(dev))
-		async_event_mask |= (1ull << MLX5_EVENT_TYPE_HOST_PARAMS_CHANGE);
+		async_event_mask |=
+			(1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED);
 
 	return async_event_mask;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9ea0ccfe5ef5..d8935232964a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1686,13 +1686,41 @@ static int eswitch_vport_event(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+static int query_esw_functions(struct mlx5_core_dev *dev,
+			       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_esw_functions_in)] = {0};
+
+	MLX5_SET(query_esw_functions_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_ESW_FUNCTIONS);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_esw_query_functions(struct mlx5_core_dev *dev, u16 *num_vfs)
+{
+	u32 out[MLX5_ST_SZ_DW(query_esw_functions_out)] = {0};
+	int err;
+
+	err = query_esw_functions(dev, out, sizeof(out));
+	if (err)
+		return err;
+
+	*num_vfs = MLX5_GET(query_esw_functions_out, out,
+			    host_params_context.host_num_of_vfs);
+	esw_debug(dev, "host_num_of_vfs=%d\n", *num_vfs);
+
+	return 0;
+}
+
 /* Public E-Switch API */
 #define ESW_ALLOWED(esw) ((esw) && MLX5_ESWITCH_MANAGER((esw)->dev))
 
 int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 {
-	int vf_nvports = 0, total_nvports = 0;
 	struct mlx5_vport *vport;
+	int total_nvports = 0;
+	u16 vf_nvports = 0;
 	int err;
 	int i, enabled_events;
 
@@ -1712,7 +1740,7 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 
 	if (mode == SRIOV_OFFLOADS) {
 		if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
-			err = mlx5_query_host_params_num_vfs(esw->dev, &vf_nvports);
+			err = mlx5_esw_query_functions(esw->dev, &vf_nvports);
 			if (err)
 				return err;
 			total_nvports = esw->total_vports;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ed3fad689ec9..320dd83dd301 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -190,7 +190,7 @@ struct mlx5_host_work {
 	struct mlx5_eswitch	*esw;
 };
 
-struct mlx5_host_info {
+struct mlx5_esw_functions {
 	struct mlx5_nb		nb;
 	u16			num_vfs;
 };
@@ -219,7 +219,7 @@ struct mlx5_eswitch {
 	int                     mode;
 	int                     nvports;
 	u16                     manager_vport;
-	struct mlx5_host_info	host_info;
+	struct mlx5_esw_functions esw_funcs;
 };
 
 void esw_offloads_cleanup(struct mlx5_eswitch *esw);
@@ -386,6 +386,8 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0,
 bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
 			       struct mlx5_core_dev *dev1);
 
+int mlx5_esw_query_functions(struct mlx5_core_dev *dev, u16 *num_vfs);
+
 #define MLX5_DEBUG_ESWITCH_MASK BIT(3)
 
 #define esw_info(__dev, format, ...)			\
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index e09ae27485ee..83689678b400 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -41,7 +41,6 @@
 #include "en.h"
 #include "fs_core.h"
 #include "lib/devcom.h"
-#include "ecpf.h"
 #include "lib/eq.h"
 
 /* There are two match-all miss flows, one for unicast dst mac and
@@ -1782,57 +1781,79 @@ static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
 		esw_prio_tag_acls_cleanup(esw);
 }
 
-static void esw_host_params_event_handler(struct work_struct *work)
+static void esw_functions_changed_event_handler(struct work_struct *work)
 {
 	struct mlx5_host_work *host_work;
 	struct mlx5_eswitch *esw;
-	int err, num_vf = 0;
+	u16 num_vfs = 0;
+	int err;
 
 	host_work = container_of(work, struct mlx5_host_work, work);
 	esw = host_work->esw;
 
-	err = mlx5_query_host_params_num_vfs(esw->dev, &num_vf);
-	if (err || num_vf == esw->host_info.num_vfs)
+	err = mlx5_esw_query_functions(esw->dev, &num_vfs);
+	if (err || num_vfs == esw->esw_funcs.num_vfs)
 		goto out;
 
 	/* Number of VFs can only change from "0 to x" or "x to 0". */
-	if (esw->host_info.num_vfs > 0) {
-		esw_offloads_unload_vf_reps(esw, esw->host_info.num_vfs);
+	if (esw->esw_funcs.num_vfs > 0) {
+		esw_offloads_unload_vf_reps(esw, esw->esw_funcs.num_vfs);
 	} else {
-		err = esw_offloads_load_vf_reps(esw, num_vf);
+		err = esw_offloads_load_vf_reps(esw, num_vfs);
 
 		if (err)
 			goto out;
 	}
 
-	esw->host_info.num_vfs = num_vf;
+	esw->esw_funcs.num_vfs = num_vfs;
 
 out:
 	kfree(host_work);
 }
 
-static int esw_host_params_event(struct notifier_block *nb,
-				 unsigned long type, void *data)
+static int esw_functions_changed_event(struct notifier_block *nb,
+				       unsigned long type, void *data)
 {
+	struct mlx5_esw_functions *esw_funcs;
 	struct mlx5_host_work *host_work;
-	struct mlx5_host_info *host_info;
 	struct mlx5_eswitch *esw;
 
 	host_work = kzalloc(sizeof(*host_work), GFP_ATOMIC);
 	if (!host_work)
 		return NOTIFY_DONE;
 
-	host_info = mlx5_nb_cof(nb, struct mlx5_host_info, nb);
-	esw = container_of(host_info, struct mlx5_eswitch, host_info);
+	esw_funcs = mlx5_nb_cof(nb, struct mlx5_esw_functions, nb);
+	esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
 
 	host_work->esw = esw;
 
-	INIT_WORK(&host_work->work, esw_host_params_event_handler);
+	INIT_WORK(&host_work->work, esw_functions_changed_event_handler);
 	queue_work(esw->work_queue, &host_work->work);
 
 	return NOTIFY_OK;
 }
 
+static void esw_functions_changed_event_init(struct mlx5_eswitch *esw,
+					     u16 vf_nvports)
+{
+	if (!mlx5_core_is_ecpf_esw_manager(esw->dev))
+		return;
+
+	MLX5_NB_INIT(&esw->esw_funcs.nb, esw_functions_changed_event,
+		     ESW_FUNCTIONS_CHANGED);
+	mlx5_eq_notifier_register(esw->dev, &esw->esw_funcs.nb);
+	esw->esw_funcs.num_vfs = vf_nvports;
+}
+
+static void esw_functions_changed_event_cleanup(struct mlx5_eswitch *esw)
+{
+	if (!mlx5_core_is_ecpf_esw_manager(esw->dev))
+		return;
+
+	mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
+	flush_workqueue(esw->work_queue);
+}
+
 int esw_offloads_init(struct mlx5_eswitch *esw, int vf_nvports,
 		      int total_nvports)
 {
@@ -1848,12 +1869,7 @@ int esw_offloads_init(struct mlx5_eswitch *esw, int vf_nvports,
 
 	esw_offloads_devcom_init(esw);
 
-	if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
-		MLX5_NB_INIT(&esw->host_info.nb, esw_host_params_event,
-			     HOST_PARAMS_CHANGE);
-		mlx5_eq_notifier_register(esw->dev, &esw->host_info.nb);
-		esw->host_info.num_vfs = vf_nvports;
-	}
+	esw_functions_changed_event_init(esw, vf_nvports);
 
 	mlx5_rdma_enable_roce(esw->dev);
 
@@ -1887,13 +1903,12 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
 {
 	u16 num_vfs;
 
-	if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
-		mlx5_eq_notifier_unregister(esw->dev, &esw->host_info.nb);
-		flush_workqueue(esw->work_queue);
-		num_vfs = esw->host_info.num_vfs;
-	} else {
+	esw_functions_changed_event_cleanup(esw);
+
+	if (mlx5_core_is_ecpf_esw_manager(esw->dev))
+		num_vfs = esw->esw_funcs.num_vfs;
+	else
 		num_vfs = esw->dev->priv.sriov.num_vfs;
-	}
 
 	mlx5_rdma_disable_roce(esw->dev);
 	esw_offloads_devcom_cleanup(esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index a81e8d2168d8..8bcf3426b9c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -108,8 +108,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_STALL_EVENT";
 	case MLX5_EVENT_TYPE_CMD:
 		return "MLX5_EVENT_TYPE_CMD";
-	case MLX5_EVENT_TYPE_HOST_PARAMS_CHANGE:
-		return "MLX5_EVENT_TYPE_HOST_PARAMS_CHANGE";
+	case MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED:
+		return "MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED";
 	case MLX5_EVENT_TYPE_PAGE_REQUEST:
 		return "MLX5_EVENT_TYPE_PAGE_REQUEST";
 	case MLX5_EVENT_TYPE_PAGE_FAULT:
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index fc2b6e807f06..5e760067ac41 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -342,7 +342,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_PAGE_FAULT	   = 0xc,
 	MLX5_EVENT_TYPE_NIC_VPORT_CHANGE   = 0xd,
 
-	MLX5_EVENT_TYPE_HOST_PARAMS_CHANGE = 0xe,
+	MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED = 0xe,
 
 	MLX5_EVENT_TYPE_DCT_DRAINED        = 0x1c,
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index feaa909bf14f..0780242a757a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -155,7 +155,7 @@ enum {
 	MLX5_CMD_OP_QUERY_XRQ_DC_PARAMS_ENTRY     = 0x725,
 	MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY       = 0x726,
 	MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS        = 0x727,
-	MLX5_CMD_OP_QUERY_HOST_PARAMS             = 0x740,
+	MLX5_CMD_OP_QUERY_ESW_FUNCTIONS           = 0x740,
 	MLX5_CMD_OP_QUERY_VPORT_STATE             = 0x750,
 	MLX5_CMD_OP_MODIFY_VPORT_STATE            = 0x751,
 	MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT       = 0x752,
@@ -9721,7 +9721,7 @@ struct mlx5_ifc_host_params_context_bits {
 	u8         reserved_at_80[0x180];
 };
 
-struct mlx5_ifc_query_host_params_in_bits {
+struct mlx5_ifc_query_esw_functions_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
 
@@ -9731,7 +9731,7 @@ struct mlx5_ifc_query_host_params_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
-struct mlx5_ifc_query_host_params_out_bits {
+struct mlx5_ifc_query_esw_functions_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
 
-- 
cgit v1.2.3


From 6706a3b94f890145ca09797f748d2b30e1414fd3 Mon Sep 17 00:00:00 2001
From: Vu Pham <vuhuong@mellanox.com>
Date: Wed, 29 May 2019 22:50:37 +0000
Subject: net/mlx5: E-Switch, Honor eswitch functions changed event cap

Whenever device supports eswitch functions changed event, honor
such device setting. Do not limit it to ECPF.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c               |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 13 +++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c |  6 +++---
 include/linux/mlx5/mlx5_ifc.h                              |  4 +++-
 4 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 052bd70e4aa6..5e9319d3d90c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -533,7 +533,7 @@ static u64 gather_async_events_mask(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, max_num_of_monitor_counters))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_MONITOR_COUNTER);
 
-	if (mlx5_core_is_ecpf_esw_manager(dev))
+	if (mlx5_eswitch_is_funcs_handler(dev))
 		async_event_mask |=
 			(1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 320dd83dd301..b524813cccac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -406,6 +406,18 @@ static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev)
 		MLX5_VPORT_ECPF : MLX5_VPORT_PF;
 }
 
+static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev)
+{
+	/* Ideally device should have the functions changed supported
+	 * capability regardless of it being ECPF or PF wherever such
+	 * event should be processed such as on eswitch manager device.
+	 * However, some ECPF based device might not have this capability
+	 * set. Hence OR for ECPF check to cover such device.
+	 */
+	return MLX5_CAP_ESW(dev, esw_functions_changed) ||
+	       mlx5_core_is_ecpf_esw_manager(dev);
+}
+
 static inline int mlx5_eswitch_uplink_idx(struct mlx5_eswitch *esw)
 {
 	/* Uplink always locate at the last element of the array.*/
@@ -500,6 +512,7 @@ static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
 static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
 static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
 static inline bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1) { return true; }
+static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; }
 
 #define FDB_MAX_CHAIN 1
 #define FDB_SLOW_PATH_CHAIN (FDB_MAX_CHAIN + 1)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 83689678b400..05cb2fffd887 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1836,7 +1836,7 @@ static int esw_functions_changed_event(struct notifier_block *nb,
 static void esw_functions_changed_event_init(struct mlx5_eswitch *esw,
 					     u16 vf_nvports)
 {
-	if (!mlx5_core_is_ecpf_esw_manager(esw->dev))
+	if (!mlx5_eswitch_is_funcs_handler(esw->dev))
 		return;
 
 	MLX5_NB_INIT(&esw->esw_funcs.nb, esw_functions_changed_event,
@@ -1847,7 +1847,7 @@ static void esw_functions_changed_event_init(struct mlx5_eswitch *esw,
 
 static void esw_functions_changed_event_cleanup(struct mlx5_eswitch *esw)
 {
-	if (!mlx5_core_is_ecpf_esw_manager(esw->dev))
+	if (!mlx5_eswitch_is_funcs_handler(esw->dev))
 		return;
 
 	mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
@@ -1905,7 +1905,7 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
 
 	esw_functions_changed_event_cleanup(esw);
 
-	if (mlx5_core_is_ecpf_esw_manager(esw->dev))
+	if (mlx5_eswitch_is_funcs_handler(esw->dev))
 		num_vfs = esw->esw_funcs.num_vfs;
 	else
 		num_vfs = esw->dev->priv.sriov.num_vfs;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 0780242a757a..6513b985c5e9 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -665,7 +665,9 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert_if_not_exist[0x1];
 	u8         vport_cvlan_insert_overwrite[0x1];
-	u8         reserved_at_5[0x16];
+	u8         reserved_at_5[0x14];
+	u8         esw_functions_changed[0x1];
+	u8         reserved_at_1a[0x1];
 	u8         ecpf_vport_exists[0x1];
 	u8         counter_eswitch_affinity[0x1];
 	u8         merged_eswitch[0x1];
-- 
cgit v1.2.3


From 8693115af4c24d92b971ad895c5f329761ed5d38 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Wed, 29 May 2019 22:50:41 +0000
Subject: {IB,net}/mlx5: Constify rep ops functions pointers

Currently for every representor type and for every single vport,
representer function pointers copy is stored even though they don't
change from one to other vport.

Additionally priv data entry for the rep is not passed during
registration, but its copied. It is used (set and cleared) by the user
of the reps.

As we want to scale vports, to simplify and also to split constants
from data,

1. Rename mlx5_eswitch_rep_if to mlx5_eswitch_rep_ops as to match _ops
prefix with other standard netdev, ibdev ops.
2. Constify the IB and Ethernet rep ops structure.
3. Instead of storing copy of all rep function pointers, store copy
per eswitch rep type.
4. Split data and function pointers to mlx5_eswitch_rep_ops and
mlx5_eswitch_rep_data.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                | 19 ++++++-----
 drivers/infiniband/hw/mlx5/ib_rep.h                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 15 +++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  1 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 38 ++++++++++------------
 include/linux/mlx5/eswitch.h                       | 20 +++++++-----
 7 files changed, 49 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index cbcc40d776b9..22e651cb5534 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -60,7 +60,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	if (!__mlx5_ib_add(ibdev, profile))
 		return -EINVAL;
 
-	rep->rep_if[REP_IB].priv = ibdev;
+	rep->rep_data[REP_IB].priv = ibdev;
 
 	return 0;
 }
@@ -70,13 +70,13 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 {
 	struct mlx5_ib_dev *dev;
 
-	if (!rep->rep_if[REP_IB].priv ||
+	if (!rep->rep_data[REP_IB].priv ||
 	    rep->vport != MLX5_VPORT_UPLINK)
 		return;
 
 	dev = mlx5_ib_rep_to_dev(rep);
 	__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
-	rep->rep_if[REP_IB].priv = NULL;
+	rep->rep_data[REP_IB].priv = NULL;
 }
 
 static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
@@ -84,16 +84,17 @@ static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
 	return mlx5_ib_rep_to_dev(rep);
 }
 
+static const struct mlx5_eswitch_rep_ops rep_ops = {
+	.load = mlx5_ib_vport_rep_load,
+	.unload = mlx5_ib_vport_rep_unload,
+	.get_proto_dev = mlx5_ib_vport_get_proto_dev,
+};
+
 void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_eswitch *esw = mdev->priv.eswitch;
-	struct mlx5_eswitch_rep_if rep_if = {};
-
-	rep_if.load = mlx5_ib_vport_rep_load;
-	rep_if.unload = mlx5_ib_vport_rep_unload;
-	rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev;
 
-	mlx5_eswitch_register_vport_reps(esw, &rep_if, REP_IB);
+	mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
 }
 
 void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev)
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index c995102b0276..22adce2d6795 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -72,6 +72,6 @@ struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
 static inline
 struct mlx5_ib_dev *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep)
 {
-	return rep->rep_if[REP_IB].priv;
+	return rep->rep_data[REP_IB].priv;
 }
 #endif /* __MLX5_IB_REP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 91e24f1cead8..33f8f99681a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1752,7 +1752,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	}
 
 	rpriv->netdev = netdev;
-	rep->rep_if[REP_ETH].priv = rpriv;
+	rep->rep_data[REP_ETH].priv = rpriv;
 	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
 
 	if (rep->vport == MLX5_VPORT_UPLINK) {
@@ -1826,16 +1826,17 @@ static void *mlx5e_vport_rep_get_proto_dev(struct mlx5_eswitch_rep *rep)
 	return rpriv->netdev;
 }
 
+static const struct mlx5_eswitch_rep_ops rep_ops = {
+	.load = mlx5e_vport_rep_load,
+	.unload = mlx5e_vport_rep_unload,
+	.get_proto_dev = mlx5e_vport_rep_get_proto_dev
+};
+
 void mlx5e_rep_register_vport_reps(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_eswitch *esw = mdev->priv.eswitch;
-	struct mlx5_eswitch_rep_if rep_if = {};
-
-	rep_if.load = mlx5e_vport_rep_load;
-	rep_if.unload = mlx5e_vport_rep_unload;
-	rep_if.get_proto_dev = mlx5e_vport_rep_get_proto_dev;
 
-	mlx5_eswitch_register_vport_reps(esw, &rep_if, REP_ETH);
+	mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_ETH);
 }
 
 void mlx5e_rep_unregister_vport_reps(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index c40c025afd99..e34573fd88c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -91,7 +91,7 @@ struct mlx5e_rep_priv {
 static inline
 struct mlx5e_rep_priv *mlx5e_rep_to_rep_priv(struct mlx5_eswitch_rep *rep)
 {
-	return rep->rep_if[REP_ETH].priv;
+	return rep->rep_data[REP_ETH].priv;
 }
 
 struct mlx5e_neigh {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index b524813cccac..135d9a29bbdf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -173,6 +173,7 @@ struct mlx5_esw_offload {
 	struct mutex peer_mutex;
 	DECLARE_HASHTABLE(encap_tbl, 8);
 	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+	const struct mlx5_eswitch_rep_ops *rep_ops[NUM_REP_TYPES];
 	u8 inline_mode;
 	u64 num_flows;
 	u8 encap;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 05cb2fffd887..d6246ee042fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -332,7 +332,7 @@ static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
 	esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
 	for (vf_vport = 1; vf_vport < esw->enabled_vports; vf_vport++) {
 		rep = &esw->offloads.vport_reps[vf_vport];
-		if (atomic_read(&rep->rep_if[REP_ETH].state) != REP_LOADED)
+		if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED)
 			continue;
 
 		err = __mlx5_eswitch_set_vport_vlan(esw, rep->vport, 0, 0, val);
@@ -1276,7 +1276,7 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 		ether_addr_copy(rep->hw_id, hw_id);
 
 		for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++)
-			atomic_set(&rep->rep_if[rep_type].state,
+			atomic_set(&rep->rep_data[rep_type].state,
 				   REP_UNREGISTERED);
 	}
 
@@ -1286,9 +1286,9 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
 				      struct mlx5_eswitch_rep *rep, u8 rep_type)
 {
-	if (atomic_cmpxchg(&rep->rep_if[rep_type].state,
+	if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
 			   REP_LOADED, REP_REGISTERED) == REP_LOADED)
-		rep->rep_if[rep_type].unload(rep);
+		esw->offloads.rep_ops[rep_type]->unload(rep);
 }
 
 static void __unload_reps_special_vport(struct mlx5_eswitch *esw, u8 rep_type)
@@ -1349,11 +1349,11 @@ static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
 {
 	int err = 0;
 
-	if (atomic_cmpxchg(&rep->rep_if[rep_type].state,
+	if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
 			   REP_REGISTERED, REP_LOADED) == REP_REGISTERED) {
-		err = rep->rep_if[rep_type].load(esw->dev, rep);
+		err = esw->offloads.rep_ops[rep_type]->load(esw->dev, rep);
 		if (err)
-			atomic_set(&rep->rep_if[rep_type].state,
+			atomic_set(&rep->rep_data[rep_type].state,
 				   REP_REGISTERED);
 	}
 
@@ -2216,21 +2216,17 @@ int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap)
 }
 
 void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
-				      struct mlx5_eswitch_rep_if *__rep_if,
+				      const struct mlx5_eswitch_rep_ops *ops,
 				      u8 rep_type)
 {
-	struct mlx5_eswitch_rep_if *rep_if;
+	struct mlx5_eswitch_rep_data *rep_data;
 	struct mlx5_eswitch_rep *rep;
 	int i;
 
+	esw->offloads.rep_ops[rep_type] = ops;
 	mlx5_esw_for_all_reps(esw, i, rep) {
-		rep_if = &rep->rep_if[rep_type];
-		rep_if->load   = __rep_if->load;
-		rep_if->unload = __rep_if->unload;
-		rep_if->get_proto_dev = __rep_if->get_proto_dev;
-		rep_if->priv = __rep_if->priv;
-
-		atomic_set(&rep_if->state, REP_REGISTERED);
+		rep_data = &rep->rep_data[rep_type];
+		atomic_set(&rep_data->state, REP_REGISTERED);
 	}
 }
 EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
@@ -2245,7 +2241,7 @@ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
 		__unload_reps_all_vport(esw, max_vf, rep_type);
 
 	mlx5_esw_for_all_reps(esw, i, rep)
-		atomic_set(&rep->rep_if[rep_type].state, REP_UNREGISTERED);
+		atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED);
 }
 EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps);
 
@@ -2254,7 +2250,7 @@ void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
 	struct mlx5_eswitch_rep *rep;
 
 	rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
-	return rep->rep_if[rep_type].priv;
+	return rep->rep_data[rep_type].priv;
 }
 
 void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
@@ -2265,9 +2261,9 @@ void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
 
 	rep = mlx5_eswitch_get_rep(esw, vport);
 
-	if (atomic_read(&rep->rep_if[rep_type].state) == REP_LOADED &&
-	    rep->rep_if[rep_type].get_proto_dev)
-		return rep->rep_if[rep_type].get_proto_dev(rep);
+	if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED &&
+	    esw->offloads.rep_ops[rep_type]->get_proto_dev)
+		return esw->offloads.rep_ops[rep_type]->get_proto_dev(rep);
 	return NULL;
 }
 EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 0ca77dd1429c..d81ee4df181c 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -29,17 +29,19 @@ enum {
 };
 
 struct mlx5_eswitch_rep;
-struct mlx5_eswitch_rep_if {
-	int		       (*load)(struct mlx5_core_dev *dev,
-				       struct mlx5_eswitch_rep *rep);
-	void		       (*unload)(struct mlx5_eswitch_rep *rep);
-	void		       *(*get_proto_dev)(struct mlx5_eswitch_rep *rep);
-	void			*priv;
-	atomic_t		state;
+struct mlx5_eswitch_rep_ops {
+	int (*load)(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep);
+	void (*unload)(struct mlx5_eswitch_rep *rep);
+	void *(*get_proto_dev)(struct mlx5_eswitch_rep *rep);
+};
+
+struct mlx5_eswitch_rep_data {
+	void *priv;
+	atomic_t state;
 };
 
 struct mlx5_eswitch_rep {
-	struct mlx5_eswitch_rep_if rep_if[NUM_REP_TYPES];
+	struct mlx5_eswitch_rep_data rep_data[NUM_REP_TYPES];
 	u16		       vport;
 	u8		       hw_id[ETH_ALEN];
 	u16		       vlan;
@@ -47,7 +49,7 @@ struct mlx5_eswitch_rep {
 };
 
 void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
-				      struct mlx5_eswitch_rep_if *rep_if,
+				      const struct mlx5_eswitch_rep_ops *ops,
 				      u8 rep_type);
 void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type);
 void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
-- 
cgit v1.2.3


From 320587e6eac960591077b90271f40bfad24d6155 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 28 May 2019 10:57:34 +0100
Subject: net: sfp: add mandatory attach/detach methods for sfp buses

Add attach and detach methods for SFP buses, which will allow us to get
rid of the netdev storage in sfp-bus.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 16 ++++++++++++++++
 drivers/net/phy/sfp-bus.c |  4 ++--
 include/linux/sfp.h       |  6 ++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index eb07c3d8f09e..503f4b221696 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1650,6 +1650,20 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd)
 }
 EXPORT_SYMBOL_GPL(phylink_mii_ioctl);
 
+static void phylink_sfp_attach(void *upstream, struct sfp_bus *bus)
+{
+	struct phylink *pl = upstream;
+
+	pl->netdev->sfp_bus = bus;
+}
+
+static void phylink_sfp_detach(void *upstream, struct sfp_bus *bus)
+{
+	struct phylink *pl = upstream;
+
+	pl->netdev->sfp_bus = NULL;
+}
+
 static int phylink_sfp_module_insert(void *upstream,
 				     const struct sfp_eeprom_id *id)
 {
@@ -1768,6 +1782,8 @@ static void phylink_sfp_disconnect_phy(void *upstream)
 }
 
 static const struct sfp_upstream_ops sfp_phylink_ops = {
+	.attach = phylink_sfp_attach,
+	.detach = phylink_sfp_detach,
 	.module_insert = phylink_sfp_module_insert,
 	.link_up = phylink_sfp_link_up,
 	.link_down = phylink_sfp_link_down,
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index e9c187946cca..0608203cc752 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -351,7 +351,7 @@ static int sfp_register_bus(struct sfp_bus *bus)
 	bus->socket_ops->attach(bus->sfp);
 	if (bus->started)
 		bus->socket_ops->start(bus->sfp);
-	bus->netdev->sfp_bus = bus;
+	bus->upstream_ops->attach(bus->upstream, bus);
 	bus->registered = true;
 	return 0;
 }
@@ -360,8 +360,8 @@ static void sfp_unregister_bus(struct sfp_bus *bus)
 {
 	const struct sfp_upstream_ops *ops = bus->upstream_ops;
 
-	bus->netdev->sfp_bus = NULL;
 	if (bus->registered) {
+		bus->upstream_ops->detach(bus->upstream, bus);
 		if (bus->started)
 			bus->socket_ops->stop(bus->sfp);
 		bus->socket_ops->detach(bus->sfp);
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index d9d9de3fcf8e..a3f0336dd703 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -469,6 +469,10 @@ struct sfp_bus;
 
 /**
  * struct sfp_upstream_ops - upstream operations structure
+ * @attach: called when the sfp socket driver is bound to the upstream
+ *   (mandatory).
+ * @detach: called when the sfp socket driver is unbound from the upstream
+ *   (mandatory).
  * @module_insert: called after a module has been detected to determine
  *   whether the module is supported for the upstream device.
  * @module_remove: called after the module has been removed.
@@ -481,6 +485,8 @@ struct sfp_bus;
  *   been removed.
  */
 struct sfp_upstream_ops {
+	void (*attach)(void *priv, struct sfp_bus *bus);
+	void (*detach)(void *priv, struct sfp_bus *bus);
 	int (*module_insert)(void *priv, const struct sfp_eeprom_id *id);
 	void (*module_remove)(void *priv);
 	void (*link_down)(void *priv);
-- 
cgit v1.2.3


From 54f70b3ba364f19291dc8b9cb096b02a00fb4461 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 28 May 2019 10:57:39 +0100
Subject: net: sfp: remove sfp-bus use of netdevs

The sfp-bus code now no longer has any use for the network device
structure, so remove its use.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c |  3 +--
 drivers/net/phy/sfp-bus.c | 10 +++-------
 include/linux/sfp.h       |  6 ++----
 3 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 503f4b221696..f5b97dab3017 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -565,8 +565,7 @@ static int phylink_register_sfp(struct phylink *pl,
 		return ret;
 	}
 
-	pl->sfp_bus = sfp_register_upstream(ref.fwnode, pl->netdev, pl,
-					    &sfp_phylink_ops);
+	pl->sfp_bus = sfp_register_upstream(ref.fwnode, pl, &sfp_phylink_ops);
 	if (!pl->sfp_bus)
 		return -ENOMEM;
 
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 0608203cc752..b23fc41896ef 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -24,7 +24,6 @@ struct sfp_bus {
 
 	const struct sfp_upstream_ops *upstream_ops;
 	void *upstream;
-	struct net_device *netdev;
 	struct phy_device *phydev;
 
 	bool registered;
@@ -443,13 +442,11 @@ static void sfp_upstream_clear(struct sfp_bus *bus)
 {
 	bus->upstream_ops = NULL;
 	bus->upstream = NULL;
-	bus->netdev = NULL;
 }
 
 /**
  * sfp_register_upstream() - Register the neighbouring device
  * @fwnode: firmware node for the SFP bus
- * @ndev: network device associated with the interface
  * @upstream: the upstream private data
  * @ops: the upstream's &struct sfp_upstream_ops
  *
@@ -460,7 +457,7 @@ static void sfp_upstream_clear(struct sfp_bus *bus)
  * On error, returns %NULL.
  */
 struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode,
-				      struct net_device *ndev, void *upstream,
+				      void *upstream,
 				      const struct sfp_upstream_ops *ops)
 {
 	struct sfp_bus *bus = sfp_bus_get(fwnode);
@@ -470,7 +467,6 @@ struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode,
 		rtnl_lock();
 		bus->upstream_ops = ops;
 		bus->upstream = upstream;
-		bus->netdev = ndev;
 
 		if (bus->sfp) {
 			ret = sfp_register_bus(bus);
@@ -592,7 +588,7 @@ struct sfp_bus *sfp_register_socket(struct device *dev, struct sfp *sfp,
 		bus->sfp = sfp;
 		bus->socket_ops = ops;
 
-		if (bus->netdev) {
+		if (bus->upstream_ops) {
 			ret = sfp_register_bus(bus);
 			if (ret)
 				sfp_socket_clear(bus);
@@ -612,7 +608,7 @@ EXPORT_SYMBOL_GPL(sfp_register_socket);
 void sfp_unregister_socket(struct sfp_bus *bus)
 {
 	rtnl_lock();
-	if (bus->netdev)
+	if (bus->upstream_ops)
 		sfp_unregister_bus(bus);
 	sfp_socket_clear(bus);
 	rtnl_unlock();
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index a3f0336dd703..1c35428e98bc 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -464,7 +464,6 @@ enum {
 struct fwnode_handle;
 struct ethtool_eeprom;
 struct ethtool_modinfo;
-struct net_device;
 struct sfp_bus;
 
 /**
@@ -510,7 +509,7 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
 void sfp_upstream_start(struct sfp_bus *bus);
 void sfp_upstream_stop(struct sfp_bus *bus);
 struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode,
-				      struct net_device *ndev, void *upstream,
+				      void *upstream,
 				      const struct sfp_upstream_ops *ops);
 void sfp_unregister_upstream(struct sfp_bus *bus);
 #else
@@ -555,8 +554,7 @@ static inline void sfp_upstream_stop(struct sfp_bus *bus)
 }
 
 static inline struct sfp_bus *sfp_register_upstream(
-	struct fwnode_handle *fwnode,
-	struct net_device *ndev, void *upstream,
+	struct fwnode_handle *fwnode, void *upstream,
 	const struct sfp_upstream_ops *ops)
 {
 	return (struct sfp_bus *)-1;
-- 
cgit v1.2.3


From 0ccc171ea6a2fa34a6b898329c0a447c84e27057 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@mellanox.com>
Date: Wed, 30 Jan 2019 17:21:55 +0200
Subject: net/mlx5: Geneve, Manage Geneve TLV options

Use Geneve TLV Options object to manage the flex parser matching
on the 32-bit options data.

When the first flow with a certain class/type values is requested to
be offloaded, create a FW object with FW command (Geneve TLV Options
general object) and start counting the number of flows using this object.

During this time, any request with a different class/type values will
fail to be offloaded.
Once the refcount reaches 0, destroy the TLV options general object,
and can now offload a flow with any class/type parameters.

Geneve TLV Options object is added to core device.
It is currently used to manage Geneve TLV options general
object allocation in FW and its reference counting only.
In the future it will also be used for managing geneve ports
by registering callbacks for ndo_udp_tunnel_add/del.

Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   3 +-
 .../net/ethernet/mellanox/mlx5/core/lib/geneve.c   | 157 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/geneve.h   |  33 +++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   4 +
 include/linux/mlx5/driver.h                        |   2 +
 5 files changed, 198 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 243368dc23db..e31027277a6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -31,7 +31,8 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 mlx5_core-$(CONFIG_MLX5_EN_ARFS)     += en_arfs.o
 mlx5_core-$(CONFIG_MLX5_EN_RXNFC)    += en_fs_ethtool.o
 mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
-mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o
+mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o \
+					lib/geneve.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c
new file mode 100644
index 000000000000..23361a9ae4fa
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include <linux/kernel.h>
+#include "mlx5_core.h"
+#include "geneve.h"
+
+struct mlx5_geneve {
+	struct mlx5_core_dev *mdev;
+	__be16 opt_class;
+	u8 opt_type;
+	u32 obj_id;
+	struct mutex sync_lock; /* protect GENEVE obj operations */
+	u32 refcount;
+};
+
+static int mlx5_geneve_tlv_option_create(struct mlx5_core_dev *mdev,
+					 __be16 class,
+					 u8 type,
+					 u8 len)
+{
+	u32 in[MLX5_ST_SZ_DW(create_geneve_tlv_option_in)] = {};
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+	u64 general_obj_types;
+	void *hdr, *opt;
+	u16 obj_id;
+	int err;
+
+	general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types);
+	if (!(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT))
+		return -EINVAL;
+
+	hdr = MLX5_ADDR_OF(create_geneve_tlv_option_in, in, hdr);
+	opt = MLX5_ADDR_OF(create_geneve_tlv_option_in, in, geneve_tlv_opt);
+
+	MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_GENEVE_TLV_OPT);
+
+	MLX5_SET(geneve_tlv_option, opt, option_class, be16_to_cpu(class));
+	MLX5_SET(geneve_tlv_option, opt, option_type, type);
+	MLX5_SET(geneve_tlv_option, opt, option_data_length, len);
+
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+	return obj_id;
+}
+
+static void mlx5_geneve_tlv_option_destroy(struct mlx5_core_dev *mdev, u16 obj_id)
+{
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+
+	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_GENEVE_TLV_OPT);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id);
+
+	mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt)
+{
+	int res = 0;
+
+	if (IS_ERR_OR_NULL(geneve))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&geneve->sync_lock);
+
+	if (geneve->refcount) {
+		if (geneve->opt_class == opt->opt_class &&
+		    geneve->opt_type == opt->type) {
+			/* We already have TLV options obj allocated */
+			geneve->refcount++;
+		} else {
+			/* TLV options obj allocated, but its params
+			 * do not match the new request.
+			 * We support only one such object.
+			 */
+			mlx5_core_warn(geneve->mdev,
+				       "Won't create Geneve TLV opt object with class:type:len = 0x%x:0x%x:%d (another class:type already exists)\n",
+				       be16_to_cpu(opt->opt_class),
+				       opt->type,
+				       opt->length);
+			res = -EOPNOTSUPP;
+			goto unlock;
+		}
+	} else {
+		/* We don't have any TLV options obj allocated */
+
+		res = mlx5_geneve_tlv_option_create(geneve->mdev,
+						    opt->opt_class,
+						    opt->type,
+						    opt->length);
+		if (res < 0) {
+			mlx5_core_warn(geneve->mdev,
+				       "Failed creating Geneve TLV opt object class:type:len = 0x%x:0x%x:%d (err=%d)\n",
+				       be16_to_cpu(opt->opt_class),
+				       opt->type, opt->length, res);
+			goto unlock;
+		}
+		geneve->opt_class = opt->opt_class;
+		geneve->opt_type = opt->type;
+		geneve->obj_id = res;
+		geneve->refcount++;
+	}
+
+unlock:
+	mutex_unlock(&geneve->sync_lock);
+	return res;
+}
+
+void mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve)
+{
+	if (IS_ERR_OR_NULL(geneve))
+		return;
+
+	mutex_lock(&geneve->sync_lock);
+	if (--geneve->refcount == 0) {
+		/* We've just removed the last user of Geneve option.
+		 * Now delete the object in FW.
+		 */
+		mlx5_geneve_tlv_option_destroy(geneve->mdev, geneve->obj_id);
+
+		geneve->opt_class = 0;
+		geneve->opt_type = 0;
+		geneve->obj_id = 0;
+	}
+	mutex_unlock(&geneve->sync_lock);
+}
+
+struct mlx5_geneve *mlx5_geneve_create(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_geneve *geneve =
+		kzalloc(sizeof(*geneve), GFP_KERNEL);
+
+	if (!geneve)
+		return ERR_PTR(-ENOMEM);
+	geneve->mdev = mdev;
+	mutex_init(&geneve->sync_lock);
+
+	return geneve;
+}
+
+void mlx5_geneve_destroy(struct mlx5_geneve *geneve)
+{
+	if (IS_ERR_OR_NULL(geneve))
+		return;
+
+	/* Lockless since we are unloading */
+	if (geneve->refcount)
+		mlx5_geneve_tlv_option_destroy(geneve->mdev, geneve->obj_id);
+
+	kfree(geneve);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h
new file mode 100644
index 000000000000..adee0cbba19c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_GENEVE_H__
+#define __MLX5_GENEVE_H__
+
+#include <net/geneve.h>
+#include <linux/mlx5/driver.h>
+
+struct mlx5_geneve;
+
+#ifdef CONFIG_MLX5_ESWITCH
+
+struct mlx5_geneve *mlx5_geneve_create(struct mlx5_core_dev *mdev);
+void mlx5_geneve_destroy(struct mlx5_geneve *geneve);
+
+int mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt);
+void mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve);
+
+#else /* CONFIG_MLX5_ESWITCH */
+
+static inline struct mlx5_geneve
+*mlx5_geneve_create(struct mlx5_core_dev *mdev) { return NULL; }
+static inline void
+mlx5_geneve_destroy(struct mlx5_geneve *geneve) {}
+static inline int
+mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt) { return 0; }
+static inline void
+mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve) {}
+
+#endif /* CONFIG_MLX5_ESWITCH */
+
+#endif /* __MLX5_GENEVE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 23d53163ce15..b27f9537256c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -63,6 +63,7 @@
 #include "accel/tls.h"
 #include "lib/clock.h"
 #include "lib/vxlan.h"
+#include "lib/geneve.h"
 #include "lib/devcom.h"
 #include "diag/fw_tracer.h"
 #include "ecpf.h"
@@ -821,6 +822,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 	mlx5_init_clock(dev);
 
 	dev->vxlan = mlx5_vxlan_create(dev);
+	dev->geneve = mlx5_geneve_create(dev);
 
 	err = mlx5_init_rl_table(dev);
 	if (err) {
@@ -865,6 +867,7 @@ err_mpfs_cleanup:
 err_rl_cleanup:
 	mlx5_cleanup_rl_table(dev);
 err_tables_cleanup:
+	mlx5_geneve_destroy(dev->geneve);
 	mlx5_vxlan_destroy(dev->vxlan);
 	mlx5_cleanup_mkey_table(dev);
 	mlx5_cleanup_qp_table(dev);
@@ -887,6 +890,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 	mlx5_mpfs_cleanup(dev);
 	mlx5_cleanup_rl_table(dev);
+	mlx5_geneve_destroy(dev->geneve);
 	mlx5_vxlan_destroy(dev->vxlan);
 	mlx5_cleanup_clock(dev);
 	mlx5_cleanup_reserved_gids(dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b5431f7d97cb..3a810bf043fe 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -647,6 +647,7 @@ struct mlx5_clock {
 
 struct mlx5_fw_tracer;
 struct mlx5_vxlan;
+struct mlx5_geneve;
 
 struct mlx5_core_dev {
 	struct device *device;
@@ -681,6 +682,7 @@ struct mlx5_core_dev {
 	u32			issi;
 	struct mlx5e_resources  mlx5e_res;
 	struct mlx5_vxlan       *vxlan;
+	struct mlx5_geneve      *geneve;
 	struct {
 		struct mlx5_rsvd_gids	reserved_gids;
 		u32			roce_en;
-- 
cgit v1.2.3


From 1f52f6c0b0e846908e9c1082dab1b3f7088b82ac Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:35 -0700
Subject: bpf: Create BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY

Create new macro BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() to be used by
__cgroup_bpf_run_filter_skb for EGRESS BPF progs so BPF programs can
request cwr for TCP packets.

Current cgroup skb programs can only return 0 or 1 (0 to drop the
packet. This macro changes the behavior so the low order bit
indicates whether the packet should be dropped (0) or not (1)
and the next bit is used for congestion notification (cn).

Hence, new allowed return values of CGROUP EGRESS BPF programs are:
  0: drop packet
  1: keep packet
  2: drop packet and call cwr
  3: keep packet and call cwr

This macro then converts it to one of NET_XMIT values or -EPERM
that has the effect of dropping the packet with no cn.
  0: NET_XMIT_SUCCESS  skb should be transmitted (no cn)
  1: NET_XMIT_DROP     skb should be dropped and cwr called
  2: NET_XMIT_CN       skb should be transmitted and cwr called
  3: -EPERM            skb should be dropped (no cn)

Note that when more than one BPF program is called, the packet is
dropped if at least one of programs requests it be dropped, and
there is cn if at least one program returns cn.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ff3e00ff84d2..2cc58fc0f413 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -552,6 +552,56 @@ _out:							\
 		_ret;					\
 	 })
 
+/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
+ * so BPF programs can request cwr for TCP packets.
+ *
+ * Current cgroup skb programs can only return 0 or 1 (0 to drop the
+ * packet. This macro changes the behavior so the low order bit
+ * indicates whether the packet should be dropped (0) or not (1)
+ * and the next bit is a congestion notification bit. This could be
+ * used by TCP to call tcp_enter_cwr()
+ *
+ * Hence, new allowed return values of CGROUP EGRESS BPF programs are:
+ *   0: drop packet
+ *   1: keep packet
+ *   2: drop packet and cn
+ *   3: keep packet and cn
+ *
+ * This macro then converts it to one of the NET_XMIT or an error
+ * code that is then interpreted as drop packet (and no cn):
+ *   0: NET_XMIT_SUCCESS  skb should be transmitted
+ *   1: NET_XMIT_DROP     skb should be dropped and cn
+ *   2: NET_XMIT_CN       skb should be transmitted and cn
+ *   3: -EPERM            skb should be dropped
+ */
+#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog_array_item *_item;	\
+		struct bpf_prog *_prog;			\
+		struct bpf_prog_array *_array;		\
+		u32 ret;				\
+		u32 _ret = 1;				\
+		u32 _cn = 0;				\
+		preempt_disable();			\
+		rcu_read_lock();			\
+		_array = rcu_dereference(array);	\
+		_item = &_array->items[0];		\
+		while ((_prog = READ_ONCE(_item->prog))) {		\
+			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			ret = func(_prog, ctx);		\
+			_ret &= (ret & 1);		\
+			_cn |= (ret & 2);		\
+			_item++;			\
+		}					\
+		rcu_read_unlock();			\
+		preempt_enable();			\
+		if (_ret)				\
+			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
+		else					\
+			_ret = (_cn ? NET_XMIT_DROP : -EPERM);		\
+		_ret;					\
+	})
+
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
 
-- 
cgit v1.2.3


From 5cf1e91456301f8c4f6bbc63ff76cff12f92f31b Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:36 -0700
Subject: bpf: cgroup inet skb programs can return 0 to 3

Allows cgroup inet skb programs to return values in the range [0, 3].
The second bit is used to deterine if congestion occurred and higher
level protocol should decrease rate. E.g. TCP would call tcp_enter_cwr()

The bpf_prog must set expected_attach_type to BPF_CGROUP_INET_EGRESS
at load time if it uses the new return values (i.e. 2 or 3).

The expected_attach_type is currently not enforced for
BPF_PROG_TYPE_CGROUP_SKB.  e.g Meaning the current bpf_prog with
expected_attach_type setting to BPF_CGROUP_INET_EGRESS can attach to
BPF_CGROUP_INET_INGRESS.  Blindly enforcing expected_attach_type will
break backward compatibility.

This patch adds a enforce_expected_attach_type bit to only
enforce the expected_attach_type when it uses the new
return value.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 ++-
 kernel/bpf/syscall.c   | 12 ++++++++++++
 kernel/bpf/verifier.c  | 16 +++++++++++++---
 3 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ba8b65270e0d..43b45d6db36d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -526,7 +526,8 @@ struct bpf_prog {
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
 				kprobe_override:1, /* Do we override a kprobe? */
-				has_callchain_buf:1; /* callchain buffer allocated? */
+				has_callchain_buf:1, /* callchain buffer allocated? */
+				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3d546b6f4646..1539774d78c7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		default:
 			return -EINVAL;
 		}
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		switch (expected_attach_type) {
+		case BPF_CGROUP_INET_INGRESS:
+		case BPF_CGROUP_INET_EGRESS:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	default:
 		return 0;
 	}
@@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		return prog->enforce_expected_attach_type &&
+			prog->expected_attach_type != attach_type ?
+			-EINVAL : 0;
 	default:
 		return 0;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2778417e6e0c..5c2cb5bd84ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 static int check_return_code(struct bpf_verifier_env *env)
 {
+	struct tnum enforce_attach_type_range = tnum_unknown;
 	struct bpf_reg_state *reg;
 	struct tnum range = tnum_range(0, 1);
 
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_CGROUP_SKB:
+		if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
+			range = tnum_range(0, 3);
+			enforce_attach_type_range = tnum_range(2, 3);
+		}
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_SOCK_OPS:
@@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env)
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
+		char tn_buf[48];
+
 		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
-			char tn_buf[48];
-
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 			verbose(env, "has value %s", tn_buf);
 		} else {
 			verbose(env, "has unknown scalar value");
 		}
-		verbose(env, " should have been 0 or 1\n");
+		tnum_strn(tn_buf, sizeof(tn_buf), range);
+		verbose(env, " should have been %s\n", tn_buf);
 		return -EINVAL;
 	}
+
+	if (!tnum_is_unknown(enforce_attach_type_range) &&
+	    tnum_in(enforce_attach_type_range, reg->var_off))
+		env->prog->enforce_expected_attach_type = 1;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3539b96e041c06e4317082816d90ec09160aeb11 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 29 May 2019 18:03:57 -0700
Subject: bpf: group memory related fields in struct bpf_map_memory

Group "user" and "pages" fields of bpf_map into the bpf_map_memory
structure. Later it can be extended with "memcg" and other related
information.

The main reason for a such change (beside cosmetics) is to pass
bpf_map_memory structure to charging functions before the actual
allocation of bpf_map.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           | 10 +++++++---
 kernel/bpf/arraymap.c         |  2 +-
 kernel/bpf/cpumap.c           |  4 ++--
 kernel/bpf/devmap.c           |  4 ++--
 kernel/bpf/hashtab.c          |  4 ++--
 kernel/bpf/local_storage.c    |  2 +-
 kernel/bpf/lpm_trie.c         |  4 ++--
 kernel/bpf/queue_stack_maps.c |  2 +-
 kernel/bpf/reuseport_array.c  |  2 +-
 kernel/bpf/stackmap.c         |  4 ++--
 kernel/bpf/syscall.c          | 19 ++++++++++---------
 kernel/bpf/xskmap.c           |  4 ++--
 net/core/bpf_sk_storage.c     |  2 +-
 net/core/sock_map.c           |  4 ++--
 14 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2cc58fc0f413..2e7c1c40d949 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -66,6 +66,11 @@ struct bpf_map_ops {
 				     u64 imm, u32 *off);
 };
 
+struct bpf_map_memory {
+	u32 pages;
+	struct user_struct *user;
+};
+
 struct bpf_map {
 	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
@@ -86,7 +91,7 @@ struct bpf_map {
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
 	struct btf *btf;
-	u32 pages;
+	struct bpf_map_memory memory;
 	bool unpriv_array;
 	bool frozen; /* write-once */
 	/* 48 bytes hole */
@@ -94,8 +99,7 @@ struct bpf_map {
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
-	struct user_struct *user ____cacheline_aligned;
-	atomic_t refcnt;
+	atomic_t refcnt ____cacheline_aligned;
 	atomic_t usercnt;
 	struct work_struct work;
 	char name[BPF_OBJ_NAME_LEN];
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 584636c9e2eb..8fda24e78193 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -138,7 +138,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
-	array->map.pages = cost;
+	array->map.memory.pages = cost;
 	array->elem_size = elem_size;
 
 	if (percpu && bpf_array_alloc_percpu(array)) {
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index cf727d77c6c6..035268add724 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_cmap;
-	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	ret = bpf_map_precharge_memlock(cmap->map.pages);
+	ret = bpf_map_precharge_memlock(cmap->map.memory.pages);
 	if (ret) {
 		err = ret;
 		goto free_cmap;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1e525d70f833..f6c57efb1d0d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -111,10 +111,10 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_dtab;
 
-	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(dtab->map.pages);
+	err = bpf_map_precharge_memlock(dtab->map.memory.pages);
 	if (err)
 		goto free_dtab;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0f2708fde5f7..15bf228d2e98 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -364,10 +364,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(htab->map.pages);
+	err = bpf_map_precharge_memlock(htab->map.memory.pages);
 	if (err)
 		goto free_htab;
 
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index e48302ecb389..574325276650 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -303,7 +303,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	map->map.pages = pages;
+	map->map.memory.pages = pages;
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&map->map, attr);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..8e423a582760 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -578,9 +578,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 		goto out_err;
 	}
 
-	trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_precharge_memlock(trie->map.pages);
+	ret = bpf_map_precharge_memlock(trie->map.memory.pages);
 	if (ret)
 		goto out_err;
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 0b140d236889..8a510e71d486 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -89,7 +89,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 
 	bpf_map_init_from_attr(&qs->map, attr);
 
-	qs->map.pages = cost;
+	qs->map.memory.pages = cost;
 	qs->size = size;
 
 	raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..819515242739 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -176,7 +176,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
-	array->map.pages = cost;
+	array->map.memory.pages = cost;
 
 	return &array->map;
 }
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..08d4efff73ac 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -131,9 +131,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	bpf_map_init_from_attr(&smap->map, attr);
 	smap->map.value_size = value_size;
 	smap->n_buckets = n_buckets;
-	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	err = bpf_map_precharge_memlock(smap->map.pages);
+	err = bpf_map_precharge_memlock(smap->map.memory.pages);
 	if (err)
 		goto free_smap;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1539774d78c7..8289a2ce14fc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -222,19 +222,20 @@ static int bpf_map_init_memlock(struct bpf_map *map)
 	struct user_struct *user = get_current_user();
 	int ret;
 
-	ret = bpf_charge_memlock(user, map->pages);
+	ret = bpf_charge_memlock(user, map->memory.pages);
 	if (ret) {
 		free_uid(user);
 		return ret;
 	}
-	map->user = user;
+	map->memory.user = user;
 	return ret;
 }
 
 static void bpf_map_release_memlock(struct bpf_map *map)
 {
-	struct user_struct *user = map->user;
-	bpf_uncharge_memlock(user, map->pages);
+	struct user_struct *user = map->memory.user;
+
+	bpf_uncharge_memlock(user, map->memory.pages);
 	free_uid(user);
 }
 
@@ -242,17 +243,17 @@ int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
 {
 	int ret;
 
-	ret = bpf_charge_memlock(map->user, pages);
+	ret = bpf_charge_memlock(map->memory.user, pages);
 	if (ret)
 		return ret;
-	map->pages += pages;
+	map->memory.pages += pages;
 	return ret;
 }
 
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
 {
-	bpf_uncharge_memlock(map->user, pages);
-	map->pages -= pages;
+	bpf_uncharge_memlock(map->memory.user, pages);
+	map->memory.pages -= pages;
 }
 
 static int bpf_map_alloc_id(struct bpf_map *map)
@@ -395,7 +396,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->value_size,
 		   map->max_entries,
 		   map->map_flags,
-		   map->pages * 1ULL << PAGE_SHIFT,
+		   map->memory.pages * 1ULL << PAGE_SHIFT,
 		   map->id,
 		   READ_ONCE(map->frozen));
 
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..f816ee1a0fa0 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -40,10 +40,10 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_m;
 
-	m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	err = bpf_map_precharge_memlock(m->map.pages);
+	err = bpf_map_precharge_memlock(m->map.memory.pages);
 	if (err)
 		goto free_m;
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 9a8aaf8e235d..92581c3ff220 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -659,7 +659,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
 	smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
 		BPF_SK_STORAGE_CACHE_SIZE;
-	smap->map.pages = pages;
+	smap->map.memory.pages = pages;
 
 	return &smap->map;
 }
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..4eb5b6a1b29f 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -49,8 +49,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 		goto free_stab;
 	}
 
-	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-	err = bpf_map_precharge_memlock(stab->map.pages);
+	stab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(stab->map.memory.pages);
 	if (err)
 		goto free_stab;
 
-- 
cgit v1.2.3


From b936ca643ade11f265fa10e5fb71c20d9c5243f1 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 29 May 2019 18:03:58 -0700
Subject: bpf: rework memlock-based memory accounting for maps

In order to unify the existing memlock charging code with the
memcg-based memory accounting, which will be added later, let's
rework the current scheme.

Currently the following design is used:
  1) .alloc() callback optionally checks if the allocation will likely
     succeed using bpf_map_precharge_memlock()
  2) .alloc() performs actual allocations
  3) .alloc() callback calculates map cost and sets map.memory.pages
  4) map_create() calls bpf_map_init_memlock() which sets map.memory.user
     and performs actual charging; in case of failure the map is
     destroyed
  <map is in use>
  1) bpf_map_free_deferred() calls bpf_map_release_memlock(), which
     performs uncharge and releases the user
  2) .map_free() callback releases the memory

The scheme can be simplified and made more robust:
  1) .alloc() calculates map cost and calls bpf_map_charge_init()
  2) bpf_map_charge_init() sets map.memory.user and performs actual
    charge
  3) .alloc() performs actual allocations
  <map is in use>
  1) .map_free() callback releases the memory
  2) bpf_map_charge_finish() performs uncharge and releases the user

The new scheme also allows to reuse bpf_map_charge_init()/finish()
functions for memcg-based accounting. Because charges are performed
before actual allocations and uncharges after freeing the memory,
no bogus memory pressure can be created.

In cases when the map structure is not available (e.g. it's not
created yet, or is already destroyed), on-stack bpf_map_memory
structure is used. The charge can be transferred with the
bpf_map_charge_move() function.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  5 +++-
 kernel/bpf/arraymap.c         | 10 +++++--
 kernel/bpf/cpumap.c           |  8 +++--
 kernel/bpf/devmap.c           | 13 ++++----
 kernel/bpf/hashtab.c          | 11 +++----
 kernel/bpf/local_storage.c    |  9 ++++--
 kernel/bpf/lpm_trie.c         |  5 ++--
 kernel/bpf/queue_stack_maps.c |  9 ++++--
 kernel/bpf/reuseport_array.c  |  9 ++++--
 kernel/bpf/stackmap.c         | 30 +++++++++++--------
 kernel/bpf/syscall.c          | 69 +++++++++++++++++++++----------------------
 kernel/bpf/xskmap.c           |  9 +++---
 net/core/bpf_sk_storage.c     |  8 +++--
 net/core/sock_map.c           |  5 ++--
 14 files changed, 112 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2e7c1c40d949..3c8f24f402bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -650,9 +650,12 @@ struct bpf_map *__bpf_map_get(struct fd f);
 struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
-int bpf_map_precharge_memlock(u32 pages);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
+int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages);
+void bpf_map_charge_finish(struct bpf_map_memory *mem);
+void bpf_map_charge_move(struct bpf_map_memory *dst,
+			 struct bpf_map_memory *src);
 void *bpf_map_area_alloc(size_t size, int numa_node);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8fda24e78193..3552da4407d9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -83,6 +83,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	u32 elem_size, index_mask, max_entries;
 	bool unpriv = !capable(CAP_SYS_ADMIN);
 	u64 cost, array_size, mask64;
+	struct bpf_map_memory mem;
 	struct bpf_array *array;
 
 	elem_size = round_up(attr->value_size, 8);
@@ -125,23 +126,26 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	}
 	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_precharge_memlock(cost);
+	ret = bpf_map_charge_init(&mem, cost);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
 	/* allocate all map elements and zero-initialize them */
 	array = bpf_map_area_alloc(array_size, numa_node);
-	if (!array)
+	if (!array) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 	array->index_mask = index_mask;
 	array->map.unpriv_array = unpriv;
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
-	array->map.memory.pages = cost;
+	bpf_map_charge_move(&array->map.memory, &mem);
 	array->elem_size = elem_size;
 
 	if (percpu && bpf_array_alloc_percpu(array)) {
+		bpf_map_charge_finish(&array->map.memory);
 		bpf_map_area_free(array);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 035268add724..c633c8d68023 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_cmap;
-	cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	ret = bpf_map_precharge_memlock(cmap->map.memory.pages);
+	ret = bpf_map_charge_init(&cmap->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (ret) {
 		err = ret;
 		goto free_cmap;
@@ -121,7 +121,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
 					    __alignof__(unsigned long));
 	if (!cmap->flush_needed)
-		goto free_cmap;
+		goto free_charge;
 
 	/* Alloc array for possible remote "destination" CPUs */
 	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
@@ -133,6 +133,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	return &cmap->map;
 free_percpu:
 	free_percpu(cmap->flush_needed);
+free_charge:
+	bpf_map_charge_finish(&cmap->map.memory);
 free_cmap:
 	kfree(cmap);
 	return ERR_PTR(err);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f6c57efb1d0d..371bd880ed58 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -111,10 +111,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_dtab;
 
-	dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(dtab->map.memory.pages);
+	/* if map size is larger than memlock limit, reject it */
+	err = bpf_map_charge_init(&dtab->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (err)
 		goto free_dtab;
 
@@ -125,19 +124,21 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 						__alignof__(unsigned long),
 						GFP_KERNEL | __GFP_NOWARN);
 	if (!dtab->flush_needed)
-		goto free_dtab;
+		goto free_charge;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *),
 					      dtab->map.numa_node);
 	if (!dtab->netdev_map)
-		goto free_dtab;
+		goto free_charge;
 
 	spin_lock(&dev_map_lock);
 	list_add_tail_rcu(&dtab->list, &dev_map_list);
 	spin_unlock(&dev_map_lock);
 
 	return &dtab->map;
+free_charge:
+	bpf_map_charge_finish(&dtab->map.memory);
 free_dtab:
 	free_percpu(dtab->flush_needed);
 	kfree(dtab);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 15bf228d2e98..b0bdc7b040ad 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -364,10 +364,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(htab->map.memory.pages);
+	/* if map size is larger than memlock limit, reject it */
+	err = bpf_map_charge_init(&htab->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (err)
 		goto free_htab;
 
@@ -376,7 +375,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 					   sizeof(struct bucket),
 					   htab->map.numa_node);
 	if (!htab->buckets)
-		goto free_htab;
+		goto free_charge;
 
 	if (htab->map.map_flags & BPF_F_ZERO_SEED)
 		htab->hashrnd = 0;
@@ -409,6 +408,8 @@ free_prealloc:
 	prealloc_destroy(htab);
 free_buckets:
 	bpf_map_area_free(htab->buckets);
+free_charge:
+	bpf_map_charge_finish(&htab->map.memory);
 free_htab:
 	kfree(htab);
 	return ERR_PTR(err);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 574325276650..e49bfd4f4f6d 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -272,6 +272,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_cgroup_storage_map *map;
+	struct bpf_map_memory mem;
 	u32 pages;
 	int ret;
 
@@ -294,16 +295,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 
 	pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >>
 		PAGE_SHIFT;
-	ret = bpf_map_precharge_memlock(pages);
+	ret = bpf_map_charge_init(&mem, pages);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
 	map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
 			   __GFP_ZERO | GFP_USER, numa_node);
-	if (!map)
+	if (!map) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 
-	map->map.memory.pages = pages;
+	bpf_map_charge_move(&map->map.memory, &mem);
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&map->map, attr);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 8e423a582760..6345a8d2dcd0 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -578,9 +578,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 		goto out_err;
 	}
 
-	trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	ret = bpf_map_precharge_memlock(trie->map.memory.pages);
+	ret = bpf_map_charge_init(&trie->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (ret)
 		goto out_err;
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8a510e71d486..224cb0fd8f03 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -67,6 +67,7 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
 static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 {
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_map_memory mem = {0};
 	struct bpf_queue_stack *qs;
 	u64 size, queue_size, cost;
 
@@ -77,19 +78,21 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 
 	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_precharge_memlock(cost);
+	ret = bpf_map_charge_init(&mem, cost);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
 	qs = bpf_map_area_alloc(queue_size, numa_node);
-	if (!qs)
+	if (!qs) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	memset(qs, 0, sizeof(*qs));
 
 	bpf_map_init_from_attr(&qs->map, attr);
 
-	qs->map.memory.pages = cost;
+	bpf_map_charge_move(&qs->map.memory, &mem);
 	qs->size = size;
 
 	raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 819515242739..5c6e25b1b9b1 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,6 +151,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 {
 	int err, numa_node = bpf_map_attr_numa_node(attr);
 	struct reuseport_array *array;
+	struct bpf_map_memory mem;
 	u64 cost, array_size;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -165,18 +166,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	err = bpf_map_precharge_memlock(cost);
+	err = bpf_map_charge_init(&mem, cost);
 	if (err)
 		return ERR_PTR(err);
 
 	/* allocate all map elements and zero-initialize them */
 	array = bpf_map_area_alloc(array_size, numa_node);
-	if (!array)
+	if (!array) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
-	array->map.memory.pages = cost;
+	bpf_map_charge_move(&array->map.memory, &mem);
 
 	return &array->map;
 }
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 08d4efff73ac..8da24ca65d97 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -89,6 +89,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 {
 	u32 value_size = attr->value_size;
 	struct bpf_stack_map *smap;
+	struct bpf_map_memory mem;
 	u64 cost, n_buckets;
 	int err;
 
@@ -116,40 +117,43 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	n_buckets = roundup_pow_of_two(attr->max_entries);
 
 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-E2BIG);
 
+	err = bpf_map_charge_init(&mem,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	if (err)
+		return ERR_PTR(err);
+
 	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
-	if (!smap)
+	if (!smap) {
+		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
-
-	err = -E2BIG;
-	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_smap;
+	}
 
 	bpf_map_init_from_attr(&smap->map, attr);
 	smap->map.value_size = value_size;
 	smap->n_buckets = n_buckets;
-	smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	err = bpf_map_precharge_memlock(smap->map.memory.pages);
-	if (err)
-		goto free_smap;
 
 	err = get_callchain_buffers(sysctl_perf_event_max_stack);
 	if (err)
-		goto free_smap;
+		goto free_charge;
 
 	err = prealloc_elems_and_freelist(smap);
 	if (err)
 		goto put_buffers;
 
+	bpf_map_charge_move(&smap->map.memory, &mem);
+
 	return &smap->map;
 
 put_buffers:
 	put_callchain_buffers();
-free_smap:
+free_charge:
+	bpf_map_charge_finish(&mem);
 	bpf_map_area_free(smap);
 	return ERR_PTR(err);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8289a2ce14fc..4a5ebad99154 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -188,19 +188,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
 	map->numa_node = bpf_map_attr_numa_node(attr);
 }
 
-int bpf_map_precharge_memlock(u32 pages)
-{
-	struct user_struct *user = get_current_user();
-	unsigned long memlock_limit, cur;
-
-	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	cur = atomic_long_read(&user->locked_vm);
-	free_uid(user);
-	if (cur + pages > memlock_limit)
-		return -EPERM;
-	return 0;
-}
-
 static int bpf_charge_memlock(struct user_struct *user, u32 pages)
 {
 	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -214,29 +201,40 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages)
 
 static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
 {
-	atomic_long_sub(pages, &user->locked_vm);
+	if (user)
+		atomic_long_sub(pages, &user->locked_vm);
 }
 
-static int bpf_map_init_memlock(struct bpf_map *map)
+int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages)
 {
 	struct user_struct *user = get_current_user();
 	int ret;
 
-	ret = bpf_charge_memlock(user, map->memory.pages);
+	ret = bpf_charge_memlock(user, pages);
 	if (ret) {
 		free_uid(user);
 		return ret;
 	}
-	map->memory.user = user;
-	return ret;
+
+	mem->pages = pages;
+	mem->user = user;
+
+	return 0;
 }
 
-static void bpf_map_release_memlock(struct bpf_map *map)
+void bpf_map_charge_finish(struct bpf_map_memory *mem)
 {
-	struct user_struct *user = map->memory.user;
+	bpf_uncharge_memlock(mem->user, mem->pages);
+	free_uid(mem->user);
+}
 
-	bpf_uncharge_memlock(user, map->memory.pages);
-	free_uid(user);
+void bpf_map_charge_move(struct bpf_map_memory *dst,
+			 struct bpf_map_memory *src)
+{
+	*dst = *src;
+
+	/* Make sure src will not be used for the redundant uncharging. */
+	memset(src, 0, sizeof(struct bpf_map_memory));
 }
 
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
@@ -304,11 +302,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
+	struct bpf_map_memory mem;
 
-	bpf_map_release_memlock(map);
+	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
+	bpf_map_charge_finish(&mem);
 }
 
 static void bpf_map_put_uref(struct bpf_map *map)
@@ -550,6 +550,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_map_memory mem;
 	struct bpf_map *map;
 	int f_flags;
 	int err;
@@ -574,7 +575,7 @@ static int map_create(union bpf_attr *attr)
 
 	err = bpf_obj_name_cpy(map->name, attr->map_name);
 	if (err)
-		goto free_map_nouncharge;
+		goto free_map;
 
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
@@ -584,20 +585,20 @@ static int map_create(union bpf_attr *attr)
 
 		if (!attr->btf_value_type_id) {
 			err = -EINVAL;
-			goto free_map_nouncharge;
+			goto free_map;
 		}
 
 		btf = btf_get_by_fd(attr->btf_fd);
 		if (IS_ERR(btf)) {
 			err = PTR_ERR(btf);
-			goto free_map_nouncharge;
+			goto free_map;
 		}
 
 		err = map_check_btf(map, btf, attr->btf_key_type_id,
 				    attr->btf_value_type_id);
 		if (err) {
 			btf_put(btf);
-			goto free_map_nouncharge;
+			goto free_map;
 		}
 
 		map->btf = btf;
@@ -609,15 +610,11 @@ static int map_create(union bpf_attr *attr)
 
 	err = security_bpf_map_alloc(map);
 	if (err)
-		goto free_map_nouncharge;
-
-	err = bpf_map_init_memlock(map);
-	if (err)
-		goto free_map_sec;
+		goto free_map;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
-		goto free_map;
+		goto free_map_sec;
 
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
@@ -633,13 +630,13 @@ static int map_create(union bpf_attr *attr)
 
 	return err;
 
-free_map:
-	bpf_map_release_memlock(map);
 free_map_sec:
 	security_bpf_map_free(map);
-free_map_nouncharge:
+free_map:
 	btf_put(map->btf);
+	bpf_map_charge_move(&mem, &map->memory);
 	map->ops->map_free(map);
+	bpf_map_charge_finish(&mem);
 	return err;
 }
 
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index f816ee1a0fa0..a329dab7c7a4 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -40,10 +40,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_m;
 
-	m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	err = bpf_map_precharge_memlock(m->map.memory.pages);
+	err = bpf_map_charge_init(&m->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (err)
 		goto free_m;
 
@@ -51,7 +50,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	m->flush_list = alloc_percpu(struct list_head);
 	if (!m->flush_list)
-		goto free_m;
+		goto free_charge;
 
 	for_each_possible_cpu(cpu)
 		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
@@ -65,6 +64,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 free_percpu:
 	free_percpu(m->flush_list);
+free_charge:
+	bpf_map_charge_finish(&m->map.memory);
 free_m:
 	kfree(m);
 	return ERR_PTR(err);
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 92581c3ff220..621a0b07ff11 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -640,13 +640,16 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
 	pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_precharge_memlock(pages);
-	if (ret < 0)
+	ret = bpf_map_charge_init(&smap->map.memory, pages);
+	if (ret < 0) {
+		kfree(smap);
 		return ERR_PTR(ret);
+	}
 
 	smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
 				 GFP_USER | __GFP_NOWARN);
 	if (!smap->buckets) {
+		bpf_map_charge_finish(&smap->map.memory);
 		kfree(smap);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -659,7 +662,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
 	smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
 		BPF_SK_STORAGE_CACHE_SIZE;
-	smap->map.memory.pages = pages;
 
 	return &smap->map;
 }
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 4eb5b6a1b29f..1028c922a149 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -49,8 +49,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 		goto free_stab;
 	}
 
-	stab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-	err = bpf_map_precharge_memlock(stab->map.memory.pages);
+	err = bpf_map_charge_init(&stab->map.memory,
+				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
 	if (err)
 		goto free_stab;
 
@@ -60,6 +60,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (stab->sks)
 		return &stab->map;
 	err = -ENOMEM;
+	bpf_map_charge_finish(&stab->map.memory);
 free_stab:
 	kfree(stab);
 	return ERR_PTR(err);
-- 
cgit v1.2.3


From c85d69135a9175c50a823d04d62d932312d037b3 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 29 May 2019 18:03:59 -0700
Subject: bpf: move memory size checks to bpf_map_charge_init()

Most bpf map types doing similar checks and bytes to pages
conversion during memory allocation and charging.

Let's unify these checks by moving them into bpf_map_charge_init().

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  2 +-
 kernel/bpf/arraymap.c         |  8 +-------
 kernel/bpf/cpumap.c           |  5 +----
 kernel/bpf/devmap.c           |  5 +----
 kernel/bpf/hashtab.c          |  7 +------
 kernel/bpf/local_storage.c    |  5 +----
 kernel/bpf/lpm_trie.c         |  7 +------
 kernel/bpf/queue_stack_maps.c |  4 ----
 kernel/bpf/reuseport_array.c  | 10 ++--------
 kernel/bpf/stackmap.c         |  8 +-------
 kernel/bpf/syscall.c          |  9 +++++++--
 kernel/bpf/xskmap.c           |  5 +----
 net/core/bpf_sk_storage.c     |  4 +---
 net/core/sock_map.c           |  8 +-------
 14 files changed, 20 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3c8f24f402bf..e5a309e6a400 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -652,7 +652,7 @@ void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
-int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages);
+int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size);
 void bpf_map_charge_finish(struct bpf_map_memory *mem);
 void bpf_map_charge_move(struct bpf_map_memory *dst,
 			 struct bpf_map_memory *src);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3552da4407d9..0349cbf23cdb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -117,14 +117,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* make sure there is no u32 overflow later in round_up() */
 	cost = array_size;
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-ENOMEM);
-	if (percpu) {
+	if (percpu)
 		cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
-		if (cost >= U32_MAX - PAGE_SIZE)
-			return ERR_PTR(-ENOMEM);
-	}
-	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	ret = bpf_map_charge_init(&mem, cost);
 	if (ret < 0)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index c633c8d68023..b31a71909307 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -106,12 +106,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	/* make sure page count doesn't overflow */
 	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
 	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_cmap;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	ret = bpf_map_charge_init(&cmap->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	ret = bpf_map_charge_init(&cmap->map.memory, cost);
 	if (ret) {
 		err = ret;
 		goto free_cmap;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 371bd880ed58..5ae7cce5ef16 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -108,12 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	/* make sure page count doesn't overflow */
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_dtab;
 
 	/* if map size is larger than memlock limit, reject it */
-	err = bpf_map_charge_init(&dtab->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&dtab->map.memory, cost);
 	if (err)
 		goto free_dtab;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b0bdc7b040ad..d92e05d9979b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -360,13 +360,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	else
 	       cost += (u64) htab->elem_size * num_possible_cpus();
 
-	if (cost >= U32_MAX - PAGE_SIZE)
-		/* make sure page count doesn't overflow */
-		goto free_htab;
-
 	/* if map size is larger than memlock limit, reject it */
-	err = bpf_map_charge_init(&htab->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&htab->map.memory, cost);
 	if (err)
 		goto free_htab;
 
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index e49bfd4f4f6d..addd6fdceec8 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -273,7 +273,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_cgroup_storage_map *map;
 	struct bpf_map_memory mem;
-	u32 pages;
 	int ret;
 
 	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
@@ -293,9 +292,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 		/* max_entries is not used and enforced to be 0 */
 		return ERR_PTR(-EINVAL);
 
-	pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >>
-		PAGE_SHIFT;
-	ret = bpf_map_charge_init(&mem, pages);
+	ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
 	if (ret < 0)
 		return ERR_PTR(ret);
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 6345a8d2dcd0..09334f13a8a0 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -573,13 +573,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	cost_per_node = sizeof(struct lpm_trie_node) +
 			attr->value_size + trie->data_size;
 	cost += (u64) attr->max_entries * cost_per_node;
-	if (cost >= U32_MAX - PAGE_SIZE) {
-		ret = -E2BIG;
-		goto out_err;
-	}
 
-	ret = bpf_map_charge_init(&trie->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	ret = bpf_map_charge_init(&trie->map.memory, cost);
 	if (ret)
 		goto out_err;
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 224cb0fd8f03..f697647ceb54 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -73,10 +73,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 
 	size = (u64) attr->max_entries + 1;
 	cost = queue_size = sizeof(*qs) + size * attr->value_size;
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-E2BIG);
-
-	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	ret = bpf_map_charge_init(&mem, cost);
 	if (ret < 0)
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 5c6e25b1b9b1..50c083ba978c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -152,7 +152,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	int err, numa_node = bpf_map_attr_numa_node(attr);
 	struct reuseport_array *array;
 	struct bpf_map_memory mem;
-	u64 cost, array_size;
+	u64 array_size;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -160,13 +160,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	array_size = sizeof(*array);
 	array_size += (u64)attr->max_entries * sizeof(struct sock *);
 
-	/* make sure there is no u32 overflow later in round_up() */
-	cost = array_size;
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-ENOMEM);
-	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	err = bpf_map_charge_init(&mem, cost);
+	err = bpf_map_charge_init(&mem, array_size);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 8da24ca65d97..3d86072d8e32 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -117,14 +117,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	n_buckets = roundup_pow_of_two(attr->max_entries);
 
 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-E2BIG);
 	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
-	if (cost >= U32_MAX - PAGE_SIZE)
-		return ERR_PTR(-E2BIG);
-
-	err = bpf_map_charge_init(&mem,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&mem, cost);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4a5ebad99154..4c53cbd3329d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -205,11 +205,16 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
 		atomic_long_sub(pages, &user->locked_vm);
 }
 
-int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages)
+int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
 {
-	struct user_struct *user = get_current_user();
+	u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
+	struct user_struct *user;
 	int ret;
 
+	if (size >= U32_MAX - PAGE_SIZE)
+		return -E2BIG;
+
+	user = get_current_user();
 	ret = bpf_charge_memlock(user, pages);
 	if (ret) {
 		free_uid(user);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index a329dab7c7a4..22066c28ba61 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -37,12 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
 	cost += sizeof(struct list_head) * num_possible_cpus();
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_m;
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
-	err = bpf_map_charge_init(&m->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&m->map.memory, cost);
 	if (err)
 		goto free_m;
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 621a0b07ff11..f40e3d35fd9c 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -626,7 +626,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	struct bpf_sk_storage_map *smap;
 	unsigned int i;
 	u32 nbuckets;
-	u32 pages;
 	u64 cost;
 	int ret;
 
@@ -638,9 +637,8 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 	smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus()));
 	nbuckets = 1U << smap->bucket_log;
 	cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
-	pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-	ret = bpf_map_charge_init(&smap->map.memory, pages);
+	ret = bpf_map_charge_init(&smap->map.memory, cost);
 	if (ret < 0) {
 		kfree(smap);
 		return ERR_PTR(ret);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1028c922a149..52d4faeee18b 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* Make sure page count doesn't overflow. */
 	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
-	if (cost >= U32_MAX - PAGE_SIZE) {
-		err = -EINVAL;
-		goto free_stab;
-	}
-
-	err = bpf_map_charge_init(&stab->map.memory,
-				  round_up(cost, PAGE_SIZE) >> PAGE_SHIFT);
+	err = bpf_map_charge_init(&stab->map.memory, cost);
 	if (err)
 		goto free_stab;
 
-- 
cgit v1.2.3


From ef11db3310e272d3d8dbe8739e0770820dd20e52 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 31 May 2019 18:27:04 +0200
Subject: net: inetdevice: provide replacement iterators for in_ifaddr walk

The ifa_list is protected either by rcu or rtnl lock, but the
current iterators do not account for this.

This adds two iterators as replacement, a later patch in
the series will update them with the needed rcu/rtnl_dereference calls.

Its not done in this patch yet to avoid sparse warnings -- the fields
lack the proper __rcu annotation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h | 10 +++++++++-
 net/ipv4/devinet.c         | 31 ++++++++++++++++---------------
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 367dc2a0f84a..d5d05503a04b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -186,7 +186,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
 				    __be32 mask);
 struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
-static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
+static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
 {
 	return !((addr^ifa->ifa_address)&ifa->ifa_mask);
 }
@@ -215,6 +215,14 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr)
 
 #define endfor_ifa(in_dev) }
 
+#define in_dev_for_each_ifa_rtnl(ifa, in_dev)			\
+	for (ifa = (in_dev)->ifa_list; ifa;			\
+	     ifa = ifa->ifa_next)
+
+#define in_dev_for_each_ifa_rcu(ifa, in_dev)			\
+	for (ifa = (in_dev)->ifa_list; ifa;			\
+	     ifa = ifa->ifa_next)
+
 static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
 {
 	return rcu_dereference(dev->ip_ptr);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 701c5d113a34..7803a4d2951c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -873,13 +873,12 @@ errout:
 static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
 {
 	struct in_device *in_dev = ifa->ifa_dev;
-	struct in_ifaddr *ifa1, **ifap;
+	struct in_ifaddr *ifa1;
 
 	if (!ifa->ifa_local)
 		return NULL;
 
-	for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
-	     ifap = &ifa1->ifa_next) {
+	in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa) &&
 		    ifa1->ifa_local == ifa->ifa_local)
@@ -1208,7 +1207,7 @@ out:
 static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
 {
 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
-	struct in_ifaddr *ifa;
+	const struct in_ifaddr *ifa;
 	struct ifreq ifr;
 	int done = 0;
 
@@ -1218,7 +1217,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s
 	if (!in_dev)
 		goto out;
 
-	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
 		if (!buf) {
 			done += size;
 			continue;
@@ -1321,10 +1320,11 @@ EXPORT_SYMBOL(inet_select_addr);
 static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
 			      __be32 local, int scope)
 {
-	int same = 0;
+	const struct in_ifaddr *ifa;
 	__be32 addr = 0;
+	int same = 0;
 
-	for_ifa(in_dev) {
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
 		if (!addr &&
 		    (local == ifa->ifa_local || !local) &&
 		    ifa->ifa_scope <= scope) {
@@ -1350,7 +1350,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
 				same = 0;
 			}
 		}
-	} endfor_ifa(in_dev);
+	}
 
 	return same ? addr : 0;
 }
@@ -1424,7 +1424,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
 	struct in_ifaddr *ifa;
 	int named = 0;
 
-	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
 		char old[IFNAMSIZ], *dot;
 
 		memcpy(old, ifa->ifa_label, IFNAMSIZ);
@@ -1454,10 +1454,9 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev,
 					struct in_device *in_dev)
 
 {
-	struct in_ifaddr *ifa;
+	const struct in_ifaddr *ifa;
 
-	for (ifa = in_dev->ifa_list; ifa;
-	     ifa = ifa->ifa_next) {
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
 		arp_send(ARPOP_REQUEST, ETH_P_ARP,
 			 ifa->ifa_local, dev,
 			 ifa->ifa_local, NULL,
@@ -1727,15 +1726,17 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
 	int ip_idx = 0;
 	int err;
 
-	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) {
-		if (ip_idx < s_ip_idx)
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		if (ip_idx < s_ip_idx) {
+			ip_idx++;
 			continue;
-
+		}
 		err = inet_fill_ifaddr(skb, ifa, fillargs);
 		if (err < 0)
 			goto done;
 
 		nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+		ip_idx++;
 	}
 	err = 0;
 
-- 
cgit v1.2.3


From 2638eb8b50cfc16240e0bb080b9afbf541a9b39d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 31 May 2019 18:27:09 +0200
Subject: net: ipv4: provide __rcu annotation for ifa_list

ifa_list is protected by rcu, yet code doesn't reflect this.

Add the __rcu annotations and fix up all places that are now reported by
sparse.

I've done this in the same commit to not add intermediate patches that
result in new warnings.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/i40iw/i40iw_utils.c       | 12 ++--
 drivers/infiniband/hw/nes/nes.c                 |  8 ++-
 drivers/infiniband/hw/usnic/usnic_ib_main.c     | 15 +++--
 drivers/net/ethernet/via/via-velocity.h         |  2 +-
 drivers/net/plip/plip.c                         |  4 +-
 drivers/net/vmxnet3/vmxnet3_drv.c               | 19 ++++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c      |  4 +-
 drivers/net/wireless/marvell/mwifiex/cfg80211.c |  2 +-
 drivers/staging/isdn/hysdn/hysdn_net.c          |  6 +-
 include/linux/inetdevice.h                      | 21 ++----
 net/core/netpoll.c                              | 10 ++-
 net/core/pktgen.c                               |  8 ++-
 net/ipv4/devinet.c                              | 88 ++++++++++++++++---------
 net/mac80211/main.c                             |  4 +-
 net/netfilter/nf_nat_redirect.c                 | 12 ++--
 15 files changed, 134 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index 337410f40860..016524683e17 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -174,10 +174,14 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
 		rcu_read_lock();
 		in = __in_dev_get_rcu(upper_dev);
 
-		if (!in->ifa_list)
-			local_ipaddr = 0;
-		else
-			local_ipaddr = ntohl(in->ifa_list->ifa_address);
+		local_ipaddr = 0;
+		if (in) {
+			struct in_ifaddr *ifa;
+
+			ifa = rcu_dereference(in->ifa_list);
+			if (ifa)
+				local_ipaddr = ntohl(ifa->ifa_address);
+		}
 
 		rcu_read_unlock();
 	} else {
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index e00add6d78ec..29b324726ea6 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -183,7 +183,13 @@ static int nes_inetaddr_event(struct notifier_block *notifier,
 
 						rcu_read_lock();
 						in = __in_dev_get_rcu(upper_dev);
-						nesvnic->local_ipaddr = in->ifa_list->ifa_address;
+						if (in) {
+							struct in_ifaddr *ifa;
+
+							ifa = rcu_dereference(in->ifa_list);
+							if (ifa)
+								nesvnic->local_ipaddr = ifa->ifa_address;
+						}
 						rcu_read_unlock();
 					} else {
 						nesvnic->local_ipaddr = ifa->ifa_address;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index d88d9f8a7f9a..34c1f9d6c915 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -427,11 +427,16 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 	if (netif_carrier_ok(us_ibdev->netdev))
 		usnic_fwd_carrier_up(us_ibdev->ufdev);
 
-	ind = in_dev_get(netdev);
-	if (ind->ifa_list)
-		usnic_fwd_add_ipaddr(us_ibdev->ufdev,
-				     ind->ifa_list->ifa_address);
-	in_dev_put(ind);
+	rcu_read_lock();
+	ind = __in_dev_get_rcu(netdev);
+	if (ind) {
+		const struct in_ifaddr *ifa;
+
+		ifa = rcu_dereference(ind->ifa_list);
+		if (ifa)
+			usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+	}
+	rcu_read_unlock();
 
 	usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
 				us_ibdev->ufdev->inaddr, &gid.raw[0]);
diff --git a/drivers/net/ethernet/via/via-velocity.h b/drivers/net/ethernet/via/via-velocity.h
index c0ecc6c7b5e0..cdfe7809e3c1 100644
--- a/drivers/net/ethernet/via/via-velocity.h
+++ b/drivers/net/ethernet/via/via-velocity.h
@@ -1509,7 +1509,7 @@ static inline int velocity_get_ip(struct velocity_info *vptr)
 	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(vptr->netdev);
 	if (in_dev != NULL) {
-		ifa = (struct in_ifaddr *) in_dev->ifa_list;
+		ifa = rcu_dereference(in_dev->ifa_list);
 		if (ifa != NULL) {
 			memcpy(vptr->ip_addr, &ifa->ifa_address, 4);
 			res = 0;
diff --git a/drivers/net/plip/plip.c b/drivers/net/plip/plip.c
index feb92ecd1880..3e3ac2e496a1 100644
--- a/drivers/net/plip/plip.c
+++ b/drivers/net/plip/plip.c
@@ -1012,7 +1012,7 @@ plip_rewrite_address(const struct net_device *dev, struct ethhdr *eth)
 	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		/* Any address will do - we take the first */
-		const struct in_ifaddr *ifa = in_dev->ifa_list;
+		const struct in_ifaddr *ifa = rcu_dereference(in_dev->ifa_list);
 		if (ifa) {
 			memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
 			memset(eth->h_dest, 0xfc, 2);
@@ -1107,7 +1107,7 @@ plip_open(struct net_device *dev)
 		/* Any address will do - we take the first. We already
 		   have the first two bytes filled with 0xfc, from
 		   plip_init_dev(). */
-		struct in_ifaddr *ifa=in_dev->ifa_list;
+		const struct in_ifaddr *ifa = rcu_dereference(in_dev->ifa_list);
 		if (ifa != NULL) {
 			memcpy(dev->dev_addr+2, &ifa->ifa_local, 4);
 		}
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 89984fcab01e..1b2a18ea855c 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -3651,13 +3651,19 @@ vmxnet3_suspend(struct device *device)
 	}
 
 	if (adapter->wol & WAKE_ARP) {
-		in_dev = in_dev_get(netdev);
-		if (!in_dev)
+		rcu_read_lock();
+
+		in_dev = __in_dev_get_rcu(netdev);
+		if (!in_dev) {
+			rcu_read_unlock();
 			goto skip_arp;
+		}
 
-		ifa = (struct in_ifaddr *)in_dev->ifa_list;
-		if (!ifa)
+		ifa = rcu_dereference(in_dev->ifa_list);
+		if (!ifa) {
+			rcu_read_unlock();
 			goto skip_arp;
+		}
 
 		pmConf->filters[i].patternSize = ETH_HLEN + /* Ethernet header*/
 			sizeof(struct arphdr) +		/* ARP header */
@@ -3677,7 +3683,9 @@ vmxnet3_suspend(struct device *device)
 
 		/* The Unicast IPv4 address in 'tip' field. */
 		arpreq += 2 * ETH_ALEN + sizeof(u32);
-		*(u32 *)arpreq = ifa->ifa_address;
+		*(__be32 *)arpreq = ifa->ifa_address;
+
+		rcu_read_unlock();
 
 		/* The mask for the relevant bits. */
 		pmConf->filters[i].mask[0] = 0x00;
@@ -3686,7 +3694,6 @@ vmxnet3_suspend(struct device *device)
 		pmConf->filters[i].mask[3] = 0x00;
 		pmConf->filters[i].mask[4] = 0xC0; /* IPv4 TIP */
 		pmConf->filters[i].mask[5] = 0x03; /* IPv4 TIP */
-		in_dev_put(in_dev);
 
 		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_FILTER;
 		i++;
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 5477a014e1fb..37cf602d8adf 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -2194,13 +2194,13 @@ static int ath6kl_wow_suspend_vif(struct ath6kl_vif *vif,
 	if (!in_dev)
 		return 0;
 
-	ifa = in_dev->ifa_list;
+	ifa = rtnl_dereference(in_dev->ifa_list);
 	memset(&ips, 0, sizeof(ips));
 
 	/* Configure IP addr only if IP address count < MAX_IP_ADDRS */
 	while (index < MAX_IP_ADDRS && ifa) {
 		ips[index] = ifa->ifa_local;
-		ifa = ifa->ifa_next;
+		ifa = rtnl_dereference(ifa->ifa_next);
 		index++;
 	}
 
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index e11a4bb67172..5a7cdb981789 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -3268,7 +3268,7 @@ static void mwifiex_set_auto_arp_mef_entry(struct mwifiex_private *priv,
 			in_dev = __in_dev_get_rtnl(adapter->priv[i]->netdev);
 			if (!in_dev)
 				continue;
-			ifa = in_dev->ifa_list;
+			ifa = rtnl_dereference(in_dev->ifa_list);
 			if (!ifa || !ifa->ifa_local)
 				continue;
 			ips[i] = ifa->ifa_local;
diff --git a/drivers/staging/isdn/hysdn/hysdn_net.c b/drivers/staging/isdn/hysdn/hysdn_net.c
index 8e9c34f33d86..bea37ae30ebb 100644
--- a/drivers/staging/isdn/hysdn/hysdn_net.c
+++ b/drivers/staging/isdn/hysdn/hysdn_net.c
@@ -70,9 +70,13 @@ net_open(struct net_device *dev)
 		for (i = 0; i < ETH_ALEN; i++)
 			dev->dev_addr[i] = 0xfc;
 		if ((in_dev = dev->ip_ptr) != NULL) {
-			struct in_ifaddr *ifa = in_dev->ifa_list;
+			const struct in_ifaddr *ifa;
+
+			rcu_read_lock();
+			ifa = rcu_dereference(in_dev->ifa_list);
 			if (ifa != NULL)
 				memcpy(dev->dev_addr + (ETH_ALEN - sizeof(ifa->ifa_local)), &ifa->ifa_local, sizeof(ifa->ifa_local));
+			rcu_read_unlock();
 		}
 	} else
 		memcpy(dev->dev_addr, card->mac_addr, ETH_ALEN);
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index d5d05503a04b..3515ca64e638 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -26,7 +26,7 @@ struct in_device {
 	struct net_device	*dev;
 	refcount_t		refcnt;
 	int			dead;
-	struct in_ifaddr	*ifa_list;	/* IP ifaddr chain		*/
+	struct in_ifaddr	__rcu *ifa_list;/* IP ifaddr chain		*/
 
 	struct ip_mc_list __rcu	*mc_list;	/* IP multicast filter chain    */
 	struct ip_mc_list __rcu	* __rcu *mc_hash;
@@ -136,7 +136,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 
 struct in_ifaddr {
 	struct hlist_node	hash;
-	struct in_ifaddr	*ifa_next;
+	struct in_ifaddr	__rcu *ifa_next;
 	struct in_device	*ifa_dev;
 	struct rcu_head		rcu_head;
 	__be32			ifa_local;
@@ -206,22 +206,13 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr)
 	return false;
 }
 
-#define for_primary_ifa(in_dev)	{ struct in_ifaddr *ifa; \
-  for (ifa = (in_dev)->ifa_list; ifa && !(ifa->ifa_flags&IFA_F_SECONDARY); ifa = ifa->ifa_next)
-
-#define for_ifa(in_dev)	{ struct in_ifaddr *ifa; \
-  for (ifa = (in_dev)->ifa_list; ifa; ifa = ifa->ifa_next)
-
-
-#define endfor_ifa(in_dev) }
-
 #define in_dev_for_each_ifa_rtnl(ifa, in_dev)			\
-	for (ifa = (in_dev)->ifa_list; ifa;			\
-	     ifa = ifa->ifa_next)
+	for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa;	\
+	     ifa = rtnl_dereference(ifa->ifa_next))
 
 #define in_dev_for_each_ifa_rcu(ifa, in_dev)			\
-	for (ifa = (in_dev)->ifa_list; ifa;			\
-	     ifa = ifa->ifa_next)
+	for (ifa = rcu_dereference((in_dev)->ifa_list); ifa;	\
+	     ifa = rcu_dereference(ifa->ifa_next))
 
 static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
 {
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index dd8b1a460d64..2cf27da1baeb 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np)
 
 	if (!np->local_ip.ip) {
 		if (!np->ipv6) {
+			const struct in_ifaddr *ifa;
+
 			in_dev = __in_dev_get_rtnl(ndev);
+			if (!in_dev)
+				goto put_noaddr;
 
-			if (!in_dev || !in_dev->ifa_list) {
+			ifa = rtnl_dereference(in_dev->ifa_list);
+			if (!ifa) {
+put_noaddr:
 				np_err(np, "no IP address for %s, aborting\n",
 				       np->dev_name);
 				err = -EDESTADDRREQ;
 				goto put;
 			}
 
-			np->local_ip.ip = in_dev->ifa_list->ifa_local;
+			np->local_ip.ip = ifa->ifa_local;
 			np_info(np, "local IP %pI4\n", &np->local_ip.ip);
 		} else {
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 319ad5490fb3..4cd120dc30ad 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2125,9 +2125,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 			rcu_read_lock();
 			in_dev = __in_dev_get_rcu(pkt_dev->odev);
 			if (in_dev) {
-				if (in_dev->ifa_list) {
-					pkt_dev->saddr_min =
-					    in_dev->ifa_list->ifa_address;
+				const struct in_ifaddr *ifa;
+
+				ifa = rcu_dereference(in_dev->ifa_list);
+				if (ifa) {
+					pkt_dev->saddr_min = ifa->ifa_address;
 					pkt_dev->saddr_max = pkt_dev->saddr_min;
 				}
 			}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index b45421b2b734..ebaea05b4033 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -194,7 +194,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
 
 static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
 static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+			 struct in_ifaddr __rcu **ifap,
 			 int destroy);
 #ifdef CONFIG_SYSCTL
 static int devinet_sysctl_register(struct in_device *idev);
@@ -300,8 +301,8 @@ static void in_dev_rcu_put(struct rcu_head *head)
 
 static void inetdev_destroy(struct in_device *in_dev)
 {
-	struct in_ifaddr *ifa;
 	struct net_device *dev;
+	struct in_ifaddr *ifa;
 
 	ASSERT_RTNL();
 
@@ -311,7 +312,7 @@ static void inetdev_destroy(struct in_device *in_dev)
 
 	ip_mc_destroy_dev(in_dev);
 
-	while ((ifa = in_dev->ifa_list) != NULL) {
+	while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
 		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
 		inet_free_ifa(ifa);
 	}
@@ -342,17 +343,20 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
 	return 0;
 }
 
-static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
-			 int destroy, struct nlmsghdr *nlh, u32 portid)
+static void __inet_del_ifa(struct in_device *in_dev,
+			   struct in_ifaddr __rcu **ifap,
+			   int destroy, struct nlmsghdr *nlh, u32 portid)
 {
 	struct in_ifaddr *promote = NULL;
-	struct in_ifaddr *ifa, *ifa1 = *ifap;
-	struct in_ifaddr *last_prim = in_dev->ifa_list;
+	struct in_ifaddr *ifa, *ifa1;
+	struct in_ifaddr *last_prim;
 	struct in_ifaddr *prev_prom = NULL;
 	int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
 
 	ASSERT_RTNL();
 
+	ifa1 = rtnl_dereference(*ifap);
+	last_prim = rtnl_dereference(in_dev->ifa_list);
 	if (in_dev->dead)
 		goto no_promotions;
 
@@ -361,9 +365,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	 **/
 
 	if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
-		struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+		struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;
 
-		while ((ifa = *ifap1) != NULL) {
+		while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
 			if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
 			    ifa1->ifa_scope <= ifa->ifa_scope)
 				last_prim = ifa;
@@ -396,7 +400,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	 * and later to add them back with new prefsrc. Do this
 	 * while all addresses are on the device list.
 	 */
-	for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+	for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
 		if (ifa1->ifa_mask == ifa->ifa_mask &&
 		    inet_ifa_match(ifa1->ifa_address, ifa))
 			fib_del_ifaddr(ifa, ifa1);
@@ -422,19 +426,24 @@ no_promotions:
 	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 
 	if (promote) {
-		struct in_ifaddr *next_sec = promote->ifa_next;
+		struct in_ifaddr *next_sec;
 
+		next_sec = rtnl_dereference(promote->ifa_next);
 		if (prev_prom) {
-			prev_prom->ifa_next = promote->ifa_next;
-			promote->ifa_next = last_prim->ifa_next;
-			last_prim->ifa_next = promote;
+			struct in_ifaddr *last_sec;
+
+			last_sec = rtnl_dereference(last_prim->ifa_next);
+			rcu_assign_pointer(prev_prom->ifa_next, next_sec);
+			rcu_assign_pointer(promote->ifa_next, last_sec);
+			rcu_assign_pointer(last_prim->ifa_next, promote);
 		}
 
 		promote->ifa_flags &= ~IFA_F_SECONDARY;
 		rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
 		blocking_notifier_call_chain(&inetaddr_chain,
 				NETDEV_UP, promote);
-		for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+		for (ifa = next_sec; ifa;
+		     ifa = rtnl_dereference(ifa->ifa_next)) {
 			if (ifa1->ifa_mask != ifa->ifa_mask ||
 			    !inet_ifa_match(ifa1->ifa_address, ifa))
 					continue;
@@ -446,7 +455,8 @@ no_promotions:
 		inet_free_ifa(ifa1);
 }
 
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+			 struct in_ifaddr __rcu **ifap,
 			 int destroy)
 {
 	__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
@@ -459,9 +469,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
 static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 			     u32 portid, struct netlink_ext_ack *extack)
 {
+	struct in_ifaddr __rcu **last_primary, **ifap;
 	struct in_device *in_dev = ifa->ifa_dev;
-	struct in_ifaddr *ifa1, **ifap, **last_primary;
 	struct in_validator_info ivi;
+	struct in_ifaddr *ifa1;
 	int ret;
 
 	ASSERT_RTNL();
@@ -474,8 +485,10 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	ifa->ifa_flags &= ~IFA_F_SECONDARY;
 	last_primary = &in_dev->ifa_list;
 
-	for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
-	     ifap = &ifa1->ifa_next) {
+	ifap = &in_dev->ifa_list;
+	ifa1 = rtnl_dereference(*ifap);
+
+	while (ifa1) {
 		if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
 		    ifa->ifa_scope <= ifa1->ifa_scope)
 			last_primary = &ifa1->ifa_next;
@@ -491,6 +504,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 			}
 			ifa->ifa_flags |= IFA_F_SECONDARY;
 		}
+
+		ifap = &ifa1->ifa_next;
+		ifa1 = rtnl_dereference(*ifap);
 	}
 
 	/* Allow any devices that wish to register ifaddr validtors to weigh
@@ -516,8 +532,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 		ifap = last_primary;
 	}
 
-	ifa->ifa_next = *ifap;
-	*ifap = ifa;
+	rcu_assign_pointer(ifa->ifa_next, *ifap);
+	rcu_assign_pointer(*ifap, ifa);
 
 	inet_hash_insert(dev_net(in_dev->dev), ifa);
 
@@ -617,10 +633,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 			    struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
+	struct in_ifaddr __rcu **ifap;
 	struct nlattr *tb[IFA_MAX+1];
 	struct in_device *in_dev;
 	struct ifaddrmsg *ifm;
-	struct in_ifaddr *ifa, **ifap;
+	struct in_ifaddr *ifa;
+
 	int err = -EINVAL;
 
 	ASSERT_RTNL();
@@ -637,7 +655,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout;
 	}
 
-	for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+	for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL;
 	     ifap = &ifa->ifa_next) {
 		if (tb[IFA_LOCAL] &&
 		    ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
@@ -725,15 +743,20 @@ static void check_lifetime(struct work_struct *work)
 
 			if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
 			    age >= ifa->ifa_valid_lft) {
-				struct in_ifaddr **ifap;
-
-				for (ifap = &ifa->ifa_dev->ifa_list;
-				     *ifap != NULL; ifap = &(*ifap)->ifa_next) {
-					if (*ifap == ifa) {
+				struct in_ifaddr __rcu **ifap;
+				struct in_ifaddr *tmp;
+
+				ifap = &ifa->ifa_dev->ifa_list;
+				tmp = rtnl_dereference(*ifap);
+				while (tmp) {
+					tmp = rtnl_dereference(tmp->ifa_next);
+					if (rtnl_dereference(*ifap) == ifa) {
 						inet_del_ifa(ifa->ifa_dev,
 							     ifap, 1);
 						break;
 					}
+					ifap = &tmp->ifa_next;
+					tmp = rtnl_dereference(*ifap);
 				}
 			} else if (ifa->ifa_preferred_lft !=
 				   INFINITY_LIFE_TIME &&
@@ -977,8 +1000,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
 {
 	struct sockaddr_in sin_orig;
 	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
+	struct in_ifaddr __rcu **ifap = NULL;
 	struct in_device *in_dev;
-	struct in_ifaddr **ifap = NULL;
 	struct in_ifaddr *ifa = NULL;
 	struct net_device *dev;
 	char *colon;
@@ -1049,7 +1072,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
 			/* note: we only do this for a limited set of ioctls
 			   and only if the original address family was AF_INET.
 			   This is checked above. */
-			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+
+			for (ifap = &in_dev->ifa_list;
+			     (ifa = rtnl_dereference(*ifap)) != NULL;
 			     ifap = &ifa->ifa_next) {
 				if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
 				    sin_orig.sin_addr.s_addr ==
@@ -1062,7 +1087,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
 		   4.3BSD-style and passed in junk so we fall back to
 		   comparing just the label */
 		if (!ifa) {
-			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+			for (ifap = &in_dev->ifa_list;
+			     (ifa = rtnl_dereference(*ifap)) != NULL;
 			     ifap = &ifa->ifa_next)
 				if (!strcmp(ifr->ifr_name, ifa->ifa_label))
 					break;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 2b608044ae23..1f11907dc528 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -354,11 +354,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 	sdata_lock(sdata);
 
 	/* Copy the addresses to the bss_conf list */
-	ifa = idev->ifa_list;
+	ifa = rtnl_dereference(idev->ifa_list);
 	while (ifa) {
 		if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN)
 			bss_conf->arp_addr_list[c] = ifa->ifa_address;
-		ifa = ifa->ifa_next;
+		ifa = rtnl_dereference(ifa->ifa_next);
 		c++;
 	}
 
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 78a9e6454ff3..8598e80968e0 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -47,15 +47,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 	if (hooknum == NF_INET_LOCAL_OUT) {
 		newdst = htonl(0x7F000001);
 	} else {
-		struct in_device *indev;
-		struct in_ifaddr *ifa;
+		const struct in_device *indev;
 
 		newdst = 0;
 
 		indev = __in_dev_get_rcu(skb->dev);
-		if (indev && indev->ifa_list) {
-			ifa = indev->ifa_list;
-			newdst = ifa->ifa_local;
+		if (indev) {
+			const struct in_ifaddr *ifa;
+
+			ifa = rcu_dereference(indev->ifa_list);
+			if (ifa)
+				newdst = ifa->ifa_local;
 		}
 
 		if (!newdst)
-- 
cgit v1.2.3


From 1cc26450a855aa35a6d515be14c539944d5f9648 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 31 May 2019 14:05:06 -0700
Subject: flow_dissector: remove unused FLOW_DISSECTOR_F_STOP_AT_L3 flag

This flag is not used by any caller, remove it.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  5 ++---
 net/core/flow_dissector.c    | 10 +---------
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 7c5a8d9a8d2a..797e19c2fc40 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -229,9 +229,8 @@ enum flow_dissector_key_id {
 };
 
 #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG		BIT(0)
-#define FLOW_DISSECTOR_F_STOP_AT_L3		BIT(1)
-#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	BIT(2)
-#define FLOW_DISSECTOR_F_STOP_AT_ENCAP		BIT(3)
+#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	BIT(1)
+#define FLOW_DISSECTOR_F_STOP_AT_ENCAP		BIT(2)
 
 struct flow_dissector_key {
 	enum flow_dissector_key_id key_id;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index edd622956083..c0559af9e5e5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -757,7 +757,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
  * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
  * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
  * @flags: flags that control the dissection process, e.g.
- *         FLOW_DISSECTOR_F_STOP_AT_L3.
+ *         FLOW_DISSECTOR_F_STOP_AT_ENCAP.
  *
  * The function will try to retrieve individual keys into target specified
  * by flow_dissector from either the skbuff or a raw buffer specified by the
@@ -922,11 +922,6 @@ proto_again:
 		__skb_flow_dissect_ipv4(skb, flow_dissector,
 					target_container, data, iph);
 
-		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
-			fdret = FLOW_DISSECT_RET_OUT_GOOD;
-			break;
-		}
-
 		break;
 	}
 	case htons(ETH_P_IPV6): {
@@ -975,9 +970,6 @@ proto_again:
 		__skb_flow_dissect_ipv6(skb, flow_dissector,
 					target_container, data, iph);
 
-		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
-			fdret = FLOW_DISSECT_RET_OUT_GOOD;
-
 		break;
 	}
 	case htons(ETH_P_8021AD):
-- 
cgit v1.2.3


From fa85999f492e227b373c20db22acfa993d770e4b Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 31 May 2019 22:47:21 +0100
Subject: flow_offload: include linux/kernel.h from flow_offload.h

flow_stats_update() uses max_t, so ensure we have that defined.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index a2df99f9b196..36fdb85c974d 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -1,6 +1,7 @@
 #ifndef _NET_FLOW_OFFLOAD_H
 #define _NET_FLOW_OFFLOAD_H
 
+#include <linux/kernel.h>
 #include <net/flow_dissector.h>
 
 struct flow_match {
-- 
cgit v1.2.3


From b7034146756b9e91cc059b19df7fe4defd4d7de7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 2 Jun 2019 11:24:18 -0700
Subject: net: fix use-after-free in kfree_skb_list

syzbot reported nasty use-after-free [1]

Lets remove frag_list field from structs ip_fraglist_iter
and ip6_fraglist_iter. This seens not needed anyway.

[1] :
BUG: KASAN: use-after-free in kfree_skb_list+0x5d/0x60 net/core/skbuff.c:706
Read of size 8 at addr ffff888085a3cbc0 by task syz-executor303/8947

CPU: 0 PID: 8947 Comm: syz-executor303 Not tainted 5.2.0-rc2+ #12
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188
 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 kasan_report+0x12/0x20 mm/kasan/common.c:614
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
 kfree_skb_list+0x5d/0x60 net/core/skbuff.c:706
 ip6_fragment+0x1ef4/0x2680 net/ipv6/ip6_output.c:882
 __ip6_finish_output+0x577/0xaa0 net/ipv6/ip6_output.c:144
 ip6_finish_output+0x38/0x1f0 net/ipv6/ip6_output.c:156
 NF_HOOK_COND include/linux/netfilter.h:294 [inline]
 ip6_output+0x235/0x7f0 net/ipv6/ip6_output.c:179
 dst_output include/net/dst.h:433 [inline]
 ip6_local_out+0xbb/0x1b0 net/ipv6/output_core.c:179
 ip6_send_skb+0xbb/0x350 net/ipv6/ip6_output.c:1796
 ip6_push_pending_frames+0xc8/0xf0 net/ipv6/ip6_output.c:1816
 rawv6_push_pending_frames net/ipv6/raw.c:617 [inline]
 rawv6_sendmsg+0x2993/0x35e0 net/ipv6/raw.c:947
 inet_sendmsg+0x141/0x5d0 net/ipv4/af_inet.c:802
 sock_sendmsg_nosec net/socket.c:652 [inline]
 sock_sendmsg+0xd7/0x130 net/socket.c:671
 ___sys_sendmsg+0x803/0x920 net/socket.c:2292
 __sys_sendmsg+0x105/0x1d0 net/socket.c:2330
 __do_sys_sendmsg net/socket.c:2339 [inline]
 __se_sys_sendmsg net/socket.c:2337 [inline]
 __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2337
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x44add9
Code: e8 7c e6 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 1b 05 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f826f33bce8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000006e7a18 RCX: 000000000044add9
RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000005
RBP: 00000000006e7a10 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006e7a1c
R13: 00007ffcec4f7ebf R14: 00007f826f33c9c0 R15: 20c49ba5e353f7cf

Allocated by task 8947:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_kmalloc mm/kasan/common.c:489 [inline]
 __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462
 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:497
 slab_post_alloc_hook mm/slab.h:437 [inline]
 slab_alloc_node mm/slab.c:3269 [inline]
 kmem_cache_alloc_node+0x131/0x710 mm/slab.c:3579
 __alloc_skb+0xd5/0x5e0 net/core/skbuff.c:199
 alloc_skb include/linux/skbuff.h:1058 [inline]
 __ip6_append_data.isra.0+0x2a24/0x3640 net/ipv6/ip6_output.c:1519
 ip6_append_data+0x1e5/0x320 net/ipv6/ip6_output.c:1688
 rawv6_sendmsg+0x1467/0x35e0 net/ipv6/raw.c:940
 inet_sendmsg+0x141/0x5d0 net/ipv4/af_inet.c:802
 sock_sendmsg_nosec net/socket.c:652 [inline]
 sock_sendmsg+0xd7/0x130 net/socket.c:671
 ___sys_sendmsg+0x803/0x920 net/socket.c:2292
 __sys_sendmsg+0x105/0x1d0 net/socket.c:2330
 __do_sys_sendmsg net/socket.c:2339 [inline]
 __se_sys_sendmsg net/socket.c:2337 [inline]
 __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2337
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 8947:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451
 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459
 __cache_free mm/slab.c:3432 [inline]
 kmem_cache_free+0x86/0x260 mm/slab.c:3698
 kfree_skbmem net/core/skbuff.c:625 [inline]
 kfree_skbmem+0xc5/0x150 net/core/skbuff.c:619
 __kfree_skb net/core/skbuff.c:682 [inline]
 kfree_skb net/core/skbuff.c:699 [inline]
 kfree_skb+0xf0/0x390 net/core/skbuff.c:693
 kfree_skb_list+0x44/0x60 net/core/skbuff.c:708
 __dev_xmit_skb net/core/dev.c:3551 [inline]
 __dev_queue_xmit+0x3034/0x36b0 net/core/dev.c:3850
 dev_queue_xmit+0x18/0x20 net/core/dev.c:3914
 neigh_direct_output+0x16/0x20 net/core/neighbour.c:1532
 neigh_output include/net/neighbour.h:511 [inline]
 ip6_finish_output2+0x1034/0x2550 net/ipv6/ip6_output.c:120
 ip6_fragment+0x1ebb/0x2680 net/ipv6/ip6_output.c:863
 __ip6_finish_output+0x577/0xaa0 net/ipv6/ip6_output.c:144
 ip6_finish_output+0x38/0x1f0 net/ipv6/ip6_output.c:156
 NF_HOOK_COND include/linux/netfilter.h:294 [inline]
 ip6_output+0x235/0x7f0 net/ipv6/ip6_output.c:179
 dst_output include/net/dst.h:433 [inline]
 ip6_local_out+0xbb/0x1b0 net/ipv6/output_core.c:179
 ip6_send_skb+0xbb/0x350 net/ipv6/ip6_output.c:1796
 ip6_push_pending_frames+0xc8/0xf0 net/ipv6/ip6_output.c:1816
 rawv6_push_pending_frames net/ipv6/raw.c:617 [inline]
 rawv6_sendmsg+0x2993/0x35e0 net/ipv6/raw.c:947
 inet_sendmsg+0x141/0x5d0 net/ipv4/af_inet.c:802
 sock_sendmsg_nosec net/socket.c:652 [inline]
 sock_sendmsg+0xd7/0x130 net/socket.c:671
 ___sys_sendmsg+0x803/0x920 net/socket.c:2292
 __sys_sendmsg+0x105/0x1d0 net/socket.c:2330
 __do_sys_sendmsg net/socket.c:2339 [inline]
 __se_sys_sendmsg net/socket.c:2337 [inline]
 __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2337
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff888085a3cbc0
 which belongs to the cache skbuff_head_cache of size 224
The buggy address is located 0 bytes inside of
 224-byte region [ffff888085a3cbc0, ffff888085a3cca0)
The buggy address belongs to the page:
page:ffffea0002168f00 refcount:1 mapcount:0 mapping:ffff88821b6f63c0 index:0x0
flags: 0x1fffc0000000200(slab)
raw: 01fffc0000000200 ffffea00027bbf88 ffffea0002105b88 ffff88821b6f63c0
raw: 0000000000000000 ffff888085a3c080 000000010000000c 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff888085a3ca80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff888085a3cb00: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc
>ffff888085a3cb80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
                                           ^
 ffff888085a3cc00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff888085a3cc80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc

Fixes: 0feca6190f88 ("net: ipv6: add skbuff fraglist splitter")
Fixes: c8b17be0b7a4 ("net: ipv4: add skbuff fraglist splitter")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h      | 1 -
 include/net/ipv6.h    | 1 -
 net/ipv4/ip_output.c  | 5 ++---
 net/ipv6/ip6_output.c | 5 ++---
 net/ipv6/netfilter.c  | 2 +-
 5 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 029cc3fd26bd..cd5cde5532d5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -167,7 +167,6 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		   int (*output)(struct net *, struct sock *, struct sk_buff *));
 
 struct ip_fraglist_iter {
-	struct sk_buff	*frag_list;
 	struct sk_buff	*frag;
 	struct iphdr	*iph;
 	int		offset;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 21bb830e9679..0d34f6ed9681 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -156,7 +156,6 @@ struct frag_hdr {
 
 struct ip6_fraglist_iter {
 	struct ipv6hdr	*tmp_hdr;
-	struct sk_buff	*frag_list;
 	struct sk_buff	*frag;
 	int		offset;
 	unsigned int	hlen;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ceca5285d9b4..f5636ab0b9c3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -575,8 +575,7 @@ void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
 {
 	unsigned int first_len = skb_pagelen(skb);
 
-	iter->frag_list = skb_shinfo(skb)->frag_list;
-	iter->frag = iter->frag_list;
+	iter->frag = skb_shinfo(skb)->frag_list;
 	skb_frag_list_init(skb);
 
 	iter->offset = 0;
@@ -845,7 +844,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			return 0;
 		}
 
-		kfree_skb_list(iter.frag_list);
+		kfree_skb_list(iter.frag);
 
 		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
 		return err;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8fa83b78f81a..1f430cd49d8a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -613,8 +613,7 @@ int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 	if (!iter->tmp_hdr)
 		return -ENOMEM;
 
-	iter->frag_list = skb_shinfo(skb)->frag_list;
-	iter->frag = iter->frag_list;
+	iter->frag = skb_shinfo(skb)->frag_list;
 	skb_frag_list_init(skb);
 
 	iter->offset = 0;
@@ -879,7 +878,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			return 0;
 		}
 
-		kfree_skb_list(iter.frag_list);
+		kfree_skb_list(iter.frag);
 
 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 			      IPSTATS_MIB_FRAGFAILS);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 9530cc280953..d9673e10c60c 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -194,7 +194,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		if (!err)
 			return 0;
 
-		kfree_skb_list(iter.frag_list);
+		kfree_skb_list(iter.frag);
 		return err;
 	}
 slow_path:
-- 
cgit v1.2.3


From 191ed2024de9fcfaab24106f9dbf7e544b07d633 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Tue, 4 Jun 2019 15:40:40 +0200
Subject: devlink: allow driver to update progress of flash update

Introduce a function to be called from drivers during flash. It sends
notification to userspace about flash update progress.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |   8 ++++
 include/uapi/linux/devlink.h |   5 +++
 net/core/devlink.c           | 102 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 151eb930d329..8f65356132be 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -741,6 +741,14 @@ void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 				     enum devlink_health_reporter_state state);
 
+void devlink_flash_update_begin_notify(struct devlink *devlink);
+void devlink_flash_update_end_notify(struct devlink *devlink);
+void devlink_flash_update_status_notify(struct devlink *devlink,
+					const char *status_msg,
+					const char *component,
+					unsigned long done,
+					unsigned long total);
+
 #if IS_ENABLED(CONFIG_NET_DEVLINK)
 
 void devlink_compat_running_version(struct net_device *dev,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 5bb4ea67d84f..5287b42c181f 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -104,6 +104,8 @@ enum devlink_command {
 	DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
 
 	DEVLINK_CMD_FLASH_UPDATE,
+	DEVLINK_CMD_FLASH_UPDATE_END,		/* notification only */
+	DEVLINK_CMD_FLASH_UPDATE_STATUS,	/* notification only */
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
@@ -331,6 +333,9 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME,	/* string */
 	DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,	/* string */
+	DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,	/* string */
+	DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,	/* u64 */
+	DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,	/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 9716a7f382cb..963178d32dda 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2673,6 +2673,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 	return devlink->ops->reload(devlink, info->extack);
 }
 
+static int devlink_nl_flash_update_fill(struct sk_buff *msg,
+					struct devlink *devlink,
+					enum devlink_command cmd,
+					const char *status_msg,
+					const char *component,
+					unsigned long done, unsigned long total)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
+		goto out;
+
+	if (status_msg &&
+	    nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
+			   status_msg))
+		goto nla_put_failure;
+	if (component &&
+	    nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
+			   component))
+		goto nla_put_failure;
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
+			      done, DEVLINK_ATTR_PAD))
+		goto nla_put_failure;
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
+			      total, DEVLINK_ATTR_PAD))
+		goto nla_put_failure;
+
+out:
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void __devlink_flash_update_notify(struct devlink *devlink,
+					  enum devlink_command cmd,
+					  const char *status_msg,
+					  const char *component,
+					  unsigned long done,
+					  unsigned long total)
+{
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
+		cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
+		cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg,
+					   component, done, total);
+	if (err)
+		goto out_free_msg;
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+	return;
+
+out_free_msg:
+	nlmsg_free(msg);
+}
+
+void devlink_flash_update_begin_notify(struct devlink *devlink)
+{
+	__devlink_flash_update_notify(devlink,
+				      DEVLINK_CMD_FLASH_UPDATE,
+				      NULL, NULL, 0, 0);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify);
+
+void devlink_flash_update_end_notify(struct devlink *devlink)
+{
+	__devlink_flash_update_notify(devlink,
+				      DEVLINK_CMD_FLASH_UPDATE_END,
+				      NULL, NULL, 0, 0);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify);
+
+void devlink_flash_update_status_notify(struct devlink *devlink,
+					const char *status_msg,
+					const char *component,
+					unsigned long done,
+					unsigned long total)
+{
+	__devlink_flash_update_notify(devlink,
+				      DEVLINK_CMD_FLASH_UPDATE_STATUS,
+				      status_msg, component, done, total);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+
 static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
 				       struct genl_info *info)
 {
-- 
cgit v1.2.3


From da29e4b466e6916a52e0e2f60054f855c324a9c2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 3 Jun 2019 15:16:58 -0700
Subject: net/tls: fully initialize the msg wrapper skb

If strparser gets cornered into starting a new message from
an sk_buff which already has frags, it will allocate a new
skb to become the "wrapper" around the fragments of the
message.

This new skb does not inherit any metadata fields.  In case
of TLS offload this may lead to unnecessarily re-encrypting
the message, as skb->decrypted is not set for the wrapper skb.

Try to be conservative and copy all fields of old skb
strparser's user may reasonably need.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 25 +++++++++++++++++++++++++
 net/strparser/strparser.c |  8 ++------
 3 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2ee5e63195c0..98ff5ac98caa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1063,6 +1063,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
 				     int max_page_order,
 				     int *errcode,
 				     gfp_t gfp_mask);
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);
 
 /* Layout of fast clones : [skb1][skb2][fclone_ref] */
 struct sk_buff_fclones {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4a712a00243a..b50a5e3ac4e4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -913,6 +913,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 #undef C
 }
 
+/**
+ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
+ * @first: first sk_buff of the msg
+ */
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
+{
+	struct sk_buff *n;
+
+	n = alloc_skb(0, GFP_ATOMIC);
+	if (!n)
+		return NULL;
+
+	n->len = first->len;
+	n->data_len = first->len;
+	n->truesize = first->truesize;
+
+	skb_shinfo(n)->frag_list = first;
+
+	__copy_skb_header(n, first);
+	n->destructor = NULL;
+
+	return n;
+}
+EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
+
 /**
  *	skb_morph	-	morph one skb into another
  *	@dst: the skb to receive the contents
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index e137698e8aef..3fe541b746b0 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -160,18 +160,14 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 					return 0;
 				}
 
-				skb = alloc_skb(0, GFP_ATOMIC);
+				skb = alloc_skb_for_msg(head);
 				if (!skb) {
 					STRP_STATS_INCR(strp->stats.mem_fail);
 					desc->error = -ENOMEM;
 					return 0;
 				}
-				skb->len = head->len;
-				skb->data_len = head->len;
-				skb->truesize = head->truesize;
-				*_strp_msg(skb) = *_strp_msg(head);
+
 				strp->skb_nextp = &head->next;
-				skb_shinfo(skb)->frag_list = head;
 				strp->skb_head = skb;
 				head = skb;
 			} else {
-- 
cgit v1.2.3


From f0aaa2c975617da78b80feebc87e74dba9ec1f53 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 3 Jun 2019 15:17:04 -0700
Subject: net/tls: reorganize struct tls_context

struct tls_context is slightly badly laid out.  If we reorder things
right we can save 16 bytes (320 -> 304) but also make all fast path
data fit into two cache lines (one read only and one read/write,
down from four cache lines).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 39ea62f0c1f6..a463a6074e5d 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -236,34 +236,32 @@ struct tls_prot_info {
 };
 
 struct tls_context {
+	/* read-only cache line */
 	struct tls_prot_info prot_info;
 
-	union tls_crypto_context crypto_send;
-	union tls_crypto_context crypto_recv;
+	u8 tx_conf:3;
+	u8 rx_conf:3;
 
-	struct list_head list;
-	struct net_device *netdev;
-	refcount_t refcount;
+	int (*push_pending_record)(struct sock *sk, int flags);
+	void (*sk_write_space)(struct sock *sk);
 
 	void *priv_ctx_tx;
 	void *priv_ctx_rx;
 
-	u8 tx_conf:3;
-	u8 rx_conf:3;
+	struct net_device *netdev;
 
+	/* rw cache line */
 	struct cipher_context tx;
 	struct cipher_context rx;
 
 	struct scatterlist *partially_sent_record;
 	u16 partially_sent_offset;
 
-	unsigned long flags;
 	bool in_tcp_sendpages;
 	bool pending_open_record_frags;
+	unsigned long flags;
 
-	int (*push_pending_record)(struct sock *sk, int flags);
-
-	void (*sk_write_space)(struct sock *sk);
+	/* cache cold stuff */
 	void (*sk_destruct)(struct sock *sk);
 	void (*sk_proto_close)(struct sock *sk, long timeout);
 
@@ -275,6 +273,12 @@ struct tls_context {
 			   int __user *optlen);
 	int  (*hash)(struct sock *sk);
 	void (*unhash)(struct sock *sk);
+
+	union tls_crypto_context crypto_send;
+	union tls_crypto_context crypto_recv;
+
+	struct list_head list;
+	refcount_t refcount;
 };
 
 enum tls_offload_ctx_dir {
-- 
cgit v1.2.3


From fb0f886fa265f265ad126fc7cd7e8ec51e2f770f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 3 Jun 2019 15:17:05 -0700
Subject: net/tls: don't pass version to tls_advance_record_sn()

All callers pass prot->version as the last parameter
of tls_advance_record_sn(), yet tls_advance_record_sn()
itself needs a pointer to prot.  Pass prot from callers.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    | 10 +++-------
 net/tls/tls_device.c |  2 +-
 net/tls/tls_sw.c     |  9 ++++-----
 3 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index a463a6074e5d..0a0072636009 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -446,19 +446,15 @@ static inline struct tls_context *tls_get_ctx(const struct sock *sk)
 }
 
 static inline void tls_advance_record_sn(struct sock *sk,
-					 struct cipher_context *ctx,
-					 int version)
+					 struct tls_prot_info *prot,
+					 struct cipher_context *ctx)
 {
-	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_prot_info *prot = &tls_ctx->prot_info;
-
 	if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size))
 		tls_err_abort(sk, EBADMSG);
 
-	if (version != TLS_1_3_VERSION) {
+	if (prot->version != TLS_1_3_VERSION)
 		tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
 				     prot->iv_size);
-	}
 }
 
 static inline void tls_fill_prepend(struct tls_context *ctx,
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 8ffc8f95f55f..51e556e79371 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -264,7 +264,7 @@ static int tls_push_record(struct sock *sk,
 	list_add_tail(&record->list, &offload_ctx->records_list);
 	spin_unlock_irq(&offload_ctx->lock);
 	offload_ctx->open_record = NULL;
-	tls_advance_record_sn(sk, &ctx->tx, prot->version);
+	tls_advance_record_sn(sk, prot, &ctx->tx);
 
 	for (i = 0; i < record->num_frags; i++) {
 		frag = &record->frags[i];
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f833407c789f..bef71e54fad0 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -534,7 +534,7 @@ static int tls_do_encryption(struct sock *sk,
 
 	/* Unhook the record from context if encryption is not failure */
 	ctx->open_rec = NULL;
-	tls_advance_record_sn(sk, &tls_ctx->tx, prot->version);
+	tls_advance_record_sn(sk, prot, &tls_ctx->tx);
 	return rc;
 }
 
@@ -1486,7 +1486,6 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct tls_prot_info *prot = &tls_ctx->prot_info;
-	int version = prot->version;
 	struct strp_msg *rxm = strp_msg(skb);
 	int pad, err = 0;
 
@@ -1504,8 +1503,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 					       async);
 			if (err < 0) {
 				if (err == -EINPROGRESS)
-					tls_advance_record_sn(sk, &tls_ctx->rx,
-							      version);
+					tls_advance_record_sn(sk, prot,
+							      &tls_ctx->rx);
 
 				return err;
 			}
@@ -1520,7 +1519,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 		rxm->full_len -= pad;
 		rxm->offset += prot->prepend_size;
 		rxm->full_len -= prot->overhead_size;
-		tls_advance_record_sn(sk, &tls_ctx->rx, version);
+		tls_advance_record_sn(sk, prot, &tls_ctx->rx);
 		ctx->decrypted = true;
 		ctx->saved_data_ready(sk);
 	} else {
-- 
cgit v1.2.3


From 5481d73f81549e2a05cbbb49867a9a560c5292df Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 3 Jun 2019 20:19:49 -0700
Subject: ipv4: Use accessors for fib_info nexthop data

Use helpers to access fib_nh and fib_nhs fields of a fib_info. Drop the
fib_dev macro which is an alias for the first nexthop. Replacements:

  fi->fib_dev    --> fib_info_nh(fi, 0)->fib_nh_dev
  fi->fib_nh     --> fib_info_nh(fi, 0)
  fi->fib_nh[i]  --> fib_info_nh(fi, i)
  fi->fib_nhs    --> fib_info_num_path(fi)

where fib_info_nh(fi, i) returns fi->fib_nh[nhsel] and fib_info_num_path
returns fi->fib_nhs.

Move the existing fib_info_nhc to nexthop.h and define the new ones
there. A later patch adds a check if a fib_info uses a nexthop object,
and defining the helpers in nexthop.h avoid circular header
dependencies.

After this all remaining open coded references to fi->fib_nhs and
fi->fib_nh are in:
- fib_create_info and helpers used to lookup an existing fib_info
  entry, and
- the netdev event functions fib_sync_down_dev and fib_sync_up.

The latter two will not be reused for nexthops, and the fib_create_info
will be updated to handle a nexthop in a fib_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c   | 29 ++++++----
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 19 ++++---
 drivers/net/ethernet/rocker/rocker_ofdpa.c         | 25 +++++---
 include/net/ip_fib.h                               |  6 --
 include/net/nexthop.h                              | 15 +++++
 net/core/filter.c                                  |  3 +-
 net/ipv4/fib_frontend.c                            | 11 ++--
 net/ipv4/fib_lookup.h                              |  1 +
 net/ipv4/fib_rules.c                               |  8 ++-
 net/ipv4/fib_semantics.c                           | 66 ++++++++++++----------
 net/ipv4/fib_trie.c                                | 26 +++++----
 net/ipv4/route.c                                   |  3 +-
 12 files changed, 132 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
index 8212bfd05733..2cbfaa8da7fc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2019 Mellanox Technologies. */
 
 #include <linux/netdevice.h>
+#include <net/nexthop.h>
 #include "lag.h"
 #include "lag_mp.h"
 #include "mlx5_core.h"
@@ -110,6 +111,8 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev,
 				     struct fib_info *fi)
 {
 	struct lag_mp *mp = &ldev->lag_mp;
+	struct fib_nh *fib_nh0, *fib_nh1;
+	unsigned int nhs;
 
 	/* Handle delete event */
 	if (event == FIB_EVENT_ENTRY_DEL) {
@@ -120,9 +123,11 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev,
 	}
 
 	/* Handle add/replace event */
-	if (fi->fib_nhs == 1) {
+	nhs = fib_info_num_path(fi);
+	if (nhs == 1) {
 		if (__mlx5_lag_is_active(ldev)) {
-			struct net_device *nh_dev = fi->fib_nh[0].fib_nh_dev;
+			struct fib_nh *nh = fib_info_nh(fi, 0);
+			struct net_device *nh_dev = nh->fib_nh_dev;
 			int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev);
 
 			mlx5_lag_set_port_affinity(ldev, ++i);
@@ -130,14 +135,16 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev,
 		return;
 	}
 
-	if (fi->fib_nhs != 2)
+	if (nhs != 2)
 		return;
 
 	/* Verify next hops are ports of the same hca */
-	if (!(fi->fib_nh[0].fib_nh_dev == ldev->pf[0].netdev &&
-	      fi->fib_nh[1].fib_nh_dev == ldev->pf[1].netdev) &&
-	    !(fi->fib_nh[0].fib_nh_dev == ldev->pf[1].netdev &&
-	      fi->fib_nh[1].fib_nh_dev == ldev->pf[0].netdev)) {
+	fib_nh0 = fib_info_nh(fi, 0);
+	fib_nh1 = fib_info_nh(fi, 1);
+	if (!(fib_nh0->fib_nh_dev == ldev->pf[0].netdev &&
+	      fib_nh1->fib_nh_dev == ldev->pf[1].netdev) &&
+	    !(fib_nh0->fib_nh_dev == ldev->pf[1].netdev &&
+	      fib_nh1->fib_nh_dev == ldev->pf[0].netdev)) {
 		mlx5_core_warn(ldev->pf[0].dev, "Multipath offload require two ports of the same HCA\n");
 		return;
 	}
@@ -174,7 +181,7 @@ static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev,
 			mlx5_lag_set_port_affinity(ldev, i);
 		}
 	} else if (event == FIB_EVENT_NH_ADD &&
-		   fi->fib_nhs == 2) {
+		   fib_info_num_path(fi) == 2) {
 		mlx5_lag_set_port_affinity(ldev, 0);
 	}
 }
@@ -238,6 +245,7 @@ static int mlx5_lag_fib_event(struct notifier_block *nb,
 	struct mlx5_fib_event_work *fib_work;
 	struct fib_entry_notifier_info *fen_info;
 	struct fib_nh_notifier_info *fnh_info;
+	struct net_device *fib_dev;
 	struct fib_info *fi;
 
 	if (info->family != AF_INET)
@@ -254,8 +262,9 @@ static int mlx5_lag_fib_event(struct notifier_block *nb,
 		fen_info = container_of(info, struct fib_entry_notifier_info,
 					info);
 		fi = fen_info->fi;
-		if (fi->fib_dev != ldev->pf[0].netdev &&
-		    fi->fib_dev != ldev->pf[1].netdev) {
+		fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev;
+		if (fib_dev != ldev->pf[0].netdev &&
+		    fib_dev != ldev->pf[1].netdev) {
 			return NOTIFY_DONE;
 		}
 		fib_work = mlx5_lag_init_fib_work(ldev, event);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 0ec52be7cc33..4f781358aef1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -21,6 +21,7 @@
 #include <net/arp.h>
 #include <net/ip_fib.h>
 #include <net/ip6_fib.h>
+#include <net/nexthop.h>
 #include <net/fib_rules.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
@@ -3816,23 +3817,25 @@ static void mlxsw_sp_nexthop_rif_gone_sync(struct mlxsw_sp *mlxsw_sp,
 }
 
 static bool mlxsw_sp_fi_is_gateway(const struct mlxsw_sp *mlxsw_sp,
-				   const struct fib_info *fi)
+				   struct fib_info *fi)
 {
-	return fi->fib_nh->fib_nh_scope == RT_SCOPE_LINK ||
-	       mlxsw_sp_nexthop4_ipip_type(mlxsw_sp, fi->fib_nh, NULL);
+	const struct fib_nh *nh = fib_info_nh(fi, 0);
+
+	return nh->fib_nh_scope == RT_SCOPE_LINK ||
+	       mlxsw_sp_nexthop4_ipip_type(mlxsw_sp, nh, NULL);
 }
 
 static struct mlxsw_sp_nexthop_group *
 mlxsw_sp_nexthop4_group_create(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi)
 {
+	unsigned int nhs = fib_info_num_path(fi);
 	struct mlxsw_sp_nexthop_group *nh_grp;
 	struct mlxsw_sp_nexthop *nh;
 	struct fib_nh *fib_nh;
 	int i;
 	int err;
 
-	nh_grp = kzalloc(struct_size(nh_grp, nexthops, fi->fib_nhs),
-			 GFP_KERNEL);
+	nh_grp = kzalloc(struct_size(nh_grp, nexthops, nhs), GFP_KERNEL);
 	if (!nh_grp)
 		return ERR_PTR(-ENOMEM);
 	nh_grp->priv = fi;
@@ -3840,11 +3843,11 @@ mlxsw_sp_nexthop4_group_create(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi)
 	nh_grp->neigh_tbl = &arp_tbl;
 
 	nh_grp->gateway = mlxsw_sp_fi_is_gateway(mlxsw_sp, fi);
-	nh_grp->count = fi->fib_nhs;
+	nh_grp->count = nhs;
 	fib_info_hold(fi);
 	for (i = 0; i < nh_grp->count; i++) {
 		nh = &nh_grp->nexthops[i];
-		fib_nh = &fi->fib_nh[i];
+		fib_nh = fib_info_nh(fi, i);
 		err = mlxsw_sp_nexthop4_init(mlxsw_sp, nh_grp, nh, fib_nh);
 		if (err)
 			goto err_nexthop4_init;
@@ -4282,9 +4285,9 @@ mlxsw_sp_fib4_entry_type_set(struct mlxsw_sp *mlxsw_sp,
 			     const struct fib_entry_notifier_info *fen_info,
 			     struct mlxsw_sp_fib_entry *fib_entry)
 {
+	struct net_device *dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev;
 	union mlxsw_sp_l3addr dip = { .addr4 = htonl(fen_info->dst) };
 	u32 tb_id = mlxsw_sp_fix_tb_id(fen_info->tb_id);
-	struct net_device *dev = fen_info->fi->fib_dev;
 	struct mlxsw_sp_ipip_entry *ipip_entry;
 	struct fib_info *fi = fen_info->fi;
 
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 30a49802fb51..47ed9d41047f 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -22,6 +22,7 @@
 #include <net/neighbour.h>
 #include <net/switchdev.h>
 #include <net/ip_fib.h>
+#include <net/nexthop.h>
 #include <net/arp.h>
 
 #include "rocker.h"
@@ -2286,8 +2287,8 @@ static int ofdpa_port_fib_ipv4(struct ofdpa_port *ofdpa_port,  __be32 dst,
 
 	/* XXX support ECMP */
 
-	nh = fi->fib_nh;
-	nh_on_port = (fi->fib_dev == ofdpa_port->dev);
+	nh = fib_info_nh(fi, 0);
+	nh_on_port = (nh->fib_nh_dev == ofdpa_port->dev);
 	has_gw = !!nh->fib_nh_gw4;
 
 	if (has_gw && nh_on_port) {
@@ -2737,11 +2738,13 @@ static int ofdpa_fib4_add(struct rocker *rocker,
 {
 	struct ofdpa *ofdpa = rocker->wpriv;
 	struct ofdpa_port *ofdpa_port;
+	struct fib_nh *nh;
 	int err;
 
 	if (ofdpa->fib_aborted)
 		return 0;
-	ofdpa_port = ofdpa_port_dev_lower_find(fen_info->fi->fib_dev, rocker);
+	nh = fib_info_nh(fen_info->fi, 0);
+	ofdpa_port = ofdpa_port_dev_lower_find(nh->fib_nh_dev, rocker);
 	if (!ofdpa_port)
 		return 0;
 	err = ofdpa_port_fib_ipv4(ofdpa_port, htonl(fen_info->dst),
@@ -2749,7 +2752,7 @@ static int ofdpa_fib4_add(struct rocker *rocker,
 				  fen_info->tb_id, 0);
 	if (err)
 		return err;
-	fen_info->fi->fib_nh->fib_nh_flags |= RTNH_F_OFFLOAD;
+	nh->fib_nh_flags |= RTNH_F_OFFLOAD;
 	return 0;
 }
 
@@ -2758,13 +2761,15 @@ static int ofdpa_fib4_del(struct rocker *rocker,
 {
 	struct ofdpa *ofdpa = rocker->wpriv;
 	struct ofdpa_port *ofdpa_port;
+	struct fib_nh *nh;
 
 	if (ofdpa->fib_aborted)
 		return 0;
-	ofdpa_port = ofdpa_port_dev_lower_find(fen_info->fi->fib_dev, rocker);
+	nh = fib_info_nh(fen_info->fi, 0);
+	ofdpa_port = ofdpa_port_dev_lower_find(nh->fib_nh_dev, rocker);
 	if (!ofdpa_port)
 		return 0;
-	fen_info->fi->fib_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD;
+	nh->fib_nh_flags &= ~RTNH_F_OFFLOAD;
 	return ofdpa_port_fib_ipv4(ofdpa_port, htonl(fen_info->dst),
 				   fen_info->dst_len, fen_info->fi,
 				   fen_info->tb_id, OFDPA_OP_FLAG_REMOVE);
@@ -2784,14 +2789,16 @@ static void ofdpa_fib4_abort(struct rocker *rocker)
 
 	spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
 	hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry) {
+		struct fib_nh *nh;
+
 		if (flow_entry->key.tbl_id !=
 		    ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING)
 			continue;
-		ofdpa_port = ofdpa_port_dev_lower_find(flow_entry->fi->fib_dev,
-						       rocker);
+		nh = fib_info_nh(flow_entry->fi, 0);
+		ofdpa_port = ofdpa_port_dev_lower_find(nh->fib_nh_dev, rocker);
 		if (!ofdpa_port)
 			continue;
-		flow_entry->fi->fib_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD;
+		nh->fib_nh_flags &= ~RTNH_F_OFFLOAD;
 		ofdpa_flow_tbl_del(ofdpa_port, OFDPA_OP_FLAG_REMOVE,
 				   flow_entry);
 	}
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 70ba0302c8c9..42b1a806f6f5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -153,7 +153,6 @@ struct fib_info {
 	bool			nh_updated;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
-#define fib_dev		fib_nh[0].fib_nh_dev
 };
 
 
@@ -190,11 +189,6 @@ struct fib_result_nl {
 	int             err;
 };
 
-static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
-{
-	return &fi->fib_nh[nhsel].nh_common;
-}
-
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 #define FIB_TABLE_HASHSZ 256
 #else
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 6e1b8f53624c..e501d77b82c8 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -192,4 +192,19 @@ static inline bool nexthop_is_blackhole(const struct nexthop *nh)
 	nhi = rcu_dereference_rtnl(nh->nh_info);
 	return nhi->reject_nh;
 }
+
+static inline unsigned int fib_info_num_path(const struct fib_info *fi)
+{
+	return fi->fib_nhs;
+}
+
+static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
+{
+	return &fi->fib_nh[nhsel].nh_common;
+}
+
+static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
+{
+	return &fi->fib_nh[nhsel];
+}
 #endif
diff --git a/net/core/filter.c b/net/core/filter.c
index 55bfc941d17a..2ae72bbfa6d2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -66,6 +66,7 @@
 #include <net/inet_hashtables.h>
 #include <net/inet6_hashtables.h>
 #include <net/ip_fib.h>
+#include <net/nexthop.h>
 #include <net/flow.h>
 #include <net/arp.h>
 #include <net/ipv6.h>
@@ -4674,7 +4675,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (res.type != RTN_UNICAST)
 		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
-	if (res.fi->fib_nhs > 1)
+	if (fib_info_num_path(res.fi) > 1)
 		fib_select_path(net, &res, &fl4, NULL);
 
 	if (check_mtu) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c7cdb8d0d164..a4691360b395 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -43,6 +43,7 @@
 #include <net/sock.h>
 #include <net/arp.h>
 #include <net/ip_fib.h>
+#include <net/nexthop.h>
 #include <net/rtnetlink.h>
 #include <net/xfrm.h>
 #include <net/l3mdev.h>
@@ -234,7 +235,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	if (table) {
 		ret = RTN_UNICAST;
 		if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
-			if (!dev || dev == res.fi->fib_dev)
+			struct fib_nh *nh = fib_info_nh(res.fi, 0);
+
+			if (!dev || dev == nh->fib_nh_dev)
 				ret = res.type;
 		}
 	}
@@ -321,8 +324,8 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int ret;
 
-	for (ret = 0; ret < fi->fib_nhs; ret++) {
-		struct fib_nh *nh = &fi->fib_nh[ret];
+	for (ret = 0; ret < fib_info_num_path(fi); ret++) {
+		const struct fib_nh *nh = fib_info_nh(fi, ret);
 
 		if (nh->fib_nh_dev == dev) {
 			dev_match = true;
@@ -333,7 +336,7 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 		}
 	}
 #else
-	if (fi->fib_nh[0].fib_nh_dev == dev)
+	if (fib_info_nh(fi, 0)->fib_nh_dev == dev)
 		dev_match = true;
 #endif
 
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 7945f0534db7..a68b5e21ec51 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <net/ip_fib.h>
+#include <net/nexthop.h>
 
 struct fib_alias {
 	struct hlist_node	fa_list;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index cfec3af54c8d..ab06fd73b343 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -31,6 +31,7 @@
 #include <net/route.h>
 #include <net/tcp.h>
 #include <net/ip_fib.h>
+#include <net/nexthop.h>
 #include <net/fib_rules.h>
 
 struct fib4_rule {
@@ -145,8 +146,11 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
 	struct fib_result *result = (struct fib_result *) arg->result;
 	struct net_device *dev = NULL;
 
-	if (result->fi)
-		dev = result->fi->fib_dev;
+	if (result->fi) {
+		struct fib_nh *nh = fib_info_nh(result->fi, 0);
+
+		dev = nh->fib_nh_dev;
+	}
 
 	/* do not accept result if the route does
 	 * not meet the required prefix length
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 78648072783e..a37ff07718a8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -42,6 +42,7 @@
 #include <net/sock.h>
 #include <net/ip_fib.h>
 #include <net/ip6_fib.h>
+#include <net/nexthop.h>
 #include <net/netlink.h>
 #include <net/rtnh.h>
 #include <net/lwtunnel.h>
@@ -65,13 +66,13 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 #define for_nexthops(fi) {						\
 	int nhsel; const struct fib_nh *nh;				\
 	for (nhsel = 0, nh = (fi)->fib_nh;				\
-	     nhsel < (fi)->fib_nhs;					\
+	     nhsel < fib_info_num_path((fi));				\
 	     nh++, nhsel++)
 
 #define change_nexthops(fi) {						\
 	int nhsel; struct fib_nh *nexthop_nh;				\
 	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
-	     nhsel < (fi)->fib_nhs;					\
+	     nhsel < fib_info_num_path((fi));				\
 	     nexthop_nh++, nhsel++)
 
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -271,11 +272,13 @@ void fib_release_info(struct fib_info *fi)
 	spin_unlock_bh(&fib_info_lock);
 }
 
-static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
 {
-	const struct fib_nh *onh = ofi->fib_nh;
+	const struct fib_nh *onh;
 
 	for_nexthops(fi) {
+		onh = fib_info_nh(ofi, nhsel);
+
 		if (nh->fib_nh_oif != onh->fib_nh_oif ||
 		    nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
 		    nh->fib_nh_scope != onh->fib_nh_scope ||
@@ -296,8 +299,6 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 		if (nh->fib_nh_gw_family == AF_INET6 &&
 		    ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
 			return -1;
-
-		onh++;
 	} endfor_nexthops(fi);
 	return 0;
 }
@@ -326,7 +327,7 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
 }
 
-static struct fib_info *fib_find_info(const struct fib_info *nfi)
+static struct fib_info *fib_find_info(struct fib_info *nfi)
 {
 	struct hlist_head *head;
 	struct fib_info *fi;
@@ -390,13 +391,14 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 			 + nla_total_size(4) /* RTA_PRIORITY */
 			 + nla_total_size(4) /* RTA_PREFSRC */
 			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+	unsigned int nhs = fib_info_num_path(fi);
 
 	/* space for nested metrics */
 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
 
-	if (fi->fib_nhs) {
+	if (nhs) {
 		size_t nh_encapsize = 0;
-		/* Also handles the special case fib_nhs == 1 */
+		/* Also handles the special case nhs == 1 */
 
 		/* each nexthop is packed in an attribute */
 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
@@ -416,8 +418,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 		} endfor_nexthops(fi);
 
 		/* all nexthops are packed in a nested attribute */
-		payload += nla_total_size((fi->fib_nhs * nhsize) +
-					  nh_encapsize);
+		payload += nla_total_size((nhs * nhsize) + nh_encapsize);
 
 	}
 
@@ -584,6 +585,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 {
 	struct net *net = fi->fib_net;
 	struct fib_config fib_cfg;
+	struct fib_nh *nh;
 	int ret;
 
 	change_nexthops(fi) {
@@ -646,24 +648,25 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 	} endfor_nexthops(fi);
 
 	ret = -EINVAL;
-	if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) {
+	nh = fib_info_nh(fi, 0);
+	if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) {
 		NL_SET_ERR_MSG(extack,
 			       "Nexthop device index does not match RTA_OIF");
 		goto errout;
 	}
 	if (cfg->fc_gw_family) {
-		if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family ||
+		if (cfg->fc_gw_family != nh->fib_nh_gw_family ||
 		    (cfg->fc_gw_family == AF_INET &&
-		     fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) ||
+		     nh->fib_nh_gw4 != cfg->fc_gw4) ||
 		    (cfg->fc_gw_family == AF_INET6 &&
-		     ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) {
+		     ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) {
 			NL_SET_ERR_MSG(extack,
 				       "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
 			goto errout;
 		}
 	}
 #ifdef CONFIG_IP_ROUTE_CLASSID
-	if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
+	if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) {
 		NL_SET_ERR_MSG(extack,
 			       "Nexthop class id does not match RTA_FLOW");
 		goto errout;
@@ -679,7 +682,7 @@ static void fib_rebalance(struct fib_info *fi)
 	int total;
 	int w;
 
-	if (fi->fib_nhs < 2)
+	if (fib_info_num_path(fi) < 2)
 		return;
 
 	total = 0;
@@ -761,27 +764,29 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 		return 1;
 
 	if (cfg->fc_oif || cfg->fc_gw_family) {
+		struct fib_nh *nh = fib_info_nh(fi, 0);
+
 		if (cfg->fc_encap) {
 			if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
-					    fi->fib_nh, cfg, extack))
+					    nh, cfg, extack))
 				return 1;
 		}
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		if (cfg->fc_flow &&
-		    cfg->fc_flow != fi->fib_nh->nh_tclassid)
+		    cfg->fc_flow != nh->nh_tclassid)
 			return 1;
 #endif
-		if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) ||
+		if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) ||
 		    (cfg->fc_gw_family &&
-		     cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family))
+		     cfg->fc_gw_family != nh->fib_nh_gw_family))
 			return 1;
 
 		if (cfg->fc_gw_family == AF_INET &&
-		    cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4)
+		    cfg->fc_gw4 != nh->fib_nh_gw4)
 			return 1;
 
 		if (cfg->fc_gw_family == AF_INET6 &&
-		    ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6))
+		    ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6))
 			return 1;
 
 		return 0;
@@ -1366,7 +1371,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 			goto err_inval;
 		}
 		nh->fib_nh_scope = RT_SCOPE_NOWHERE;
-		nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif);
+		nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif);
 		err = -ENODEV;
 		if (!nh->fib_nh_dev)
 			goto failure;
@@ -1583,6 +1588,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
 		  struct fib_info *fi, unsigned int flags)
 {
+	unsigned int nhs = fib_info_num_path(fi);
 	struct nlmsghdr *nlh;
 	struct rtmsg *rtm;
 
@@ -1618,8 +1624,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	if (fi->fib_prefsrc &&
 	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
 		goto nla_put_failure;
-	if (fi->fib_nhs == 1) {
-		struct fib_nh *nh = &fi->fib_nh[0];
+	if (nhs == 1) {
+		const struct fib_nh *nh = fib_info_nh(fi, 0);
 		unsigned char flags = 0;
 
 		if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0)
@@ -1838,6 +1844,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 
 	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
+		struct fib_nh *nh;
 
 		if (fa->fa_slen != slen)
 			continue;
@@ -1859,8 +1866,9 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 		if (next_fi->fib_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-		if (!next_fi->fib_nh[0].fib_nh_gw4 ||
-		    next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK)
+
+		nh = fib_info_nh(next_fi, 0);
+		if (!nh->fib_nh_gw4 || nh->fib_nh_scope != RT_SCOPE_LINK)
 			continue;
 
 		fib_alias_accessed(fa);
@@ -2024,7 +2032,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
 		goto check_saddr;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi->fib_nhs > 1) {
+	if (fib_info_num_path(res->fi) > 1) {
 		int h = fib_multipath_hash(net, fl4, skb, NULL);
 
 		fib_select_multipath(res, h);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index b53ecef89d59..5c8a4d21b8e0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1469,7 +1469,7 @@ found:
 		}
 		if (fi->fib_flags & RTNH_F_DEAD)
 			continue;
-		for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+		for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
 			struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
 
 			if (nhc->nhc_flags & RTNH_F_DEAD)
@@ -2717,14 +2717,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock();
 }
 
-static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
 {
 	unsigned int flags = 0;
 
 	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
 		flags = RTF_REJECT;
-	if (fi && fi->fib_nh->fib_nh_gw4)
-		flags |= RTF_GATEWAY;
+	if (fi) {
+		const struct fib_nh *nh = fib_info_nh(fi, 0);
+
+		if (nh->fib_nh_gw4)
+			flags |= RTF_GATEWAY;
+	}
 	if (mask == htonl(0xFFFFFFFF))
 		flags |= RTF_HOST;
 	flags |= RTF_UP;
@@ -2755,7 +2759,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 	prefix = htonl(l->key);
 
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
-		const struct fib_info *fi = fa->fa_info;
+		struct fib_info *fi = fa->fa_info;
 		__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
 		unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
 
@@ -2768,26 +2772,28 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 
 		seq_setwidth(seq, 127);
 
-		if (fi)
+		if (fi) {
+			struct fib_nh *nh = fib_info_nh(fi, 0);
+
 			seq_printf(seq,
 				   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
 				   "%d\t%08X\t%d\t%u\t%u",
-				   fi->fib_dev ? fi->fib_dev->name : "*",
+				   nh->fib_nh_dev ? nh->fib_nh_dev->name : "*",
 				   prefix,
-				   fi->fib_nh->fib_nh_gw4, flags, 0, 0,
+				   nh->fib_nh_gw4, flags, 0, 0,
 				   fi->fib_priority,
 				   mask,
 				   (fi->fib_advmss ?
 				    fi->fib_advmss + 40 : 0),
 				   fi->fib_window,
 				   fi->fib_rtt >> 3);
-		else
+		} else {
 			seq_printf(seq,
 				   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
 				   "%d\t%08X\t%d\t%u\t%u",
 				   prefix, 0, flags, 0, 0, 0,
 				   mask, 0, 0, 0);
-
+		}
 		seq_pad(seq, '\n');
 	}
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11ddc276776e..05a6a8ecb574 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -99,6 +99,7 @@
 #include <net/inetpeer.h>
 #include <net/sock.h>
 #include <net/ip_fib.h>
+#include <net/nexthop.h>
 #include <net/arp.h>
 #include <net/tcp.h>
 #include <net/icmp.h>
@@ -1950,7 +1951,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 			    struct flow_keys *hkeys)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi && res->fi->fib_nhs > 1) {
+	if (res->fi && fib_info_num_path(res->fi) > 1) {
 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
 
 		fib_select_multipath(res, h);
-- 
cgit v1.2.3


From dcb1ecb50edf8219c3bd851de35897fb024c423b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 3 Jun 2019 20:19:50 -0700
Subject: ipv4: Prepare for fib6_nh from a nexthop object

Convert more IPv4 code to use fib_nh_common over fib_nh to enable routes
to use a fib6_nh based nexthop. In the end, only code not using a
nexthop object in a fib_info should directly access fib_nh in a fib_info
without checking the famiy and going through fib_nh_common. Those
functions will be marked when it is not directly evident.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 15 +++++++++----
 net/ipv4/fib_frontend.c  | 12 +++++------
 net/ipv4/fib_rules.c     |  4 ++--
 net/ipv4/fib_semantics.c | 55 +++++++++++++++++++++++++++++++++---------------
 net/ipv4/fib_trie.c      | 15 +++++++------
 net/ipv4/nexthop.c       |  3 ++-
 net/ipv4/route.c         |  2 +-
 7 files changed, 69 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 42b1a806f6f5..7da8ea784029 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -195,8 +195,8 @@ struct fib_result_nl {
 #define FIB_TABLE_HASHSZ 2
 #endif
 
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh,
-				unsigned char scope);
+__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
+				 unsigned char scope);
 __be32 fib_result_prefsrc(struct net *net, struct fib_result *res);
 
 #define FIB_RES_NHC(res)		((res).nhc)
@@ -455,11 +455,18 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 {
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	struct fib_nh_common *nhc = res->nhc;
-	struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	u32 rtag;
 #endif
-	*itag = nh->nh_tclassid << 16;
+	if (nhc->nhc_family == AF_INET) {
+		struct fib_nh *nh;
+
+		nh = container_of(nhc, struct fib_nh, nh_common);
+		*itag = nh->nh_tclassid << 16;
+	} else {
+		*itag = 0;
+	}
+
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	rtag = res->tclassid;
 	if (*itag == 0)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a4691360b395..5ea2750982f2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -235,9 +235,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	if (table) {
 		ret = RTN_UNICAST;
 		if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
-			struct fib_nh *nh = fib_info_nh(res.fi, 0);
+			struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
 
-			if (!dev || dev == nh->fib_nh_dev)
+			if (!dev || dev == nhc->nhc_dev)
 				ret = res.type;
 		}
 	}
@@ -325,18 +325,18 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 	int ret;
 
 	for (ret = 0; ret < fib_info_num_path(fi); ret++) {
-		const struct fib_nh *nh = fib_info_nh(fi, ret);
+		const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
 
-		if (nh->fib_nh_dev == dev) {
+		if (nhc->nhc_dev == dev) {
 			dev_match = true;
 			break;
-		} else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) {
+		} else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) {
 			dev_match = true;
 			break;
 		}
 	}
 #else
-	if (fib_info_nh(fi, 0)->fib_nh_dev == dev)
+	if (fib_info_nhc(fi, 0)->nhc_dev == dev)
 		dev_match = true;
 #endif
 
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index ab06fd73b343..88807c138df4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -147,9 +147,9 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
 	struct net_device *dev = NULL;
 
 	if (result->fi) {
-		struct fib_nh *nh = fib_info_nh(result->fi, 0);
+		struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0);
 
-		dev = nh->fib_nh_dev;
+		dev = nhc->nhc_dev;
 	}
 
 	/* do not accept result if the route does
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a37ff07718a8..4a12c69f7fa1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -61,6 +61,9 @@ static unsigned int fib_info_cnt;
 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
+/* for_nexthops and change_nexthops only used when nexthop object
+ * is not set in a fib_info. The logic within can reference fib_nh.
+ */
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
 #define for_nexthops(fi) {						\
@@ -402,20 +405,23 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 
 		/* each nexthop is packed in an attribute */
 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+		unsigned int i;
 
 		/* may contain flow and gateway attribute */
 		nhsize += 2 * nla_total_size(4);
 
 		/* grab encap info */
-		for_nexthops(fi) {
-			if (nh->fib_nh_lws) {
+		for (i = 0; i < fib_info_num_path(fi); i++) {
+			struct fib_nh_common *nhc = fib_info_nhc(fi, i);
+
+			if (nhc->nhc_lwtstate) {
 				/* RTA_ENCAP_TYPE */
 				nh_encapsize += lwtunnel_get_encap_size(
-						nh->fib_nh_lws);
+						nhc->nhc_lwtstate);
 				/* RTA_ENCAP */
 				nh_encapsize +=  nla_total_size(2);
 			}
-		} endfor_nexthops(fi);
+		}
 
 		/* all nexthops are packed in a nested attribute */
 		payload += nla_total_size((nhs * nhsize) + nh_encapsize);
@@ -1194,9 +1200,15 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
 	fib_info_hash_free(old_laddrhash, bytes);
 }
 
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh,
-				unsigned char scope)
+__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
+				 unsigned char scope)
 {
+	struct fib_nh *nh;
+
+	if (nhc->nhc_family != AF_INET)
+		return inet_select_addr(nhc->nhc_dev, 0, scope);
+
+	nh = container_of(nhc, struct fib_nh, nh_common);
 	nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);
 	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
 
@@ -1206,16 +1218,19 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh,
 __be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
 {
 	struct fib_nh_common *nhc = res->nhc;
-	struct fib_nh *nh;
 
 	if (res->fi->fib_prefsrc)
 		return res->fi->fib_prefsrc;
 
-	nh = container_of(nhc, struct fib_nh, nh_common);
-	if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
-		return nh->nh_saddr;
+	if (nhc->nhc_family == AF_INET) {
+		struct fib_nh *nh;
+
+		nh = container_of(nhc, struct fib_nh, nh_common);
+		if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
+			return nh->nh_saddr;
+	}
 
-	return fib_info_update_nh_saddr(net, nh, res->fi->fib_scope);
+	return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope);
 }
 
 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
@@ -1397,7 +1412,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	}
 
 	change_nexthops(fi) {
-		fib_info_update_nh_saddr(net, nexthop_nh, fi->fib_scope);
+		fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
+					  fi->fib_scope);
 		if (nexthop_nh->fib_nh_gw_family == AF_INET6)
 			fi->fib_nh_is_v6 = true;
 	} endfor_nexthops(fi)
@@ -1625,17 +1641,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
 		goto nla_put_failure;
 	if (nhs == 1) {
-		const struct fib_nh *nh = fib_info_nh(fi, 0);
+		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
 		unsigned char flags = 0;
 
-		if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0)
+		if (fib_nexthop_info(skb, nhc, &flags, false) < 0)
 			goto nla_put_failure;
 
 		rtm->rtm_flags = flags;
 #ifdef CONFIG_IP_ROUTE_CLASSID
-		if (nh->nh_tclassid &&
-		    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
-			goto nla_put_failure;
+		if (nhc->nhc_family == AF_INET) {
+			struct fib_nh *nh;
+
+			nh = container_of(nhc, struct fib_nh, nh_common);
+			if (nh->nh_tclassid &&
+			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+				goto nla_put_failure;
+		}
 #endif
 	} else {
 		if (fib_add_multipath(skb, fi) < 0)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5c8a4d21b8e0..d704d1606b8f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2724,9 +2724,9 @@ static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
 	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
 		flags = RTF_REJECT;
 	if (fi) {
-		const struct fib_nh *nh = fib_info_nh(fi, 0);
+		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
 
-		if (nh->fib_nh_gw4)
+		if (nhc->nhc_gw.ipv4)
 			flags |= RTF_GATEWAY;
 	}
 	if (mask == htonl(0xFFFFFFFF))
@@ -2773,14 +2773,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 		seq_setwidth(seq, 127);
 
 		if (fi) {
-			struct fib_nh *nh = fib_info_nh(fi, 0);
+			struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+			__be32 gw = 0;
+
+			if (nhc->nhc_gw_family == AF_INET)
+				gw = nhc->nhc_gw.ipv4;
 
 			seq_printf(seq,
 				   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
 				   "%d\t%08X\t%d\t%u\t%u",
-				   nh->fib_nh_dev ? nh->fib_nh_dev->name : "*",
-				   prefix,
-				   nh->fib_nh_gw4, flags, 0, 0,
+				   nhc->nhc_dev ? nhc->nhc_dev->name : "*",
+				   prefix, gw, flags, 0, 0,
 				   fi->fib_priority,
 				   mask,
 				   (fi->fib_advmss ?
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 7a5a3d08fec3..aec4ecb145a0 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -815,7 +815,8 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
 	err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
 	if (!err) {
 		nh->nh_flags = fib_nh->fib_nh_flags;
-		fib_info_update_nh_saddr(net, fib_nh, fib_nh->fib_nh_scope);
+		fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
+					  fib_nh->fib_nh_scope);
 	} else {
 		fib_nh_release(net, fib_nh);
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 05a6a8ecb574..4a1168451f3a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1585,7 +1585,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
-		{
+		if (nhc->nhc_family == AF_INET) {
 			struct fib_nh *nh;
 
 			nh = container_of(nhc, struct fib_nh, nh_common);
-- 
cgit v1.2.3


From 4c7e8084fd467ddb2b0e6c6011f9c1064afb7e56 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 3 Jun 2019 20:19:51 -0700
Subject: ipv4: Plumb support for nexthop object in a fib_info

Add 'struct nexthop' and nh_list list_head to fib_info. nh_list is the
fib_info side of the nexthop <-> fib_info relationship.

Add fi_list list_head to 'struct nexthop' to track fib_info entries
using a nexthop instance. Add __remove_nexthop_fib and add it to
__remove_nexthop to walk the new list_head and mark those fib entries
as dead when the nexthop is deleted.

Add a few nexthop helpers for use when a nexthop is added to fib_info:
- nexthop_cmp to determine if 2 nexthops are the same
- nexthop_path_fib_result to select a path for a multipath
  'struct nexthop'
- nexthop_fib_nhc to select a specific fib_nh_common within a
  multipath 'struct nexthop'

Update existing fib_info_nhc to use nexthop_fib_nhc if a fib_info uses
a 'struct nexthop', and mark fib_info_nh as only used for the non-nexthop
case.

Update the fib_info functions to check for fi->nh and take a different
path as needed:
- free_fib_info_rcu - put the nexthop object reference
- fib_release_info - remove the fib_info from the nexthop's fi_list
- nh_comp - use nexthop_cmp when either fib_info references a nexthop
  object
- fib_info_hashfn - use the nexthop id for the hashing vs the oif of
  each fib_nh in a fib_info
- fib_nlmsg_size - add space for the RTA_NH_ID attribute
- fib_create_info - verify nexthop reference can be taken, verify
  nexthop spec is valid for fib entry, and add fib_info to fi_list for
  a nexthop
- fib_select_multipath - use the new nexthop_path_fib_result to select a
  path when nexthop objects are used
- fib_table_lookup - if the 'struct nexthop' is a blackhole nexthop, treat
  it the same as a fib entry using 'blackhole'

The bulk of the changes are in fib_semantics.c and most of that is
moving the existing change_nexthops into an else branch.

Update the nexthop code to walk fi_list on a nexthop deleted to remove
fib entries referencing it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |   4 ++
 include/net/nexthop.h    |  48 ++++++++++++++++
 net/ipv4/fib_semantics.c | 142 +++++++++++++++++++++++++++++++++++------------
 net/ipv4/fib_trie.c      |   7 +++
 net/ipv4/nexthop.c       |  64 +++++++++++++++++++++
 5 files changed, 229 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 7da8ea784029..071d280de389 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -129,9 +129,12 @@ struct fib_nh {
  * This structure contains data shared by many of routes.
  */
 
+struct nexthop;
+
 struct fib_info {
 	struct hlist_node	fib_hash;
 	struct hlist_node	fib_lhash;
+	struct list_head	nh_list;
 	struct net		*fib_net;
 	int			fib_treeref;
 	refcount_t		fib_clntref;
@@ -151,6 +154,7 @@ struct fib_info {
 	int			fib_nhs;
 	bool			fib_nh_is_v6;
 	bool			nh_updated;
+	struct nexthop		*nh;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 };
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index e501d77b82c8..2912a2d7a515 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -77,6 +77,7 @@ struct nh_group {
 
 struct nexthop {
 	struct rb_node		rb_node;    /* entry on netns rbtree */
+	struct list_head	fi_list;    /* v4 entries using nh */
 	struct list_head	grp_list;   /* nh group entries using this nh */
 	struct net		*net;
 
@@ -110,6 +111,12 @@ static inline void nexthop_put(struct nexthop *nh)
 		call_rcu(&nh->rcu, nexthop_free_rcu);
 }
 
+static inline bool nexthop_cmp(const struct nexthop *nh1,
+			       const struct nexthop *nh2)
+{
+	return nh1 == nh2;
+}
+
 static inline bool nexthop_is_multipath(const struct nexthop *nh)
 {
 	if (nh->is_group) {
@@ -193,18 +200,59 @@ static inline bool nexthop_is_blackhole(const struct nexthop *nh)
 	return nhi->reject_nh;
 }
 
+static inline void nexthop_path_fib_result(struct fib_result *res, int hash)
+{
+	struct nh_info *nhi;
+	struct nexthop *nh;
+
+	nh = nexthop_select_path(res->fi->nh, hash);
+	nhi = rcu_dereference(nh->nh_info);
+	res->nhc = &nhi->fib_nhc;
+}
+
+/* called with rcu read lock or rtnl held */
+static inline
+struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel)
+{
+	struct nh_info *nhi;
+
+	BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0);
+	BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0);
+
+	if (nexthop_is_multipath(nh)) {
+		nh = nexthop_mpath_select(nh, nhsel);
+		if (!nh)
+			return NULL;
+	}
+
+	nhi = rcu_dereference_rtnl(nh->nh_info);
+	return &nhi->fib_nhc;
+}
+
 static inline unsigned int fib_info_num_path(const struct fib_info *fi)
 {
+	if (unlikely(fi->nh))
+		return nexthop_num_path(fi->nh);
+
 	return fi->fib_nhs;
 }
 
+int fib_check_nexthop(struct nexthop *nh, u8 scope,
+		      struct netlink_ext_ack *extack);
+
 static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
 {
+	if (unlikely(fi->nh))
+		return nexthop_fib_nhc(fi->nh, nhsel);
+
 	return &fi->fib_nh[nhsel].nh_common;
 }
 
+/* only used when fib_nh is built into fib_info */
 static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
 {
+	WARN_ON(fi->nh);
+
 	return &fi->fib_nh[nhsel];
 }
 #endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4a12c69f7fa1..01e587a5dcb1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -236,9 +236,13 @@ static void free_fib_info_rcu(struct rcu_head *head)
 {
 	struct fib_info *fi = container_of(head, struct fib_info, rcu);
 
-	change_nexthops(fi) {
-		fib_nh_release(fi->fib_net, nexthop_nh);
-	} endfor_nexthops(fi);
+	if (fi->nh) {
+		nexthop_put(fi->nh);
+	} else {
+		change_nexthops(fi) {
+			fib_nh_release(fi->fib_net, nexthop_nh);
+		} endfor_nexthops(fi);
+	}
 
 	ip_fib_metrics_put(fi->fib_metrics);
 
@@ -264,11 +268,15 @@ void fib_release_info(struct fib_info *fi)
 		hlist_del(&fi->fib_hash);
 		if (fi->fib_prefsrc)
 			hlist_del(&fi->fib_lhash);
-		change_nexthops(fi) {
-			if (!nexthop_nh->fib_nh_dev)
-				continue;
-			hlist_del(&nexthop_nh->nh_hash);
-		} endfor_nexthops(fi)
+		if (fi->nh) {
+			list_del(&fi->nh_list);
+		} else {
+			change_nexthops(fi) {
+				if (!nexthop_nh->fib_nh_dev)
+					continue;
+				hlist_del(&nexthop_nh->nh_hash);
+			} endfor_nexthops(fi)
+		}
 		fi->fib_dead = 1;
 		fib_info_put(fi);
 	}
@@ -279,6 +287,12 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
 {
 	const struct fib_nh *onh;
 
+	if (fi->nh || ofi->nh)
+		return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1;
+
+	if (ofi->fib_nhs == 0)
+		return 0;
+
 	for_nexthops(fi) {
 		onh = fib_info_nh(ofi, nhsel);
 
@@ -323,9 +337,14 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 	val ^= (fi->fib_protocol << 8) | fi->fib_scope;
 	val ^= (__force u32)fi->fib_prefsrc;
 	val ^= fi->fib_priority;
-	for_nexthops(fi) {
-		val ^= fib_devindex_hashfn(nh->fib_nh_oif);
-	} endfor_nexthops(fi)
+
+	if (fi->nh) {
+		val ^= fib_devindex_hashfn(fi->nh->id);
+	} else {
+		for_nexthops(fi) {
+			val ^= fib_devindex_hashfn(nh->fib_nh_oif);
+		} endfor_nexthops(fi)
+	}
 
 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
 }
@@ -352,7 +371,7 @@ static struct fib_info *fib_find_info(struct fib_info *nfi)
 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
 			   sizeof(u32) * RTAX_MAX) == 0 &&
 		    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
-		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+		    nh_comp(fi, nfi) == 0)
 			return fi;
 	}
 
@@ -399,6 +418,9 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 	/* space for nested metrics */
 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
 
+	if (fi->nh)
+		payload += nla_total_size(4); /* RTA_NH_ID */
+
 	if (nhs) {
 		size_t nh_encapsize = 0;
 		/* Also handles the special case nhs == 1 */
@@ -585,6 +607,7 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
 	return nhs;
 }
 
+/* only called when fib_nh is integrated into fib_info */
 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		       int remaining, struct fib_config *cfg,
 		       struct netlink_ext_ack *extack)
@@ -683,6 +706,7 @@ errout:
 	return ret;
 }
 
+/* only called when fib_nh is integrated into fib_info */
 static void fib_rebalance(struct fib_info *fi)
 {
 	int total;
@@ -1262,6 +1286,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 {
 	int err;
 	struct fib_info *fi = NULL;
+	struct nexthop *nh = NULL;
 	struct fib_info *ofi;
 	int nhs = 1;
 	struct net *net = cfg->fc_nlinfo.nl_net;
@@ -1333,14 +1358,25 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	fi->fib_tb_id = cfg->fc_table;
 
 	fi->fib_nhs = nhs;
-	change_nexthops(fi) {
-		nexthop_nh->nh_parent = fi;
-	} endfor_nexthops(fi)
+	if (nh) {
+		if (!nexthop_get(nh)) {
+			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+			err = -EINVAL;
+		} else {
+			err = 0;
+			fi->nh = nh;
+		}
+	} else {
+		change_nexthops(fi) {
+			nexthop_nh->nh_parent = fi;
+		} endfor_nexthops(fi)
 
-	if (cfg->fc_mp)
-		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
-	else
-		err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
+		if (cfg->fc_mp)
+			err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg,
+					  extack);
+		else
+			err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
+	}
 
 	if (err != 0)
 		goto failure;
@@ -1371,7 +1407,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		goto err_inval;
 	}
 
-	if (cfg->fc_scope == RT_SCOPE_HOST) {
+	if (fi->nh) {
+		err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack);
+		if (err)
+			goto failure;
+	} else if (cfg->fc_scope == RT_SCOPE_HOST) {
 		struct fib_nh *nh = fi->fib_nh;
 
 		/* Local address is added. */
@@ -1411,14 +1451,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		goto err_inval;
 	}
 
-	change_nexthops(fi) {
-		fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
-					  fi->fib_scope);
-		if (nexthop_nh->fib_nh_gw_family == AF_INET6)
-			fi->fib_nh_is_v6 = true;
-	} endfor_nexthops(fi)
+	if (!fi->nh) {
+		change_nexthops(fi) {
+			fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
+						  fi->fib_scope);
+			if (nexthop_nh->fib_nh_gw_family == AF_INET6)
+				fi->fib_nh_is_v6 = true;
+		} endfor_nexthops(fi)
 
-	fib_rebalance(fi);
+		fib_rebalance(fi);
+	}
 
 link_it:
 	ofi = fib_find_info(fi);
@@ -1440,16 +1482,20 @@ link_it:
 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
 		hlist_add_head(&fi->fib_lhash, head);
 	}
-	change_nexthops(fi) {
-		struct hlist_head *head;
-		unsigned int hash;
+	if (fi->nh) {
+		list_add(&fi->nh_list, &nh->fi_list);
+	} else {
+		change_nexthops(fi) {
+			struct hlist_head *head;
+			unsigned int hash;
 
-		if (!nexthop_nh->fib_nh_dev)
-			continue;
-		hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
-		head = &fib_info_devhash[hash];
-		hlist_add_head(&nexthop_nh->nh_hash, head);
-	} endfor_nexthops(fi)
+			if (!nexthop_nh->fib_nh_dev)
+				continue;
+			hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
+			head = &fib_info_devhash[hash];
+			hlist_add_head(&nexthop_nh->nh_hash, head);
+		} endfor_nexthops(fi)
+	}
 	spin_unlock_bh(&fib_info_lock);
 	return fi;
 
@@ -1576,6 +1622,12 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
 	if (!mp)
 		goto nla_put_failure;
 
+	if (unlikely(fi->nh)) {
+		if (nexthop_mpath_fill_node(skb, fi->nh) < 0)
+			goto nla_put_failure;
+		goto mp_end;
+	}
+
 	for_nexthops(fi) {
 		if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
 			goto nla_put_failure;
@@ -1586,6 +1638,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
 #endif
 	} endfor_nexthops(fi);
 
+mp_end:
 	nla_nest_end(skb, mp);
 
 	return 0;
@@ -1640,6 +1693,14 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	if (fi->fib_prefsrc &&
 	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
 		goto nla_put_failure;
+
+	if (fi->nh) {
+		if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id))
+			goto nla_put_failure;
+		if (nexthop_is_blackhole(fi->nh))
+			rtm->rtm_type = RTN_BLACKHOLE;
+	}
+
 	if (nhs == 1) {
 		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
 		unsigned char flags = 0;
@@ -1784,6 +1845,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
  * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
  * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
  * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
+ *
+ * only used when fib_nh is built into fib_info
  */
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 {
@@ -1931,6 +1994,8 @@ out:
 /*
  * Dead device goes up. We wake up dead nexthops.
  * It takes sense only on multipath routes.
+ *
+ * only used when fib_nh is built into fib_info
  */
 int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
 {
@@ -2025,6 +2090,11 @@ void fib_select_multipath(struct fib_result *res, int hash)
 	struct net *net = fi->fib_net;
 	bool first = false;
 
+	if (unlikely(res->fi->nh)) {
+		nexthop_path_fib_result(res, hash);
+		return;
+	}
+
 	change_nexthops(fi) {
 		if (net->ipv4.sysctl_fib_multipath_use_neigh) {
 			if (!fib_good_nh(nexthop_nh))
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d704d1606b8f..716f2d66cb3f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1461,6 +1461,7 @@ found:
 		fib_alias_accessed(fa);
 		err = fib_props[fa->fa_type].error;
 		if (unlikely(err < 0)) {
+out_reject:
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
@@ -1469,6 +1470,12 @@ found:
 		}
 		if (fi->fib_flags & RTNH_F_DEAD)
 			continue;
+
+		if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) {
+			err = fib_props[RTN_BLACKHOLE].error;
+			goto out_reject;
+		}
+
 		for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
 			struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index aec4ecb145a0..63cbb04f697f 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -105,6 +105,7 @@ static struct nexthop *nexthop_alloc(void)
 
 	nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 	if (nh) {
+		INIT_LIST_HEAD(&nh->fi_list);
 		INIT_LIST_HEAD(&nh->grp_list);
 	}
 	return nh;
@@ -515,6 +516,54 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
 }
 EXPORT_SYMBOL_GPL(nexthop_select_path);
 
+static int nexthop_check_scope(struct nexthop *nh, u8 scope,
+			       struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
+		NL_SET_ERR_MSG(extack,
+			       "Route with host scope can not have a gateway");
+		return -EINVAL;
+	}
+
+	if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
+		NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Invoked by fib add code to verify nexthop by id is ok with
+ * config for prefix; parts of fib_check_nh not done when nexthop
+ * object is used.
+ */
+int fib_check_nexthop(struct nexthop *nh, u8 scope,
+		      struct netlink_ext_ack *extack)
+{
+	int err = 0;
+
+	if (nh->is_group) {
+		struct nh_group *nhg;
+
+		if (scope == RT_SCOPE_HOST) {
+			NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+			err = -EINVAL;
+			goto out;
+		}
+
+		nhg = rtnl_dereference(nh->nh_grp);
+		/* all nexthops in a group have the same scope */
+		err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack);
+	} else {
+		err = nexthop_check_scope(nh, scope, extack);
+	}
+out:
+	return err;
+}
+
 static void nh_group_rebalance(struct nh_group *nhg)
 {
 	int total = 0;
@@ -607,9 +656,24 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
 	}
 }
 
+static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+{
+	bool do_flush = false;
+	struct fib_info *fi;
+
+	list_for_each_entry(fi, &nh->fi_list, nh_list) {
+		fi->fib_flags |= RTNH_F_DEAD;
+		do_flush = true;
+	}
+	if (do_flush)
+		fib_flush(net);
+}
+
 static void __remove_nexthop(struct net *net, struct nexthop *nh,
 			     struct nl_info *nlinfo)
 {
+	__remove_nexthop_fib(net, nh);
+
 	if (nh->is_group) {
 		remove_nexthop_group(nh, nlinfo);
 	} else {
-- 
cgit v1.2.3


From f88d8ea67fbdbac7a64bfa6ed9a2ba27bb822f74 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 3 Jun 2019 20:19:52 -0700
Subject: ipv6: Plumb support for nexthop object in a fib6_info

Add struct nexthop and nh_list list_head to fib6_info. nh_list is the
fib6_info side of the nexthop <-> fib_info relationship. Since a fib6_info
referencing a nexthop object can not have 'sibling' entries (the old way
of doing multipath routes), the nh_list is a union with fib6_siblings.

Add f6i_list list_head to 'struct nexthop' to track fib6_info entries
using a nexthop instance. Update __remove_nexthop_fib to walk f6_list
and delete fib entries using the nexthop.

Add a few nexthop helpers for use when a nexthop is added to fib6_info:
- nexthop_fib6_nh - return first fib6_nh in a nexthop object
- fib6_info_nh_dev moved to nexthop.h and updated to use nexthop_fib6_nh
  if the fib6_info references a nexthop object
- nexthop_path_fib6_result - similar to ipv4, select a path within a
  multipath nexthop object. If the nexthop is a blackhole, set
  fib6_result type to RTN_BLACKHOLE, and set the REJECT flag

Update the fib6_info references to check for nh and take a different path
as needed:
- rt6_qualify_for_ecmp - if a fib entry uses a nexthop object it can NOT
  be coalesced with other fib entries into a multipath route
- rt6_duplicate_nexthop - use nexthop_cmp if either fib6_info references
  a nexthop
- addrconf (host routes), RA's and info entries (anything configured via
  ndisc) does not use nexthop objects
- fib6_info_destroy_rcu - put reference to nexthop object
- fib6_purge_rt - drop fib6_info from f6i_list
- fib6_select_path - update to use the new nexthop_path_fib6_result when
  fib entry uses a nexthop object
- rt6_device_match - update to catch use of nexthop object as a blackhole
  and set fib6_type and flags.
- ip6_route_info_create - don't add space for fib6_nh if fib entry is
  going to reference a nexthop object, take a reference to nexthop object,
  disallow use of source routing
- rt6_nlmsg_size - add space for RTA_NH_ID
- add rt6_fill_node_nexthop to add nexthop data on a dump

As with ipv4, most of the changes push existing code into the else branch
of whether the fib entry uses a nexthop object.

Update the nexthop code to walk f6i_list on a nexthop deleted to remove
fib entries referencing it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |  11 ++--
 include/net/ip6_route.h |  13 ++++-
 include/net/nexthop.h   |  50 ++++++++++++++++
 net/ipv4/nexthop.c      |  44 ++++++++++++++
 net/ipv6/addrconf.c     |   5 ++
 net/ipv6/ip6_fib.c      |  22 +++++--
 net/ipv6/ndisc.c        |   3 +-
 net/ipv6/route.c        | 148 +++++++++++++++++++++++++++++++++++++++++-------
 8 files changed, 260 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index ebe5d65f97e0..1a8acd51b277 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -146,7 +146,10 @@ struct fib6_info {
 	 * destination, but not the same gateway. nsiblings is just a cache
 	 * to speed up lookup.
 	 */
-	struct list_head		fib6_siblings;
+	union {
+		struct list_head	fib6_siblings;
+		struct list_head	nh_list;
+	};
 	unsigned int			fib6_nsiblings;
 
 	refcount_t			fib6_ref;
@@ -170,6 +173,7 @@ struct fib6_info {
 					unused:3;
 
 	struct rcu_head			rcu;
+	struct nexthop			*nh;
 	struct fib6_nh			fib6_nh[0];
 };
 
@@ -441,11 +445,6 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
 	rcu_read_unlock();
 }
 
-static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
-{
-	return f6i->fib6_nh->fib_nh_dev;
-}
-
 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 		 struct fib6_config *cfg, gfp_t gfp_flags,
 		 struct netlink_ext_ack *extack);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index a6ce6ea856b9..7375a165fd98 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -27,6 +27,7 @@ struct route_info {
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/route.h>
+#include <net/nexthop.h>
 
 #define RT6_LOOKUP_F_IFACE		0x00000001
 #define RT6_LOOKUP_F_REACHABLE		0x00000002
@@ -66,10 +67,13 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+/* fib entries using a nexthop object can not be coalesced into
+ * a multipath route
+ */
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
 	/* the RTF_ADDRCONF flag filters out RA's */
-	return !(f6i->fib6_flags & RTF_ADDRCONF) &&
+	return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh &&
 		f6i->fib6_nh->fib_nh_gw_family;
 }
 
@@ -275,8 +279,13 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
-	struct fib6_nh *nha = a->fib6_nh, *nhb = b->fib6_nh;
+	struct fib6_nh *nha, *nhb;
+
+	if (a->nh || b->nh)
+		return nexthop_cmp(a->nh, b->nh);
 
+	nha = a->fib6_nh;
+	nhb = b->fib6_nh;
 	return nha->fib_nh_dev == nhb->fib_nh_dev &&
 	       ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
 	       !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 2912a2d7a515..aff7b2410057 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -10,6 +10,7 @@
 #define __LINUX_NEXTHOP_H
 
 #include <linux/netdevice.h>
+#include <linux/route.h>
 #include <linux/types.h>
 #include <net/ip_fib.h>
 #include <net/ip6_fib.h>
@@ -78,6 +79,7 @@ struct nh_group {
 struct nexthop {
 	struct rb_node		rb_node;    /* entry on netns rbtree */
 	struct list_head	fi_list;    /* v4 entries using nh */
+	struct list_head	f6i_list;   /* v6 entries using nh */
 	struct list_head	grp_list;   /* nh group entries using this nh */
 	struct net		*net;
 
@@ -255,4 +257,52 @@ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
 
 	return &fi->fib_nh[nhsel];
 }
+
+/*
+ * IPv6 variants
+ */
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+		       struct netlink_ext_ack *extack);
+
+static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
+{
+	struct nh_info *nhi;
+
+	if (nexthop_is_multipath(nh)) {
+		nh = nexthop_mpath_select(nh, 0);
+		if (!nh)
+			return NULL;
+	}
+
+	nhi = rcu_dereference_rtnl(nh->nh_info);
+	if (nhi->family == AF_INET6)
+		return &nhi->fib6_nh;
+
+	return NULL;
+}
+
+static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
+{
+	struct fib6_nh *fib6_nh;
+
+	fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh;
+	return fib6_nh->fib_nh_dev;
+}
+
+static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
+{
+	struct nexthop *nh = res->f6i->nh;
+	struct nh_info *nhi;
+
+	nh = nexthop_select_path(nh, hash);
+
+	nhi = rcu_dereference_rtnl(nh->nh_info);
+	if (nhi->reject_nh) {
+		res->fib6_type = RTN_BLACKHOLE;
+		res->fib6_flags |= RTF_REJECT;
+		res->nh = nexthop_fib6_nh(nh);
+	} else {
+		res->nh = &nhi->fib6_nh;
+	}
+}
 #endif
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 63cbb04f697f..5e48762b6b5f 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -106,6 +106,7 @@ static struct nexthop *nexthop_alloc(void)
 	nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 	if (nh) {
 		INIT_LIST_HEAD(&nh->fi_list);
+		INIT_LIST_HEAD(&nh->f6i_list);
 		INIT_LIST_HEAD(&nh->grp_list);
 	}
 	return nh;
@@ -516,6 +517,41 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
 }
 EXPORT_SYMBOL_GPL(nexthop_select_path);
 
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+		       struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+
+	/* fib6_src is unique to a fib6_info and limits the ability to cache
+	 * routes in fib6_nh within a nexthop that is potentially shared
+	 * across multiple fib entries. If the config wants to use source
+	 * routing it can not use nexthop objects. mlxsw also does not allow
+	 * fib6_src on routes.
+	 */
+	if (!ipv6_addr_any(&cfg->fc_src)) {
+		NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
+		return -EINVAL;
+	}
+
+	if (nh->is_group) {
+		struct nh_group *nhg;
+
+		nhg = rtnl_dereference(nh->nh_grp);
+		if (nhg->has_v4)
+			goto no_v4_nh;
+	} else {
+		nhi = rtnl_dereference(nh->nh_info);
+		if (nhi->family == AF_INET)
+			goto no_v4_nh;
+	}
+
+	return 0;
+no_v4_nh:
+	NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(fib6_check_nexthop);
+
 static int nexthop_check_scope(struct nexthop *nh, u8 scope,
 			       struct netlink_ext_ack *extack)
 {
@@ -658,6 +694,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
 
 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 {
+	struct fib6_info *f6i, *tmp;
 	bool do_flush = false;
 	struct fib_info *fi;
 
@@ -667,6 +704,13 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 	}
 	if (do_flush)
 		fib_flush(net);
+
+	/* ip6_del_rt removes the entry from this list hence the _safe */
+	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+		/* __ip6_del_rt does a release, so do a hold here */
+		fib6_info_hold(f6i);
+		ipv6_stub->ip6_del_rt(net, f6i);
+	}
 }
 
 static void __remove_nexthop(struct net *net, struct nexthop *nh,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4c30726fa7c7..d7fff86c2ef0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2421,6 +2421,10 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
+		/* prefix routes only use builtin fib6_nh */
+		if (rt->nh)
+			continue;
+
 		if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
 			continue;
 		if (no_gw && rt->fib6_nh->fib_nh_gw_family)
@@ -6352,6 +6356,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 	list_for_each_entry(ifa, &idev->addr_list, if_list) {
 		spin_lock(&ifa->lock);
 		if (ifa->rt) {
+			/* host routes only use builtin fib6_nh */
 			struct fib6_nh *nh = ifa->rt->fib6_nh;
 			int cpu;
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index cdfb8500ccae..02feda73a98e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -159,6 +159,7 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
 	if (!f6i)
 		return NULL;
 
+	/* fib6_siblings is a union with nh_list, so this initializes both */
 	INIT_LIST_HEAD(&f6i->fib6_siblings);
 	refcount_set(&f6i->fib6_ref, 1);
 
@@ -171,7 +172,11 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 
 	WARN_ON(f6i->fib6_node);
 
-	fib6_nh_release(f6i->fib6_nh);
+	if (f6i->nh)
+		nexthop_put(f6i->nh);
+	else
+		fib6_nh_release(f6i->fib6_nh);
+
 	ip_fib_metrics_put(f6i->fib6_metrics);
 	kfree(f6i);
 }
@@ -927,6 +932,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 
 	fib6_drop_pcpu_from(rt, table);
 
+	if (rt->nh && !list_empty(&rt->nh_list))
+		list_del_init(&rt->nh_list);
+
 	if (refcount_read(&rt->fib6_ref) != 1) {
 		/* This route is used as dummy address holder in some split
 		 * nodes. It is not leaked, but it still holds other resources,
@@ -1334,6 +1342,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 
 	err = fib6_add_rt2node(fn, rt, info, extack);
 	if (!err) {
+		if (rt->nh)
+			list_add(&rt->nh_list, &rt->nh->f6i_list);
 		__fib6_update_sernum_upto_root(rt, sernum);
 		fib6_start_gc(info->nl_net, rt);
 	}
@@ -2295,9 +2305,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
 	struct fib6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
+	struct fib6_nh *fib6_nh = rt->fib6_nh;
 	unsigned int flags = rt->fib6_flags;
 	const struct net_device *dev;
 
+	if (rt->nh)
+		fib6_nh = nexthop_fib6_nh(rt->nh);
+
 	seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -2305,14 +2319,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 #else
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
-	if (rt->fib6_nh->fib_nh_gw_family) {
+	if (fib6_nh->fib_nh_gw_family) {
 		flags |= RTF_GATEWAY;
-		seq_printf(seq, "%pi6", &rt->fib6_nh->fib_nh_gw6);
+		seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
 	} else {
 		seq_puts(seq, "00000000000000000000000000000000");
 	}
 
-	dev = rt->fib6_nh->fib_nh_dev;
+	dev = fib6_nh->fib_nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
 		   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
 		   flags, dev ? dev->name : "");
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f874dde1ee85..6e3c51109c83 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1289,9 +1289,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	    !in6_dev->cnf.accept_ra_rtr_pref)
 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
 #endif
-
+	/* routes added from RAs do not use nexthop objects */
 	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
-
 	if (rt) {
 		neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
 					 rt->fib6_nh->fib_nh_dev, NULL,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9b9a0159f7fd..df5be3d5d3e5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -432,15 +432,21 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
 	struct fib6_info *sibling, *next_sibling;
 	struct fib6_info *match = res->f6i;
 
-	if (!match->fib6_nsiblings || have_oif_match)
+	if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
 		goto out;
 
 	/* We might have already computed the hash for ICMPv6 errors. In such
 	 * case it will always be non-zero. Otherwise now is the time to do it.
 	 */
-	if (!fl6->mp_hash)
+	if (!fl6->mp_hash &&
+	    (!match->nh || nexthop_is_multipath(match->nh)))
 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
+	if (unlikely(match->nh)) {
+		nexthop_path_fib6_result(res, fl6->mp_hash);
+		return;
+	}
+
 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
 		goto out;
 
@@ -496,7 +502,13 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
 	struct fib6_nh *nh;
 
 	if (!oif && ipv6_addr_any(saddr)) {
-		nh = f6i->fib6_nh;
+		if (unlikely(f6i->nh)) {
+			nh = nexthop_fib6_nh(f6i->nh);
+			if (nexthop_is_blackhole(f6i->nh))
+				goto out_blackhole;
+		} else {
+			nh = f6i->fib6_nh;
+		}
 		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
 			goto out;
 	}
@@ -515,7 +527,14 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
 		goto out;
 	}
 
-	nh = f6i->fib6_nh;
+	if (unlikely(f6i->nh)) {
+		nh = nexthop_fib6_nh(f6i->nh);
+		if (nexthop_is_blackhole(f6i->nh))
+			goto out_blackhole;
+	} else {
+		nh = f6i->fib6_nh;
+	}
+
 	if (nh->fib_nh_flags & RTNH_F_DEAD) {
 		res->f6i = net->ipv6.fib6_null_entry;
 		nh = res->f6i->fib6_nh;
@@ -524,6 +543,12 @@ out:
 	res->nh = nh;
 	res->fib6_type = res->f6i->fib6_type;
 	res->fib6_flags = res->f6i->fib6_flags;
+	return;
+
+out_blackhole:
+	res->fib6_flags |= RTF_REJECT;
+	res->fib6_type = RTN_BLACKHOLE;
+	res->nh = nh;
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -1117,6 +1142,8 @@ restart:
 		rt = net->ipv6.ip6_null_entry;
 		dst_hold(&rt->dst);
 		goto out;
+	} else if (res.fib6_flags & RTF_REJECT) {
+		goto do_create;
 	}
 
 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
@@ -1128,6 +1155,7 @@ restart:
 		if (ip6_hold_safe(net, &rt))
 			dst_use_noref(&rt->dst, jiffies);
 	} else {
+do_create:
 		rt = ip6_create_rt_rcu(&res);
 	}
 
@@ -3217,7 +3245,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 {
 	struct net *net = cfg->fc_nlinfo.nl_net;
 	struct fib6_info *rt = NULL;
+	struct nexthop *nh = NULL;
 	struct fib6_table *table;
+	struct fib6_nh *fib6_nh;
 	int err = -EINVAL;
 	int addr_type;
 
@@ -3270,7 +3300,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 
 	err = -ENOMEM;
-	rt = fib6_info_alloc(gfp_flags, true);
+	rt = fib6_info_alloc(gfp_flags, !nh);
 	if (!rt)
 		goto out;
 
@@ -3310,19 +3340,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
 	rt->fib6_src.plen = cfg->fc_src_len;
 #endif
-	err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
-	if (err)
-		goto out;
+	if (nh) {
+		if (!nexthop_get(nh)) {
+			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+			goto out;
+		}
+		if (rt->fib6_src.plen) {
+			NL_SET_ERR_MSG(extack, "Nexthops can not be used wtih source routing");
+			goto out;
+		}
+		rt->nh = nh;
+		fib6_nh = nexthop_fib6_nh(rt->nh);
+	} else {
+		err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
+		if (err)
+			goto out;
 
-	/* We cannot add true routes via loopback here,
-	 * they would result in kernel looping; promote them to reject routes
-	 */
-	addr_type = ipv6_addr_type(&cfg->fc_dst);
-	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type))
-		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+		fib6_nh = rt->fib6_nh;
+
+		/* We cannot add true routes via loopback here, they would
+		 * result in kernel looping; promote them to reject routes
+		 */
+		addr_type = ipv6_addr_type(&cfg->fc_dst);
+		if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
+				   addr_type))
+			rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+	}
 
 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
-		struct net_device *dev = fib6_info_nh_dev(rt);
+		struct net_device *dev = fib6_nh->fib_nh_dev;
 
 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
 			NL_SET_ERR_MSG(extack, "Invalid source address");
@@ -3678,6 +3724,9 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
+		/* these routes do not use nexthops */
+		if (rt->nh)
+			continue;
 		if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
 			continue;
 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
@@ -3741,8 +3790,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
-		struct fib6_nh *nh = rt->fib6_nh;
+		struct fib6_nh *nh;
+
+		/* RA routes do not use nexthops */
+		if (rt->nh)
+			continue;
 
+		nh = rt->fib6_nh;
 		if (dev == nh->fib_nh_dev &&
 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
@@ -3993,7 +4047,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
 
-	if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
+	if (!rt->nh &&
+	    ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
 	    rt != net->ipv6.fib6_null_entry &&
 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
@@ -4021,8 +4076,13 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
-	struct fib6_nh *nh = rt->fib6_nh;
+	struct fib6_nh *nh;
 
+	/* RA routes do not use nexthops */
+	if (rt->nh)
+		return 0;
+
+	nh = rt->fib6_nh;
 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
 	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
 		return -1;
@@ -4069,6 +4129,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 	return NULL;
 }
 
+/* only called for fib entries with builtin fib6_nh */
 static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
@@ -4147,7 +4208,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.fib6_null_entry &&
+	if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
 	    rt->fib6_nh->fib_nh_dev == arg->dev) {
 		rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
@@ -4172,6 +4233,7 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
 }
 
+/* only called for fib entries with inline fib6_nh */
 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
 				   const struct net_device *dev)
 {
@@ -4232,7 +4294,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 	const struct net_device *dev = arg->dev;
 	struct net *net = dev_net(dev);
 
-	if (rt == net->ipv6.fib6_null_entry)
+	if (rt == net->ipv6.fib6_null_entry || rt->nh)
 		return 0;
 
 	switch (arg->event) {
@@ -4786,6 +4848,9 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 {
 	int nexthop_len = 0;
 
+	if (rt->nh)
+		nexthop_len += nla_total_size(4); /* RTA_NH_ID */
+
 	if (rt->fib6_nsiblings) {
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
 			    + NLA_ALIGN(sizeof(struct rtnexthop))
@@ -4812,6 +4877,35 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 	       + nexthop_len;
 }
 
+static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
+				 unsigned char *flags)
+{
+	if (nexthop_is_multipath(nh)) {
+		struct nlattr *mp;
+
+		mp = nla_nest_start(skb, RTA_MULTIPATH);
+		if (!mp)
+			goto nla_put_failure;
+
+		if (nexthop_mpath_fill_node(skb, nh))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, mp);
+	} else {
+		struct fib6_nh *fib6_nh;
+
+		fib6_nh = nexthop_fib6_nh(nh);
+		if (fib_nexthop_info(skb, &fib6_nh->nh_common,
+				     flags, false) < 0)
+			goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			 struct fib6_info *rt, struct dst_entry *dst,
 			 struct in6_addr *dest, struct in6_addr *src,
@@ -4821,6 +4915,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	struct rt6_info *rt6 = (struct rt6_info *)dst;
 	struct rt6key *rt6_dst, *rt6_src;
 	u32 *pmetrics, table, rt6_flags;
+	unsigned char nh_flags = 0;
 	struct nlmsghdr *nlh;
 	struct rtmsg *rtm;
 	long expires = 0;
@@ -4940,9 +5035,18 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 		}
 
 		nla_nest_end(skb, mp);
-	} else {
-		unsigned char nh_flags = 0;
+	} else if (rt->nh) {
+		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
+			goto nla_put_failure;
+
+		if (nexthop_is_blackhole(rt->nh))
+			rtm->rtm_type = RTN_BLACKHOLE;
 
+		if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+			goto nla_put_failure;
+
+		rtm->rtm_flags |= nh_flags;
+	} else {
 		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
 				     &nh_flags, false) < 0)
 			goto nla_put_failure;
-- 
cgit v1.2.3


From bac9593515c653e9ec05df9e303cdf1b969854d4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 May 2019 17:46:14 +0200
Subject: xfrm: remove init_tempsel indirection from xfrm_state_afinfo

Simple initialization, handle it in the caller.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  2 --
 net/ipv4/xfrm4_state.c | 19 -----------------
 net/ipv6/xfrm6_state.c | 21 -------------------
 net/xfrm/xfrm_state.c  | 56 +++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a2907873ed56..ba65434b5293 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -354,8 +354,6 @@ struct xfrm_state_afinfo {
 	const struct xfrm_type_offload	*type_offload_map[IPPROTO_MAX];
 
 	int			(*init_flags)(struct xfrm_state *x);
-	void			(*init_tempsel)(struct xfrm_selector *sel,
-						const struct flowi *fl);
 	void			(*init_temprop)(struct xfrm_state *x,
 						const struct xfrm_tmpl *tmpl,
 						const xfrm_address_t *daddr,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 80c40b4981bb..da0fd9556d57 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -22,24 +22,6 @@ static int xfrm4_init_flags(struct xfrm_state *x)
 	return 0;
 }
 
-static void
-__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
-	const struct flowi4 *fl4 = &fl->u.ip4;
-
-	sel->daddr.a4 = fl4->daddr;
-	sel->saddr.a4 = fl4->saddr;
-	sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
-	sel->dport_mask = htons(0xffff);
-	sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
-	sel->sport_mask = htons(0xffff);
-	sel->family = AF_INET;
-	sel->prefixlen_d = 32;
-	sel->prefixlen_s = 32;
-	sel->proto = fl4->flowi4_proto;
-	sel->ifindex = fl4->flowi4_oif;
-}
-
 static void
 xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
 		   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
@@ -77,7 +59,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.eth_proto		= htons(ETH_P_IP),
 	.owner			= THIS_MODULE,
 	.init_flags		= xfrm4_init_flags,
-	.init_tempsel		= __xfrm4_init_tempsel,
 	.init_temprop		= xfrm4_init_temprop,
 	.output			= xfrm4_output,
 	.output_finish		= xfrm4_output_finish,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 5bdca3d5d6b7..0e19ded3e33b 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -21,26 +21,6 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 
-static void
-__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
-	const struct flowi6 *fl6 = &fl->u.ip6;
-
-	/* Initialize temporary selector matching only
-	 * to current session. */
-	*(struct in6_addr *)&sel->daddr = fl6->daddr;
-	*(struct in6_addr *)&sel->saddr = fl6->saddr;
-	sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
-	sel->dport_mask = htons(0xffff);
-	sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
-	sel->sport_mask = htons(0xffff);
-	sel->family = AF_INET6;
-	sel->prefixlen_d = 128;
-	sel->prefixlen_s = 128;
-	sel->proto = fl6->flowi6_proto;
-	sel->ifindex = fl6->flowi6_oif;
-}
-
 static void
 xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
 		   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
@@ -173,7 +153,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.proto			= IPPROTO_IPV6,
 	.eth_proto		= htons(ETH_P_IPV6),
 	.owner			= THIS_MODULE,
-	.init_tempsel		= __xfrm6_init_tempsel,
 	.init_temprop		= xfrm6_init_temprop,
 	.tmpl_sort		= __xfrm6_tmpl_sort,
 	.state_sort		= __xfrm6_state_sort,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 50621d982970..66d9009fe9b5 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -769,6 +769,43 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 }
 EXPORT_SYMBOL(xfrm_sad_getinfo);
 
+static void
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+	const struct flowi4 *fl4 = &fl->u.ip4;
+
+	sel->daddr.a4 = fl4->daddr;
+	sel->saddr.a4 = fl4->saddr;
+	sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
+	sel->dport_mask = htons(0xffff);
+	sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
+	sel->sport_mask = htons(0xffff);
+	sel->family = AF_INET;
+	sel->prefixlen_d = 32;
+	sel->prefixlen_s = 32;
+	sel->proto = fl4->flowi4_proto;
+	sel->ifindex = fl4->flowi4_oif;
+}
+
+static void
+__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+	const struct flowi6 *fl6 = &fl->u.ip6;
+
+	/* Initialize temporary selector matching only to current session. */
+	*(struct in6_addr *)&sel->daddr = fl6->daddr;
+	*(struct in6_addr *)&sel->saddr = fl6->saddr;
+	sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
+	sel->dport_mask = htons(0xffff);
+	sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
+	sel->sport_mask = htons(0xffff);
+	sel->family = AF_INET6;
+	sel->prefixlen_d = 128;
+	sel->prefixlen_s = 128;
+	sel->proto = fl6->flowi6_proto;
+	sel->ifindex = fl6->flowi6_oif;
+}
+
 static void
 xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
 		    const struct xfrm_tmpl *tmpl,
@@ -777,16 +814,21 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
 {
 	struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
 
+	switch (family) {
+	case AF_INET:
+		__xfrm4_init_tempsel(&x->sel, fl);
+		break;
+	case AF_INET6:
+		__xfrm6_init_tempsel(&x->sel, fl);
+		break;
+	}
+
+	if (family != tmpl->encap_family)
+		afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
+
 	if (!afinfo)
 		return;
 
-	afinfo->init_tempsel(&x->sel, fl);
-
-	if (family != tmpl->encap_family) {
-		afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
-		if (!afinfo)
-			return;
-	}
 	afinfo->init_temprop(x, tmpl, daddr, saddr);
 }
 
-- 
cgit v1.2.3


From 5c1b9ab3ec81992bef9a8605b8b281b41577b475 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 May 2019 17:46:15 +0200
Subject: xfrm: remove init_temprop indirection from xfrm_state_afinfo

same as previous patch: just place this in the caller, no need to
have an indirection for a structure initialization.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  4 ----
 net/ipv4/xfrm4_state.c | 16 ----------------
 net/ipv6/xfrm6_state.c | 16 ----------------
 net/xfrm/xfrm_state.c  | 27 ++++++++++++++++++++-------
 4 files changed, 20 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ba65434b5293..e8f676ce27be 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -354,10 +354,6 @@ struct xfrm_state_afinfo {
 	const struct xfrm_type_offload	*type_offload_map[IPPROTO_MAX];
 
 	int			(*init_flags)(struct xfrm_state *x);
-	void			(*init_temprop)(struct xfrm_state *x,
-						const struct xfrm_tmpl *tmpl,
-						const xfrm_address_t *daddr,
-						const xfrm_address_t *saddr);
 	int			(*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n);
 	int			(*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n);
 	int			(*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index da0fd9556d57..018448e222af 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -22,21 +22,6 @@ static int xfrm4_init_flags(struct xfrm_state *x)
 	return 0;
 }
 
-static void
-xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
-		   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
-	x->id = tmpl->id;
-	if (x->id.daddr.a4 == 0)
-		x->id.daddr.a4 = daddr->a4;
-	x->props.saddr = tmpl->saddr;
-	if (x->props.saddr.a4 == 0)
-		x->props.saddr.a4 = saddr->a4;
-	x->props.mode = tmpl->mode;
-	x->props.reqid = tmpl->reqid;
-	x->props.family = AF_INET;
-}
-
 int xfrm4_extract_header(struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
@@ -59,7 +44,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.eth_proto		= htons(ETH_P_IP),
 	.owner			= THIS_MODULE,
 	.init_flags		= xfrm4_init_flags,
-	.init_temprop		= xfrm4_init_temprop,
 	.output			= xfrm4_output,
 	.output_finish		= xfrm4_output_finish,
 	.extract_input		= xfrm4_extract_input,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 0e19ded3e33b..aa5d2c52cc31 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -21,21 +21,6 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 
-static void
-xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
-		   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
-	x->id = tmpl->id;
-	if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
-		memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
-	memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
-	if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
-		memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
-	x->props.mode = tmpl->mode;
-	x->props.reqid = tmpl->reqid;
-	x->props.family = AF_INET6;
-}
-
 /* distribution counting sort function for xfrm_state and xfrm_tmpl */
 static int
 __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
@@ -153,7 +138,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.proto			= IPPROTO_IPV6,
 	.eth_proto		= htons(ETH_P_IPV6),
 	.owner			= THIS_MODULE,
-	.init_temprop		= xfrm6_init_temprop,
 	.tmpl_sort		= __xfrm6_tmpl_sort,
 	.state_sort		= __xfrm6_state_sort,
 	.output			= xfrm6_output,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 66d9009fe9b5..336d3f6a1a51 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -812,8 +812,6 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
 		    const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 		    unsigned short family)
 {
-	struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
-
 	switch (family) {
 	case AF_INET:
 		__xfrm4_init_tempsel(&x->sel, fl);
@@ -823,13 +821,28 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
 		break;
 	}
 
-	if (family != tmpl->encap_family)
-		afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
+	x->id = tmpl->id;
 
-	if (!afinfo)
-		return;
+	switch (tmpl->encap_family) {
+	case AF_INET:
+		if (x->id.daddr.a4 == 0)
+			x->id.daddr.a4 = daddr->a4;
+		x->props.saddr = tmpl->saddr;
+		if (x->props.saddr.a4 == 0)
+			x->props.saddr.a4 = saddr->a4;
+		break;
+	case AF_INET6:
+		if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
+			memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
+		memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
+		if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
+			memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
+		break;
+	}
 
-	afinfo->init_temprop(x, tmpl, daddr, saddr);
+	x->props.mode = tmpl->mode;
+	x->props.reqid = tmpl->reqid;
+	x->props.family = tmpl->encap_family;
 }
 
 static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
-- 
cgit v1.2.3


From e46817472a1d7da32e8f265f9469a4e2fa39c60f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 May 2019 17:46:16 +0200
Subject: xfrm: remove init_flags indirection from xfrm_state_afinfo

There is only one implementation of this function; just call it directly.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  1 -
 net/ipv4/xfrm4_state.c |  8 --------
 net/xfrm/xfrm_state.c  | 17 +++--------------
 3 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e8f676ce27be..61214f5c3205 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -353,7 +353,6 @@ struct xfrm_state_afinfo {
 	const struct xfrm_type		*type_map[IPPROTO_MAX];
 	const struct xfrm_type_offload	*type_offload_map[IPPROTO_MAX];
 
-	int			(*init_flags)(struct xfrm_state *x);
 	int			(*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n);
 	int			(*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n);
 	int			(*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 018448e222af..62c96da38b4e 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -15,13 +15,6 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/export.h>
 
-static int xfrm4_init_flags(struct xfrm_state *x)
-{
-	if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
-		x->props.flags |= XFRM_STATE_NOPMTUDISC;
-	return 0;
-}
-
 int xfrm4_extract_header(struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
@@ -43,7 +36,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.proto			= IPPROTO_IPIP,
 	.eth_proto		= htons(ETH_P_IP),
 	.owner			= THIS_MODULE,
-	.init_flags		= xfrm4_init_flags,
 	.output			= xfrm4_output,
 	.output_finish		= xfrm4_output_finish,
 	.extract_input		= xfrm4_extract_input,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 336d3f6a1a51..5c13a8021d4c 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2263,25 +2263,14 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu)
 
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 {
-	const struct xfrm_state_afinfo *afinfo;
 	const struct xfrm_mode *inner_mode;
 	const struct xfrm_mode *outer_mode;
 	int family = x->props.family;
 	int err;
 
-	err = -EAFNOSUPPORT;
-	afinfo = xfrm_state_get_afinfo(family);
-	if (!afinfo)
-		goto error;
-
-	err = 0;
-	if (afinfo->init_flags)
-		err = afinfo->init_flags(x);
-
-	rcu_read_unlock();
-
-	if (err)
-		goto error;
+	if (family == AF_INET &&
+	    xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
+		x->props.flags |= XFRM_STATE_NOPMTUDISC;
 
 	err = -EPROTONOSUPPORT;
 
-- 
cgit v1.2.3


From c54c2c72b2b90a3ba61b8cad032a578ce2bf5b35 Mon Sep 17 00:00:00 2001
From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Date: Thu, 11 Apr 2019 09:11:33 -0700
Subject: net: Add a define for LLDP ethertype

Add a new define ETH_P_LLDP for Link Layer Discovery Protocol (LLDP)
ethertype.

Suggested-by: Bruce Allan <bruce.w.allan@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 3158ba672b72..f6ceb2e63d1e 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -91,6 +91,7 @@
 #define ETH_P_802_EX1	0x88B5		/* 802.1 Local Experimental 1.  */
 #define ETH_P_PREAUTH	0x88C7		/* 802.11 Preauthentication */
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
+#define ETH_P_LLDP	0x88CC		/* Link Layer Discovery Protocol */
 #define ETH_P_MACSEC	0x88E5		/* 802.1ae MACsec */
 #define ETH_P_8021AH	0x88E7          /* 802.1ah Backbone Service Tag */
 #define ETH_P_MVRP	0x88F5          /* 802.1Q MVRP                  */
-- 
cgit v1.2.3


From fe3475af3bdf38fac78787ec2fe9eedaf2518188 Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <yanjun.zhu@oracle.com>
Date: Mon, 3 Jun 2019 00:28:01 -0400
Subject: net: rds: add per rds connection cache statistics

The variable cache_allocs is to indicate how many frags (KiB) are in one
rds connection frag cache.
The command "rds-info -Iv" will output the rds connection cache
statistics as below:
"
RDS IB Connections:
      LocalAddr RemoteAddr Tos SL  LocalDev            RemoteDev
      1.1.1.14 1.1.1.14   58 255  fe80::2:c903:a:7a31 fe80::2:c903:a:7a31
      send_wr=256, recv_wr=1024, send_sge=8, rdma_mr_max=4096,
      rdma_mr_size=257, cache_allocs=12
"
This means that there are about 12KiB frag in this rds connection frag
cache.
Since rds.h in rds-tools is not related with the kernel rds.h, the change
in kernel rds.h does not affect rds-tools.
rds-info in rds-tools 2.0.5 and 2.0.6 is tested with this commit. It works
well.

Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rds.h | 2 ++
 net/rds/ib.c             | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 5d0f76c780e5..fd6b5f66e2c5 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -250,6 +250,7 @@ struct rds_info_rdma_connection {
 	__u32		rdma_mr_max;
 	__u32		rdma_mr_size;
 	__u8		tos;
+	__u32		cache_allocs;
 };
 
 struct rds6_info_rdma_connection {
@@ -264,6 +265,7 @@ struct rds6_info_rdma_connection {
 	__u32		rdma_mr_max;
 	__u32		rdma_mr_size;
 	__u8		tos;
+	__u32		cache_allocs;
 };
 
 /* RDS message Receive Path Latency points */
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 2da9b75bad16..f9baf2d5a82a 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -318,6 +318,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
 		iinfo->max_send_sge = rds_ibdev->max_sge;
 		rds_ib_get_mr_info(rds_ibdev, iinfo);
+		iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs);
 	}
 	return 1;
 }
@@ -351,6 +352,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
 		iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
 		iinfo6->max_send_sge = rds_ibdev->max_sge;
 		rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+		iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs);
 	}
 	return 1;
 }
-- 
cgit v1.2.3


From 2bd229df5e2ecbc13909f71dbd196fced1d533ca Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 4 Jun 2019 23:02:34 +0200
Subject: net: phy: remove state PHY_FORCING

In the early days of phylib we had a functionality that changed to the
next lower speed in fixed mode if no link was established after a
certain period of time. This functionality has been removed years ago,
and state PHY_FORCING isn't needed any longer. Instead we can go from
UP to RUNNING or NOLINK directly (same as in autoneg mode).

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 26 ++------------------------
 include/linux/phy.h   | 11 -----------
 2 files changed, 2 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 0084220d10dc..d9150765009e 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -43,7 +43,6 @@ static const char *phy_state_to_str(enum phy_state st)
 	PHY_STATE_STR(UP)
 	PHY_STATE_STR(RUNNING)
 	PHY_STATE_STR(NOLINK)
-	PHY_STATE_STR(FORCING)
 	PHY_STATE_STR(HALTED)
 	}
 
@@ -577,15 +576,8 @@ int phy_start_aneg(struct phy_device *phydev)
 	if (err < 0)
 		goto out_unlock;
 
-	if (phy_is_started(phydev)) {
-		if (phydev->autoneg == AUTONEG_ENABLE) {
-			err = phy_check_link_status(phydev);
-		} else {
-			phydev->state = PHY_FORCING;
-			phydev->link_timeout = PHY_FORCE_TIMEOUT;
-		}
-	}
-
+	if (phy_is_started(phydev))
+		err = phy_check_link_status(phydev);
 out_unlock:
 	mutex_unlock(&phydev->lock);
 
@@ -951,20 +943,6 @@ void phy_state_machine(struct work_struct *work)
 	case PHY_RUNNING:
 		err = phy_check_link_status(phydev);
 		break;
-	case PHY_FORCING:
-		err = genphy_update_link(phydev);
-		if (err)
-			break;
-
-		if (phydev->link) {
-			phydev->state = PHY_RUNNING;
-			phy_link_up(phydev);
-		} else {
-			if (0 == phydev->link_timeout--)
-				needs_aneg = true;
-			phy_link_down(phydev, false);
-		}
-		break;
 	case PHY_HALTED:
 		if (phydev->link) {
 			phydev->link = 0;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index dc4b51060ebc..8caaf76685cd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -297,12 +297,6 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr);
  * - irq or timer will set RUNNING if link comes back
  * - phy_stop moves to HALTED
  *
- * FORCING: PHY is being configured with forced settings
- * - if link is up, move to RUNNING
- * - If link is down, we drop to the next highest setting, and
- *   retry (FORCING) after a timeout
- * - phy_stop moves to HALTED
- *
  * RUNNING: PHY is currently up, running, and possibly sending
  * and/or receiving packets
  * - irq or timer will set NOLINK if link goes down
@@ -319,7 +313,6 @@ enum phy_state {
 	PHY_UP,
 	PHY_RUNNING,
 	PHY_NOLINK,
-	PHY_FORCING,
 };
 
 /**
@@ -347,8 +340,6 @@ struct phy_c45_device_ids {
  * loopback_enabled: Set true if this phy has been loopbacked successfully.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
- * link_timeout: The number of timer firings to wait before the
- * giving up on the current attempt at acquiring a link
  * irq: IRQ number of the PHY's interrupt (-1 if none)
  * phy_timer: The timer for handling the state machine
  * attached_dev: The attached enet driver's device instance ptr
@@ -416,8 +407,6 @@ struct phy_device {
 	/* Energy efficient ethernet modes which should be prohibited */
 	u32 eee_broken_modes;
 
-	int link_timeout;
-
 #ifdef CONFIG_LED_TRIGGER_PHY
 	struct phy_led_trigger *phy_led_triggers;
 	unsigned int phy_num_led_triggers;
-- 
cgit v1.2.3


From 3aaf3915a31aac83523d2de0191a480d3ad1e747 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 May 2019 17:46:17 +0200
Subject: xfrm: remove state and template sort indirections from
 xfrm_state_afinfo

No module dependency, placing this in xfrm_state.c avoids need for
an indirection.

This also removes the state spinlock -- I don't see why we would need
to hold it during sorting.

This in turn allows to remove the 'net' argument passed to
xfrm_tmpl_sort.  Last, remove the EXPORT_SYMBOL, there are no modular
callers.

For the CONFIG_IPV6=m case, vmlinux size increase is about 300 byte.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  18 +++----
 net/ipv6/xfrm6_state.c |  98 ------------------------------------
 net/xfrm/xfrm_policy.c |   2 +-
 net/xfrm/xfrm_state.c  | 132 +++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 113 insertions(+), 137 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 61214f5c3205..4325cb708ed4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -353,8 +353,6 @@ struct xfrm_state_afinfo {
 	const struct xfrm_type		*type_map[IPPROTO_MAX];
 	const struct xfrm_type_offload	*type_offload_map[IPPROTO_MAX];
 
-	int			(*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n);
-	int			(*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n);
 	int			(*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int			(*output_finish)(struct sock *sk, struct sk_buff *skb);
 	int			(*extract_input)(struct xfrm_state *x,
@@ -1501,21 +1499,19 @@ struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark,
 					    u8 proto,
 					    unsigned short family);
 #ifdef CONFIG_XFRM_SUB_POLICY
-int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
-		   unsigned short family, struct net *net);
-int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
+void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
 		    unsigned short family);
+void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
+		     unsigned short family);
 #else
-static inline int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src,
-				 int n, unsigned short family, struct net *net)
+static inline void xfrm_tmpl_sort(struct xfrm_tmpl **d, struct xfrm_tmpl **s,
+				  int n, unsigned short family)
 {
-	return -ENOSYS;
 }
 
-static inline int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src,
-				  int n, unsigned short family)
+static inline void xfrm_state_sort(struct xfrm_state **d, struct xfrm_state **s,
+				   int n, unsigned short family)
 {
-	return -ENOSYS;
 }
 #endif
 
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index aa5d2c52cc31..1782ebb22dd3 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -21,102 +21,6 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 
-/* distribution counting sort function for xfrm_state and xfrm_tmpl */
-static int
-__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
-{
-	int count[XFRM_MAX_DEPTH] = { };
-	int class[XFRM_MAX_DEPTH];
-	int i;
-
-	for (i = 0; i < n; i++) {
-		int c;
-		class[i] = c = cmp(src[i]);
-		count[c]++;
-	}
-
-	for (i = 2; i < maxclass; i++)
-		count[i] += count[i - 1];
-
-	for (i = 0; i < n; i++) {
-		dst[count[class[i] - 1]++] = src[i];
-		src[i] = NULL;
-	}
-
-	return 0;
-}
-
-/*
- * Rule for xfrm_state:
- *
- * rule 1: select IPsec transport except AH
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec transport AH
- * rule 4: select IPsec tunnel
- * rule 5: others
- */
-static int __xfrm6_state_sort_cmp(void *p)
-{
-	struct xfrm_state *v = p;
-
-	switch (v->props.mode) {
-	case XFRM_MODE_TRANSPORT:
-		if (v->id.proto != IPPROTO_AH)
-			return 1;
-		else
-			return 3;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
-	case XFRM_MODE_ROUTEOPTIMIZATION:
-	case XFRM_MODE_IN_TRIGGER:
-		return 2;
-#endif
-	case XFRM_MODE_TUNNEL:
-	case XFRM_MODE_BEET:
-		return 4;
-	}
-	return 5;
-}
-
-static int
-__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
-{
-	return __xfrm6_sort((void **)dst, (void **)src, n,
-			    __xfrm6_state_sort_cmp, 6);
-}
-
-/*
- * Rule for xfrm_tmpl:
- *
- * rule 1: select IPsec transport
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec tunnel
- * rule 4: others
- */
-static int __xfrm6_tmpl_sort_cmp(void *p)
-{
-	struct xfrm_tmpl *v = p;
-	switch (v->mode) {
-	case XFRM_MODE_TRANSPORT:
-		return 1;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
-	case XFRM_MODE_ROUTEOPTIMIZATION:
-	case XFRM_MODE_IN_TRIGGER:
-		return 2;
-#endif
-	case XFRM_MODE_TUNNEL:
-	case XFRM_MODE_BEET:
-		return 3;
-	}
-	return 4;
-}
-
-static int
-__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n)
-{
-	return __xfrm6_sort((void **)dst, (void **)src, n,
-			    __xfrm6_tmpl_sort_cmp, 5);
-}
-
 int xfrm6_extract_header(struct sk_buff *skb)
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -138,8 +42,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.proto			= IPPROTO_IPV6,
 	.eth_proto		= htons(ETH_P_IPV6),
 	.owner			= THIS_MODULE,
-	.tmpl_sort		= __xfrm6_tmpl_sort,
-	.state_sort		= __xfrm6_state_sort,
 	.output			= xfrm6_output,
 	.output_finish		= xfrm6_output_finish,
 	.extract_input		= xfrm6_extract_input,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b1694d5d15d3..1070dfece76b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3628,7 +3628,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 		xfrm_nr = ti;
 		if (npols > 1) {
-			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net);
+			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
 			tpp = stp;
 		}
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5c13a8021d4c..3f0950db060a 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1688,51 +1688,129 @@ xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
 EXPORT_SYMBOL(xfrm_find_acq);
 
 #ifdef CONFIG_XFRM_SUB_POLICY
-int
+#if IS_ENABLED(CONFIG_IPV6)
+/* distribution counting sort function for xfrm_state and xfrm_tmpl */
+static void
+__xfrm6_sort(void **dst, void **src, int n,
+	     int (*cmp)(const void *p), int maxclass)
+{
+	int count[XFRM_MAX_DEPTH] = { };
+	int class[XFRM_MAX_DEPTH];
+	int i;
+
+	for (i = 0; i < n; i++) {
+		int c = cmp(src[i]);
+
+		class[i] = c;
+		count[c]++;
+	}
+
+	for (i = 2; i < maxclass; i++)
+		count[i] += count[i - 1];
+
+	for (i = 0; i < n; i++) {
+		dst[count[class[i] - 1]++] = src[i];
+		src[i] = NULL;
+	}
+}
+
+/* Rule for xfrm_state:
+ *
+ * rule 1: select IPsec transport except AH
+ * rule 2: select MIPv6 RO or inbound trigger
+ * rule 3: select IPsec transport AH
+ * rule 4: select IPsec tunnel
+ * rule 5: others
+ */
+static int __xfrm6_state_sort_cmp(const void *p)
+{
+	const struct xfrm_state *v = p;
+
+	switch (v->props.mode) {
+	case XFRM_MODE_TRANSPORT:
+		if (v->id.proto != IPPROTO_AH)
+			return 1;
+		else
+			return 3;
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+	case XFRM_MODE_ROUTEOPTIMIZATION:
+	case XFRM_MODE_IN_TRIGGER:
+		return 2;
+#endif
+	case XFRM_MODE_TUNNEL:
+	case XFRM_MODE_BEET:
+		return 4;
+	}
+	return 5;
+}
+
+/* Rule for xfrm_tmpl:
+ *
+ * rule 1: select IPsec transport
+ * rule 2: select MIPv6 RO or inbound trigger
+ * rule 3: select IPsec tunnel
+ * rule 4: others
+ */
+static int __xfrm6_tmpl_sort_cmp(const void *p)
+{
+	const struct xfrm_tmpl *v = p;
+
+	switch (v->mode) {
+	case XFRM_MODE_TRANSPORT:
+		return 1;
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+	case XFRM_MODE_ROUTEOPTIMIZATION:
+	case XFRM_MODE_IN_TRIGGER:
+		return 2;
+#endif
+	case XFRM_MODE_TUNNEL:
+	case XFRM_MODE_BEET:
+		return 3;
+	}
+	return 4;
+}
+#else
+static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; }
+static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; }
+
+static inline void
+__xfrm6_sort(void **dst, void **src, int n,
+	     int (*cmp)(const void *p), int maxclass)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+#endif /* CONFIG_IPV6 */
+
+void
 xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
-	       unsigned short family, struct net *net)
+	       unsigned short family)
 {
 	int i;
-	int err = 0;
-	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	if (!afinfo)
-		return -EAFNOSUPPORT;
 
-	spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/
-	if (afinfo->tmpl_sort)
-		err = afinfo->tmpl_sort(dst, src, n);
+	if (family == AF_INET6)
+		__xfrm6_sort((void **)dst, (void **)src, n,
+			     __xfrm6_tmpl_sort_cmp, 5);
 	else
 		for (i = 0; i < n; i++)
 			dst[i] = src[i];
-	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
-	rcu_read_unlock();
-	return err;
 }
-EXPORT_SYMBOL(xfrm_tmpl_sort);
 
-int
+void
 xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
 		unsigned short family)
 {
 	int i;
-	int err = 0;
-	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	struct net *net = xs_net(*src);
 
-	if (!afinfo)
-		return -EAFNOSUPPORT;
-
-	spin_lock_bh(&net->xfrm.xfrm_state_lock);
-	if (afinfo->state_sort)
-		err = afinfo->state_sort(dst, src, n);
+	if (family == AF_INET6)
+		__xfrm6_sort((void **)dst, (void **)src, n,
+			     __xfrm6_state_sort_cmp, 6);
 	else
 		for (i = 0; i < n; i++)
 			dst[i] = src[i];
-	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
-	rcu_read_unlock();
-	return err;
 }
-EXPORT_SYMBOL(xfrm_state_sort);
 #endif
 
 /* Silly enough, but I'm lazy to build resolution list */
-- 
cgit v1.2.3


From 4c203b0454b5b6bfafe2c4ab1b5472d4a7a8a0f2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 May 2019 17:46:18 +0200
Subject: xfrm: remove eth_proto value from xfrm_state_afinfo

xfrm_prepare_input needs to lookup the state afinfo backend again to fetch
the address family ethernet protocol value.

There are only two address families, so a switch statement is simpler.
While at it, use u8 for family and proto and remove the owner member --
its not used anywhere.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  6 ++----
 net/ipv4/xfrm4_state.c |  2 --
 net/ipv6/xfrm6_state.c |  2 --
 net/xfrm/xfrm_input.c  | 24 ++++++++++++------------
 4 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4325cb708ed4..812994ad49ac 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -346,10 +346,8 @@ void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
 int __xfrm_state_delete(struct xfrm_state *x);
 
 struct xfrm_state_afinfo {
-	unsigned int			family;
-	unsigned int			proto;
-	__be16				eth_proto;
-	struct module			*owner;
+	u8				family;
+	u8				proto;
 	const struct xfrm_type		*type_map[IPPROTO_MAX];
 	const struct xfrm_type_offload	*type_offload_map[IPPROTO_MAX];
 
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 62c96da38b4e..f8ed3c3bb928 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -34,8 +34,6 @@ int xfrm4_extract_header(struct sk_buff *skb)
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
 	.proto			= IPPROTO_IPIP,
-	.eth_proto		= htons(ETH_P_IP),
-	.owner			= THIS_MODULE,
 	.output			= xfrm4_output,
 	.output_finish		= xfrm4_output_finish,
 	.extract_input		= xfrm4_extract_input,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 1782ebb22dd3..78daadecbdef 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -40,8 +40,6 @@ int xfrm6_extract_header(struct sk_buff *skb)
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
 	.proto			= IPPROTO_IPV6,
-	.eth_proto		= htons(ETH_P_IPV6),
-	.owner			= THIS_MODULE,
 	.output			= xfrm6_output,
 	.output_finish		= xfrm6_output_finish,
 	.extract_input		= xfrm6_extract_input,
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 314973aaa414..8a00cc94c32c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -359,28 +359,28 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
 	afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family);
 	if (likely(afinfo))
 		err = afinfo->extract_input(x, skb);
+	rcu_read_unlock();
 
-	if (err) {
-		rcu_read_unlock();
+	if (err)
 		return err;
-	}
 
 	if (x->sel.family == AF_UNSPEC) {
 		inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
-		if (!inner_mode) {
-			rcu_read_unlock();
+		if (!inner_mode)
 			return -EAFNOSUPPORT;
-		}
 	}
 
-	afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
-	if (unlikely(!afinfo)) {
-		rcu_read_unlock();
-		return -EAFNOSUPPORT;
+	switch (inner_mode->family) {
+	case AF_INET:
+		skb->protocol = htons(ETH_P_IP);
+		break;
+	case AF_INET6:
+		skb->protocol = htons(ETH_P_IPV6);
+	default:
+		WARN_ON_ONCE(1);
+		break;
 	}
 
-	skb->protocol = afinfo->eth_proto;
-	rcu_read_unlock();
 	return xfrm_inner_mode_encap_remove(x, inner_mode, skb);
 }
 
-- 
cgit v1.2.3


From 4f518e802ccad30c9dccc895f2294398757b87c0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 May 2019 17:46:19 +0200
Subject: xfrm: remove type and offload_type map from xfrm_state_afinfo

Only a handful of xfrm_types exist, no need to have 512 pointers for them.

Reduces size of afinfo struct from 4k to 120 bytes on 64bit platforms.

Also, the unregister function doesn't need to return an error, no single
caller does anything useful with it.

Just place a WARN_ON() where needed instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  16 +++--
 net/ipv4/ah4.c          |   3 +-
 net/ipv4/esp4.c         |   3 +-
 net/ipv4/esp4_offload.c |   4 +-
 net/ipv4/ipcomp.c       |   3 +-
 net/ipv4/xfrm4_tunnel.c |   3 +-
 net/ipv6/ah6.c          |   4 +-
 net/ipv6/esp6.c         |   3 +-
 net/ipv6/esp6_offload.c |   4 +-
 net/ipv6/ipcomp6.c      |   3 +-
 net/ipv6/mip6.c         |   6 +-
 net/xfrm/xfrm_state.c   | 179 ++++++++++++++++++++++++++++++++++--------------
 12 files changed, 150 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 812994ad49ac..56b31676e330 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -348,8 +348,16 @@ int __xfrm_state_delete(struct xfrm_state *x);
 struct xfrm_state_afinfo {
 	u8				family;
 	u8				proto;
-	const struct xfrm_type		*type_map[IPPROTO_MAX];
-	const struct xfrm_type_offload	*type_offload_map[IPPROTO_MAX];
+
+	const struct xfrm_type_offload *type_offload_esp;
+
+	const struct xfrm_type		*type_esp;
+	const struct xfrm_type		*type_ipip;
+	const struct xfrm_type		*type_ipip6;
+	const struct xfrm_type		*type_comp;
+	const struct xfrm_type		*type_ah;
+	const struct xfrm_type		*type_routing;
+	const struct xfrm_type		*type_dstopts;
 
 	int			(*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int			(*output_finish)(struct sock *sk, struct sk_buff *skb);
@@ -401,7 +409,7 @@ struct xfrm_type {
 };
 
 int xfrm_register_type(const struct xfrm_type *type, unsigned short family);
-int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family);
+void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family);
 
 struct xfrm_type_offload {
 	char		*description;
@@ -413,7 +421,7 @@ struct xfrm_type_offload {
 };
 
 int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
-int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
+void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 static inline int xfrm_af2proto(unsigned int family)
 {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 9c3afd550612..974179b3b314 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -590,8 +590,7 @@ static void __exit ah4_fini(void)
 {
 	if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
 		pr_info("%s: can't remove protocol\n", __func__);
-	if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
-		pr_info("%s: can't remove xfrm type\n", __func__);
+	xfrm_unregister_type(&ah_type, AF_INET);
 }
 
 module_init(ah4_init);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b9ae95576084..c06562aded11 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1066,8 +1066,7 @@ static void __exit esp4_fini(void)
 {
 	if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
 		pr_info("%s: can't remove protocol\n", __func__);
-	if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
-		pr_info("%s: can't remove xfrm type\n", __func__);
+	xfrm_unregister_type(&esp_type, AF_INET);
 }
 
 module_init(esp4_init);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 8edcfa66d1e5..6e5288aef71e 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -315,9 +315,7 @@ static int __init esp4_offload_init(void)
 
 static void __exit esp4_offload_exit(void)
 {
-	if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0)
-		pr_info("%s: can't remove xfrm type offload\n", __func__);
-
+	xfrm_unregister_type_offload(&esp_type_offload, AF_INET);
 	inet_del_offload(&esp4_offload, IPPROTO_ESP);
 }
 
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 9119d012ba46..ee03f0a55152 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -190,8 +190,7 @@ static void __exit ipcomp4_fini(void)
 {
 	if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
 		pr_info("%s: can't remove protocol\n", __func__);
-	if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
-		pr_info("%s: can't remove xfrm type\n", __func__);
+	xfrm_unregister_type(&ipcomp_type, AF_INET);
 }
 
 module_init(ipcomp4_init);
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 5d00e54cd319..dc19aff7c2e0 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -108,8 +108,7 @@ static void __exit ipip_fini(void)
 	if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
 		pr_info("%s: can't remove xfrm handler for AF_INET\n",
 			__func__);
-	if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
-		pr_info("%s: can't remove xfrm type\n", __func__);
+	xfrm_unregister_type(&ipip_type, AF_INET);
 }
 
 module_init(ipip_init);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 68b9e92e469e..25e1172fd1c3 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -793,9 +793,7 @@ static void __exit ah6_fini(void)
 	if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0)
 		pr_info("%s: can't remove protocol\n", __func__);
 
-	if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
-		pr_info("%s: can't remove xfrm type\n", __func__);
-
+	xfrm_unregister_type(&ah6_type, AF_INET6);
 }
 
 module_init(ah6_init);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ae6a739c5f52..b6c6b3e08836 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -951,8 +951,7 @@ static void __exit esp6_fini(void)
 {
 	if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0)
 		pr_info("%s: can't remove protocol\n", __func__);
-	if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
-		pr_info("%s: can't remove xfrm type\n", __func__);
+	xfrm_unregister_type(&esp6_type, AF_INET6);
 }
 
 module_init(esp6_init);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index d453cf417b03..f2c8f7103332 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -339,9 +339,7 @@ static int __init esp6_offload_init(void)
 
 static void __exit esp6_offload_exit(void)
 {
-	if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0)
-		pr_info("%s: can't remove xfrm type offload\n", __func__);
-
+	xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6);
 	inet6_del_offload(&esp6_offload, IPPROTO_ESP);
 }
 
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 51fd33294c7c..3752bd3e92ce 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -206,8 +206,7 @@ static void __exit ipcomp6_fini(void)
 {
 	if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0)
 		pr_info("%s: can't remove protocol\n", __func__);
-	if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
-		pr_info("%s: can't remove xfrm type\n", __func__);
+	xfrm_unregister_type(&ipcomp6_type, AF_INET6);
 }
 
 module_init(ipcomp6_init);
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 91801432878c..878fcec14949 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -499,10 +499,8 @@ static void __exit mip6_fini(void)
 {
 	if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
 		pr_info("%s: can't remove rawv6 mh filter\n", __func__);
-	if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
-		pr_info("%s: can't remove xfrm type(rthdr)\n", __func__);
-	if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
-		pr_info("%s: can't remove xfrm type(destopt)\n", __func__);
+	xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
+	xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
 }
 
 module_init(mip6_init);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 3f0950db060a..fd51737f9f17 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -177,63 +177,132 @@ int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
 static bool km_is_alive(const struct km_event *c);
 void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
 
-static DEFINE_SPINLOCK(xfrm_type_lock);
 int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
 {
 	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	const struct xfrm_type **typemap;
 	int err = 0;
 
-	if (unlikely(afinfo == NULL))
+	if (!afinfo)
 		return -EAFNOSUPPORT;
-	typemap = afinfo->type_map;
-	spin_lock_bh(&xfrm_type_lock);
 
-	if (likely(typemap[type->proto] == NULL))
-		typemap[type->proto] = type;
-	else
-		err = -EEXIST;
-	spin_unlock_bh(&xfrm_type_lock);
+#define X(afi, T, name) do {			\
+		WARN_ON((afi)->type_ ## name);	\
+		(afi)->type_ ## name = (T);	\
+	} while (0)
+
+	switch (type->proto) {
+	case IPPROTO_COMP:
+		X(afinfo, type, comp);
+		break;
+	case IPPROTO_AH:
+		X(afinfo, type, ah);
+		break;
+	case IPPROTO_ESP:
+		X(afinfo, type, esp);
+		break;
+	case IPPROTO_IPIP:
+		X(afinfo, type, ipip);
+		break;
+	case IPPROTO_DSTOPTS:
+		X(afinfo, type, dstopts);
+		break;
+	case IPPROTO_ROUTING:
+		X(afinfo, type, routing);
+		break;
+	case IPPROTO_IPV6:
+		X(afinfo, type, ipip6);
+		break;
+	default:
+		WARN_ON(1);
+		err = -EPROTONOSUPPORT;
+		break;
+	}
+#undef X
 	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_register_type);
 
-int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
+void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
 {
 	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	const struct xfrm_type **typemap;
-	int err = 0;
 
 	if (unlikely(afinfo == NULL))
-		return -EAFNOSUPPORT;
-	typemap = afinfo->type_map;
-	spin_lock_bh(&xfrm_type_lock);
+		return;
 
-	if (unlikely(typemap[type->proto] != type))
-		err = -ENOENT;
-	else
-		typemap[type->proto] = NULL;
-	spin_unlock_bh(&xfrm_type_lock);
+#define X(afi, T, name) do {				\
+		WARN_ON((afi)->type_ ## name != (T));	\
+		(afi)->type_ ## name = NULL;		\
+	} while (0)
+
+	switch (type->proto) {
+	case IPPROTO_COMP:
+		X(afinfo, type, comp);
+		break;
+	case IPPROTO_AH:
+		X(afinfo, type, ah);
+		break;
+	case IPPROTO_ESP:
+		X(afinfo, type, esp);
+		break;
+	case IPPROTO_IPIP:
+		X(afinfo, type, ipip);
+		break;
+	case IPPROTO_DSTOPTS:
+		X(afinfo, type, dstopts);
+		break;
+	case IPPROTO_ROUTING:
+		X(afinfo, type, routing);
+		break;
+	case IPPROTO_IPV6:
+		X(afinfo, type, ipip6);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+#undef X
 	rcu_read_unlock();
-	return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_type);
 
 static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
 {
+	const struct xfrm_type *type = NULL;
 	struct xfrm_state_afinfo *afinfo;
-	const struct xfrm_type **typemap;
-	const struct xfrm_type *type;
 	int modload_attempted = 0;
 
 retry:
 	afinfo = xfrm_state_get_afinfo(family);
 	if (unlikely(afinfo == NULL))
 		return NULL;
-	typemap = afinfo->type_map;
 
-	type = READ_ONCE(typemap[proto]);
+	switch (proto) {
+	case IPPROTO_COMP:
+		type = afinfo->type_comp;
+		break;
+	case IPPROTO_AH:
+		type = afinfo->type_ah;
+		break;
+	case IPPROTO_ESP:
+		type = afinfo->type_esp;
+		break;
+	case IPPROTO_IPIP:
+		type = afinfo->type_ipip;
+		break;
+	case IPPROTO_DSTOPTS:
+		type = afinfo->type_dstopts;
+		break;
+	case IPPROTO_ROUTING:
+		type = afinfo->type_routing;
+		break;
+	case IPPROTO_IPV6:
+		type = afinfo->type_ipip6;
+		break;
+	default:
+		break;
+	}
+
 	if (unlikely(type && !try_module_get(type->owner)))
 		type = NULL;
 
@@ -253,65 +322,71 @@ static void xfrm_put_type(const struct xfrm_type *type)
 	module_put(type->owner);
 }
 
-static DEFINE_SPINLOCK(xfrm_type_offload_lock);
 int xfrm_register_type_offload(const struct xfrm_type_offload *type,
 			       unsigned short family)
 {
 	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	const struct xfrm_type_offload **typemap;
 	int err = 0;
 
 	if (unlikely(afinfo == NULL))
 		return -EAFNOSUPPORT;
-	typemap = afinfo->type_offload_map;
-	spin_lock_bh(&xfrm_type_offload_lock);
 
-	if (likely(typemap[type->proto] == NULL))
-		typemap[type->proto] = type;
-	else
-		err = -EEXIST;
-	spin_unlock_bh(&xfrm_type_offload_lock);
+	switch (type->proto) {
+	case IPPROTO_ESP:
+		WARN_ON(afinfo->type_offload_esp);
+		afinfo->type_offload_esp = type;
+		break;
+	default:
+		WARN_ON(1);
+		err = -EPROTONOSUPPORT;
+		break;
+	}
+
 	rcu_read_unlock();
 	return err;
 }
 EXPORT_SYMBOL(xfrm_register_type_offload);
 
-int xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
-				 unsigned short family)
+void xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
+				  unsigned short family)
 {
 	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	const struct xfrm_type_offload **typemap;
-	int err = 0;
 
 	if (unlikely(afinfo == NULL))
-		return -EAFNOSUPPORT;
-	typemap = afinfo->type_offload_map;
-	spin_lock_bh(&xfrm_type_offload_lock);
+		return;
 
-	if (unlikely(typemap[type->proto] != type))
-		err = -ENOENT;
-	else
-		typemap[type->proto] = NULL;
-	spin_unlock_bh(&xfrm_type_offload_lock);
+	switch (type->proto) {
+	case IPPROTO_ESP:
+		WARN_ON(afinfo->type_offload_esp != type);
+		afinfo->type_offload_esp = NULL;
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
 	rcu_read_unlock();
-	return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_type_offload);
 
 static const struct xfrm_type_offload *
 xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load)
 {
+	const struct xfrm_type_offload *type = NULL;
 	struct xfrm_state_afinfo *afinfo;
-	const struct xfrm_type_offload **typemap;
-	const struct xfrm_type_offload *type;
 
 retry:
 	afinfo = xfrm_state_get_afinfo(family);
 	if (unlikely(afinfo == NULL))
 		return NULL;
-	typemap = afinfo->type_offload_map;
 
-	type = typemap[proto];
+	switch (proto) {
+	case IPPROTO_ESP:
+		type = afinfo->type_offload_esp;
+		break;
+	default:
+		break;
+	}
+
 	if ((type && !try_module_get(type->owner)))
 		type = NULL;
 
-- 
cgit v1.2.3


From fe1e0713bbe84c6796b01d50a55ce4e324141161 Mon Sep 17 00:00:00 2001
From: Litao jiao <jiaolitao@raisecom.com>
Date: Thu, 6 Jun 2019 17:57:58 +0800
Subject: vxlan: Use FDB_HASH_SIZE hash_locks to reduce contention

The monolithic hash_lock could cause huge contention when
inserting/deletiing vxlan_fdbs into the fdb_head.

Use FDB_HASH_SIZE hash_locks to protect insertions/deletions
of vxlan_fdbs into the fdb_head hash table.

Suggested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Litao jiao <jiaolitao@raisecom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 92 ++++++++++++++++++++++++++++++++++-------------------
 include/net/vxlan.h |  2 +-
 2 files changed, 60 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index f31d226b5f2a..75056b95b31f 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -471,14 +471,19 @@ static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
 	return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
 }
 
+static u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
+{
+	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
+		return eth_vni_hash(mac, vni);
+	else
+		return eth_hash(mac);
+}
+
 /* Hash chain to use given mac address */
 static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
 						const u8 *mac, __be32 vni)
 {
-	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
-		return &vxlan->fdb_head[eth_vni_hash(mac, vni)];
-	else
-		return &vxlan->fdb_head[eth_hash(mac)];
+	return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)];
 }
 
 /* Look up Ethernet address in forwarding table */
@@ -593,8 +598,8 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
 		return -EINVAL;
 	vxlan = netdev_priv(dev);
 
-	spin_lock_bh(&vxlan->hash_lock);
 	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		spin_lock_bh(&vxlan->hash_lock[h]);
 		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
 			if (f->vni == vni) {
 				list_for_each_entry(rdst, &f->remotes, list) {
@@ -602,14 +607,16 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
 								  f, rdst,
 								  extack);
 					if (rc)
-						goto out;
+						goto unlock;
 				}
 			}
 		}
+		spin_unlock_bh(&vxlan->hash_lock[h]);
 	}
+	return 0;
 
-out:
-	spin_unlock_bh(&vxlan->hash_lock);
+unlock:
+	spin_unlock_bh(&vxlan->hash_lock[h]);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(vxlan_fdb_replay);
@@ -625,14 +632,15 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
 		return;
 	vxlan = netdev_priv(dev);
 
-	spin_lock_bh(&vxlan->hash_lock);
 	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		spin_lock_bh(&vxlan->hash_lock[h]);
 		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
 			if (f->vni == vni)
 				list_for_each_entry(rdst, &f->remotes, list)
 					rdst->offloaded = false;
+		spin_unlock_bh(&vxlan->hash_lock[h]);
 	}
-	spin_unlock_bh(&vxlan->hash_lock);
+
 }
 EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);
 
@@ -1108,6 +1116,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	__be16 port;
 	__be32 src_vni, vni;
 	u32 ifindex;
+	u32 hash_index;
 	int err;
 
 	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
@@ -1126,12 +1135,13 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
 		return -EAFNOSUPPORT;
 
-	spin_lock_bh(&vxlan->hash_lock);
+	hash_index = fdb_head_index(vxlan, addr, src_vni);
+	spin_lock_bh(&vxlan->hash_lock[hash_index]);
 	err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
 			       port, src_vni, vni, ifindex,
 			       ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
 			       true, extack);
-	spin_unlock_bh(&vxlan->hash_lock);
+	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
 	return err;
 }
@@ -1179,16 +1189,18 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	__be32 src_vni, vni;
 	__be16 port;
 	u32 ifindex;
+	u32 hash_index;
 	int err;
 
 	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
 	if (err)
 		return err;
 
-	spin_lock_bh(&vxlan->hash_lock);
+	hash_index = fdb_head_index(vxlan, addr, src_vni);
+	spin_lock_bh(&vxlan->hash_lock[hash_index]);
 	err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
 				 true);
-	spin_unlock_bh(&vxlan->hash_lock);
+	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
 	return err;
 }
@@ -1300,8 +1312,10 @@ static bool vxlan_snoop(struct net_device *dev,
 		f->updated = jiffies;
 		vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL);
 	} else {
+		u32 hash_index = fdb_head_index(vxlan, src_mac, vni);
+
 		/* learned new entry */
-		spin_lock(&vxlan->hash_lock);
+		spin_lock(&vxlan->hash_lock[hash_index]);
 
 		/* close off race between vxlan_flush and incoming packets */
 		if (netif_running(dev))
@@ -1312,7 +1326,7 @@ static bool vxlan_snoop(struct net_device *dev,
 					 vni,
 					 vxlan->default_dst.remote_vni,
 					 ifindex, NTF_SELF, true, NULL);
-		spin_unlock(&vxlan->hash_lock);
+		spin_unlock(&vxlan->hash_lock[hash_index]);
 	}
 
 	return false;
@@ -2702,7 +2716,7 @@ static void vxlan_cleanup(struct timer_list *t)
 	for (h = 0; h < FDB_HASH_SIZE; ++h) {
 		struct hlist_node *p, *n;
 
-		spin_lock(&vxlan->hash_lock);
+		spin_lock(&vxlan->hash_lock[h]);
 		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
 			struct vxlan_fdb *f
 				= container_of(p, struct vxlan_fdb, hlist);
@@ -2724,7 +2738,7 @@ static void vxlan_cleanup(struct timer_list *t)
 			} else if (time_before(timeout, next_timer))
 				next_timer = timeout;
 		}
-		spin_unlock(&vxlan->hash_lock);
+		spin_unlock(&vxlan->hash_lock[h]);
 	}
 
 	mod_timer(&vxlan->age_timer, next_timer);
@@ -2767,12 +2781,13 @@ static int vxlan_init(struct net_device *dev)
 static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
 {
 	struct vxlan_fdb *f;
+	u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);
 
-	spin_lock_bh(&vxlan->hash_lock);
+	spin_lock_bh(&vxlan->hash_lock[hash_index]);
 	f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
 	if (f)
 		vxlan_fdb_destroy(vxlan, f, true, true);
-	spin_unlock_bh(&vxlan->hash_lock);
+	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 }
 
 static void vxlan_uninit(struct net_device *dev)
@@ -2817,9 +2832,10 @@ static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
 {
 	unsigned int h;
 
-	spin_lock_bh(&vxlan->hash_lock);
 	for (h = 0; h < FDB_HASH_SIZE; ++h) {
 		struct hlist_node *p, *n;
+
+		spin_lock_bh(&vxlan->hash_lock[h]);
 		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
 			struct vxlan_fdb *f
 				= container_of(p, struct vxlan_fdb, hlist);
@@ -2829,8 +2845,8 @@ static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
 			if (!is_zero_ether_addr(f->eth_addr))
 				vxlan_fdb_destroy(vxlan, f, true, true);
 		}
+		spin_unlock_bh(&vxlan->hash_lock[h]);
 	}
-	spin_unlock_bh(&vxlan->hash_lock);
 }
 
 /* Cleanup timer and forwarding table on shutdown */
@@ -3014,7 +3030,6 @@ static void vxlan_setup(struct net_device *dev)
 	dev->max_mtu = ETH_MAX_MTU;
 
 	INIT_LIST_HEAD(&vxlan->next);
-	spin_lock_init(&vxlan->hash_lock);
 
 	timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
 
@@ -3022,8 +3037,10 @@ static void vxlan_setup(struct net_device *dev)
 
 	gro_cells_init(&vxlan->gro_cells, dev);
 
-	for (h = 0; h < FDB_HASH_SIZE; ++h)
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		spin_lock_init(&vxlan->hash_lock[h]);
 		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+	}
 }
 
 static void vxlan_ether_setup(struct net_device *dev)
@@ -3917,7 +3934,9 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 
 	/* handle default dst entry */
 	if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) {
-		spin_lock_bh(&vxlan->hash_lock);
+		u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni);
+
+		spin_lock_bh(&vxlan->hash_lock[hash_index]);
 		if (!vxlan_addr_any(&conf.remote_ip)) {
 			err = vxlan_fdb_update(vxlan, all_zeros_mac,
 					       &conf.remote_ip,
@@ -3928,7 +3947,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 					       conf.remote_ifindex,
 					       NTF_SELF, true, extack);
 			if (err) {
-				spin_unlock_bh(&vxlan->hash_lock);
+				spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 				return err;
 			}
 		}
@@ -3940,7 +3959,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 					   dst->remote_vni,
 					   dst->remote_ifindex,
 					   true);
-		spin_unlock_bh(&vxlan->hash_lock);
+		spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 	}
 
 	if (conf.age_interval != vxlan->cfg.age_interval)
@@ -4195,8 +4214,11 @@ vxlan_fdb_offloaded_set(struct net_device *dev,
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_rdst *rdst;
 	struct vxlan_fdb *f;
+	u32 hash_index;
+
+	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
 
-	spin_lock_bh(&vxlan->hash_lock);
+	spin_lock_bh(&vxlan->hash_lock[hash_index]);
 
 	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
 	if (!f)
@@ -4212,7 +4234,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev,
 	rdst->offloaded = fdb_info->offloaded;
 
 out:
-	spin_unlock_bh(&vxlan->hash_lock);
+	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 }
 
 static int
@@ -4221,11 +4243,13 @@ vxlan_fdb_external_learn_add(struct net_device *dev,
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct netlink_ext_ack *extack;
+	u32 hash_index;
 	int err;
 
+	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
 	extack = switchdev_notifier_info_to_extack(&fdb_info->info);
 
-	spin_lock_bh(&vxlan->hash_lock);
+	spin_lock_bh(&vxlan->hash_lock[hash_index]);
 	err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
 			       NUD_REACHABLE,
 			       NLM_F_CREATE | NLM_F_REPLACE,
@@ -4235,7 +4259,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev,
 			       fdb_info->remote_ifindex,
 			       NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
 			       false, extack);
-	spin_unlock_bh(&vxlan->hash_lock);
+	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
 	return err;
 }
@@ -4246,9 +4270,11 @@ vxlan_fdb_external_learn_del(struct net_device *dev,
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
+	u32 hash_index;
 	int err = 0;
 
-	spin_lock_bh(&vxlan->hash_lock);
+	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
+	spin_lock_bh(&vxlan->hash_lock[hash_index]);
 
 	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
 	if (!f)
@@ -4262,7 +4288,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev,
 					 fdb_info->remote_ifindex,
 					 false);
 
-	spin_unlock_bh(&vxlan->hash_lock);
+	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
 	return err;
 }
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 83b5999a2587..dc1583a1fb8a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -242,7 +242,7 @@ struct vxlan_dev {
 	struct vxlan_rdst default_dst;	/* default destination */
 
 	struct timer_list age_timer;
-	spinlock_t	  hash_lock;
+	spinlock_t	  hash_lock[FDB_HASH_SIZE];
 	unsigned int	  addrcnt;
 	struct gro_cells  gro_cells;
 
-- 
cgit v1.2.3


From ca72efb6bdc733006f335ca12ca615395077a873 Mon Sep 17 00:00:00 2001
From: Robert Hancock <hancock@sedsystems.ca>
Date: Tue, 4 Jun 2019 16:15:01 -0600
Subject: net: phy: Add detection of 1000BaseX link mode support

Add 1000BaseX to the link modes which are detected based on the
MII_ESTATUS register as per 802.3 Clause 22. This allows PHYs which
support 1000BaseX to work properly with drivers using phylink.

Previously 1000BaseX support was not detected, and if that was the only
mode the PHY indicated support for, phylink would refuse to attach it
due to the list of supported modes being empty.

Signed-off-by: Robert Hancock <hancock@sedsystems.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 3 +++
 include/uapi/linux/mii.h     | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 2c879ba01f35..03c885ec1f98 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1930,6 +1930,9 @@ int genphy_config_init(struct phy_device *phydev)
 		if (val & ESTATUS_1000_THALF)
 			linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
 					 features);
+		if (val & ESTATUS_1000_XFULL)
+			linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT,
+					 features);
 	}
 
 	linkmode_and(phydev->supported, phydev->supported, features);
diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h
index a506216591d6..51b48e4be1f2 100644
--- a/include/uapi/linux/mii.h
+++ b/include/uapi/linux/mii.h
@@ -121,6 +121,8 @@
 #define EXPANSION_MFAULTS	0x0010	/* Multiple faults detected    */
 #define EXPANSION_RESV		0xffe0	/* Unused...                   */
 
+#define ESTATUS_1000_XFULL	0x8000	/* Can do 1000BaseX Full       */
+#define ESTATUS_1000_XHALF	0x4000	/* Can do 1000BaseX Half       */
 #define ESTATUS_1000_TFULL	0x2000	/* Can do 1000BT Full          */
 #define ESTATUS_1000_THALF	0x1000	/* Can do 1000BT Half          */
 
-- 
cgit v1.2.3


From 2d6b51c6924c1bba8e4948dc4a2dbc96bf685b97 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 5 Jun 2019 14:11:38 -0700
Subject: net/tls: split the TLS_DRIVER_STATE_SIZE and bump TX to 16 bytes

8 bytes of driver state has been enough so far, but for drivers
which have to store 8 byte handle it's no longer practical to
store the state directly in the context.

Drivers generally don't need much extra state on RX side, while
TX side has to be tracking TCP sequence numbers.  Split the
lengths of max driver state size on RX and TX.

The struct tls_offload_context_tx currently stands at 616 bytes and
struct tls_offload_context_rx stands at 368 bytes.  Upcoming work
will consume extra 8 bytes in both for kernel-driven resync.
This means that we can bump TX side to 16 bytes and still fit
into the same number of cache lines but on RX side we would be 8
bytes over.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 0a0072636009..3094db5398a9 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -202,12 +202,12 @@ struct tls_offload_context_tx {
 	 * Currently the belief is that there is not enough
 	 * driver specific state to justify another layer of indirection
 	 */
-#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *)))
+#define TLS_DRIVER_STATE_SIZE_TX	16
 };
 
 #define TLS_OFFLOAD_CONTEXT_SIZE_TX                                            \
 	(ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) +        \
-	 TLS_DRIVER_STATE_SIZE)
+	 TLS_DRIVER_STATE_SIZE_TX)
 
 struct cipher_context {
 	char *iv;
@@ -307,11 +307,12 @@ struct tls_offload_context_rx {
 	 * Currently the belief is that there is not enough
 	 * driver specific state to justify another layer of indirection
 	 */
+#define TLS_DRIVER_STATE_SIZE_RX	8
 };
 
 #define TLS_OFFLOAD_CONTEXT_SIZE_RX					\
 	(ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \
-	 TLS_DRIVER_STATE_SIZE)
+	 TLS_DRIVER_STATE_SIZE_RX)
 
 int wait_on_pending_writer(struct sock *sk, long *timeo);
 int tls_sk_query(struct sock *sk, int optname, char __user *optval,
-- 
cgit v1.2.3


From 2e361176ea01c4b2f90b17c2264a2a794050c3f3 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 5 Jun 2019 14:11:39 -0700
Subject: net/tls: simplify driver context retrieval

Currently drivers have to ensure the alignment of their tls state
structure, which leads to unnecessary layers of getters and
encapsulated structures in each driver.

Simplify all this by marking the driver state as aligned (driver_state
members are currently aligned, so no hole is added, besides ALIGN in
TLS_OFFLOAD_CONTEXT_SIZE_RX/TX would reserve this extra space, anyway.)
With that we can add a common accessor to the core.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 3094db5398a9..3da0d941e729 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -40,6 +40,7 @@
 #include <linux/socket.h>
 #include <linux/tcp.h>
 #include <linux/skmsg.h>
+#include <linux/netdevice.h>
 
 #include <net/tcp.h>
 #include <net/strparser.h>
@@ -197,7 +198,7 @@ struct tls_offload_context_tx {
 
 	struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
 	void (*sk_destruct)(struct sock *sk);
-	u8 driver_state[];
+	u8 driver_state[] __aligned(8);
 	/* The TLS layer reserves room for driver specific state
 	 * Currently the belief is that there is not enough
 	 * driver specific state to justify another layer of indirection
@@ -206,8 +207,7 @@ struct tls_offload_context_tx {
 };
 
 #define TLS_OFFLOAD_CONTEXT_SIZE_TX                                            \
-	(ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) +        \
-	 TLS_DRIVER_STATE_SIZE_TX)
+	(sizeof(struct tls_offload_context_tx) + TLS_DRIVER_STATE_SIZE_TX)
 
 struct cipher_context {
 	char *iv;
@@ -302,7 +302,7 @@ struct tls_offload_context_rx {
 	/* sw must be the first member of tls_offload_context_rx */
 	struct tls_sw_context_rx sw;
 	atomic64_t resync_req;
-	u8 driver_state[];
+	u8 driver_state[] __aligned(8);
 	/* The TLS layer reserves room for driver specific state
 	 * Currently the belief is that there is not enough
 	 * driver specific state to justify another layer of indirection
@@ -311,8 +311,7 @@ struct tls_offload_context_rx {
 };
 
 #define TLS_OFFLOAD_CONTEXT_SIZE_RX					\
-	(ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \
-	 TLS_DRIVER_STATE_SIZE_RX)
+	(sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX)
 
 int wait_on_pending_writer(struct sock *sk, long *timeo);
 int tls_sk_query(struct sock *sk, int optname, char __user *optval,
@@ -557,6 +556,23 @@ tls_offload_ctx_rx(const struct tls_context *tls_ctx)
 	return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx;
 }
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+static inline void *__tls_driver_ctx(struct tls_context *tls_ctx,
+				     enum tls_offload_ctx_dir direction)
+{
+	if (direction == TLS_OFFLOAD_CTX_DIR_TX)
+		return tls_offload_ctx_tx(tls_ctx)->driver_state;
+	else
+		return tls_offload_ctx_rx(tls_ctx)->driver_state;
+}
+
+static inline void *
+tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction)
+{
+	return __tls_driver_ctx(tls_get_ctx(sk), direction);
+}
+#endif
+
 /* The TLS context is valid until sk_destruct is called */
 static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq)
 {
-- 
cgit v1.2.3


From b9727d7f957d68febb3b5f68d4be270ee6fb57e7 Mon Sep 17 00:00:00 2001
From: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Date: Wed, 5 Jun 2019 14:11:40 -0700
Subject: net/tls: export TLS per skb encryption

While offloading TLS connections, drivers need to handle the case where
out of order packets need to be transmitted.

Other drivers obtain the entire TLS record for the specific skb to
provide as context to hardware for encryption. However, other designs
may also want to keep the hardware state intact and perform the
out of order encryption entirely on the host.

To achieve this, export the already existing software encryption
fallback path so drivers could access this.

Signed-off-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h             | 1 +
 net/tls/tls_device_fallback.c | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 3da0d941e729..d1a4f365d6be 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -590,6 +590,7 @@ void tls_unregister_device(struct tls_device *device);
 int tls_device_decrypted(struct sock *sk, struct sk_buff *skb);
 int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 		struct scatterlist *sgout);
+struct sk_buff *tls_encrypt_skb(struct sk_buff *skb);
 
 struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
 				      struct net_device *dev,
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 5a087e1981c3..1d2d804ac633 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -426,6 +426,12 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(tls_validate_xmit_skb);
 
+struct sk_buff *tls_encrypt_skb(struct sk_buff *skb)
+{
+	return tls_sw_fallback(skb->sk, skb);
+}
+EXPORT_SYMBOL_GPL(tls_encrypt_skb);
+
 int tls_sw_fallback_init(struct sock *sk,
 			 struct tls_offload_context_tx *offload_ctx,
 			 struct tls_crypto_info *crypto_info)
-- 
cgit v1.2.3


From 5e3f847a02aabfecea519d7b2fd48f4d6f551be6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:28 +0300
Subject: net: dsa: Add teardown callback for drivers

This is helpful for e.g. draining per-driver (not per-port) tagger
queues.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 net/dsa/dsa2.c    | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1131d9fac20b..82a2baa2dc48 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -357,6 +357,7 @@ struct dsa_switch_ops {
 						  int port);
 
 	int	(*setup)(struct dsa_switch *ds);
+	void	(*teardown)(struct dsa_switch *ds);
 	u32	(*get_phy_flags)(struct dsa_switch *ds, int port);
 
 	/*
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 38d11c863b57..3abd173ebacb 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -408,6 +408,9 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
 
 	dsa_switch_unregister_notifier(ds);
 
+	if (ds->ops->teardown)
+		ds->ops->teardown(ds);
+
 	if (ds->devlink) {
 		devlink_unregister(ds->devlink);
 		devlink_free(ds->devlink);
-- 
cgit v1.2.3


From d461933638ae9fa49ad22f60a40de5b3ed414912 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:29 +0300
Subject: net: dsa: tag_8021q: Create helper function for removing VLAN header

This removes the existing implementation from tag_sja1105, which was
partially incorrect (it was not changing the MAC header offset, thereby
leaving it to point 4 bytes earlier than it should have).

This overwrites the VLAN tag by moving the Ethernet source and
destination MACs 4 bytes to the right. Then skb->data (assumed to be
pointing immediately after the EtherType) is temporarily pushed to the
beginning of the new Ethernet header, the new Ethernet header offset and
length are recorded, then skb->data is moved back to where it was.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h | 16 ++++++-------
 net/dsa/tag_8021q.c       | 57 ++++++++++++++++++++++++++++++-----------------
 net/dsa/tag_sja1105.c     | 19 ++++++++--------
 3 files changed, 53 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 3911e0586478..0aa803c451a3 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -20,9 +20,6 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci);
 
-struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
-			      struct packet_type *pt, u16 *tpid, u16 *tci);
-
 u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port);
 
 u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port);
@@ -31,6 +28,8 @@ int dsa_8021q_rx_switch_id(u16 vid);
 
 int dsa_8021q_rx_source_port(u16 vid);
 
+struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb);
+
 #else
 
 int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
@@ -45,12 +44,6 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 	return NULL;
 }
 
-struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
-			      struct packet_type *pt, u16 *tpid, u16 *tci)
-{
-	return NULL;
-}
-
 u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port)
 {
 	return 0;
@@ -71,6 +64,11 @@ int dsa_8021q_rx_source_port(u16 vid)
 	return 0;
 }
 
+struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
+{
+	return NULL;
+}
+
 #endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */
 
 #endif /* _NET_DSA_8021Q_H */
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 65a35e976d7b..6ebbd799c4eb 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -235,31 +235,48 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
 
-struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
-			      struct packet_type *pt, u16 *tpid, u16 *tci)
+/* In the DSA packet_type handler, skb->data points in the middle of the VLAN
+ * tag, after tpid and before tci. This is because so far, ETH_HLEN
+ * (DMAC, SMAC, EtherType) bytes were pulled.
+ * There are 2 bytes of VLAN tag left in skb->data, and upper
+ * layers expect the 'real' EtherType to be consumed as well.
+ * Coincidentally, a VLAN header is also of the same size as
+ * the number of bytes that need to be pulled.
+ *
+ * skb_mac_header                                      skb->data
+ * |                                                       |
+ * v                                                       v
+ * |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |
+ * +-----------------------+-----------------------+-------+-------+-------+
+ * |    Destination MAC    |      Source MAC       |  TPID |  TCI  | EType |
+ * +-----------------------+-----------------------+-------+-------+-------+
+ * ^                                               |               |
+ * |<--VLAN_HLEN-->to                              <---VLAN_HLEN--->
+ * from            |
+ *       >>>>>>>   v
+ *       >>>>>>>   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |
+ *       >>>>>>>   +-----------------------+-----------------------+-------+
+ *       >>>>>>>   |    Destination MAC    |      Source MAC       | EType |
+ *                 +-----------------------+-----------------------+-------+
+ *                 ^                                                       ^
+ * (now part of    |                                                       |
+ *  skb->head)     skb_mac_header                                  skb->data
+ */
+struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
 {
-	struct vlan_ethhdr *tag;
-
-	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
-		return NULL;
+	u8 *from = skb_mac_header(skb);
+	u8 *dest = from + VLAN_HLEN;
 
-	tag = vlan_eth_hdr(skb);
-	*tpid = ntohs(tag->h_vlan_proto);
-	*tci = ntohs(tag->h_vlan_TCI);
-
-	/* skb->data points in the middle of the VLAN tag,
-	 * after tpid and before tci. This is because so far,
-	 * ETH_HLEN (DMAC, SMAC, EtherType) bytes were pulled.
-	 * There are 2 bytes of VLAN tag left in skb->data, and upper
-	 * layers expect the 'real' EtherType to be consumed as well.
-	 * Coincidentally, a VLAN header is also of the same size as
-	 * the number of bytes that need to be pulled.
-	 */
-	skb_pull_rcsum(skb, VLAN_HLEN);
+	memmove(dest, from, ETH_HLEN - VLAN_HLEN);
+	skb_pull(skb, VLAN_HLEN);
+	skb_push(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+	skb_pull_rcsum(skb, ETH_HLEN);
 
 	return skb;
 }
-EXPORT_SYMBOL_GPL(dsa_8021q_rcv);
+EXPORT_SYMBOL_GPL(dsa_8021q_remove_header);
 
 static const struct dsa_device_ops dsa_8021q_netdev_ops = {
 	.name		= "8021q",
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index d43737e6c3fb..77eeea004e92 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -66,17 +66,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
 {
-	struct ethhdr *hdr = eth_hdr(skb);
-	u64 source_port, switch_id;
-	struct sk_buff *nskb;
+	int source_port, switch_id;
+	struct vlan_ethhdr *hdr;
 	u16 tpid, vid, tci;
 	bool is_tagged;
 
-	nskb = dsa_8021q_rcv(skb, netdev, pt, &tpid, &tci);
-	is_tagged = (nskb && tpid == ETH_P_SJA1105);
-
-	skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
-	vid = tci & VLAN_VID_MASK;
+	hdr = vlan_eth_hdr(skb);
+	tpid = ntohs(hdr->h_vlan_proto);
+	is_tagged = (tpid == ETH_P_SJA1105);
 
 	skb->offload_fwd_mark = 1;
 
@@ -92,8 +89,11 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 		hdr->h_dest[4] = 0;
 	} else {
 		/* Normal traffic path. */
+		tci = ntohs(hdr->h_vlan_TCI);
+		vid = tci & VLAN_VID_MASK;
 		source_port = dsa_8021q_rx_source_port(vid);
 		switch_id = dsa_8021q_rx_switch_id(vid);
+		skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
 	}
 
 	skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
@@ -106,8 +106,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	 * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN).
 	 */
 	if (is_tagged)
-		memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - VLAN_HLEN,
-			ETH_HLEN - VLAN_HLEN);
+		skb = dsa_8021q_remove_header(skb);
 
 	return skb;
 }
-- 
cgit v1.2.3


From 47ed985e97f513b7746270e8c5d1f3a3f959b2da Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:35 +0300
Subject: net: dsa: sja1105: Add logic for TX timestamping

On TX, timestamping is performed synchronously from the
port_deferred_xmit worker thread.
In management routes, the switch is requested to take egress timestamps
(again partial), which are reconstructed and appended to a clone of the
skb that was just sent.  The cloning is done by DSA and we retrieve the
pointer from the structure that DSA keeps in skb->cb.
Then these clones are enqueued to the socket's error queue for
application-level processing.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h      |  10 ++++
 drivers/net/dsa/sja1105/sja1105_main.c |  55 ++++++++++++++++-
 drivers/net/dsa/sja1105/sja1105_ptp.c  | 106 +++++++++++++++++++++++++++++++++
 drivers/net/dsa/sja1105/sja1105_ptp.h  |  17 ++++++
 drivers/net/dsa/sja1105/sja1105_spi.c  |  14 +++++
 include/linux/dsa/sja1105.h            |   1 +
 6 files changed, 201 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 3c6296203c21..5a4f83a3417b 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -33,6 +33,7 @@ struct sja1105_regs {
 	u64 ptpclk;
 	u64 ptpclkrate;
 	u64 ptptsclk;
+	u64 ptpegr_ts[SJA1105_NUM_PORTS];
 	u64 pad_mii_tx[SJA1105_NUM_PORTS];
 	u64 cgu_idiv[SJA1105_NUM_PORTS];
 	u64 rgmii_pad_mii_tx[SJA1105_NUM_PORTS];
@@ -56,6 +57,15 @@ struct sja1105_info {
 	 * switch core and device_id)
 	 */
 	u64 part_no;
+	/* E/T and P/Q/R/S have partial timestamps of different sizes.
+	 * They must be reconstructed on both families anyway to get the full
+	 * 64-bit values back.
+	 */
+	int ptp_ts_bits;
+	/* Also SPI commands are of different sizes to retrieve
+	 * the egress timestamps.
+	 */
+	int ptpegr_ts_bytes;
 	const struct sja1105_dynamic_table_ops *dyn_ops;
 	const struct sja1105_table_ops *static_ops;
 	const struct sja1105_regs *regs;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index f897fdb12930..121ceccd8107 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1565,7 +1565,7 @@ static int sja1105_setup(struct dsa_switch *ds)
 }
 
 static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
-			     struct sk_buff *skb)
+			     struct sk_buff *skb, bool takets)
 {
 	struct sja1105_mgmt_entry mgmt_route = {0};
 	struct sja1105_private *priv = ds->priv;
@@ -1578,6 +1578,8 @@ static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
 	mgmt_route.macaddr = ether_addr_to_u64(hdr->h_dest);
 	mgmt_route.destports = BIT(port);
 	mgmt_route.enfport = 1;
+	mgmt_route.tsreg = 0;
+	mgmt_route.takets = takets;
 
 	rc = sja1105_dynamic_config_write(priv, BLK_IDX_MGMT_ROUTE,
 					  slot, &mgmt_route, true);
@@ -1629,7 +1631,11 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
 {
 	struct sja1105_private *priv = ds->priv;
 	struct sja1105_port *sp = &priv->ports[port];
+	struct skb_shared_hwtstamps shwt = {0};
 	int slot = sp->mgmt_slot;
+	struct sk_buff *clone;
+	u64 now, ts;
+	int rc;
 
 	/* The tragic fact about the switch having 4x2 slots for installing
 	 * management routes is that all of them except one are actually
@@ -1647,8 +1653,36 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
 	 */
 	mutex_lock(&priv->mgmt_lock);
 
-	sja1105_mgmt_xmit(ds, port, slot, skb);
+	/* The clone, if there, was made by dsa_skb_tx_timestamp */
+	clone = DSA_SKB_CB(skb)->clone;
+
+	sja1105_mgmt_xmit(ds, port, slot, skb, !!clone);
+
+	if (!clone)
+		goto out;
+
+	skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS;
+
+	mutex_lock(&priv->ptp_lock);
+
+	now = priv->tstamp_cc.read(&priv->tstamp_cc);
+
+	rc = sja1105_ptpegr_ts_poll(priv, slot, &ts);
+	if (rc < 0) {
+		dev_err(ds->dev, "xmit: timed out polling for tstamp\n");
+		kfree_skb(clone);
+		goto out_unlock_ptp;
+	}
+
+	ts = sja1105_tstamp_reconstruct(priv, now, ts);
+	ts = timecounter_cyc2time(&priv->tstamp_tc, ts);
 
+	shwt.hwtstamp = ns_to_ktime(ts);
+	skb_complete_tx_timestamp(clone, &shwt);
+
+out_unlock_ptp:
+	mutex_unlock(&priv->ptp_lock);
+out:
 	mutex_unlock(&priv->mgmt_lock);
 	return NETDEV_TX_OK;
 }
@@ -1677,6 +1711,22 @@ static int sja1105_set_ageing_time(struct dsa_switch *ds,
 	return sja1105_static_config_reload(priv);
 }
 
+/* Called from dsa_skb_tx_timestamp. This callback is just to make DSA clone
+ * the skb and have it available in DSA_SKB_CB in the .port_deferred_xmit
+ * callback, where we will timestamp it synchronously.
+ */
+bool sja1105_port_txtstamp(struct dsa_switch *ds, int port,
+			   struct sk_buff *skb, unsigned int type)
+{
+	struct sja1105_private *priv = ds->priv;
+	struct sja1105_port *sp = &priv->ports[port];
+
+	if (!sp->hwts_tx_en)
+		return false;
+
+	return true;
+}
+
 static const struct dsa_switch_ops sja1105_switch_ops = {
 	.get_tag_protocol	= sja1105_get_tag_protocol,
 	.setup			= sja1105_setup,
@@ -1701,6 +1751,7 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.port_mdb_add		= sja1105_mdb_add,
 	.port_mdb_del		= sja1105_mdb_del,
 	.port_deferred_xmit	= sja1105_port_deferred_xmit,
+	.port_txtstamp		= sja1105_port_txtstamp,
 };
 
 static int sja1105_check_device_id(struct sja1105_private *priv)
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index 47313a6ec932..01ecc8fb1b30 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -113,6 +113,112 @@ int sja1105pqrs_ptp_cmd(const void *ctx, const void *data)
 }
 EXPORT_SYMBOL_GPL(sja1105pqrs_ptp_cmd);
 
+/* The switch returns partial timestamps (24 bits for SJA1105 E/T, which wrap
+ * around in 0.135 seconds, and 32 bits for P/Q/R/S, wrapping around in 34.35
+ * seconds).
+ *
+ * This receives the RX or TX MAC timestamps, provided by hardware as
+ * the lower bits of the cycle counter, sampled at the time the timestamp was
+ * collected.
+ *
+ * To reconstruct into a full 64-bit-wide timestamp, the cycle counter is
+ * read and the high-order bits are filled in.
+ *
+ * Must be called within one wraparound period of the partial timestamp since
+ * it was generated by the MAC.
+ */
+u64 sja1105_tstamp_reconstruct(struct sja1105_private *priv, u64 now,
+			       u64 ts_partial)
+{
+	u64 partial_tstamp_mask = CYCLECOUNTER_MASK(priv->info->ptp_ts_bits);
+	u64 ts_reconstructed;
+
+	ts_reconstructed = (now & ~partial_tstamp_mask) | ts_partial;
+
+	/* Check lower bits of current cycle counter against the timestamp.
+	 * If the current cycle counter is lower than the partial timestamp,
+	 * then wraparound surely occurred and must be accounted for.
+	 */
+	if ((now & partial_tstamp_mask) <= ts_partial)
+		ts_reconstructed -= (partial_tstamp_mask + 1);
+
+	return ts_reconstructed;
+}
+EXPORT_SYMBOL_GPL(sja1105_tstamp_reconstruct);
+
+/* Reads the SPI interface for an egress timestamp generated by the switch
+ * for frames sent using management routes.
+ *
+ * SJA1105 E/T layout of the 4-byte SPI payload:
+ *
+ * 31    23    15    7     0
+ * |     |     |     |     |
+ * +-----+-----+-----+     ^
+ *          ^              |
+ *          |              |
+ *  24-bit timestamp   Update bit
+ *
+ *
+ * SJA1105 P/Q/R/S layout of the 8-byte SPI payload:
+ *
+ * 31    23    15    7     0     63    55    47    39    32
+ * |     |     |     |     |     |     |     |     |     |
+ *                         ^     +-----+-----+-----+-----+
+ *                         |                 ^
+ *                         |                 |
+ *                    Update bit    32-bit timestamp
+ *
+ * Notice that the update bit is in the same place.
+ * To have common code for E/T and P/Q/R/S for reading the timestamp,
+ * we need to juggle with the offset and the bit indices.
+ */
+int sja1105_ptpegr_ts_poll(struct sja1105_private *priv, int port, u64 *ts)
+{
+	const struct sja1105_regs *regs = priv->info->regs;
+	int tstamp_bit_start, tstamp_bit_end;
+	int timeout = 10;
+	u8 packed_buf[8];
+	u64 update;
+	int rc;
+
+	do {
+		rc = sja1105_spi_send_packed_buf(priv, SPI_READ,
+						 regs->ptpegr_ts[port],
+						 packed_buf,
+						 priv->info->ptpegr_ts_bytes);
+		if (rc < 0)
+			return rc;
+
+		sja1105_unpack(packed_buf, &update, 0, 0,
+			       priv->info->ptpegr_ts_bytes);
+		if (update)
+			break;
+
+		usleep_range(10, 50);
+	} while (--timeout);
+
+	if (!timeout)
+		return -ETIMEDOUT;
+
+	/* Point the end bit to the second 32-bit word on P/Q/R/S,
+	 * no-op on E/T.
+	 */
+	tstamp_bit_end = (priv->info->ptpegr_ts_bytes - 4) * 8;
+	/* Shift the 24-bit timestamp on E/T to be collected from 31:8.
+	 * No-op on P/Q/R/S.
+	 */
+	tstamp_bit_end += 32 - priv->info->ptp_ts_bits;
+	tstamp_bit_start = tstamp_bit_end + priv->info->ptp_ts_bits - 1;
+
+	*ts = 0;
+
+	sja1105_unpack(packed_buf, ts, tstamp_bit_start, tstamp_bit_end,
+		       priv->info->ptpegr_ts_bytes);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sja1105_ptpegr_ts_poll);
+
 int sja1105_ptp_reset(struct sja1105_private *priv)
 {
 	struct dsa_switch *ds = priv->ds;
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h
index 137ffbb0a233..af456b0a4d27 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.h
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.h
@@ -10,6 +10,8 @@ int sja1105_ptp_clock_register(struct sja1105_private *priv);
 
 void sja1105_ptp_clock_unregister(struct sja1105_private *priv);
 
+int sja1105_ptpegr_ts_poll(struct sja1105_private *priv, int port, u64 *ts);
+
 int sja1105et_ptp_cmd(const void *ctx, const void *data);
 
 int sja1105pqrs_ptp_cmd(const void *ctx, const void *data);
@@ -17,6 +19,9 @@ int sja1105pqrs_ptp_cmd(const void *ctx, const void *data);
 int sja1105_get_ts_info(struct dsa_switch *ds, int port,
 			struct ethtool_ts_info *ts);
 
+u64 sja1105_tstamp_reconstruct(struct sja1105_private *priv, u64 now,
+			       u64 ts_partial);
+
 int sja1105_ptp_reset(struct sja1105_private *priv);
 
 #else
@@ -31,6 +36,18 @@ static inline void sja1105_ptp_clock_unregister(struct sja1105_private *priv)
 	return;
 }
 
+static inline int
+sja1105_ptpegr_ts_poll(struct sja1105_private *priv, int port, u64 *ts)
+{
+	return 0;
+}
+
+static inline u64 sja1105_tstamp_reconstruct(struct sja1105_private *priv,
+					     u64 now, u64 ts_partial)
+{
+	return 0;
+}
+
 static inline int sja1105_ptp_reset(struct sja1105_private *priv)
 {
 	return 0;
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index a0d08e6c22ff..d729a0f0b28e 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -514,6 +514,7 @@ static struct sja1105_regs sja1105et_regs = {
 	.rgmii_tx_clk = {0x100016, 0x10001D, 0x100024, 0x10002B, 0x100032},
 	.rmii_ref_clk = {0x100015, 0x10001C, 0x100023, 0x10002A, 0x100031},
 	.rmii_ext_tx_clk = {0x100018, 0x10001F, 0x100026, 0x10002D, 0x100034},
+	.ptpegr_ts = {0xC0, 0xC2, 0xC4, 0xC6, 0xC8},
 	.ptp_control = 0x17,
 	.ptpclk = 0x18, /* Spans 0x18 to 0x19 */
 	.ptpclkrate = 0x1A,
@@ -544,6 +545,7 @@ static struct sja1105_regs sja1105pqrs_regs = {
 	.rmii_ref_clk = {0x100015, 0x10001B, 0x100021, 0x100027, 0x10002D},
 	.rmii_ext_tx_clk = {0x100017, 0x10001D, 0x100023, 0x100029, 0x10002F},
 	.qlevel = {0x604, 0x614, 0x624, 0x634, 0x644},
+	.ptpegr_ts = {0xC0, 0xC4, 0xC8, 0xCC, 0xD0},
 	.ptp_control = 0x18,
 	.ptpclk = 0x19,
 	.ptpclkrate = 0x1B,
@@ -555,6 +557,8 @@ struct sja1105_info sja1105e_info = {
 	.part_no		= SJA1105ET_PART_NO,
 	.static_ops		= sja1105e_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
+	.ptp_ts_bits		= 24,
+	.ptpegr_ts_bytes	= 4,
 	.reset_cmd		= sja1105et_reset_cmd,
 	.fdb_add_cmd		= sja1105et_fdb_add,
 	.fdb_del_cmd		= sja1105et_fdb_del,
@@ -567,6 +571,8 @@ struct sja1105_info sja1105t_info = {
 	.part_no		= SJA1105ET_PART_NO,
 	.static_ops		= sja1105t_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
+	.ptp_ts_bits		= 24,
+	.ptpegr_ts_bytes	= 4,
 	.reset_cmd		= sja1105et_reset_cmd,
 	.fdb_add_cmd		= sja1105et_fdb_add,
 	.fdb_del_cmd		= sja1105et_fdb_del,
@@ -579,6 +585,8 @@ struct sja1105_info sja1105p_info = {
 	.part_no		= SJA1105P_PART_NO,
 	.static_ops		= sja1105p_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.ptp_ts_bits		= 32,
+	.ptpegr_ts_bytes	= 8,
 	.reset_cmd		= sja1105pqrs_reset_cmd,
 	.fdb_add_cmd		= sja1105pqrs_fdb_add,
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
@@ -591,6 +599,8 @@ struct sja1105_info sja1105q_info = {
 	.part_no		= SJA1105Q_PART_NO,
 	.static_ops		= sja1105q_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.ptp_ts_bits		= 32,
+	.ptpegr_ts_bytes	= 8,
 	.reset_cmd		= sja1105pqrs_reset_cmd,
 	.fdb_add_cmd		= sja1105pqrs_fdb_add,
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
@@ -603,6 +613,8 @@ struct sja1105_info sja1105r_info = {
 	.part_no		= SJA1105R_PART_NO,
 	.static_ops		= sja1105r_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.ptp_ts_bits		= 32,
+	.ptpegr_ts_bytes	= 8,
 	.reset_cmd		= sja1105pqrs_reset_cmd,
 	.fdb_add_cmd		= sja1105pqrs_fdb_add,
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
@@ -616,6 +628,8 @@ struct sja1105_info sja1105s_info = {
 	.static_ops		= sja1105s_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
 	.regs			= &sja1105pqrs_regs,
+	.ptp_ts_bits		= 32,
+	.ptpegr_ts_bytes	= 8,
 	.reset_cmd		= sja1105pqrs_reset_cmd,
 	.fdb_add_cmd		= sja1105pqrs_fdb_add,
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index e46e18c47d41..5a956f335022 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -22,6 +22,7 @@
 
 struct sja1105_port {
 	struct dsa_port *dp;
+	bool hwts_tx_en;
 	int mgmt_slot;
 };
 
-- 
cgit v1.2.3


From d3f9b90bf19fad05889e4bead7dc1b336da56118 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:36 +0300
Subject: net: dsa: sja1105: Build a minimal understanding of meta frames

Meta frames are sent on the CPU port by the switch if RX timestamping is
enabled. They contain a partial timestamp of the previous frame.

They are Ethernet frames with the Ethernet header constructed out of:

- SJA1105_META_DMAC
- SJA1105_META_SMAC
- ETH_P_SJA1105_META

The Ethernet payload will be decoded in a follow-up patch.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 11 +++++++++++
 net/dsa/tag_sja1105.c       | 15 +++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 5a956f335022..cc4a909d1007 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -12,6 +12,7 @@
 #include <net/dsa.h>
 
 #define ETH_P_SJA1105				ETH_P_DSA_8021Q
+#define ETH_P_SJA1105_META			0x0008
 
 /* IEEE 802.3 Annex 57A: Slow Protocols PDUs (01:80:C2:xx:xx:xx) */
 #define SJA1105_LINKLOCAL_FILTER_A		0x0180C2000000ull
@@ -20,6 +21,16 @@
 #define SJA1105_LINKLOCAL_FILTER_B		0x011B19000000ull
 #define SJA1105_LINKLOCAL_FILTER_B_MASK		0xFFFFFF000000ull
 
+/* Source and Destination MAC of follow-up meta frames.
+ * Whereas the choice of SMAC only affects the unique identification of the
+ * switch as sender of meta frames, the DMAC must be an address that is present
+ * in the DSA master port's multicast MAC filter.
+ * 01-80-C2-00-00-0E is a good choice for this, as all profiles of IEEE 1588
+ * over L2 use this address for some purpose already.
+ */
+#define SJA1105_META_SMAC			0x222222222222ull
+#define SJA1105_META_DMAC			0x0180C200000Eull
+
 struct sja1105_port {
 	struct dsa_port *dp;
 	bool hwts_tx_en;
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index cd8e0bfb5e75..0beb52518d56 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -22,6 +22,21 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb)
 	return false;
 }
 
+static inline bool sja1105_is_meta_frame(const struct sk_buff *skb)
+{
+	const struct ethhdr *hdr = eth_hdr(skb);
+	u64 smac = ether_addr_to_u64(hdr->h_source);
+	u64 dmac = ether_addr_to_u64(hdr->h_dest);
+
+	if (smac != SJA1105_META_SMAC)
+		return false;
+	if (dmac != SJA1105_META_DMAC)
+		return false;
+	if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META)
+		return false;
+	return true;
+}
+
 /* This is the first time the tagger sees the frame on RX.
  * Figure out if we can decode it, and if we can, annotate skb->cb with how we
  * plan to do that, so we don't need to check again in the rcv function.
-- 
cgit v1.2.3


From 844d7edc6a34ae3a8236f1306e4f2615c8db1eac Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:40 +0300
Subject: net: dsa: sja1105: Add a global sja1105_tagger_data structure

This will be used to keep state for RX timestamping. It is global
because the switch serializes timestampable and meta frames when
trapping them towards the CPU port (lower port indices have higher
priority) and therefore having one state machine per port would create
unnecessary complications.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h      |  1 +
 drivers/net/dsa/sja1105/sja1105_main.c |  5 +++++
 include/linux/dsa/sja1105.h            | 15 +++++++++++++++
 3 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 5a4f83a3417b..0fc6fe9ada87 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -103,6 +103,7 @@ struct sja1105_private {
 	 * the switch doesn't confuse them with one another.
 	 */
 	struct mutex mgmt_lock;
+	struct sja1105_tagger_data tagger_data;
 };
 
 #include "sja1105_dynamic_config.h"
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index d129997174bb..3c11142f1c67 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1828,6 +1828,7 @@ static int sja1105_check_device_id(struct sja1105_private *priv)
 
 static int sja1105_probe(struct spi_device *spi)
 {
+	struct sja1105_tagger_data *tagger_data;
 	struct device *dev = &spi->dev;
 	struct sja1105_private *priv;
 	struct dsa_switch *ds;
@@ -1882,12 +1883,16 @@ static int sja1105_probe(struct spi_device *spi)
 	ds->priv = priv;
 	priv->ds = ds;
 
+	tagger_data = &priv->tagger_data;
+	skb_queue_head_init(&tagger_data->skb_rxtstamp_queue);
+
 	/* Connections between dsa_port and sja1105_port */
 	for (i = 0; i < SJA1105_NUM_PORTS; i++) {
 		struct sja1105_port *sp = &priv->ports[i];
 
 		ds->ports[i].priv = sp;
 		sp->dp = &ds->ports[i];
+		sp->data = tagger_data;
 	}
 	mutex_init(&priv->mgmt_lock);
 
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index cc4a909d1007..2c4fce4eaf0d 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -31,7 +31,22 @@
 #define SJA1105_META_SMAC			0x222222222222ull
 #define SJA1105_META_DMAC			0x0180C200000Eull
 
+/* Global tagger data: each struct sja1105_port has a reference to
+ * the structure defined in struct sja1105_private.
+ */
+struct sja1105_tagger_data {
+	struct sk_buff_head skb_rxtstamp_queue;
+	struct work_struct rxtstamp_work;
+	struct sk_buff *stampable_skb;
+	/* Protects concurrent access to the meta state machine
+	 * from taggers running on multiple ports on SMP systems
+	 */
+	spinlock_t meta_lock;
+	bool hwts_rx_en;
+};
+
 struct sja1105_port {
+	struct sja1105_tagger_data *data;
 	struct dsa_port *dp;
 	bool hwts_tx_en;
 	int mgmt_slot;
-- 
cgit v1.2.3


From f3097be21bf17ae8785eea009cbc424f16611d9a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sat, 8 Jun 2019 15:04:42 +0300
Subject: net: dsa: sja1105: Add a state machine for RX timestamping

Meta frame reception relies on the hardware keeping its promise that it
will send no other traffic towards the CPU port between a link-local
frame and a meta frame.  Otherwise there is no other way to associate
the meta frame with the link-local frame it's holding a timestamp of.
The receive function is made stateful, and buffers a timestampable frame
until its meta frame arrives, then merges the two, drops the meta and
releases the link-local frame up the stack.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c |  62 +++++++++++++++++
 include/linux/dsa/sja1105.h            |   7 ++
 net/dsa/tag_sja1105.c                  | 121 ++++++++++++++++++++++++++++++++-
 3 files changed, 189 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 2b804eeca390..8963b21b3061 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1600,6 +1600,14 @@ static int sja1105_setup(struct dsa_switch *ds)
 	return sja1105_setup_8021q_tagging(ds, true);
 }
 
+static void sja1105_teardown(struct dsa_switch *ds)
+{
+	struct sja1105_private *priv = ds->priv;
+
+	cancel_work_sync(&priv->tagger_data.rxtstamp_work);
+	skb_queue_purge(&priv->tagger_data.skb_rxtstamp_queue);
+}
+
 static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
 			     struct sk_buff *skb, bool takets)
 {
@@ -1747,6 +1755,57 @@ static int sja1105_set_ageing_time(struct dsa_switch *ds,
 	return sja1105_static_config_reload(priv);
 }
 
+#define to_tagger(d) \
+	container_of((d), struct sja1105_tagger_data, rxtstamp_work)
+#define to_sja1105(d) \
+	container_of((d), struct sja1105_private, tagger_data)
+
+static void sja1105_rxtstamp_work(struct work_struct *work)
+{
+	struct sja1105_tagger_data *data = to_tagger(work);
+	struct sja1105_private *priv = to_sja1105(data);
+	struct sk_buff *skb;
+	u64 now;
+
+	mutex_lock(&priv->ptp_lock);
+
+	now = priv->tstamp_cc.read(&priv->tstamp_cc);
+
+	while ((skb = skb_dequeue(&data->skb_rxtstamp_queue)) != NULL) {
+		struct skb_shared_hwtstamps *shwt = skb_hwtstamps(skb);
+		u64 ts;
+
+		*shwt = (struct skb_shared_hwtstamps) {0};
+
+		ts = SJA1105_SKB_CB(skb)->meta_tstamp;
+		ts = sja1105_tstamp_reconstruct(priv, now, ts);
+		ts = timecounter_cyc2time(&priv->tstamp_tc, ts);
+
+		shwt->hwtstamp = ns_to_ktime(ts);
+		netif_rx_ni(skb);
+	}
+
+	mutex_unlock(&priv->ptp_lock);
+}
+
+/* Called from dsa_skb_defer_rx_timestamp */
+bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port,
+			   struct sk_buff *skb, unsigned int type)
+{
+	struct sja1105_private *priv = ds->priv;
+	struct sja1105_tagger_data *data = &priv->tagger_data;
+
+	if (!data->hwts_rx_en)
+		return false;
+
+	/* We need to read the full PTP clock to reconstruct the Rx
+	 * timestamp. For that we need a sleepable context.
+	 */
+	skb_queue_tail(&data->skb_rxtstamp_queue, skb);
+	schedule_work(&data->rxtstamp_work);
+	return true;
+}
+
 /* Called from dsa_skb_tx_timestamp. This callback is just to make DSA clone
  * the skb and have it available in DSA_SKB_CB in the .port_deferred_xmit
  * callback, where we will timestamp it synchronously.
@@ -1766,6 +1825,7 @@ bool sja1105_port_txtstamp(struct dsa_switch *ds, int port,
 static const struct dsa_switch_ops sja1105_switch_ops = {
 	.get_tag_protocol	= sja1105_get_tag_protocol,
 	.setup			= sja1105_setup,
+	.teardown		= sja1105_teardown,
 	.set_ageing_time	= sja1105_set_ageing_time,
 	.phylink_validate	= sja1105_phylink_validate,
 	.phylink_mac_config	= sja1105_mac_config,
@@ -1787,6 +1847,7 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.port_mdb_add		= sja1105_mdb_add,
 	.port_mdb_del		= sja1105_mdb_del,
 	.port_deferred_xmit	= sja1105_port_deferred_xmit,
+	.port_rxtstamp		= sja1105_port_rxtstamp,
 	.port_txtstamp		= sja1105_port_txtstamp,
 };
 
@@ -1885,6 +1946,7 @@ static int sja1105_probe(struct spi_device *spi)
 
 	tagger_data = &priv->tagger_data;
 	skb_queue_head_init(&tagger_data->skb_rxtstamp_queue);
+	INIT_WORK(&tagger_data->rxtstamp_work, sja1105_rxtstamp_work);
 
 	/* Connections between dsa_port and sja1105_port */
 	for (i = 0; i < SJA1105_NUM_PORTS; i++) {
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 2c4fce4eaf0d..79435cfc20eb 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -45,6 +45,13 @@ struct sja1105_tagger_data {
 	bool hwts_rx_en;
 };
 
+struct sja1105_skb_cb {
+	u32 meta_tstamp;
+};
+
+#define SJA1105_SKB_CB(skb) \
+	((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb))
+
 struct sja1105_port {
 	struct sja1105_tagger_data *data;
 	struct dsa_port *dp;
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 5b51e96130c7..1d96c9d4a8e9 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -110,6 +110,124 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 			     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
 }
 
+static void sja1105_transfer_meta(struct sk_buff *skb,
+				  const struct sja1105_meta *meta)
+{
+	struct ethhdr *hdr = eth_hdr(skb);
+
+	hdr->h_dest[3] = meta->dmac_byte_3;
+	hdr->h_dest[4] = meta->dmac_byte_4;
+	SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp;
+}
+
+/* This is a simple state machine which follows the hardware mechanism of
+ * generating RX timestamps:
+ *
+ * After each timestampable skb (all traffic for which send_meta1 and
+ * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame
+ * containing a partial timestamp is immediately generated by the switch and
+ * sent as a follow-up to the link-local frame on the CPU port.
+ *
+ * The meta frames have no unique identifier (such as sequence number) by which
+ * one may pair them to the correct timestampable frame.
+ * Instead, the switch has internal logic that ensures no frames are sent on
+ * the CPU port between a link-local timestampable frame and its corresponding
+ * meta follow-up. It also ensures strict ordering between ports (lower ports
+ * have higher priority towards the CPU port). For this reason, a per-port
+ * data structure is not needed/desirable.
+ *
+ * This function pairs the link-local frame with its partial timestamp from the
+ * meta follow-up frame. The full timestamp will be reconstructed later in a
+ * work queue.
+ */
+static struct sk_buff
+*sja1105_rcv_meta_state_machine(struct sk_buff *skb,
+				struct sja1105_meta *meta,
+				bool is_link_local,
+				bool is_meta)
+{
+	struct sja1105_port *sp;
+	struct dsa_port *dp;
+
+	dp = dsa_slave_to_port(skb->dev);
+	sp = dp->priv;
+
+	/* Step 1: A timestampable frame was received.
+	 * Buffer it until we get its meta frame.
+	 */
+	if (is_link_local && sp->data->hwts_rx_en) {
+		spin_lock(&sp->data->meta_lock);
+		/* Was this a link-local frame instead of the meta
+		 * that we were expecting?
+		 */
+		if (sp->data->stampable_skb) {
+			dev_err_ratelimited(dp->ds->dev,
+					    "Expected meta frame, is %12llx "
+					    "in the DSA master multicast filter?\n",
+					    SJA1105_META_DMAC);
+		}
+
+		/* Hold a reference to avoid dsa_switch_rcv
+		 * from freeing the skb.
+		 */
+		sp->data->stampable_skb = skb_get(skb);
+		spin_unlock(&sp->data->meta_lock);
+
+		/* Tell DSA we got nothing */
+		return NULL;
+
+	/* Step 2: The meta frame arrived.
+	 * Time to take the stampable skb out of the closet, annotate it
+	 * with the partial timestamp, and pretend that we received it
+	 * just now (basically masquerade the buffered frame as the meta
+	 * frame, which serves no further purpose).
+	 */
+	} else if (is_meta) {
+		struct sk_buff *stampable_skb;
+
+		spin_lock(&sp->data->meta_lock);
+
+		stampable_skb = sp->data->stampable_skb;
+		sp->data->stampable_skb = NULL;
+
+		/* Was this a meta frame instead of the link-local
+		 * that we were expecting?
+		 */
+		if (!stampable_skb) {
+			dev_err_ratelimited(dp->ds->dev,
+					    "Unexpected meta frame\n");
+			spin_unlock(&sp->data->meta_lock);
+			return NULL;
+		}
+
+		if (stampable_skb->dev != skb->dev) {
+			dev_err_ratelimited(dp->ds->dev,
+					    "Meta frame on wrong port\n");
+			spin_unlock(&sp->data->meta_lock);
+			return NULL;
+		}
+
+		/* Free the meta frame and give DSA the buffered stampable_skb
+		 * for further processing up the network stack.
+		 */
+		kfree_skb(skb);
+
+		skb = skb_copy(stampable_skb, GFP_ATOMIC);
+		if (!skb) {
+			dev_err_ratelimited(dp->ds->dev,
+					    "Failed to copy stampable skb\n");
+			return NULL;
+		}
+		sja1105_transfer_meta(skb, meta);
+		/* The cached copy will be freed now */
+		skb_unref(stampable_skb);
+
+		spin_unlock(&sp->data->meta_lock);
+	}
+
+	return skb;
+}
+
 static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
@@ -167,7 +285,8 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	if (is_tagged)
 		skb = dsa_8021q_remove_header(skb);
 
-	return skb;
+	return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
+					      is_meta);
 }
 
 static struct dsa_device_ops sja1105_netdev_ops = {
-- 
cgit v1.2.3


From 5237ff79b23901a5326f8e986667976861370445 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Fri, 7 Jun 2019 10:59:28 -0400
Subject: bonding: add slave_foo printk macros

Where possible, we generally want both the bond master and the relevant slave
information in message output. Standardize the format using new slave_*
printk macros.

Suggested-by: Joe Perches <joe@perches.com>
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bonding.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index b46d68acf701..676e7fae05a3 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -38,6 +38,15 @@
 #define __long_aligned __attribute__((aligned((sizeof(long)))))
 #endif
 
+#define slave_info(bond_dev, slave_dev, fmt, ...) \
+	netdev_info(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__)
+#define slave_warn(bond_dev, slave_dev, fmt, ...) \
+	netdev_warn(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__)
+#define slave_dbg(bond_dev, slave_dev, fmt, ...) \
+	netdev_dbg(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__)
+#define slave_err(bond_dev, slave_dev, fmt, ...) \
+	netdev_err(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__)
+
 #define BOND_MODE(bond) ((bond)->params.mode)
 
 /* slave list primitives */
-- 
cgit v1.2.3


From 5270041d342de6f1e6a3b6634c1ceaa67d1f87ea Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 7 Jun 2019 18:31:07 +0300
Subject: nexthop: off by one in nexthop_mpath_select()

The nhg->nh_entries[] array is allocated in nexthop_grp_alloc() and it
has nhg->num_nh elements so this check should be >= instead of >.

Fixes: 430a049190de ("nexthop: Add support for nexthop groups")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index aff7b2410057..e019ed9b3dc3 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -160,7 +160,7 @@ struct nexthop *nexthop_mpath_select(const struct nexthop *nh, int nhsel)
 	/* for_nexthops macros in fib_semantics.c grabs a pointer to
 	 * the nexthop before checking nhsel
 	 */
-	if (nhsel > nhg->num_nh)
+	if (nhsel >= nhg->num_nh)
 		return NULL;
 
 	return nhg->nh_entries[nhsel].nh;
-- 
cgit v1.2.3


From 6dcdd884e2a4bb57b0ed3654ff28974ae17d2a08 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 7 Jun 2019 21:20:40 +0200
Subject: net: hwbm: Make the hwbm_pool lock a mutex

Based on review, `lock' is only acquired in hwbm_pool_add() which is
invoked via ->probe(), ->resume() and ->ndo_change_mtu(). Based on this
the lock can become a mutex and there is no need to disable interrupts
during the procedure.
Now that the lock is a mutex, hwbm_pool_add() no longer invokes
hwbm_pool_refill() in an atomic context so we can pass GFP_KERNEL to
hwbm_pool_refill() and remove the `gfp' argument from hwbm_pool_add().

Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c    |  2 +-
 drivers/net/ethernet/marvell/mvneta_bm.c |  4 ++--
 include/net/hwbm.h                       |  6 +++---
 net/core/hwbm.c                          | 15 +++++++--------
 4 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 94dc0a272644..895bfed26a8a 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1119,7 +1119,7 @@ static void mvneta_bm_update_mtu(struct mvneta_port *pp, int mtu)
 			SKB_DATA_ALIGN(MVNETA_RX_BUF_SIZE(bm_pool->pkt_size));
 
 	/* Fill entire long pool */
-	num = hwbm_pool_add(hwbm_pool, hwbm_pool->size, GFP_ATOMIC);
+	num = hwbm_pool_add(hwbm_pool, hwbm_pool->size);
 	if (num != hwbm_pool->size) {
 		WARN(1, "pool %d: %d of %d allocated\n",
 		     bm_pool->id, num, hwbm_pool->size);
diff --git a/drivers/net/ethernet/marvell/mvneta_bm.c b/drivers/net/ethernet/marvell/mvneta_bm.c
index de468e1bdba9..82ee2bcca6fd 100644
--- a/drivers/net/ethernet/marvell/mvneta_bm.c
+++ b/drivers/net/ethernet/marvell/mvneta_bm.c
@@ -190,7 +190,7 @@ struct mvneta_bm_pool *mvneta_bm_pool_use(struct mvneta_bm *priv, u8 pool_id,
 			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 		hwbm_pool->construct = mvneta_bm_construct;
 		hwbm_pool->priv = new_pool;
-		spin_lock_init(&hwbm_pool->lock);
+		mutex_init(&hwbm_pool->buf_lock);
 
 		/* Create new pool */
 		err = mvneta_bm_pool_create(priv, new_pool);
@@ -201,7 +201,7 @@ struct mvneta_bm_pool *mvneta_bm_pool_use(struct mvneta_bm *priv, u8 pool_id,
 		}
 
 		/* Allocate buffers for this pool */
-		num = hwbm_pool_add(hwbm_pool, hwbm_pool->size, GFP_ATOMIC);
+		num = hwbm_pool_add(hwbm_pool, hwbm_pool->size);
 		if (num != hwbm_pool->size) {
 			WARN(1, "pool %d: %d of %d allocated\n",
 			     new_pool->id, num, hwbm_pool->size);
diff --git a/include/net/hwbm.h b/include/net/hwbm.h
index 89085e2e2da5..81643cf8a1c4 100644
--- a/include/net/hwbm.h
+++ b/include/net/hwbm.h
@@ -12,18 +12,18 @@ struct hwbm_pool {
 	/* constructor called during alocation */
 	int (*construct)(struct hwbm_pool *bm_pool, void *buf);
 	/* protect acces to the buffer counter*/
-	spinlock_t lock;
+	struct mutex buf_lock;
 	/* private data */
 	void *priv;
 };
 #ifdef CONFIG_HWBM
 void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf);
 int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp);
-int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp);
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num);
 #else
 void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) {}
 int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) { return 0; }
-int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num)
 { return 0; }
 #endif /* CONFIG_HWBM */
 #endif /* _HWBM_H */
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
index fd822ca5a245..ac1a66df9adc 100644
--- a/net/core/hwbm.c
+++ b/net/core/hwbm.c
@@ -43,34 +43,33 @@ int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
 }
 EXPORT_SYMBOL_GPL(hwbm_pool_refill);
 
-int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num)
 {
 	int err, i;
-	unsigned long flags;
 
-	spin_lock_irqsave(&bm_pool->lock, flags);
+	mutex_lock(&bm_pool->buf_lock);
 	if (bm_pool->buf_num == bm_pool->size) {
 		pr_warn("pool already filled\n");
-		spin_unlock_irqrestore(&bm_pool->lock, flags);
+		mutex_unlock(&bm_pool->buf_lock);
 		return bm_pool->buf_num;
 	}
 
 	if (buf_num + bm_pool->buf_num > bm_pool->size) {
 		pr_warn("cannot allocate %d buffers for pool\n",
 			buf_num);
-		spin_unlock_irqrestore(&bm_pool->lock, flags);
+		mutex_unlock(&bm_pool->buf_lock);
 		return 0;
 	}
 
 	if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
 		pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
 			buf_num,  bm_pool->buf_num);
-		spin_unlock_irqrestore(&bm_pool->lock, flags);
+		mutex_unlock(&bm_pool->buf_lock);
 		return 0;
 	}
 
 	for (i = 0; i < buf_num; i++) {
-		err = hwbm_pool_refill(bm_pool, gfp);
+		err = hwbm_pool_refill(bm_pool, GFP_KERNEL);
 		if (err < 0)
 			break;
 	}
@@ -79,7 +78,7 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
 	bm_pool->buf_num += i;
 
 	pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
-	spin_unlock_irqrestore(&bm_pool->lock, flags);
+	mutex_unlock(&bm_pool->buf_lock);
 
 	return i;
 }
-- 
cgit v1.2.3


From c67b85558ff20cb1ff20874461d12af456bee5d0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 8 Jun 2019 17:58:51 -0700
Subject: ipv6: tcp: send consistent autoflowlabel in TIME_WAIT state

In case autoflowlabel is in action, skb_get_hash_flowi6()
derives a non zero skb->hash to the flowlabel.

If skb->hash is zero, a flow dissection is performed.

Since all TCP skbs sent from ESTABLISH state inherit their
skb->hash from sk->sk_txhash, we better keep a copy
of sk->sk_txhash into the TIME_WAIT socket.

After this patch, ACK or RST packets sent on behalf of
a TIME_WAIT socket have the flowlabel that was previously
used by the flow.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h |  1 +
 net/ipv4/tcp_minisocks.c         |  1 +
 net/ipv6/tcp_ipv6.c              | 13 ++++++++++---
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c2f756aedc54..aef38c140014 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -70,6 +70,7 @@ struct inet_timewait_sock {
 				tw_flowlabel	: 20,
 				tw_pad		: 2,	/* 2 bits hole */
 				tw_tos		: 8;
+	u32			tw_txhash;
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
 };
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7c35731816e2..11011e8386dc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -283,6 +283,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
 			tw->tw_tclass = np->tclass;
 			tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
+			tw->tw_txhash = sk->sk_txhash;
 			tw->tw_ipv6only = sk->sk_ipv6only;
 		}
 #endif
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c1da52c7f990..ad7039137a20 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -883,9 +883,16 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		fl6.flowi6_oif = oif;
 	}
 
-	if (sk)
-		mark = (sk->sk_state == TCP_TIME_WAIT) ?
-			inet_twsk(sk)->tw_mark : sk->sk_mark;
+	if (sk) {
+		if (sk->sk_state == TCP_TIME_WAIT) {
+			mark = inet_twsk(sk)->tw_mark;
+			/* autoflowlabel relies on buff->hash */
+			skb_set_hash(buff, inet_twsk(sk)->tw_txhash,
+				     PKT_HASH_TYPE_L4);
+		} else {
+			mark = sk->sk_mark;
+		}
+	}
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
-- 
cgit v1.2.3


From fe03d4745675cbd678cb8c50d951df0abafdcaee Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 10 Jun 2019 13:00:24 +0200
Subject: Update my email address

It's better to use my kadlec@netfilter.org email address in
the source code. I might not be able to use
kadlec@blackhole.kfki.hu in the future.

Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 CREDITS                                        | 2 +-
 MAINTAINERS                                    | 2 +-
 include/linux/jhash.h                          | 2 +-
 include/linux/netfilter/ipset/ip_set.h         | 2 +-
 include/linux/netfilter/ipset/ip_set_counter.h | 2 +-
 include/linux/netfilter/ipset/ip_set_skbinfo.h | 2 +-
 include/linux/netfilter/ipset/ip_set_timeout.h | 2 +-
 include/uapi/linux/netfilter/ipset/ip_set.h    | 2 +-
 net/ipv4/netfilter/iptable_raw.c               | 2 +-
 net/ipv4/netfilter/nf_nat_h323.c               | 2 +-
 net/ipv6/netfilter/ip6table_raw.c              | 2 +-
 net/netfilter/ipset/ip_set_bitmap_gen.h        | 2 +-
 net/netfilter/ipset/ip_set_bitmap_ip.c         | 4 ++--
 net/netfilter/ipset/ip_set_bitmap_ipmac.c      | 4 ++--
 net/netfilter/ipset/ip_set_bitmap_port.c       | 4 ++--
 net/netfilter/ipset/ip_set_core.c              | 4 ++--
 net/netfilter/ipset/ip_set_getport.c           | 2 +-
 net/netfilter/ipset/ip_set_hash_gen.h          | 2 +-
 net/netfilter/ipset/ip_set_hash_ip.c           | 4 ++--
 net/netfilter/ipset/ip_set_hash_ipmark.c       | 2 +-
 net/netfilter/ipset/ip_set_hash_ipport.c       | 4 ++--
 net/netfilter/ipset/ip_set_hash_ipportip.c     | 4 ++--
 net/netfilter/ipset/ip_set_hash_ipportnet.c    | 4 ++--
 net/netfilter/ipset/ip_set_hash_mac.c          | 4 ++--
 net/netfilter/ipset/ip_set_hash_net.c          | 4 ++--
 net/netfilter/ipset/ip_set_hash_netiface.c     | 4 ++--
 net/netfilter/ipset/ip_set_hash_netnet.c       | 2 +-
 net/netfilter/ipset/ip_set_hash_netport.c      | 4 ++--
 net/netfilter/ipset/ip_set_hash_netportnet.c   | 2 +-
 net/netfilter/ipset/ip_set_list_set.c          | 4 ++--
 net/netfilter/nf_conntrack_h323_main.c         | 2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         | 2 +-
 net/netfilter/xt_iprange.c                     | 4 ++--
 net/netfilter/xt_set.c                         | 4 ++--
 34 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/CREDITS b/CREDITS
index 8e0342620a06..4200f4f91a16 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1800,7 +1800,7 @@ S: 2300 Copenhagen S.
 S: Denmark
 
 N: Jozsef Kadlecsik
-E: kadlec@blackhole.kfki.hu
+E: kadlec@netfilter.org
 P: 1024D/470DB964 4CB3 1A05 713E 9BF7 FAC5  5809 DD8C B7B1 470D B964
 D: netfilter: TCP window tracking code
 D: netfilter: raw table
diff --git a/MAINTAINERS b/MAINTAINERS
index fcbd648b960e..4c65ce86fc9e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10858,7 +10858,7 @@ F:	drivers/net/ethernet/neterion/
 
 NETFILTER
 M:	Pablo Neira Ayuso <pablo@netfilter.org>
-M:	Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+M:	Jozsef Kadlecsik <kadlec@netfilter.org>
 M:	Florian Westphal <fw@strlen.de>
 L:	netfilter-devel@vger.kernel.org
 L:	coreteam@netfilter.org
diff --git a/include/linux/jhash.h b/include/linux/jhash.h
index 8037850f3104..ba2f6a9776b6 100644
--- a/include/linux/jhash.h
+++ b/include/linux/jhash.h
@@ -17,7 +17,7 @@
  * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
  * the public domain.  It has no warranty.
  *
- * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
+ * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
  *
  * I've modified Bob's hash to be useful in the Linux kernel, and
  * any bugs present are my fault.
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e499d170f12d..f5c6e7cd6469 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -1,7 +1,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/netfilter/ipset/ip_set_counter.h b/include/linux/netfilter/ipset/ip_set_counter.h
index 3d33a2c3f39f..305aeda2a899 100644
--- a/include/linux/netfilter/ipset/ip_set_counter.h
+++ b/include/linux/netfilter/ipset/ip_set_counter.h
@@ -1,7 +1,7 @@
 #ifndef _IP_SET_COUNTER_H
 #define _IP_SET_COUNTER_H
 
-/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/netfilter/ipset/ip_set_skbinfo.h b/include/linux/netfilter/ipset/ip_set_skbinfo.h
index 29d7ef2bc3fa..fac57ef854c2 100644
--- a/include/linux/netfilter/ipset/ip_set_skbinfo.h
+++ b/include/linux/netfilter/ipset/ip_set_skbinfo.h
@@ -1,7 +1,7 @@
 #ifndef _IP_SET_SKBINFO_H
 #define _IP_SET_SKBINFO_H
 
-/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 8ce271e187b6..dc74150f3432 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -1,7 +1,7 @@
 #ifndef _IP_SET_TIMEOUT_H
 #define _IP_SET_TIMEOUT_H
 
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index ea69ca21ff23..eea166c52c36 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -2,7 +2,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 6eefde5bc468..69697eb4bfc6 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -2,7 +2,7 @@
 /*
  * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
  *
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 15f2b2604890..076b6b29d66d 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -7,7 +7,7 @@
  * This source code is licensed under General Public License version 2.
  *
  * Based on the 'brute force' H.323 NAT module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
  */
 
 #include <linux/module.h>
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 3f7d4691c423..a22100b1cf2c 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -2,7 +2,7 @@
 /*
  * IPv6 raw table, a port of the IPv4 raw table to IPv6
  *
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 38ef2ea838cb..29c1e9a50601 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 488d6d05c65c..5a66c5499700 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -31,7 +31,7 @@
 #define IPSET_TYPE_REV_MAX	3	/* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:ip");
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 980000fc3b50..ec7a8b12642c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -1,7 +1,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *			   Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -31,7 +31,7 @@
 #define IPSET_TYPE_REV_MAX	3	/* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:ip,mac");
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index b561ca8b3659..18275ec4924c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -26,7 +26,7 @@
 #define IPSET_TYPE_REV_MAX	3	/* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:port");
 
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 039892cd2b7d..18430ad2fdf2 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -51,7 +51,7 @@ static unsigned int max_sets;
 module_param(max_sets, int, 0600);
 MODULE_PARM_DESC(max_sets, "maximal number of sets");
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 MODULE_DESCRIPTION("core IP set support");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 3f09cdb42562..dc7b46b41354 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 623e0d675725..07ef941130a6 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 613eb212cb48..7b82bf1104ce 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -30,7 +30,7 @@
 #define IPSET_TYPE_REV_MAX	4	/* skbinfo support  */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip");
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index f3ba8348cf9d..7d468f98a252 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index ddb8039ec1d2..d358ee69d04b 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,7 @@
 #define IPSET_TYPE_REV_MAX	5 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port");
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index a7f4d7a85420..0a304785f912 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,7 @@
 #define IPSET_TYPE_REV_MAX	5 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port,ip");
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 88b83d6d3084..245f7d714870 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -34,7 +34,7 @@
 #define IPSET_TYPE_REV_MAX	7 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port,net");
 
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 4fe5f243d0a3..3d1fc71dac38 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -23,7 +23,7 @@
 #define IPSET_TYPE_REV_MAX	0
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:mac");
 
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 5449e23af13a..470701fda231 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -31,7 +31,7 @@
 #define IPSET_TYPE_REV_MAX	6 /* skbinfo mapping support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net");
 
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f5164c1efce2..1df8656ad84d 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,7 @@
 #define IPSET_TYPE_REV_MAX	6 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net,iface");
 
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 5a2b923bd81f..e0553be89600 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 1a187be9ebc8..943d55d76fcf 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -33,7 +33,7 @@
 #define IPSET_TYPE_REV_MAX	7 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net,port");
 
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 613e18e720a4..afaff99e578c 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 4f894165cdcd..ed4360072f64 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -22,7 +22,7 @@
 #define IPSET_TYPE_REV_MAX	3 /* skbinfo support added */
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_list:set");
 
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 12de40390e97..1ff66e070cb2 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -7,7 +7,7 @@
  * This source code is licensed under General Public License version 2.
  *
  * Based on the 'brute force' H.323 connection tracking module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * For more information, please see http://nath323.sourceforge.net/
  */
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 7ba01d8ee165..60b68400435d 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1,6 +1,6 @@
 /* (C) 1999-2001 Paul `Rusty' Russell
  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index b46626cddd93..4ab4155706d7 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -1,7 +1,7 @@
 /*
  *	xt_iprange - Netfilter module to match IP address ranges
  *
- *	(C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *	(C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
  *	(C) CC Computer Consultants GmbH, 2008
  *
  *	This program is free software; you can redistribute it and/or modify
@@ -133,7 +133,7 @@ static void __exit iprange_mt_exit(void)
 module_init(iprange_mt_init);
 module_exit(iprange_mt_exit);
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
 MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching");
 MODULE_ALIAS("ipt_iprange");
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index cf67bbe07dc2..f025c51ba375 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -1,7 +1,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,7 +21,7 @@
 #include <uapi/linux/netfilter/xt_set.h>
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: IP set match and target module");
 MODULE_ALIAS("xt_SET");
 MODULE_ALIAS("ipt_set");
-- 
cgit v1.2.3


From f88c9aa12fd0cff9cbb74b490350e6f0fac68296 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 8 Jun 2019 14:53:22 -0700
Subject: nexthops: Add ipv6 helper to walk all fib6_nh in a nexthop struct

IPv6 has traditionally had a single fib6_nh per fib6_info. With
nexthops we can have multiple fib6_nh associated with a fib6_info.
Add a nexthop helper to invoke a callback for each fib6_nh in a
'struct nexthop'. If the callback returns non-0, the loop is
stopped and the return value passed to the caller.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h |  4 ++++
 net/ipv4/nexthop.c    | 31 +++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index e019ed9b3dc3..25f1f9a8419b 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -305,4 +305,8 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
 		res->nh = &nhi->fib6_nh;
 	}
 }
+
+int nexthop_for_each_fib6_nh(struct nexthop *nh,
+			     int (*cb)(struct fib6_nh *nh, void *arg),
+			     void *arg);
 #endif
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 5e48762b6b5f..49e8adce5b96 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -517,6 +517,37 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
 }
 EXPORT_SYMBOL_GPL(nexthop_select_path);
 
+int nexthop_for_each_fib6_nh(struct nexthop *nh,
+			     int (*cb)(struct fib6_nh *nh, void *arg),
+			     void *arg)
+{
+	struct nh_info *nhi;
+	int err;
+
+	if (nh->is_group) {
+		struct nh_group *nhg;
+		int i;
+
+		nhg = rcu_dereference_rtnl(nh->nh_grp);
+		for (i = 0; i < nhg->num_nh; i++) {
+			struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+			nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
+			err = cb(&nhi->fib6_nh, arg);
+			if (err)
+				return err;
+		}
+	} else {
+		nhi = rcu_dereference_rtnl(nh->nh_info);
+		err = cb(&nhi->fib6_nh, arg);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
+
 int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
 		       struct netlink_ext_ack *extack)
 {
-- 
cgit v1.2.3


From 493ced1ac47c48bb86d9d4e8e87df8592be85a0e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 8 Jun 2019 14:53:32 -0700
Subject: ipv4: Allow routes to use nexthop objects

Add support for RTA_NH_ID attribute to allow a user to specify a
nexthop id to use with a route. fc_nh_id is added to fib_config to
hold the value passed in the RTA_NH_ID attribute. If a nexthop id
is given, the gateway, device, encap and multipath attributes can
not be set.

Update fib_nh_match to check ids on a route delete.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  1 +
 net/ipv4/fib_frontend.c  | 19 +++++++++++++++++++
 net/ipv4/fib_semantics.c | 15 +++++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4cdf8bc22efd..7e1e621a56df 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -40,6 +40,7 @@ struct fib_config {
 	u32			fc_flags;
 	u32			fc_priority;
 	__be32			fc_prefsrc;
+	u32			fc_nh_id;
 	struct nlattr		*fc_mx;
 	struct rtnexthop	*fc_mp;
 	int			fc_mx_len;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 33b0dbe84aa6..108191667531 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -671,6 +671,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
 	[RTA_SPORT]		= { .type = NLA_U16 },
 	[RTA_DPORT]		= { .type = NLA_U16 },
+	[RTA_NH_ID]		= { .type = NLA_U32 },
 };
 
 int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
@@ -808,6 +809,18 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			if (err < 0)
 				goto errout;
 			break;
+		case RTA_NH_ID:
+			cfg->fc_nh_id = nla_get_u32(attr);
+			break;
+		}
+	}
+
+	if (cfg->fc_nh_id) {
+		if (cfg->fc_oif || cfg->fc_gw_family ||
+		    cfg->fc_encap || cfg->fc_mp) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop specification and nexthop id are mutually exclusive");
+			return -EINVAL;
 		}
 	}
 
@@ -834,6 +847,12 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
+	if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
+		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+		err = -EINVAL;
+		goto errout;
+	}
+
 	tb = fib_get_table(net, cfg.fc_table);
 	if (!tb) {
 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index afa4af1f9326..2c24d8e3b126 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -789,6 +789,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
 		return 1;
 
+	if (cfg->fc_nh_id) {
+		if (fi->nh && cfg->fc_nh_id == fi->nh->id)
+			return 0;
+		return 1;
+	}
+
 	if (cfg->fc_oif || cfg->fc_gw_family) {
 		struct fib_nh *nh = fib_info_nh(fi, 0);
 
@@ -1302,6 +1308,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		goto err_inval;
 	}
 
+	if (cfg->fc_nh_id) {
+		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+		if (!nh) {
+			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+			goto err_inval;
+		}
+		nhs = 0;
+	}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (cfg->fc_mp) {
 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
-- 
cgit v1.2.3


From 5b98324ebe29f4494b0fc45bde2d47ee716518fd Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 8 Jun 2019 14:53:34 -0700
Subject: ipv6: Allow routes to use nexthop objects

Add support for RTA_NH_ID attribute to allow a user to specify a
nexthop id to use with a route. fc_nh_id is added to fib6_config to
hold the value passed in the RTA_NH_ID attribute. If a nexthop id
is given, the gateway, device, encap and multipath attributes can
not be set.

Update ip6_route_del to check metric and protocol before nexthop
specs. If fc_nh_id is set, then it must match the id in the route
entry. Since IPv6 allows delete of a cached entry (an exception),
add ip6_del_cached_rt_nh to cycle through all of the fib6_nh in
a fib entry if it is using a nexthop.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/route.c      | 89 ++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 82 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index ac0427c096f3..1e92f1500b87 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -49,6 +49,7 @@ struct fib6_config {
 	u16		fc_delete_all_nh : 1,
 			fc_ignore_dev_down:1,
 			__unused : 14;
+	u32		fc_nh_id;
 
 	struct in6_addr	fc_dst;
 	struct in6_addr	fc_src;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f287375fd0b2..f7257a56072a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3531,6 +3531,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 #endif
+	if (cfg->fc_nh_id) {
+		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+		if (!nh) {
+			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+			goto out;
+		}
+		err = fib6_check_nexthop(nh, cfg, extack);
+		if (err)
+			goto out;
+	}
 
 	err = -ENOBUFS;
 	if (cfg->fc_nlinfo.nlh &&
@@ -3762,6 +3772,30 @@ static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
 	return 0;
 }
 
+struct fib6_nh_del_cached_rt_arg {
+	struct fib6_config *cfg;
+	struct fib6_info *f6i;
+};
+
+static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
+{
+	struct fib6_nh_del_cached_rt_arg *arg = _arg;
+	int rc;
+
+	rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
+	return rc != -ESRCH ? rc : 0;
+}
+
+static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
+{
+	struct fib6_nh_del_cached_rt_arg arg = {
+		.cfg = cfg,
+		.f6i = f6i
+	};
+
+	return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
+}
+
 static int ip6_route_del(struct fib6_config *cfg,
 			 struct netlink_ext_ack *extack)
 {
@@ -3787,11 +3821,20 @@ static int ip6_route_del(struct fib6_config *cfg,
 		for_each_fib6_node_rt_rcu(fn) {
 			struct fib6_nh *nh;
 
-			nh = rt->fib6_nh;
-			if (cfg->fc_flags & RTF_CACHE) {
-				int rc;
+			if (rt->nh && rt->nh->id != cfg->fc_nh_id)
+				continue;
 
-				rc = ip6_del_cached_rt(cfg, rt, nh);
+			if (cfg->fc_flags & RTF_CACHE) {
+				int rc = 0;
+
+				if (rt->nh) {
+					rc = ip6_del_cached_rt_nh(cfg, rt);
+				} else if (cfg->fc_nh_id) {
+					continue;
+				} else {
+					nh = rt->fib6_nh;
+					rc = ip6_del_cached_rt(cfg, rt, nh);
+				}
 				if (rc != -ESRCH) {
 					rcu_read_unlock();
 					return rc;
@@ -3799,6 +3842,23 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			}
 
+			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
+				continue;
+			if (cfg->fc_protocol &&
+			    cfg->fc_protocol != rt->fib6_protocol)
+				continue;
+
+			if (rt->nh) {
+				if (!fib6_info_hold_safe(rt))
+					continue;
+				rcu_read_unlock();
+
+				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+			}
+			if (cfg->fc_nh_id)
+				continue;
+
+			nh = rt->fib6_nh;
 			if (cfg->fc_ifindex &&
 			    (!nh->fib_nh_dev ||
 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
@@ -3806,10 +3866,6 @@ static int ip6_route_del(struct fib6_config *cfg,
 			if (cfg->fc_flags & RTF_GATEWAY &&
 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
 				continue;
-			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
-				continue;
-			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
-				continue;
 			if (!fib6_info_hold_safe(rt))
 				continue;
 			rcu_read_unlock();
@@ -4709,6 +4765,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
 	[RTA_SPORT]		= { .type = NLA_U16 },
 	[RTA_DPORT]		= { .type = NLA_U16 },
+	[RTA_NH_ID]		= { .type = NLA_U32 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4755,6 +4812,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
 
+	if (tb[RTA_NH_ID]) {
+		if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
+		    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop specification and nexthop id are mutually exclusive");
+			goto errout;
+		}
+		cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
+	}
+
 	if (tb[RTA_GATEWAY]) {
 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
 		cfg->fc_flags |= RTF_GATEWAY;
@@ -5089,6 +5156,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	if (cfg.fc_nh_id &&
+	    !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
+		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+		return -EINVAL;
+	}
+
 	if (cfg.fc_mp)
 		return ip6_route_multipath_del(&cfg, extack);
 	else {
-- 
cgit v1.2.3


From fada7fdc83c0bf8755956bff707c42b609223301 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Thu, 6 Jun 2019 13:59:40 -0700
Subject: bpf: Allow bpf_map_lookup_elem() on an xskmap

Currently, the AF_XDP code uses a separate map in order to
determine if an xsk is bound to a queue.  Instead of doing this,
have bpf_map_lookup_elem() return a xdp_sock.

Rearrange some xdp_sock members to eliminate structure holes.

Remove selftest - will be added back in later patch.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  8 +++++
 include/net/xdp_sock.h                             |  4 +--
 include/uapi/linux/bpf.h                           |  4 +++
 kernel/bpf/verifier.c                              | 26 ++++++++++++--
 kernel/bpf/xskmap.c                                |  7 ++++
 net/core/filter.c                                  | 40 ++++++++++++++++++++++
 .../selftests/bpf/verifier/prevent_map_lookup.c    | 15 --------
 7 files changed, 85 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e5a309e6a400..1fe137afa898 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -280,6 +280,7 @@ enum bpf_reg_type {
 	PTR_TO_TCP_SOCK,	 /* reg points to struct tcp_sock */
 	PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
+	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -727,6 +728,13 @@ void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+				  struct bpf_insn_access_aux *info);
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+				    const struct bpf_insn *si,
+				    struct bpf_insn *insn_buf,
+				    struct bpf_prog *prog,
+				    u32 *target_size);
 
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index d074b6d60f8a..ae0f368a62bb 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -58,11 +58,11 @@ struct xdp_sock {
 	struct xdp_umem *umem;
 	struct list_head flush_node;
 	u16 queue_id;
-	struct xsk_queue *tx ____cacheline_aligned_in_smp;
-	struct list_head list;
 	bool zc;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
+	struct xsk_queue *tx ____cacheline_aligned_in_smp;
+	struct list_head list;
 	/* Mutual exclusion of NAPI TX thread and sendmsg error paths
 	 * in the SKB destructor callback.
 	 */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7c6aef253173..ae0907d8c03a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3083,6 +3083,10 @@ struct bpf_sock_tuple {
 	};
 };
 
+struct bpf_xdp_sock {
+	__u32 queue_id;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c2cb5bd84ce..8d1786357a09 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -334,7 +334,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
 {
 	return type == PTR_TO_SOCKET ||
 		type == PTR_TO_SOCK_COMMON ||
-		type == PTR_TO_TCP_SOCK;
+		type == PTR_TO_TCP_SOCK ||
+		type == PTR_TO_XDP_SOCK;
 }
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
@@ -406,6 +407,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_TCP_SOCK]	= "tcp_sock",
 	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
 	[PTR_TO_TP_BUFFER]	= "tp_buffer",
+	[PTR_TO_XDP_SOCK]	= "xdp_sock",
 };
 
 static char slot_type_char[] = {
@@ -1363,6 +1365,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
+	case PTR_TO_XDP_SOCK:
 		return true;
 	default:
 		return false;
@@ -1843,6 +1846,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 	case PTR_TO_TCP_SOCK:
 		valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
 		break;
+	case PTR_TO_XDP_SOCK:
+		valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
+		break;
 	default:
 		valid = false;
 	}
@@ -2007,6 +2013,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_TCP_SOCK:
 		pointer_desc = "tcp_sock ";
 		break;
+	case PTR_TO_XDP_SOCK:
+		pointer_desc = "xdp_sock ";
+		break;
 	default:
 		break;
 	}
@@ -2905,10 +2914,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	 * appear.
 	 */
 	case BPF_MAP_TYPE_CPUMAP:
-	case BPF_MAP_TYPE_XSKMAP:
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_XSKMAP:
+		if (func_id != BPF_FUNC_redirect_map &&
+		    func_id != BPF_FUNC_map_lookup_elem)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
@@ -3799,6 +3812,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
+	case PTR_TO_XDP_SOCK:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -5038,6 +5052,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			if (reg->map_ptr->inner_map_meta) {
 				reg->type = CONST_PTR_TO_MAP;
 				reg->map_ptr = reg->map_ptr->inner_map_meta;
+			} else if (reg->map_ptr->map_type ==
+				   BPF_MAP_TYPE_XSKMAP) {
+				reg->type = PTR_TO_XDP_SOCK;
 			} else {
 				reg->type = PTR_TO_MAP_VALUE;
 			}
@@ -6299,6 +6316,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
+	case PTR_TO_XDP_SOCK:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -6693,6 +6711,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
+	case PTR_TO_XDP_SOCK:
 		return false;
 	default:
 		return true;
@@ -7826,6 +7845,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		case PTR_TO_TCP_SOCK:
 			convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
 			break;
+		case PTR_TO_XDP_SOCK:
+			convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
+			break;
 		default:
 			continue;
 		}
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 413d75f4fc72..ef7338cebd18 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -151,6 +151,12 @@ void __xsk_map_flush(struct bpf_map *map)
 }
 
 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
@@ -218,6 +224,7 @@ const struct bpf_map_ops xsk_map_ops = {
 	.map_free = xsk_map_free,
 	.map_get_next_key = xsk_map_get_next_key,
 	.map_lookup_elem = xsk_map_lookup_elem,
+	.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
 	.map_update_elem = xsk_map_update_elem,
 	.map_delete_elem = xsk_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
diff --git a/net/core/filter.c b/net/core/filter.c
index f2777dc0b624..a5e4ac7fcbe5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5680,6 +5680,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
 	return INET_ECN_set_ce(skb);
 }
 
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+				  struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	default:
+		return size == sizeof(__u32);
+	}
+}
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+				    const struct bpf_insn *si,
+				    struct bpf_insn *insn_buf,
+				    struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+#define BPF_XDP_SOCK_GET(FIELD)						\
+	do {								\
+		BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) >	\
+			     FIELD_SIZEOF(struct bpf_xdp_sock, FIELD));	\
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
+				      si->dst_reg, si->src_reg,		\
+				      offsetof(struct xdp_sock, FIELD)); \
+	} while (0)
+
+	switch (si->off) {
+	case offsetof(struct bpf_xdp_sock, queue_id):
+		BPF_XDP_SOCK_GET(queue_id);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
 	.func           = bpf_skb_ecn_set_ce,
 	.gpl_only       = false,
diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
index bbdba990fefb..da7a4b37cb98 100644
--- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
+++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
@@ -28,21 +28,6 @@
 	.errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem",
 	.prog_type = BPF_PROG_TYPE_SOCK_OPS,
 },
-{
-	"prevent map lookup in xskmap",
-	.insns = {
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
-	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
-	BPF_LD_MAP_FD(BPF_REG_1, 0),
-	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_EXIT_INSN(),
-	},
-	.fixup_map_xskmap = { 3 },
-	.result = REJECT,
-	.errstr = "cannot pass map_type 17 into func bpf_map_lookup_elem",
-	.prog_type = BPF_PROG_TYPE_XDP,
-},
 {
 	"prevent map lookup in stack trace",
 	.insns = {
-- 
cgit v1.2.3


From 89fec474fa1ab2c754e48d29e1081a2c2bd22dc6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 10 Jun 2019 21:40:00 -0700
Subject: net/tls: pass record number as a byte array

TLS offload code casts record number to a u64.  The buffer
should be aligned to 8 bytes, but its actually a __be64, and
the rest of the TLS code treats it as big int.  Make the
offload callbacks take a byte array, drivers can make the
choice to do the ugly cast if they want to.

Prepare for copying the record number onto the stack by
defining a constant for max size of the byte array.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c |  3 ++-
 include/net/tls.h                                      |  5 +++--
 net/tls/tls_device.c                                   | 12 +++++++++---
 net/tls/tls_sw.c                                       |  8 ++++----
 4 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
index e88340e196f7..d65150aa8298 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -161,11 +161,12 @@ static void mlx5e_tls_del(struct net_device *netdev,
 }
 
 static void mlx5e_tls_resync_rx(struct net_device *netdev, struct sock *sk,
-				u32 seq, u64 rcd_sn)
+				u32 seq, u8 *rcd_sn_data)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_tls_offload_context_rx *rx_ctx;
+	u64 rcd_sn = *(u64 *)rcd_sn_data;
 
 	rx_ctx = mlx5e_get_tls_rx_context(tls_ctx);
 
diff --git a/include/net/tls.h b/include/net/tls.h
index 3ecf45adb707..25641e2f5b96 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -62,6 +62,7 @@
 #define TLS_DEVICE_NAME_MAX		32
 
 #define MAX_IV_SIZE			16
+#define TLS_MAX_REC_SEQ_SIZE		8
 
 /* For AES-CCM, the full 16-bytes of IV is made of '4' fields of given sizes.
  *
@@ -299,7 +300,7 @@ struct tlsdev_ops {
 			    struct tls_context *ctx,
 			    enum tls_offload_ctx_dir direction);
 	void (*tls_dev_resync_rx)(struct net_device *netdev,
-				  struct sock *sk, u32 seq, u64 rcd_sn);
+				  struct sock *sk, u32 seq, u8 *rcd_sn);
 };
 
 struct tls_offload_context_rx {
@@ -607,6 +608,6 @@ int tls_sw_fallback_init(struct sock *sk,
 int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
 
 void tls_device_offload_cleanup_rx(struct sock *sk);
-void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn);
+void handle_device_resync(struct sock *sk, u32 seq);
 
 #endif /* _TLS_OFFLOAD_H */
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 59f0c8dacbcc..16635f0c829c 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -551,7 +551,7 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
 }
 
 static void tls_device_resync_rx(struct tls_context *tls_ctx,
-				 struct sock *sk, u32 seq, u64 rcd_sn)
+				 struct sock *sk, u32 seq, u8 *rcd_sn)
 {
 	struct net_device *netdev;
 
@@ -563,7 +563,7 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
 	clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
 }
 
-void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
+void handle_device_resync(struct sock *sk, u32 seq)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *rx_ctx;
@@ -582,7 +582,7 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 
 	if (unlikely(is_req_pending) && req_seq == seq &&
 	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
-		tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn);
+		tls_device_resync_rx(tls_ctx, sk, seq, tls_ctx->rx.rec_seq);
 }
 
 static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
@@ -760,6 +760,12 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
 		goto free_offload_ctx;
 	}
 
+	/* Sanity-check the rec_seq_size for stack allocations */
+	if (rec_seq_size > TLS_MAX_REC_SEQ_SIZE) {
+		rc = -EINVAL;
+		goto free_offload_ctx;
+	}
+
 	prot->prepend_size = TLS_HEADER_SIZE + nonce_size;
 	prot->tag_size = tag_size;
 	prot->overhead_size = prot->prepend_size + prot->tag_size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index bef71e54fad0..c1d22290f1d0 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2015,8 +2015,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 		goto read_failure;
 	}
 #ifdef CONFIG_TLS_DEVICE
-	handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset,
-			     *(u64*)tls_ctx->rx.rec_seq);
+	handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset);
 #endif
 	return data_len + TLS_HEADER_SIZE;
 
@@ -2283,8 +2282,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto free_priv;
 	}
 
-	/* Sanity-check the IV size for stack allocations. */
-	if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) {
+	/* Sanity-check the sizes for stack allocations. */
+	if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE ||
+	    rec_seq_size > TLS_MAX_REC_SEQ_SIZE) {
 		rc = -EINVAL;
 		goto free_priv;
 	}
-- 
cgit v1.2.3


From fe58a5a02cd9f49d5868539b4146ec1e5e5176e4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 10 Jun 2019 21:40:01 -0700
Subject: net/tls: rename handle_device_resync()

handle_device_resync() doesn't describe the function very well.
The function checks if resync should be issued upon parsing of
a new record.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    | 2 +-
 net/tls/tls_device.c | 2 +-
 net/tls/tls_sw.c     | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 25641e2f5b96..1c512da5e4f4 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -608,6 +608,6 @@ int tls_sw_fallback_init(struct sock *sk,
 int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
 
 void tls_device_offload_cleanup_rx(struct sock *sk);
-void handle_device_resync(struct sock *sk, u32 seq);
+void tls_device_rx_resync_new_rec(struct sock *sk, u32 seq);
 
 #endif /* _TLS_OFFLOAD_H */
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 16635f0c829c..0ecfa0ee415d 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -563,7 +563,7 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
 	clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
 }
 
-void handle_device_resync(struct sock *sk, u32 seq)
+void tls_device_rx_resync_new_rec(struct sock *sk, u32 seq)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *rx_ctx;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index c1d22290f1d0..bc3a1b188d4a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2015,7 +2015,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 		goto read_failure;
 	}
 #ifdef CONFIG_TLS_DEVICE
-	handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset);
+	tls_device_rx_resync_new_rec(strp->sk,
+				     TCP_SKB_CB(skb)->seq + rxm->offset);
 #endif
 	return data_len + TLS_HEADER_SIZE;
 
-- 
cgit v1.2.3


From f953d33ba1225d68cf8790b4706d8c4410b15926 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 10 Jun 2019 21:40:02 -0700
Subject: net/tls: add kernel-driven TLS RX resync

TLS offload device may lose sync with the TCP stream if packets
arrive out of order.  Drivers can currently request a resync at
a specific TCP sequence number.  When a record is found starting
at that sequence number kernel will inform the device of the
corresponding record number.

This requires the device to constantly scan the stream for a
known pattern (constant bytes of the header) after sync is lost.

This patch adds an alternative approach which is entirely under
the control of the kernel.  Kernel tracks records it had to fully
decrypt, even though TLS socket is in TLS_HW mode.  If multiple
records did not have any decrypted parts - it's a pretty strong
indication that the device is out of sync.

We choose the min number of fully encrypted records to be 2,
which should hopefully be more than will get retransmitted at
a time.

After kernel decides the device is out of sync it schedules a
resync request.  If the TCP socket is empty the resync gets
performed immediately.  If socket is not empty we leave the
record parser to resync when next record comes.

Before resync in message parser we peek at the TCP socket and
don't attempt the sync if the socket already has some of the
next record queued.

On resync failure (encrypted data continues to flow in) we
retry with exponential backoff, up to once every 128 records
(with a 16k record thats at most once every 2M of data).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tls-offload.rst |  19 ++++++
 include/net/tls.h                        |  34 +++++++++-
 net/tls/tls_device.c                     | 105 +++++++++++++++++++++++++++----
 net/tls/tls_sw.c                         |   2 +-
 4 files changed, 145 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst
index eb7c9b81ccf5..d134d63307e7 100644
--- a/Documentation/networking/tls-offload.rst
+++ b/Documentation/networking/tls-offload.rst
@@ -268,6 +268,9 @@ Device can only detect that segment 4 also contains a TLS header
 if it knows the length of the previous record from segment 2. In this case
 the device will lose synchronization with the stream.
 
+Stream scan resynchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 When the device gets out of sync and the stream reaches TCP sequence
 numbers more than a max size record past the expected TCP sequence number,
 the device starts scanning for a known header pattern. For example
@@ -298,6 +301,22 @@ Special care has to be taken if the confirmation request is passed
 asynchronously to the packet stream and record may get processed
 by the kernel before the confirmation request.
 
+Stack-driven resynchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The driver may also request the stack to perform resynchronization
+whenever it sees the records are no longer getting decrypted.
+If the connection is configured in this mode the stack automatically
+schedules resynchronization after it has received two completely encrypted
+records.
+
+The stack waits for the socket to drain and informs the device about
+the next expected record number and its TCP sequence number. If the
+records continue to be received fully encrypted stack retries the
+synchronization with an exponential back off (first after 2 encrypted
+records, then after 4 records, after 8, after 16... up until every
+128 records).
+
 Error handling
 ==============
 
diff --git a/include/net/tls.h b/include/net/tls.h
index 1c512da5e4f4..28eca6a3b615 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -303,10 +303,33 @@ struct tlsdev_ops {
 				  struct sock *sk, u32 seq, u8 *rcd_sn);
 };
 
+enum tls_offload_sync_type {
+	TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ = 0,
+	TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT = 1,
+};
+
+#define TLS_DEVICE_RESYNC_NH_START_IVAL		2
+#define TLS_DEVICE_RESYNC_NH_MAX_IVAL		128
+
 struct tls_offload_context_rx {
 	/* sw must be the first member of tls_offload_context_rx */
 	struct tls_sw_context_rx sw;
-	atomic64_t resync_req;
+	enum tls_offload_sync_type resync_type;
+	/* this member is set regardless of resync_type, to avoid branches */
+	u8 resync_nh_reset:1;
+	/* CORE_NEXT_HINT-only member, but use the hole here */
+	u8 resync_nh_do_now:1;
+	union {
+		/* TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ */
+		struct {
+			atomic64_t resync_req;
+		};
+		/* TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT */
+		struct {
+			u32 decrypted_failed;
+			u32 decrypted_tgt;
+		} resync_nh;
+	};
 	u8 driver_state[] __aligned(8);
 	/* The TLS layer reserves room for driver specific state
 	 * Currently the belief is that there is not enough
@@ -587,6 +610,13 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq)
 	atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | 1);
 }
 
+static inline void
+tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+
+	tls_offload_ctx_rx(tls_ctx)->resync_type = type;
+}
 
 int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
 		      unsigned char *record_type);
@@ -608,6 +638,6 @@ int tls_sw_fallback_init(struct sock *sk,
 int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
 
 void tls_device_offload_cleanup_rx(struct sock *sk);
-void tls_device_rx_resync_new_rec(struct sock *sk, u32 seq);
+void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq);
 
 #endif /* _TLS_OFFLOAD_H */
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 0ecfa0ee415d..477c869c69c8 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -563,10 +563,12 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
 	clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
 }
 
-void tls_device_rx_resync_new_rec(struct sock *sk, u32 seq)
+void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *rx_ctx;
+	u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
+	struct tls_prot_info *prot;
 	u32 is_req_pending;
 	s64 resync_req;
 	u32 req_seq;
@@ -574,15 +576,84 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 seq)
 	if (tls_ctx->rx_conf != TLS_HW)
 		return;
 
+	prot = &tls_ctx->prot_info;
 	rx_ctx = tls_offload_ctx_rx(tls_ctx);
-	resync_req = atomic64_read(&rx_ctx->resync_req);
-	req_seq = resync_req >> 32;
-	seq += TLS_HEADER_SIZE - 1;
-	is_req_pending = resync_req;
-
-	if (unlikely(is_req_pending) && req_seq == seq &&
-	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
-		tls_device_resync_rx(tls_ctx, sk, seq, tls_ctx->rx.rec_seq);
+	memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size);
+
+	switch (rx_ctx->resync_type) {
+	case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ:
+		resync_req = atomic64_read(&rx_ctx->resync_req);
+		req_seq = resync_req >> 32;
+		seq += TLS_HEADER_SIZE - 1;
+		is_req_pending = resync_req;
+
+		if (likely(!is_req_pending) || req_seq != seq ||
+		    !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
+			return;
+		break;
+	case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT:
+		if (likely(!rx_ctx->resync_nh_do_now))
+			return;
+
+		/* head of next rec is already in, note that the sock_inq will
+		 * include the currently parsed message when called from parser
+		 */
+		if (tcp_inq(sk) > rcd_len)
+			return;
+
+		rx_ctx->resync_nh_do_now = 0;
+		seq += rcd_len;
+		tls_bigint_increment(rcd_sn, prot->rec_seq_size);
+		break;
+	}
+
+	tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn);
+}
+
+static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx,
+					   struct tls_offload_context_rx *ctx,
+					   struct sock *sk, struct sk_buff *skb)
+{
+	struct strp_msg *rxm;
+
+	/* device will request resyncs by itself based on stream scan */
+	if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT)
+		return;
+	/* already scheduled */
+	if (ctx->resync_nh_do_now)
+		return;
+	/* seen decrypted fragments since last fully-failed record */
+	if (ctx->resync_nh_reset) {
+		ctx->resync_nh_reset = 0;
+		ctx->resync_nh.decrypted_failed = 1;
+		ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL;
+		return;
+	}
+
+	if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt)
+		return;
+
+	/* doing resync, bump the next target in case it fails */
+	if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL)
+		ctx->resync_nh.decrypted_tgt *= 2;
+	else
+		ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL;
+
+	rxm = strp_msg(skb);
+
+	/* head of next rec is already in, parser will sync for us */
+	if (tcp_inq(sk) > rxm->full_len) {
+		ctx->resync_nh_do_now = 1;
+	} else {
+		struct tls_prot_info *prot = &tls_ctx->prot_info;
+		u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
+
+		memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size);
+		tls_bigint_increment(rcd_sn, prot->rec_seq_size);
+
+		tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq,
+				     rcd_sn);
+	}
 }
 
 static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
@@ -686,12 +757,21 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
 
 	ctx->sw.decrypted |= is_decrypted;
 
-	/* Return immedeatly if the record is either entirely plaintext or
+	/* Return immediately if the record is either entirely plaintext or
 	 * entirely ciphertext. Otherwise handle reencrypt partially decrypted
 	 * record.
 	 */
-	return (is_encrypted || is_decrypted) ? 0 :
-		tls_device_reencrypt(sk, skb);
+	if (is_decrypted) {
+		ctx->resync_nh_reset = 1;
+		return 0;
+	}
+	if (is_encrypted) {
+		tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb);
+		return 0;
+	}
+
+	ctx->resync_nh_reset = 1;
+	return tls_device_reencrypt(sk, skb);
 }
 
 static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
@@ -917,6 +997,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
 		rc = -ENOMEM;
 		goto release_netdev;
 	}
+	context->resync_nh_reset = 1;
 
 	ctx->priv_ctx_rx = context;
 	rc = tls_set_sw_offload(sk, ctx, 0);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index bc3a1b188d4a..533eaa4826e5 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2015,7 +2015,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 		goto read_failure;
 	}
 #ifdef CONFIG_TLS_DEVICE
-	tls_device_rx_resync_new_rec(strp->sk,
+	tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE,
 				     TCP_SKB_CB(skb)->seq + rxm->offset);
 #endif
 	return data_len + TLS_HEADER_SIZE;
-- 
cgit v1.2.3


From eeb2efaf36c75753f9028de3500669bddfac81a8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 10 Jun 2019 21:40:08 -0700
Subject: net/tls: generalize the resync callback

Currently only RX direction is ever resynced, however, TX may
also get out of sequence if packets get dropped on the way to
the driver.  Rename the resync callback and add a direction
parameter.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 9 ++++++---
 drivers/net/ethernet/netronome/nfp/crypto/tls.c        | 9 ++++++---
 include/net/tls.h                                      | 5 +++--
 net/tls/tls_device.c                                   | 5 +++--
 4 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
index d65150aa8298..dc15c5c9e557 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -160,14 +160,17 @@ static void mlx5e_tls_del(struct net_device *netdev,
 				direction == TLS_OFFLOAD_CTX_DIR_TX);
 }
 
-static void mlx5e_tls_resync_rx(struct net_device *netdev, struct sock *sk,
-				u32 seq, u8 *rcd_sn_data)
+static void mlx5e_tls_resync(struct net_device *netdev, struct sock *sk,
+			     u32 seq, u8 *rcd_sn_data,
+			     enum tls_offload_ctx_dir direction)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_tls_offload_context_rx *rx_ctx;
 	u64 rcd_sn = *(u64 *)rcd_sn_data;
 
+	if (WARN_ON_ONCE(direction != TLS_OFFLOAD_CTX_DIR_RX))
+		return;
 	rx_ctx = mlx5e_get_tls_rx_context(tls_ctx);
 
 	netdev_info(netdev, "resyncing seq %d rcd %lld\n", seq,
@@ -179,7 +182,7 @@ static void mlx5e_tls_resync_rx(struct net_device *netdev, struct sock *sk,
 static const struct tlsdev_ops mlx5e_tls_ops = {
 	.tls_dev_add = mlx5e_tls_add,
 	.tls_dev_del = mlx5e_tls_del,
-	.tls_dev_resync_rx = mlx5e_tls_resync_rx,
+	.tls_dev_resync = mlx5e_tls_resync,
 };
 
 void mlx5e_tls_build_netdev(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/netronome/nfp/crypto/tls.c b/drivers/net/ethernet/netronome/nfp/crypto/tls.c
index 4427c1d42047..93f87b7633b1 100644
--- a/drivers/net/ethernet/netronome/nfp/crypto/tls.c
+++ b/drivers/net/ethernet/netronome/nfp/crypto/tls.c
@@ -383,14 +383,17 @@ nfp_net_tls_del(struct net_device *netdev, struct tls_context *tls_ctx,
 }
 
 static void
-nfp_net_tls_resync_rx(struct net_device *netdev, struct sock *sk, u32 seq,
-		      u8 *rcd_sn)
+nfp_net_tls_resync(struct net_device *netdev, struct sock *sk, u32 seq,
+		   u8 *rcd_sn, enum tls_offload_ctx_dir direction)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 	struct nfp_net_tls_offload_ctx *ntls;
 	struct nfp_crypto_req_update *req;
 	struct sk_buff *skb;
 
+	if (WARN_ON_ONCE(direction != TLS_OFFLOAD_CTX_DIR_RX))
+		return;
+
 	skb = nfp_net_tls_alloc_simple(nn, sizeof(*req), GFP_ATOMIC);
 	if (!skb)
 		return;
@@ -411,7 +414,7 @@ nfp_net_tls_resync_rx(struct net_device *netdev, struct sock *sk, u32 seq,
 static const struct tlsdev_ops nfp_net_tls_ops = {
 	.tls_dev_add = nfp_net_tls_add,
 	.tls_dev_del = nfp_net_tls_del,
-	.tls_dev_resync_rx = nfp_net_tls_resync_rx,
+	.tls_dev_resync = nfp_net_tls_resync,
 };
 
 static int nfp_net_tls_reset(struct nfp_net *nn)
diff --git a/include/net/tls.h b/include/net/tls.h
index 28eca6a3b615..9b49baecc4a8 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -299,8 +299,9 @@ struct tlsdev_ops {
 	void (*tls_dev_del)(struct net_device *netdev,
 			    struct tls_context *ctx,
 			    enum tls_offload_ctx_dir direction);
-	void (*tls_dev_resync_rx)(struct net_device *netdev,
-				  struct sock *sk, u32 seq, u8 *rcd_sn);
+	void (*tls_dev_resync)(struct net_device *netdev,
+			       struct sock *sk, u32 seq, u8 *rcd_sn,
+			       enum tls_offload_ctx_dir direction);
 };
 
 enum tls_offload_sync_type {
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 477c869c69c8..b35a3b902bfa 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -559,7 +559,8 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
 		return;
 	netdev = READ_ONCE(tls_ctx->netdev);
 	if (netdev)
-		netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn);
+		netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
+						   TLS_OFFLOAD_CTX_DIR_RX);
 	clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
 }
 
@@ -1105,7 +1106,7 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
 	case NETDEV_REGISTER:
 	case NETDEV_FEAT_CHANGE:
 		if ((dev->features & NETIF_F_HW_TLS_RX) &&
-		    !dev->tlsdev_ops->tls_dev_resync_rx)
+		    !dev->tlsdev_ops->tls_dev_resync)
 			return NOTIFY_BAD;
 
 		if  (dev->tlsdev_ops &&
-- 
cgit v1.2.3


From 50180074099fcda752d9d56282d23242b126ebc9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 10 Jun 2019 21:40:09 -0700
Subject: net/tls: add kernel-driven resync mechanism for TX

TLS offload drivers keep track of TCP seq numbers to make sure
the packets are fed into the HW in order.

When packets get dropped on the way through the stack, the driver
will get out of sync and have to use fallback encryption, but unless
TCP seq number is resynced it will never match the packets correctly
(or even worse - use incorrect record sequence number after TCP seq
wraps).

Existing drivers (mlx5) feed the entire record on every out-of-order
event, allowing FW/HW to always be in sync.

This patch adds an alternative, more akin to the RX resync.  When
driver sees a frame which is past its expected sequence number the
stream must have gotten out of order (if the sequence number is
smaller than expected its likely a retransmission which doesn't
require resync).  Driver will ask the stack to perform TX sync
before it submits the next full record, and fall back to software
crypto until stack has performed the sync.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tls-offload.rst | 35 +++++++++++++++++++++++++++++++-
 include/net/tls.h                        | 23 +++++++++++++++++++++
 net/tls/tls_device.c                     | 27 ++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst
index d134d63307e7..048e5ca44824 100644
--- a/Documentation/networking/tls-offload.rst
+++ b/Documentation/networking/tls-offload.rst
@@ -206,7 +206,11 @@ TX
 
 Segments transmitted from an offloaded socket can get out of sync
 in similar ways to the receive side-retransmissions - local drops
-are possible, though network reorders are not.
+are possible, though network reorders are not. There are currently
+two mechanisms for dealing with out of order segments.
+
+Crypto state rebuilding
+~~~~~~~~~~~~~~~~~~~~~~~
 
 Whenever an out of order segment is transmitted the driver provides
 the device with enough information to perform cryptographic operations.
@@ -225,6 +229,35 @@ was just a retransmission. The former is simpler, and does not require
 retransmission detection therefore it is the recommended method until
 such time it is proven inefficient.
 
+Next record sync
+~~~~~~~~~~~~~~~~
+
+Whenever an out of order segment is detected the driver requests
+that the ``ktls`` software fallback code encrypt it. If the segment's
+sequence number is lower than expected the driver assumes retransmission
+and doesn't change device state. If the segment is in the future, it
+may imply a local drop, the driver asks the stack to sync the device
+to the next record state and falls back to software.
+
+Resync request is indicated with:
+
+.. code-block:: c
+
+  void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq)
+
+Until resync is complete driver should not access its expected TCP
+sequence number (as it will be updated from a different context).
+Following helper should be used to test if resync is complete:
+
+.. code-block:: c
+
+  bool tls_offload_tx_resync_pending(struct sock *sk)
+
+Next time ``ktls`` pushes a record it will first send its TCP sequence number
+and TLS record number to the driver. Stack will also make sure that
+the new record will start on a segment boundary (like it does when
+the connection is initially added).
+
 RX
 --
 
diff --git a/include/net/tls.h b/include/net/tls.h
index 9b49baecc4a8..63e473420b00 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -212,6 +212,11 @@ struct tls_offload_context_tx {
 
 enum tls_context_flags {
 	TLS_RX_SYNC_RUNNING = 0,
+	/* Unlike RX where resync is driven entirely by the core in TX only
+	 * the driver knows when things went out of sync, so we need the flag
+	 * to be atomic.
+	 */
+	TLS_TX_SYNC_SCHED = 1,
 };
 
 struct cipher_context {
@@ -619,6 +624,24 @@ tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type)
 	tls_offload_ctx_rx(tls_ctx)->resync_type = type;
 }
 
+static inline void tls_offload_tx_resync_request(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+
+	WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags));
+}
+
+/* Driver's seq tracking has to be disabled until resync succeeded */
+static inline bool tls_offload_tx_resync_pending(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	bool ret;
+
+	ret = test_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags);
+	smp_mb__after_atomic();
+	return ret;
+}
+
 int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
 		      unsigned char *record_type);
 void tls_register_device(struct tls_device *device);
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index b35a3b902bfa..40076f423dcb 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -209,6 +209,29 @@ void tls_device_free_resources_tx(struct sock *sk)
 	tls_free_partial_record(sk, tls_ctx);
 }
 
+static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
+				 u32 seq)
+{
+	struct net_device *netdev;
+	struct sk_buff *skb;
+	u8 *rcd_sn;
+
+	skb = tcp_write_queue_tail(sk);
+	if (skb)
+		TCP_SKB_CB(skb)->eor = 1;
+
+	rcd_sn = tls_ctx->tx.rec_seq;
+
+	down_read(&device_offload_lock);
+	netdev = tls_ctx->netdev;
+	if (netdev)
+		netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
+						   TLS_OFFLOAD_CTX_DIR_TX);
+	up_read(&device_offload_lock);
+
+	clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags);
+}
+
 static void tls_append_frag(struct tls_record_info *record,
 			    struct page_frag *pfrag,
 			    int size)
@@ -264,6 +287,10 @@ static int tls_push_record(struct sock *sk,
 	list_add_tail(&record->list, &offload_ctx->records_list);
 	spin_unlock_irq(&offload_ctx->lock);
 	offload_ctx->open_record = NULL;
+
+	if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags))
+		tls_device_resync_tx(sk, ctx, tp->write_seq);
+
 	tls_advance_record_sn(sk, prot, &ctx->tx);
 
 	for (i = 0; i < record->num_frags; i++) {
-- 
cgit v1.2.3


From a842fe1425cb20f457abd3f8ef98b468f83ca98b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 12 Jun 2019 11:57:25 -0700
Subject: tcp: add optional per socket transmit delay

Adding delays to TCP flows is crucial for studying behavior
of TCP stacks, including congestion control modules.

Linux offers netem module, but it has unpractical constraints :
- Need root access to change qdisc
- Hard to setup on egress if combined with non trivial qdisc like FQ
- Single delay for all flows.

EDT (Earliest Departure Time) adoption in TCP stack allows us
to enable a per socket delay at a very small cost.

Networking tools can now establish thousands of flows, each of them
with a different delay, simulating real world conditions.

This requires FQ packet scheduler or a EDT-enabled NIC.

This patchs adds TCP_TX_DELAY socket option, to set a delay in
usec units.

  unsigned int tx_delay = 10000; /* 10 msec */

  setsockopt(fd, SOL_TCP, TCP_TX_DELAY, &tx_delay, sizeof(tx_delay));

Note that FQ packet scheduler limits might need some tweaking :

man tc-fq

PARAMETERS
   limit
       Hard  limit  on  the  real  queue  size. When this limit is
       reached, new packets are dropped. If the value is  lowered,
       packets  are  dropped so that the new limit is met. Default
       is 10000 packets.

   flow_limit
       Hard limit on the maximum  number  of  packets  queued  per
       flow.  Default value is 100.

Use of TCP_TX_DELAY option will increase number of skbs in FQ qdisc,
so packets would be dropped if any of the previous limit is hit.

Use of a jump label makes this support runtime-free, for hosts
never using the option.

Also note that TSQ (TCP Small Queues) limits are slightly changed
with this patch : we need to account that skbs artificially delayed
wont stop us providind more skbs to feed the pipe (netem uses
skb_orphan_partial() for this purpose, but FQ can not use this trick)

Because of that, using big delays might very well trigger
old bugs in TSO auto defer logic and/or sndbuf limited detection.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  2 ++
 include/net/tcp.h        | 19 +++++++++++++++++++
 include/uapi/linux/tcp.h |  3 +++
 net/ipv4/tcp.c           | 24 ++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      | 10 ++++++----
 net/ipv4/tcp_minisocks.c |  2 +-
 net/ipv4/tcp_output.c    | 23 ++++++++++++++++++++---
 net/ipv6/tcp_ipv6.c      |  1 +
 8 files changed, 76 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 711361af9ce0..c23019a3b264 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -245,6 +245,7 @@ struct tcp_sock {
 		syn_smc:1;	/* SYN includes SMC */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
+	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */
 	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
 	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
 
@@ -436,6 +437,7 @@ struct tcp_timewait_sock {
 	u32			  tw_last_oow_ack_time;
 
 	int			  tw_ts_recent_stamp;
+	u32			  tw_tx_delay;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 204328b88412..49a178b8d5b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2232,4 +2232,23 @@ void clean_acked_data_disable(struct inet_connection_sock *icsk);
 void clean_acked_data_flush(void);
 #endif
 
+DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+static inline void tcp_add_tx_delay(struct sk_buff *skb,
+				    const struct tcp_sock *tp)
+{
+	if (static_branch_unlikely(&tcp_tx_delay_enabled))
+		skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
+}
+
+static inline void tcp_set_tx_time(struct sk_buff *skb,
+				   const struct sock *sk)
+{
+	if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
+		u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
+			tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;
+
+		skb->skb_mstamp_ns = tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
+	}
+}
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index b521464ea962..b3564f85a762 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -127,6 +127,9 @@ enum {
 
 #define TCP_CM_INQ		TCP_INQ
 
+#define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */
+
+
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF		0
 #define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bd0856ac680a..5542e3d778e6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2736,6 +2736,21 @@ static int tcp_repair_options_est(struct sock *sk,
 	return 0;
 }
 
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(void)
+{
+	if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+		static int __tcp_tx_delay_enabled = 0;
+
+		if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+			static_branch_enable(&tcp_tx_delay_enabled);
+			pr_info("TCP_TX_DELAY enabled\n");
+		}
+	}
+}
+
 /*
  *	Socket option code for TCP.
  */
@@ -3087,6 +3102,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			tp->recvmsg_inq = val;
 		break;
+	case TCP_TX_DELAY:
+		if (val)
+			tcp_enable_tx_delay();
+		tp->tcp_tx_delay = val;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -3546,6 +3566,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = tp->fastopen_no_cookie;
 		break;
 
+	case TCP_TX_DELAY:
+		val = tp->tcp_tx_delay;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp_raw() + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f059fbd81a84..1b7e9e1fbd3b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -767,9 +767,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
-	if (sk)
+	if (sk) {
 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+		tcp_set_tx_time(skb, sk);
+	}
 	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -859,9 +861,9 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
-	if (sk)
-		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
-				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+			   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	tcp_set_tx_time(skb, sk);
 	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 11011e8386dc..8bcaf2586b68 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
 		tcptw->tw_last_oow_ack_time = 0;
-
+		tcptw->tw_tx_delay	= tp->tcp_tx_delay;
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f429e856e263..d954ff9069e8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
 			       sizeof(struct inet6_skb_parm)));
 
+	tcp_add_tx_delay(skb, tp);
+
 	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
 
 	if (unlikely(err > 0)) {
@@ -2234,6 +2236,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 			      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
 	limit <<= factor;
 
+	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
+	    tcp_sk(sk)->tcp_tx_delay) {
+		u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+
+		/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
+		 * approximate our needs assuming an ~100% skb->truesize overhead.
+		 * USEC_PER_SEC is approximated by 2^20.
+		 * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
+		 */
+		extra_bytes >>= (20 - 1);
+		limit += extra_bytes;
+	}
 	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
 		/* Always send skb if rtx queue is empty.
 		 * No need to wait for TX completion to call us back,
@@ -3212,6 +3226,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	int tcp_header_size;
 	struct tcphdr *th;
 	int mss;
+	u64 now;
 
 	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
 	if (unlikely(!skb)) {
@@ -3243,13 +3258,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	memset(&opts, 0, sizeof(opts));
+	now = tcp_clock_ns();
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
 		skb->skb_mstamp_ns = cookie_init_timestamp(req);
 	else
 #endif
 	{
-		skb->skb_mstamp_ns = tcp_clock_ns();
+		skb->skb_mstamp_ns = now;
 		if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
 			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
 	}
@@ -3292,8 +3308,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	rcu_read_unlock();
 #endif
 
-	/* Do not fool tcpdump (if any), clean our debris */
-	skb->tstamp = 0;
+	skb->skb_mstamp_ns = now;
+	tcp_add_tx_delay(skb, tp);
+
 	return skb;
 }
 EXPORT_SYMBOL(tcp_make_synack);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ad7039137a20..5606b2131b65 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -892,6 +892,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		} else {
 			mark = sk->sk_mark;
 		}
+		tcp_set_tx_time(buff, sk);
 	}
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
 	fl6.fl6_dport = t1->dest;
-- 
cgit v1.2.3


From 86eec50beaf3a45f6432d491072fa5c54284dbca Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:19 +0000
Subject: net/mlx5: Support querying max VFs from device

For ECPF with eswitch manager privilege, query the host max VF count
by querying the device using query_functions command.

With this enhancement:
1. flow steering entries are created only for valid vports based on
   the max VF count of the PF.
2. Driver only queries cap of valid vport.

Eswitch requires the max VFs when doing initialization, so do sr-iov
init before eswitch init.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c  | 18 +++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 22 ++++++++++++++++++++++
 include/linux/mlx5/driver.h                     |  7 ++-----
 include/linux/mlx5/mlx5_ifc.h                   |  2 +-
 4 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 8e96c42d3b84..720f65bfe6a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -844,32 +844,32 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 		goto err_rl_cleanup;
 	}
 
-	err = mlx5_eswitch_init(dev);
+	err = mlx5_sriov_init(dev);
 	if (err) {
-		mlx5_core_err(dev, "Failed to init eswitch %d\n", err);
+		mlx5_core_err(dev, "Failed to init sriov %d\n", err);
 		goto err_mpfs_cleanup;
 	}
 
-	err = mlx5_sriov_init(dev);
+	err = mlx5_eswitch_init(dev);
 	if (err) {
-		mlx5_core_err(dev, "Failed to init sriov %d\n", err);
-		goto err_eswitch_cleanup;
+		mlx5_core_err(dev, "Failed to init eswitch %d\n", err);
+		goto err_sriov_cleanup;
 	}
 
 	err = mlx5_fpga_init(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to init fpga device %d\n", err);
-		goto err_sriov_cleanup;
+		goto err_eswitch_cleanup;
 	}
 
 	dev->tracer = mlx5_fw_tracer_create(dev);
 
 	return 0;
 
-err_sriov_cleanup:
-	mlx5_sriov_cleanup(dev);
 err_eswitch_cleanup:
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
+err_sriov_cleanup:
+	mlx5_sriov_cleanup(dev);
 err_mpfs_cleanup:
 	mlx5_mpfs_cleanup(dev);
 err_rl_cleanup:
@@ -893,8 +893,8 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 {
 	mlx5_fw_tracer_destroy(dev->tracer);
 	mlx5_fpga_cleanup(dev);
-	mlx5_sriov_cleanup(dev);
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
+	mlx5_sriov_cleanup(dev);
 	mlx5_mpfs_cleanup(dev);
 	mlx5_cleanup_rl_table(dev);
 	mlx5_vxlan_destroy(dev->vxlan);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index a249b3c3843d..2eecb831c499 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -208,6 +208,27 @@ void mlx5_sriov_detach(struct mlx5_core_dev *dev)
 	mlx5_device_disable_sriov(dev);
 }
 
+static u16 mlx5_get_max_vfs(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(query_esw_functions_out)] = {};
+	u16 host_total_vfs;
+	int err;
+
+	if (mlx5_core_is_ecpf_esw_manager(dev)) {
+		err = mlx5_esw_query_functions(dev, out, sizeof(out));
+		host_total_vfs = MLX5_GET(query_esw_functions_out, out,
+					  host_params_context.host_total_vfs);
+
+		/* Old FW doesn't support getting total_vfs from esw func
+		 * but supports getting it from pci_sriov.
+		 */
+		if (!err && host_total_vfs)
+			return host_total_vfs;
+	}
+
+	return pci_sriov_get_totalvfs(dev->pdev);
+}
+
 int mlx5_sriov_init(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
@@ -218,6 +239,7 @@ int mlx5_sriov_init(struct mlx5_core_dev *dev)
 		return 0;
 
 	total_vfs = pci_sriov_get_totalvfs(pdev);
+	sriov->max_vfs = mlx5_get_max_vfs(dev);
 	sriov->num_vfs = pci_num_vf(pdev);
 	sriov->vfs_ctx = kcalloc(total_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL);
 	if (!sriov->vfs_ctx)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b5431f7d97cb..64155fe201ee 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -470,6 +470,7 @@ struct mlx5_core_sriov {
 	struct mlx5_vf_context	*vfs_ctx;
 	int			num_vfs;
 	int			enabled_vfs;
+	u16			max_vfs;
 };
 
 struct mlx5_fc_stats {
@@ -1103,13 +1104,9 @@ static inline bool mlx5_ecpf_vport_exists(struct mlx5_core_dev *dev)
 	return mlx5_core_is_pf(dev) && MLX5_CAP_ESW(dev, ecpf_vport_exists);
 }
 
-#define MLX5_HOST_PF_MAX_VFS	(127u)
 static inline u16 mlx5_core_max_vfs(struct mlx5_core_dev *dev)
 {
-	if (mlx5_core_is_ecpf_esw_manager(dev))
-		return MLX5_HOST_PF_MAX_VFS;
-	else
-		return pci_sriov_get_totalvfs(dev->pdev);
+	return dev->priv.sriov.max_vfs;
 }
 
 static inline int mlx5_get_gid_table_len(u16 param)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6513b985c5e9..e3c154b573a2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9711,7 +9711,7 @@ struct mlx5_ifc_host_params_context_bits {
 	u8         reserved_at_8[0x8];
 	u8         host_num_of_vfs[0x10];
 
-	u8         reserved_at_20[0x10];
+	u8         host_total_vfs[0x10];
 	u8         host_pci_bus[0x10];
 
 	u8         reserved_at_40[0x10];
-- 
cgit v1.2.3


From ca390799c2aa03632c294107fa7f647bcbdff428 Mon Sep 17 00:00:00 2001
From: Yuval Avnery <yuvalav@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:23 +0000
Subject: net/mlx5: Change interrupt handler to call chain notifier

Multiple EQs may share the same IRQ in subsequent patches.

Instead of calling the IRQ handler directly, the EQ will register
to an atomic chain notfier.

The Linux built-in shared IRQ is not used because it forces the caller
to disable the IRQ and clear affinity before free_irq() can be called.

This patch is the first step in the separation of IRQ and EQ logic.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h             |   1 +
 drivers/infiniband/hw/mlx5/odp.c                 |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     | 138 +++++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h |   9 +-
 include/linux/mlx5/eq.h                          |   3 +-
 5 files changed, 105 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 40eb8be482e4..a043af7ee366 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -920,6 +920,7 @@ struct mlx5_ib_lb_state {
 };
 
 struct mlx5_ib_pf_eq {
+	struct notifier_block irq_nb;
 	struct mlx5_ib_dev *dev;
 	struct mlx5_eq *core;
 	struct work_struct work;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 91507a2e9290..ac40a4fd5598 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1488,9 +1488,11 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
 	mlx5_eq_update_ci(eq->core, cc, 1);
 }
 
-static irqreturn_t mlx5_ib_eq_pf_int(int irq, void *eq_ptr)
+static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
+			     void *data)
 {
-	struct mlx5_ib_pf_eq *eq = eq_ptr;
+	struct mlx5_ib_pf_eq *eq =
+		container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
 	unsigned long flags;
 
 	if (spin_trylock_irqsave(&eq->lock, flags)) {
@@ -1553,12 +1555,12 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 		goto err_mempool;
 	}
 
+	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
 	param = (struct mlx5_eq_param) {
 		.index = MLX5_EQ_PFAULT_IDX,
 		.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 		.nent = MLX5_IB_NUM_PF_EQE,
-		.context = eq,
-		.handler = mlx5_ib_eq_pf_int
+		.nb = &eq->irq_nb,
 	};
 	eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", &param);
 	if (IS_ERR(eq->core)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 28defeaca80a..590c0fefaa25 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -72,16 +72,16 @@ enum {
 static_assert(MLX5_EQ_POLLING_BUDGET <= MLX5_NUM_SPARE_EQE);
 
 struct mlx5_irq_info {
+	struct atomic_notifier_head nh;
 	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
-	void *context; /* dev_id provided to request_irq */
 };
 
 struct mlx5_eq_table {
 	struct list_head        comp_eqs_list;
-	struct mlx5_eq          pages_eq;
-	struct mlx5_eq	        cmd_eq;
-	struct mlx5_eq          async_eq;
+	struct mlx5_eq_async    pages_eq;
+	struct mlx5_eq_async    cmd_eq;
+	struct mlx5_eq_async    async_eq;
 
 	struct atomic_notifier_head nh[MLX5_EVENT_TYPE_MAX];
 
@@ -109,6 +109,31 @@ struct mlx5_eq_table {
 			       (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE)	    | \
 			       (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT))
 
+static struct mlx5_irq_info *mlx5_irq_get(struct mlx5_core_dev *dev, int vecidx)
+{
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+
+	return &eq_table->irq_info[vecidx];
+}
+
+static int mlx5_irq_attach_nb(struct mlx5_irq_info *irq,
+			      struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&irq->nh, nb);
+}
+
+static int mlx5_irq_detach_nb(struct mlx5_irq_info *irq,
+			      struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&irq->nh, nb);
+}
+
+static irqreturn_t mlx5_irq_int_handler(int irq, void *nh)
+{
+	atomic_notifier_call_chain(nh, 0, NULL);
+	return IRQ_HANDLED;
+}
+
 static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
 {
 	u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0};
@@ -134,10 +159,13 @@ static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
 	return cq;
 }
 
-static irqreturn_t mlx5_eq_comp_int(int irq, void *eq_ptr)
+static int mlx5_eq_comp_int(struct notifier_block *nb,
+			    __always_unused unsigned long action,
+			    __always_unused void *data)
 {
-	struct mlx5_eq_comp *eq_comp = eq_ptr;
-	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_eq_comp *eq_comp =
+		container_of(nb, struct mlx5_eq_comp, irq_nb);
+	struct mlx5_eq *eq = &eq_comp->core;
 	struct mlx5_eqe *eqe;
 	int num_eqes = 0;
 	u32 cqn = -1;
@@ -175,7 +203,7 @@ out:
 	if (cqn != -1)
 		tasklet_schedule(&eq_comp->tasklet_ctx.task);
 
-	return IRQ_HANDLED;
+	return 0;
 }
 
 /* Some architectures don't latch interrupts when they are disabled, so using
@@ -189,16 +217,19 @@ u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq_comp *eq)
 
 	disable_irq(eq->core.irqn);
 	count_eqe = eq->core.cons_index;
-	mlx5_eq_comp_int(eq->core.irqn, eq);
+	mlx5_eq_comp_int(&eq->irq_nb, 0, NULL);
 	count_eqe = eq->core.cons_index - count_eqe;
 	enable_irq(eq->core.irqn);
 
 	return count_eqe;
 }
 
-static irqreturn_t mlx5_eq_async_int(int irq, void *eq_ptr)
+static int mlx5_eq_async_int(struct notifier_block *nb,
+			     unsigned long action, void *data)
 {
-	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_eq_async *eq_async =
+		container_of(nb, struct mlx5_eq_async, irq_nb);
+	struct mlx5_eq *eq = &eq_async->core;
 	struct mlx5_eq_table *eqt;
 	struct mlx5_core_dev *dev;
 	struct mlx5_eqe *eqe;
@@ -232,7 +263,7 @@ static irqreturn_t mlx5_eq_async_int(int irq, void *eq_ptr)
 out:
 	eq_update_ci(eq, 1);
 
-	return IRQ_HANDLED;
+	return 0;
 }
 
 static void init_eq_buf(struct mlx5_eq *eq)
@@ -254,6 +285,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	struct mlx5_cq_table *cq_table = &eq->cq_table;
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_irq_info *irq_info;
 	u8 vecidx = param->index;
 	__be64 *pas;
 	void *eqc;
@@ -261,9 +293,6 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	u32 *in;
 	int err;
 
-	if (eq_table->irq_info[vecidx].context)
-		return -EEXIST;
-
 	/* Init CQ table */
 	memset(cq_table, 0, sizeof(*cq_table));
 	spin_lock_init(&cq_table->lock);
@@ -306,24 +335,31 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	if (err)
 		goto err_in;
 
-	snprintf(eq_table->irq_info[vecidx].name, MLX5_MAX_IRQ_NAME, "%s@pci:%s",
-		 name, pci_name(dev->pdev));
-	eq_table->irq_info[vecidx].context = param->context;
+	irq_info = mlx5_irq_get(dev, vecidx);
+	ATOMIC_INIT_NOTIFIER_HEAD(&irq_info->nh);
+	snprintf(irq_info->name, MLX5_MAX_IRQ_NAME,
+		 "%s@pci:%s", name, pci_name(dev->pdev));
 
 	eq->vecidx = vecidx;
 	eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
 	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
 	eq->dev = dev;
 	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
-	err = request_irq(eq->irqn, param->handler, 0,
-			  eq_table->irq_info[vecidx].name, param->context);
+	eq->irq_nb = param->nb;
+
+	err = request_irq(eq->irqn, mlx5_irq_int_handler, 0, irq_info->name,
+			  &irq_info->nh);
 	if (err)
 		goto err_eq;
 
-	err = mlx5_debug_eq_add(dev, eq);
+	err = mlx5_irq_attach_nb(irq_info, param->nb);
 	if (err)
 		goto err_irq;
 
+	err = mlx5_debug_eq_add(dev, eq);
+	if (err)
+		goto err_detach;
+
 	/* EQs are created in ARMED state
 	 */
 	eq_update_ci(eq, 1);
@@ -331,8 +367,11 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	kvfree(in);
 	return 0;
 
+err_detach:
+	mlx5_irq_detach_nb(irq_info, param->nb);
+
 err_irq:
-	free_irq(eq->irqn, eq);
+	free_irq(eq->irqn, &eq_table->irq_info[vecidx].nh);
 
 err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
@@ -355,9 +394,11 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 
 	mlx5_debug_eq_remove(dev, eq);
 
-	free_irq(eq->irqn, irq_info->context);
-	irq_info->context = NULL;
-
+	err = mlx5_irq_detach_nb(irq_info, eq->irq_nb);
+	if (err)
+		mlx5_core_warn(eq->dev, "eq failed to detach from irq. err %d",
+			       err);
+	free_irq(eq->irqn, &eq_table->irq_info[eq->vecidx].nh);
 	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
@@ -479,7 +520,7 @@ static int cq_err_event_notifier(struct notifier_block *nb,
 	/* type == MLX5_EVENT_TYPE_CQ_ERROR */
 
 	eqt = mlx5_nb_cof(nb, struct mlx5_eq_table, cq_err_nb);
-	eq  = &eqt->async_eq;
+	eq  = &eqt->async_eq.core;
 	eqe = data;
 
 	cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
@@ -548,14 +589,14 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	MLX5_NB_INIT(&table->cq_err_nb, cq_err_event_notifier, CQ_ERROR);
 	mlx5_eq_notifier_register(dev, &table->cq_err_nb);
 
+	table->cmd_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.index = MLX5_EQ_CMD_IDX,
 		.mask = 1ull << MLX5_EVENT_TYPE_CMD,
 		.nent = MLX5_NUM_CMD_EQE,
-		.context = &table->cmd_eq,
-		.handler = mlx5_eq_async_int,
+		.nb = &table->cmd_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_cmd_eq", &table->cmd_eq, &param);
+	err = create_async_eq(dev, "mlx5_cmd_eq", &table->cmd_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		goto err0;
@@ -563,27 +604,29 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 
 	mlx5_cmd_use_events(dev);
 
+	table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.index = MLX5_EQ_ASYNC_IDX,
 		.mask = gather_async_events_mask(dev),
 		.nent = MLX5_NUM_ASYNC_EQE,
-		.context = &table->async_eq,
-		.handler = mlx5_eq_async_int,
+		.nb = &table->async_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_async_eq", &table->async_eq, &param);
+	err = create_async_eq(dev, "mlx5_async_eq",
+			      &table->async_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
 		goto err1;
 	}
 
+	table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.index = MLX5_EQ_PAGEREQ_IDX,
 		.mask =  1 << MLX5_EVENT_TYPE_PAGE_REQUEST,
 		.nent = /* TODO: sriov max_vf + */ 1,
-		.context = &table->pages_eq,
-		.handler = mlx5_eq_async_int,
+		.nb = &table->pages_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_pages_eq", &table->pages_eq, &param);
+	err = create_async_eq(dev, "mlx5_pages_eq",
+			      &table->pages_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
 		goto err2;
@@ -592,11 +635,11 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	return err;
 
 err2:
-	destroy_async_eq(dev, &table->async_eq);
+	destroy_async_eq(dev, &table->async_eq.core);
 
 err1:
 	mlx5_cmd_use_polling(dev);
-	destroy_async_eq(dev, &table->cmd_eq);
+	destroy_async_eq(dev, &table->cmd_eq.core);
 err0:
 	mlx5_eq_notifier_unregister(dev, &table->cq_err_nb);
 	return err;
@@ -607,19 +650,19 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	int err;
 
-	err = destroy_async_eq(dev, &table->pages_eq);
+	err = destroy_async_eq(dev, &table->pages_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
 			      err);
 
-	err = destroy_async_eq(dev, &table->async_eq);
+	err = destroy_async_eq(dev, &table->async_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
 			      err);
 
 	mlx5_cmd_use_polling(dev);
 
-	err = destroy_async_eq(dev, &table->cmd_eq);
+	err = destroy_async_eq(dev, &table->cmd_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
 			      err);
@@ -629,17 +672,17 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 
 struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev)
 {
-	return &dev->priv.eq_table->async_eq;
+	return &dev->priv.eq_table->async_eq.core;
 }
 
 void mlx5_eq_synchronize_async_irq(struct mlx5_core_dev *dev)
 {
-	synchronize_irq(dev->priv.eq_table->async_eq.irqn);
+	synchronize_irq(dev->priv.eq_table->async_eq.core.irqn);
 }
 
 void mlx5_eq_synchronize_cmd_irq(struct mlx5_core_dev *dev)
 {
-	synchronize_irq(dev->priv.eq_table->cmd_eq.irqn);
+	synchronize_irq(dev->priv.eq_table->cmd_eq.core.irqn);
 }
 
 /* Generic EQ API for mlx5_core consumers
@@ -837,12 +880,12 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 		irq_cpu_rmap_add(table->rmap, pci_irq_vector(dev->pdev, vecidx));
 #endif
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
+		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
 			.index = vecidx,
 			.mask = 0,
 			.nent = nent,
-			.context = &eq->core,
-			.handler = mlx5_eq_comp_int
+			.nb = &eq->irq_nb,
 		};
 		err = create_map_eq(dev, &eq->core, name, &param);
 		if (err) {
@@ -940,10 +983,7 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 	mutex_lock(&table->lock); /* sync with create/destroy_async_eq */
 	max_eqs = table->num_comp_vectors + MLX5_EQ_VEC_COMP_BASE;
 	for (i = max_eqs - 1; i >= 0; i--) {
-		if (!table->irq_info[i].context)
-			continue;
-		free_irq(pci_irq_vector(dev->pdev, i), table->irq_info[i].context);
-		table->irq_info[i].context = NULL;
+		free_irq(pci_irq_vector(dev->pdev, i), &table->irq_info[i].nh);
 	}
 	mutex_unlock(&table->lock);
 	pci_free_irq_vectors(dev->pdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index c0fb6d72b695..adbc228bd55d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -34,10 +34,17 @@ struct mlx5_eq {
 	u8                      eqn;
 	int                     nent;
 	struct mlx5_rsc_debug   *dbg;
+	struct notifier_block   *irq_nb; /* For destroy only */
+};
+
+struct mlx5_eq_async {
+	struct mlx5_eq          core;
+	struct notifier_block   irq_nb;
 };
 
 struct mlx5_eq_comp {
-	struct mlx5_eq          core; /* Must be first */
+	struct mlx5_eq          core;
+	struct notifier_block   irq_nb;
 	struct mlx5_eq_tasklet  tasklet_ctx;
 	struct list_head        list;
 };
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 00045cc4ea11..7909f1ff197c 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -26,8 +26,7 @@ struct mlx5_eq_param {
 	u8             index;
 	int            nent;
 	u64            mask;
-	void          *context;
-	irq_handler_t  handler;
+	struct notifier_block *nb;
 };
 
 struct mlx5_eq *
-- 
cgit v1.2.3


From 24163189da487b4caa751eef4e945c9333aae441 Mon Sep 17 00:00:00 2001
From: Yuval Avnery <yuvalav@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:25 +0000
Subject: net/mlx5: Separate IRQ request/free from EQ life cycle

Instead of requesting IRQ with eq creation, IRQs will be requested
before EQ table creation.
Instead of freeing the IRQs after EQ destroy, free IRQs after eq
table destroy.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c             |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 121 ++++++++++++++++++---------
 include/linux/mlx5/eq.h                      |   3 +-
 3 files changed, 84 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index ac40a4fd5598..7ce7c5bfe685 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1562,7 +1562,7 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 		.nent = MLX5_IB_NUM_PF_EQE,
 		.nb = &eq->irq_nb,
 	};
-	eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", &param);
+	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
 		goto err_wq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 590c0fefaa25..f187169cbe76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -134,6 +134,64 @@ static irqreturn_t mlx5_irq_int_handler(int irq, void *nh)
 	return IRQ_HANDLED;
 }
 
+static void irq_set_name(char *name, int vecidx)
+{
+	switch (vecidx) {
+	case MLX5_EQ_CMD_IDX:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_cmd_eq");
+		break;
+	case MLX5_EQ_ASYNC_IDX:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async_eq");
+		break;
+	case MLX5_EQ_PAGEREQ_IDX:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_pages_eq");
+		break;
+	case MLX5_EQ_PFAULT_IDX:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_ib_page_fault_eq");
+		break;
+	default:
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
+			 vecidx - MLX5_EQ_VEC_COMP_BASE);
+		break;
+	}
+}
+
+static int request_irqs(struct mlx5_core_dev *dev, int nvec)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_eq_table *eq_table;
+	char name[MLX5_MAX_IRQ_NAME];
+	int err;
+	int i;
+
+	eq_table = priv->eq_table;
+	for (i = 0; i < nvec; i++) {
+		struct mlx5_irq_info *irq_info = &eq_table->irq_info[i];
+		int irqn = pci_irq_vector(dev->pdev, i);
+
+		irq_set_name(name, i);
+		ATOMIC_INIT_NOTIFIER_HEAD(&irq_info->nh);
+		snprintf(irq_info->name, MLX5_MAX_IRQ_NAME,
+			 "%s@pci:%s", name, pci_name(dev->pdev));
+		err = request_irq(irqn, mlx5_irq_int_handler, 0, irq_info->name,
+				  &irq_info->nh);
+		if (err) {
+			mlx5_core_err(dev, "Failed to request irq\n");
+			goto err_request_irq;
+		}
+	}
+	return 0;
+
+err_request_irq:
+	for (; i >= 0; i--) {
+		struct mlx5_irq_info *irq_info = &eq_table->irq_info[i];
+		int irqn = pci_irq_vector(dev->pdev, i);
+
+		free_irq(irqn, &irq_info->nh);
+	}
+	return  err;
+}
+
 static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
 {
 	u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0};
@@ -278,14 +336,12 @@ static void init_eq_buf(struct mlx5_eq *eq)
 }
 
 static int
-create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
+create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	      struct mlx5_eq_param *param)
 {
-	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
 	struct mlx5_cq_table *cq_table = &eq->cq_table;
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_irq_info *irq_info;
 	u8 vecidx = param->index;
 	__be64 *pas;
 	void *eqc;
@@ -335,11 +391,6 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	if (err)
 		goto err_in;
 
-	irq_info = mlx5_irq_get(dev, vecidx);
-	ATOMIC_INIT_NOTIFIER_HEAD(&irq_info->nh);
-	snprintf(irq_info->name, MLX5_MAX_IRQ_NAME,
-		 "%s@pci:%s", name, pci_name(dev->pdev));
-
 	eq->vecidx = vecidx;
 	eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
 	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
@@ -347,15 +398,10 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
 	eq->irq_nb = param->nb;
 
-	err = request_irq(eq->irqn, mlx5_irq_int_handler, 0, irq_info->name,
-			  &irq_info->nh);
+	err = mlx5_irq_attach_nb(mlx5_irq_get(dev, vecidx), param->nb);
 	if (err)
 		goto err_eq;
 
-	err = mlx5_irq_attach_nb(irq_info, param->nb);
-	if (err)
-		goto err_irq;
-
 	err = mlx5_debug_eq_add(dev, eq);
 	if (err)
 		goto err_detach;
@@ -368,10 +414,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, const char *name,
 	return 0;
 
 err_detach:
-	mlx5_irq_detach_nb(irq_info, param->nb);
-
-err_irq:
-	free_irq(eq->irqn, &eq_table->irq_info[vecidx].nh);
+	mlx5_irq_detach_nb(mlx5_irq_get(dev, vecidx), eq->irq_nb);
 
 err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
@@ -386,19 +429,14 @@ err_buf:
 
 static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 {
-	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
-	struct mlx5_irq_info *irq_info;
 	int err;
 
-	irq_info = &eq_table->irq_info[eq->vecidx];
-
 	mlx5_debug_eq_remove(dev, eq);
 
-	err = mlx5_irq_detach_nb(irq_info, eq->irq_nb);
+	err = mlx5_irq_detach_nb(mlx5_irq_get(dev, eq->vecidx), eq->irq_nb);
 	if (err)
 		mlx5_core_warn(eq->dev, "eq failed to detach from irq. err %d",
 			       err);
-	free_irq(eq->irqn, &eq_table->irq_info[eq->vecidx].nh);
 	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
@@ -479,7 +517,7 @@ void mlx5_eq_table_cleanup(struct mlx5_core_dev *dev)
 
 /* Async EQs */
 
-static int create_async_eq(struct mlx5_core_dev *dev, const char *name,
+static int create_async_eq(struct mlx5_core_dev *dev,
 			   struct mlx5_eq *eq, struct mlx5_eq_param *param)
 {
 	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
@@ -491,7 +529,7 @@ static int create_async_eq(struct mlx5_core_dev *dev, const char *name,
 		goto unlock;
 	}
 
-	err = create_map_eq(dev, eq, name, param);
+	err = create_map_eq(dev, eq, param);
 unlock:
 	mutex_unlock(&eq_table->lock);
 	return err;
@@ -596,7 +634,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.nent = MLX5_NUM_CMD_EQE,
 		.nb = &table->cmd_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_cmd_eq", &table->cmd_eq.core, &param);
+	err = create_async_eq(dev, &table->cmd_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		goto err0;
@@ -611,8 +649,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.nent = MLX5_NUM_ASYNC_EQE,
 		.nb = &table->async_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_async_eq",
-			      &table->async_eq.core, &param);
+	err = create_async_eq(dev, &table->async_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
 		goto err1;
@@ -625,8 +662,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.nent = /* TODO: sriov max_vf + */ 1,
 		.nb = &table->pages_eq.irq_nb,
 	};
-	err = create_async_eq(dev, "mlx5_pages_eq",
-			      &table->pages_eq.core, &param);
+	err = create_async_eq(dev, &table->pages_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
 		goto err2;
@@ -689,7 +725,7 @@ void mlx5_eq_synchronize_cmd_irq(struct mlx5_core_dev *dev)
  * Needed For RDMA ODP EQ for now
  */
 struct mlx5_eq *
-mlx5_eq_create_generic(struct mlx5_core_dev *dev, const char *name,
+mlx5_eq_create_generic(struct mlx5_core_dev *dev,
 		       struct mlx5_eq_param *param)
 {
 	struct mlx5_eq *eq = kvzalloc(sizeof(*eq), GFP_KERNEL);
@@ -698,7 +734,7 @@ mlx5_eq_create_generic(struct mlx5_core_dev *dev, const char *name,
 	if (!eq)
 		return ERR_PTR(-ENOMEM);
 
-	err = create_async_eq(dev, name, eq, param);
+	err = create_async_eq(dev, eq, param);
 	if (err) {
 		kvfree(eq);
 		eq = ERR_PTR(err);
@@ -845,7 +881,6 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 static int create_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
-	char name[MLX5_MAX_IRQ_NAME];
 	struct mlx5_eq_comp *eq;
 	int ncomp_vec;
 	int nent;
@@ -879,7 +914,6 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 #ifdef CONFIG_RFS_ACCEL
 		irq_cpu_rmap_add(table->rmap, pci_irq_vector(dev->pdev, vecidx));
 #endif
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
 			.index = vecidx,
@@ -887,7 +921,7 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 			.nent = nent,
 			.nb = &eq->irq_nb,
 		};
-		err = create_map_eq(dev, &eq->core, name, &param);
+		err = create_map_eq(dev, &eq->core, &param);
 		if (err) {
 			kfree(eq);
 			goto clean;
@@ -1018,8 +1052,14 @@ static int alloc_irq_vectors(struct mlx5_core_dev *dev)
 
 	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
 
+	err = request_irqs(dev, nvec);
+	if (err)
+		goto err_free_irqs;
+
 	return 0;
 
+err_free_irqs:
+	pci_free_irq_vectors(dev->pdev);
 err_free_irq_info:
 	kfree(table->irq_info);
 	return err;
@@ -1027,10 +1067,13 @@ err_free_irq_info:
 
 static void free_irq_vectors(struct mlx5_core_dev *dev)
 {
-	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+	int i;
 
+	for (i = 0; i < table->num_comp_vectors + MLX5_EQ_VEC_COMP_BASE; i++)
+		free_irq(pci_irq_vector(dev->pdev, i), &table->irq_info[i].nh);
 	pci_free_irq_vectors(dev->pdev);
-	kfree(priv->eq_table->irq_info);
+	kfree(table->irq_info);
 }
 
 int mlx5_eq_table_create(struct mlx5_core_dev *dev)
@@ -1039,7 +1082,7 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)
 
 	err = alloc_irq_vectors(dev);
 	if (err) {
-		mlx5_core_err(dev, "alloc irq vectors failed\n");
+		mlx5_core_err(dev, "Failed to create IRQ vectors\n");
 		return err;
 	}
 
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 7909f1ff197c..73ab658af764 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -30,8 +30,7 @@ struct mlx5_eq_param {
 };
 
 struct mlx5_eq *
-mlx5_eq_create_generic(struct mlx5_core_dev *dev, const char *name,
-		       struct mlx5_eq_param *param);
+mlx5_eq_create_generic(struct mlx5_core_dev *dev, struct mlx5_eq_param *param);
 int
 mlx5_eq_destroy_generic(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 
-- 
cgit v1.2.3


From 561aa15ad69e9d1e5a8bb277adb3209bf8091ecb Mon Sep 17 00:00:00 2001
From: Yuval Avnery <yuvalav@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:27 +0000
Subject: net/mlx5: Separate IRQ data from EQ table data

IRQ table should only exist for mlx5_core_dev for PF and VF only.
EQ table of mediated devices should hold a pointer to the IRQ table
of the parent PCI device.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 125 ++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  11 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   3 +
 include/linux/mlx5/driver.h                        |   3 +
 4 files changed, 98 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index f187169cbe76..cdfa35ec02fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -77,6 +77,14 @@ struct mlx5_irq_info {
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
+struct mlx5_irq_table {
+	struct mlx5_irq_info *irq_info;
+	int nvec;
+#ifdef CONFIG_RFS_ACCEL
+	struct cpu_rmap *rmap;
+#endif
+};
+
 struct mlx5_eq_table {
 	struct list_head        comp_eqs_list;
 	struct mlx5_eq_async    pages_eq;
@@ -89,11 +97,8 @@ struct mlx5_eq_table {
 	struct mlx5_nb          cq_err_nb;
 
 	struct mutex            lock; /* sync async eqs creations */
-	int			num_comp_vectors;
-	struct mlx5_irq_info	*irq_info;
-#ifdef CONFIG_RFS_ACCEL
-	struct cpu_rmap         *rmap;
-#endif
+	int			num_comp_eqs;
+	struct mlx5_irq_table	*irq_table;
 };
 
 #define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG)	    | \
@@ -109,11 +114,33 @@ struct mlx5_eq_table {
 			       (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE)	    | \
 			       (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT))
 
+int mlx5_irq_table_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_irq_table *irq_table;
+
+	irq_table = kvzalloc(sizeof(*irq_table), GFP_KERNEL);
+	if (!irq_table)
+		return -ENOMEM;
+
+	dev->priv.irq_table = irq_table;
+	return 0;
+}
+
+void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
+{
+	kvfree(dev->priv.irq_table);
+}
+
+static int mlx5_irq_get_num_comp(struct mlx5_irq_table *table)
+{
+	return table->nvec - MLX5_EQ_VEC_COMP_BASE;
+}
+
 static struct mlx5_irq_info *mlx5_irq_get(struct mlx5_core_dev *dev, int vecidx)
 {
-	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+	struct mlx5_irq_table *irq_table = dev->priv.irq_table;
 
-	return &eq_table->irq_info[vecidx];
+	return &irq_table->irq_info[vecidx];
 }
 
 static int mlx5_irq_attach_nb(struct mlx5_irq_info *irq,
@@ -158,15 +185,12 @@ static void irq_set_name(char *name, int vecidx)
 
 static int request_irqs(struct mlx5_core_dev *dev, int nvec)
 {
-	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_eq_table *eq_table;
 	char name[MLX5_MAX_IRQ_NAME];
 	int err;
 	int i;
 
-	eq_table = priv->eq_table;
 	for (i = 0; i < nvec; i++) {
-		struct mlx5_irq_info *irq_info = &eq_table->irq_info[i];
+		struct mlx5_irq_info *irq_info = mlx5_irq_get(dev, i);
 		int irqn = pci_irq_vector(dev->pdev, i);
 
 		irq_set_name(name, i);
@@ -184,7 +208,7 @@ static int request_irqs(struct mlx5_core_dev *dev, int nvec)
 
 err_request_irq:
 	for (; i >= 0; i--) {
-		struct mlx5_irq_info *irq_info = &eq_table->irq_info[i];
+		struct mlx5_irq_info *irq_info = mlx5_irq_get(dev, i);
 		int irqn = pci_irq_vector(dev->pdev, i);
 
 		free_irq(irqn, &irq_info->nh);
@@ -501,6 +525,7 @@ int mlx5_eq_table_init(struct mlx5_core_dev *dev)
 	for (i = 0; i < MLX5_EVENT_TYPE_MAX; i++)
 		ATOMIC_INIT_NOTIFIER_HEAD(&eq_table->nh[i]);
 
+	eq_table->irq_table = dev->priv.irq_table;
 	return 0;
 
 kvfree_eq_table:
@@ -796,10 +821,13 @@ EXPORT_SYMBOL(mlx5_eq_update_ci);
 
 static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
-	struct mlx5_priv *priv  = &mdev->priv;
 	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
-	int irq = pci_irq_vector(mdev->pdev, vecidx);
-	struct mlx5_irq_info *irq_info = &priv->eq_table->irq_info[vecidx];
+	struct mlx5_priv *priv  = &mdev->priv;
+	struct mlx5_irq_info *irq_info;
+	int irq;
+
+	irq_info = mlx5_irq_get(mdev, vecidx);
+	irq = pci_irq_vector(mdev->pdev, vecidx);
 
 	if (!zalloc_cpumask_var(&irq_info->mask, GFP_KERNEL)) {
 		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
@@ -819,20 +847,22 @@ static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 static void clear_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
 	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
-	struct mlx5_priv *priv  = &mdev->priv;
-	int irq = pci_irq_vector(mdev->pdev, vecidx);
-	struct mlx5_irq_info *irq_info = &priv->eq_table->irq_info[vecidx];
+	struct mlx5_irq_info *irq_info;
+	int irq;
 
+	irq_info = mlx5_irq_get(mdev, vecidx);
+	irq = pci_irq_vector(mdev->pdev, vecidx);
 	irq_set_affinity_hint(irq, NULL);
 	free_cpumask_var(irq_info->mask);
 }
 
 static int set_comp_irq_affinity_hints(struct mlx5_core_dev *mdev)
 {
+	int nvec = mlx5_irq_get_num_comp(mdev->priv.irq_table);
 	int err;
 	int i;
 
-	for (i = 0; i < mdev->priv.eq_table->num_comp_vectors; i++) {
+	for (i = 0; i < nvec; i++) {
 		err = set_comp_irq_affinity_hint(mdev, i);
 		if (err)
 			goto err_out;
@@ -849,9 +879,10 @@ err_out:
 
 static void clear_comp_irqs_affinity_hints(struct mlx5_core_dev *mdev)
 {
+	int nvec = mlx5_irq_get_num_comp(mdev->priv.irq_table);
 	int i;
 
-	for (i = 0; i < mdev->priv.eq_table->num_comp_vectors; i++)
+	for (i = 0; i < nvec; i++)
 		clear_comp_irq_affinity_hint(mdev, i);
 }
 
@@ -863,9 +894,9 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 	clear_comp_irqs_affinity_hints(dev);
 
 #ifdef CONFIG_RFS_ACCEL
-	if (table->rmap) {
-		free_irq_cpu_rmap(table->rmap);
-		table->rmap = NULL;
+	if (table->irq_table->rmap) {
+		free_irq_cpu_rmap(table->irq_table->rmap);
+		table->irq_table->rmap = NULL;
 	}
 #endif
 	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
@@ -882,20 +913,20 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	struct mlx5_eq_comp *eq;
-	int ncomp_vec;
+	int ncomp_eqs;
 	int nent;
 	int err;
 	int i;
 
 	INIT_LIST_HEAD(&table->comp_eqs_list);
-	ncomp_vec = table->num_comp_vectors;
+	ncomp_eqs = table->num_comp_eqs;
 	nent = MLX5_COMP_EQ_SIZE;
 #ifdef CONFIG_RFS_ACCEL
-	table->rmap = alloc_irq_cpu_rmap(ncomp_vec);
-	if (!table->rmap)
+	table->irq_table->rmap = alloc_irq_cpu_rmap(ncomp_eqs);
+	if (!table->irq_table->rmap)
 		return -ENOMEM;
 #endif
-	for (i = 0; i < ncomp_vec; i++) {
+	for (i = 0; i < ncomp_eqs; i++) {
 		int vecidx = i + MLX5_EQ_VEC_COMP_BASE;
 		struct mlx5_eq_param param = {};
 
@@ -912,7 +943,8 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 			     (unsigned long)&eq->tasklet_ctx);
 
 #ifdef CONFIG_RFS_ACCEL
-		irq_cpu_rmap_add(table->rmap, pci_irq_vector(dev->pdev, vecidx));
+		irq_cpu_rmap_add(table->irq_table->rmap,
+				 pci_irq_vector(dev->pdev, vecidx));
 #endif
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
@@ -967,22 +999,23 @@ EXPORT_SYMBOL(mlx5_vector2eqn);
 
 unsigned int mlx5_comp_vectors_count(struct mlx5_core_dev *dev)
 {
-	return dev->priv.eq_table->num_comp_vectors;
+	return dev->priv.eq_table->num_comp_eqs;
 }
 EXPORT_SYMBOL(mlx5_comp_vectors_count);
 
 struct cpumask *
 mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector)
 {
-	/* TODO: consider irq_get_affinity_mask(irq) */
-	return dev->priv.eq_table->irq_info[vector + MLX5_EQ_VEC_COMP_BASE].mask;
+	int vecidx = vector + MLX5_EQ_VEC_COMP_BASE;
+
+	return dev->priv.eq_table->irq_table->irq_info[vecidx].mask;
 }
 EXPORT_SYMBOL(mlx5_comp_irq_get_affinity_mask);
 
 #ifdef CONFIG_RFS_ACCEL
 struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev)
 {
-	return dev->priv.eq_table->rmap;
+	return dev->priv.eq_table->irq_table->rmap;
 }
 #endif
 
@@ -1008,16 +1041,17 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 	clear_comp_irqs_affinity_hints(dev);
 
 #ifdef CONFIG_RFS_ACCEL
-	if (table->rmap) {
-		free_irq_cpu_rmap(table->rmap);
-		table->rmap = NULL;
+	if (table->irq_table->rmap) {
+		free_irq_cpu_rmap(table->irq_table->rmap);
+		table->irq_table->rmap = NULL;
 	}
 #endif
 
 	mutex_lock(&table->lock); /* sync with create/destroy_async_eq */
-	max_eqs = table->num_comp_vectors + MLX5_EQ_VEC_COMP_BASE;
+	max_eqs = table->num_comp_eqs + MLX5_EQ_VEC_COMP_BASE;
 	for (i = max_eqs - 1; i >= 0; i--) {
-		free_irq(pci_irq_vector(dev->pdev, i), &table->irq_info[i].nh);
+		free_irq(pci_irq_vector(dev->pdev, i),
+			 &mlx5_irq_get(dev, i)->nh);
 	}
 	mutex_unlock(&table->lock);
 	pci_free_irq_vectors(dev->pdev);
@@ -1026,7 +1060,7 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 static int alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
-	struct mlx5_eq_table *table = priv->eq_table;
+	struct mlx5_irq_table *table = priv->irq_table;
 	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
 		      MLX5_CAP_GEN(dev, max_num_eqs) :
 		      1 << MLX5_CAP_GEN(dev, log_max_eq);
@@ -1050,7 +1084,7 @@ static int alloc_irq_vectors(struct mlx5_core_dev *dev)
 		goto err_free_irq_info;
 	}
 
-	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
+	table->nvec = nvec;
 
 	err = request_irqs(dev, nvec);
 	if (err)
@@ -1067,17 +1101,19 @@ err_free_irq_info:
 
 static void free_irq_vectors(struct mlx5_core_dev *dev)
 {
-	struct mlx5_eq_table *table = dev->priv.eq_table;
+	struct mlx5_irq_table *table = dev->priv.irq_table;
 	int i;
 
-	for (i = 0; i < table->num_comp_vectors + MLX5_EQ_VEC_COMP_BASE; i++)
-		free_irq(pci_irq_vector(dev->pdev, i), &table->irq_info[i].nh);
+	for (i = 0; i < table->nvec; i++)
+		free_irq(pci_irq_vector(dev->pdev, i),
+			 &mlx5_irq_get(dev, i)->nh);
 	pci_free_irq_vectors(dev->pdev);
 	kfree(table->irq_info);
 }
 
 int mlx5_eq_table_create(struct mlx5_core_dev *dev)
 {
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
 	int err;
 
 	err = alloc_irq_vectors(dev);
@@ -1086,6 +1122,9 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)
 		return err;
 	}
 
+	eq_table->num_comp_eqs =
+		mlx5_irq_get_num_comp(eq_table->irq_table);
+
 	err = create_async_eqs(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to create async EQs\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 720f65bfe6a9..be79dceea3c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -804,10 +804,16 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 		goto err_devcom;
 	}
 
+	err = mlx5_irq_table_init(dev);
+	if (err) {
+		mlx5_core_err(dev, "failed to initialize irq table\n");
+		goto err_devcom;
+	}
+
 	err = mlx5_eq_table_init(dev);
 	if (err) {
 		mlx5_core_err(dev, "failed to initialize eq\n");
-		goto err_devcom;
+		goto err_irq_cleanup;
 	}
 
 	err = mlx5_events_init(dev);
@@ -883,6 +889,8 @@ err_events_cleanup:
 	mlx5_events_cleanup(dev);
 err_eq_cleanup:
 	mlx5_eq_table_cleanup(dev);
+err_irq_cleanup:
+	mlx5_irq_table_cleanup(dev);
 err_devcom:
 	mlx5_devcom_unregister_device(dev->priv.devcom);
 
@@ -905,6 +913,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_cq_debugfs_cleanup(dev);
 	mlx5_events_cleanup(dev);
 	mlx5_eq_table_cleanup(dev);
+	mlx5_irq_table_cleanup(dev);
 	mlx5_devcom_unregister_device(dev->priv.devcom);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 22e69d4813e4..907515f3bfbb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -153,6 +153,9 @@ int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
 void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev);
 void mlx5_lag_remove(struct mlx5_core_dev *dev);
 
+int mlx5_irq_table_init(struct mlx5_core_dev *dev);
+void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev);
+
 int mlx5_events_init(struct mlx5_core_dev *dev);
 void mlx5_events_cleanup(struct mlx5_core_dev *dev);
 void mlx5_events_start(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 64155fe201ee..d8ab633406c2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -492,6 +492,7 @@ struct mlx5_eswitch;
 struct mlx5_lag;
 struct mlx5_devcom;
 struct mlx5_eq_table;
+struct mlx5_irq_table;
 
 struct mlx5_rate_limit {
 	u32			rate;
@@ -521,6 +522,8 @@ struct mlx5_core_roce {
 };
 
 struct mlx5_priv {
+	/* IRQ table valid only for real pci devices PF or VF */
+	struct mlx5_irq_table   *irq_table;
 	struct mlx5_eq_table	*eq_table;
 
 	/* pages stuff */
-- 
cgit v1.2.3


From 81bfa206032a67f0700459a64a5493c246629604 Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:41 +0000
Subject: net/mlx5: Use a single IRQ for all async EQs

The patch modifies the IRQ allocation so that all async EQs are
assigned to the same IRQ resulting in more available IRQs for
completion EQs.

The changes are using the support for IRQ sharing and EQ polling budget
that was introduced in previous patches so when the shared interrupt is
triggered, the kernel will serially call the handler of each of the
sharing EQs with a certain budget of EQEs to poll in order to prevent
starvation.

Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c                  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c      | 19 ++++++------
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 38 +++++++++--------------
 include/linux/mlx5/eq.h                           | 14 ++-------
 4 files changed, 27 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 7ce7c5bfe685..693a0e225093 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1557,7 +1557,7 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 
 	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
 	param = (struct mlx5_eq_param) {
-		.index = MLX5_EQ_PFAULT_IDX,
+		.irq_index = 0,
 		.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 		.nent = MLX5_IB_NUM_PF_EQE,
 		.nb = &eq->irq_nb,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 0c72c122daef..0f5846a34928 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -250,7 +250,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	struct mlx5_cq_table *cq_table = &eq->cq_table;
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
-	u8 vecidx = param->index;
+	u8 vecidx = param->irq_index;
 	__be64 *pas;
 	void *eqc;
 	int inlen;
@@ -435,8 +435,9 @@ static int create_async_eq(struct mlx5_core_dev *dev,
 	int err;
 
 	mutex_lock(&eq_table->lock);
-	if (param->index >= MLX5_EQ_MAX_ASYNC_EQS) {
-		err = -ENOSPC;
+	/* Async EQs must share irq index 0 */
+	if (param->irq_index != 0) {
+		err = -EINVAL;
 		goto unlock;
 	}
 
@@ -540,7 +541,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 
 	table->cmd_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
-		.index = MLX5_EQ_CMD_IDX,
+		.irq_index = 0,
 		.mask = 1ull << MLX5_EVENT_TYPE_CMD,
 		.nent = MLX5_NUM_CMD_EQE,
 		.nb = &table->cmd_eq.irq_nb,
@@ -555,7 +556,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 
 	table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
-		.index = MLX5_EQ_ASYNC_IDX,
+		.irq_index = 0,
 		.mask = gather_async_events_mask(dev),
 		.nent = MLX5_NUM_ASYNC_EQE,
 		.nb = &table->async_eq.irq_nb,
@@ -568,7 +569,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 
 	table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
-		.index = MLX5_EQ_PAGEREQ_IDX,
+		.irq_index = 0,
 		.mask =  1 << MLX5_EVENT_TYPE_PAGE_REQUEST,
 		.nent = /* TODO: sriov max_vf + */ 1,
 		.nb = &table->pages_eq.irq_nb,
@@ -731,7 +732,7 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 	ncomp_eqs = table->num_comp_eqs;
 	nent = MLX5_COMP_EQ_SIZE;
 	for (i = 0; i < ncomp_eqs; i++) {
-		int vecidx = i + MLX5_EQ_VEC_COMP_BASE;
+		int vecidx = i + MLX5_IRQ_VEC_COMP_BASE;
 		struct mlx5_eq_param param = {};
 
 		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
@@ -748,7 +749,7 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
-			.index = vecidx,
+			.irq_index = vecidx,
 			.mask = 0,
 			.nent = nent,
 			.nb = &eq->irq_nb,
@@ -800,7 +801,7 @@ EXPORT_SYMBOL(mlx5_comp_vectors_count);
 struct cpumask *
 mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector)
 {
-	int vecidx = vector + MLX5_EQ_VEC_COMP_BASE;
+	int vecidx = vector + MLX5_IRQ_VEC_COMP_BASE;
 
 	return mlx5_irq_get_affinity_mask(dev->priv.eq_table->irq_table,
 					  vecidx);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index fec861f4fefe..373981a659c7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -45,7 +45,7 @@ void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
 
 int mlx5_irq_get_num_comp(struct mlx5_irq_table *table)
 {
-	return table->nvec - MLX5_EQ_VEC_COMP_BASE;
+	return table->nvec - MLX5_IRQ_VEC_COMP_BASE;
 }
 
 static struct mlx5_irq *mlx5_irq_get(struct mlx5_core_dev *dev, int vecidx)
@@ -81,24 +81,14 @@ static irqreturn_t mlx5_irq_int_handler(int irq, void *nh)
 
 static void irq_set_name(char *name, int vecidx)
 {
-	switch (vecidx) {
-	case MLX5_EQ_CMD_IDX:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_cmd_eq");
-		break;
-	case MLX5_EQ_ASYNC_IDX:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async_eq");
-		break;
-	case MLX5_EQ_PAGEREQ_IDX:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_pages_eq");
-		break;
-	case MLX5_EQ_PFAULT_IDX:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_ib_page_fault_eq");
-		break;
-	default:
-		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
-			 vecidx - MLX5_EQ_VEC_COMP_BASE);
-		break;
+	if (vecidx == 0) {
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async");
+		return;
 	}
+
+	snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
+		 vecidx - MLX5_IRQ_VEC_COMP_BASE);
+	return;
 }
 
 static int request_irqs(struct mlx5_core_dev *dev, int nvec)
@@ -159,7 +149,7 @@ static int irq_set_rmap(struct mlx5_core_dev *mdev)
 		goto err_out;
 	}
 
-	vecidx = MLX5_EQ_VEC_COMP_BASE;
+	vecidx = MLX5_IRQ_VEC_COMP_BASE;
 	for (; vecidx < irq_table->nvec; vecidx++) {
 		err = irq_cpu_rmap_add(irq_table->rmap,
 				       pci_irq_vector(mdev->pdev, vecidx));
@@ -182,7 +172,7 @@ err_out:
 
 static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
-	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
+	int vecidx = MLX5_IRQ_VEC_COMP_BASE + i;
 	struct mlx5_irq *irq;
 	int irqn;
 
@@ -205,7 +195,7 @@ static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 
 static void clear_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
-	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
+	int vecidx = MLX5_IRQ_VEC_COMP_BASE + i;
 	struct mlx5_irq *irq;
 	int irqn;
 
@@ -279,16 +269,16 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 	int err;
 
 	nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
-	       MLX5_EQ_VEC_COMP_BASE;
+	       MLX5_IRQ_VEC_COMP_BASE;
 	nvec = min_t(int, nvec, num_eqs);
-	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
+	if (nvec <= MLX5_IRQ_VEC_COMP_BASE)
 		return -ENOMEM;
 
 	table->irq = kcalloc(nvec, sizeof(*table->irq), GFP_KERNEL);
 	if (!table->irq)
 		return -ENOMEM;
 
-	nvec = pci_alloc_irq_vectors(dev->pdev, MLX5_EQ_VEC_COMP_BASE + 1,
+	nvec = pci_alloc_irq_vectors(dev->pdev, MLX5_IRQ_VEC_COMP_BASE + 1,
 				     nvec, PCI_IRQ_MSIX);
 	if (nvec < 0) {
 		err = nvec;
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 73ab658af764..4a94e04eff0a 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -4,17 +4,7 @@
 #ifndef MLX5_CORE_EQ_H
 #define MLX5_CORE_EQ_H
 
-enum {
-	MLX5_EQ_PAGEREQ_IDX        = 0,
-	MLX5_EQ_CMD_IDX            = 1,
-	MLX5_EQ_ASYNC_IDX          = 2,
-	/* reserved to be used by mlx5_core ulps (mlx5e/mlx5_ib) */
-	MLX5_EQ_PFAULT_IDX         = 3,
-	MLX5_EQ_MAX_ASYNC_EQS,
-	/* completion eqs vector indices start here */
-	MLX5_EQ_VEC_COMP_BASE = MLX5_EQ_MAX_ASYNC_EQS,
-};
-
+#define MLX5_IRQ_VEC_COMP_BASE 1
 #define MLX5_NUM_CMD_EQE   (32)
 #define MLX5_NUM_ASYNC_EQE (0x1000)
 #define MLX5_NUM_SPARE_EQE (0x80)
@@ -23,7 +13,7 @@ struct mlx5_eq;
 struct mlx5_core_dev;
 
 struct mlx5_eq_param {
-	u8             index;
+	u8             irq_index;
 	int            nent;
 	u64            mask;
 	struct notifier_block *nb;
-- 
cgit v1.2.3


From 1f8a7bee27e63d7c5287719049941e285e54d370 Mon Sep 17 00:00:00 2001
From: Yuval Avnery <yuvalav@mellanox.com>
Date: Mon, 10 Jun 2019 23:38:42 +0000
Subject: net/mlx5: Add EQ enable/disable API

Previously, EQ joined the chain notifier on creation.
This forced the caller to be ready to handle events before creating
the EQ through eq_create_generic interface.

To help the caller control when the created EQ will be attached to the
IRQ, add enable/disable API.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c                 |   9 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     | 105 ++++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h |   1 -
 include/linux/mlx5/eq.h                          |   5 +-
 4 files changed, 88 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 693a0e225093..12ccee1eb047 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1560,15 +1560,21 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 		.irq_index = 0,
 		.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 		.nent = MLX5_IB_NUM_PF_EQE,
-		.nb = &eq->irq_nb,
 	};
 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
 		goto err_wq;
 	}
+	err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
+	if (err) {
+		mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
+		goto err_eq;
+	}
 
 	return 0;
+err_eq:
+	mlx5_eq_destroy_generic(dev->mdev, eq->core);
 err_wq:
 	destroy_workqueue(eq->wq);
 err_mempool:
@@ -1581,6 +1587,7 @@ mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 {
 	int err;
 
+	mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
 	err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
 	cancel_work_sync(&eq->work);
 	destroy_workqueue(eq->wq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 0f5846a34928..58fff2f39b38 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -304,27 +304,14 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
 	eq->dev = dev;
 	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
-	eq->irq_nb = param->nb;
-
-	err = mlx5_irq_attach_nb(dev->priv.eq_table->irq_table, vecidx,
-				 param->nb);
-	if (err)
-		goto err_eq;
 
 	err = mlx5_debug_eq_add(dev, eq);
 	if (err)
-		goto err_detach;
-
-	/* EQs are created in ARMED state
-	 */
-	eq_update_ci(eq, 1);
+		goto err_eq;
 
 	kvfree(in);
 	return 0;
 
-err_detach:
-	mlx5_irq_detach_nb(dev->priv.eq_table->irq_table, vecidx, eq->irq_nb);
-
 err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
 
@@ -336,17 +323,49 @@ err_buf:
 	return err;
 }
 
+/**
+ * mlx5_eq_enable - Enable EQ for receiving EQEs
+ * @dev - Device which owns the eq
+ * @eq - EQ to enable
+ * @nb - notifier call block
+ * mlx5_eq_enable - must be called after EQ is created in device.
+ */
+int mlx5_eq_enable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		   struct notifier_block *nb)
+{
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+	int err;
+
+	err = mlx5_irq_attach_nb(eq_table->irq_table, eq->vecidx, nb);
+	if (!err)
+		eq_update_ci(eq, 1);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_eq_enable);
+
+/**
+ * mlx5_eq_disable - Enable EQ for receiving EQEs
+ * @dev - Device which owns the eq
+ * @eq - EQ to disable
+ * @nb - notifier call block
+ * mlx5_eq_disable - must be called before EQ is destroyed.
+ */
+void mlx5_eq_disable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		     struct notifier_block *nb)
+{
+	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
+
+	mlx5_irq_detach_nb(eq_table->irq_table, eq->vecidx, nb);
+}
+EXPORT_SYMBOL(mlx5_eq_disable);
+
 static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 {
 	int err;
 
 	mlx5_debug_eq_remove(dev, eq);
 
-	err = mlx5_irq_detach_nb(dev->priv.eq_table->irq_table,
-				 eq->vecidx, eq->irq_nb);
-	if (err)
-		mlx5_core_warn(eq->dev, "eq failed to detach from irq. err %d",
-			       err);
 	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
@@ -544,14 +563,17 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.irq_index = 0,
 		.mask = 1ull << MLX5_EVENT_TYPE_CMD,
 		.nent = MLX5_NUM_CMD_EQE,
-		.nb = &table->cmd_eq.irq_nb,
 	};
 	err = create_async_eq(dev, &table->cmd_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		goto err0;
 	}
-
+	err = mlx5_eq_enable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb);
+	if (err) {
+		mlx5_core_warn(dev, "failed to enable cmd EQ %d\n", err);
+		goto err1;
+	}
 	mlx5_cmd_use_events(dev);
 
 	table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int;
@@ -559,12 +581,17 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.irq_index = 0,
 		.mask = gather_async_events_mask(dev),
 		.nent = MLX5_NUM_ASYNC_EQE,
-		.nb = &table->async_eq.irq_nb,
 	};
 	err = create_async_eq(dev, &table->async_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
-		goto err1;
+		goto err2;
+	}
+	err = mlx5_eq_enable(dev, &table->async_eq.core,
+			     &table->async_eq.irq_nb);
+	if (err) {
+		mlx5_core_warn(dev, "failed to enable async EQ %d\n", err);
+		goto err3;
 	}
 
 	table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int;
@@ -572,21 +599,31 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		.irq_index = 0,
 		.mask =  1 << MLX5_EVENT_TYPE_PAGE_REQUEST,
 		.nent = /* TODO: sriov max_vf + */ 1,
-		.nb = &table->pages_eq.irq_nb,
 	};
 	err = create_async_eq(dev, &table->pages_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
-		goto err2;
+		goto err4;
+	}
+	err = mlx5_eq_enable(dev, &table->pages_eq.core,
+			     &table->pages_eq.irq_nb);
+	if (err) {
+		mlx5_core_warn(dev, "failed to enable pages EQ %d\n", err);
+		goto err5;
 	}
 
 	return err;
 
-err2:
+err5:
+	destroy_async_eq(dev, &table->pages_eq.core);
+err4:
+	mlx5_eq_disable(dev, &table->async_eq.core, &table->async_eq.irq_nb);
+err3:
 	destroy_async_eq(dev, &table->async_eq.core);
-
-err1:
+err2:
 	mlx5_cmd_use_polling(dev);
+	mlx5_eq_disable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb);
+err1:
 	destroy_async_eq(dev, &table->cmd_eq.core);
 err0:
 	mlx5_eq_notifier_unregister(dev, &table->cq_err_nb);
@@ -598,11 +635,13 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	int err;
 
+	mlx5_eq_disable(dev, &table->pages_eq.core, &table->pages_eq.irq_nb);
 	err = destroy_async_eq(dev, &table->pages_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
 			      err);
 
+	mlx5_eq_disable(dev, &table->async_eq.core, &table->async_eq.irq_nb);
 	err = destroy_async_eq(dev, &table->async_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
@@ -610,6 +649,7 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 
 	mlx5_cmd_use_polling(dev);
 
+	mlx5_eq_disable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb);
 	err = destroy_async_eq(dev, &table->cmd_eq.core);
 	if (err)
 		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
@@ -711,6 +751,7 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 
 	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
 		list_del(&eq->list);
+		mlx5_eq_disable(dev, &eq->core, &eq->irq_nb);
 		if (destroy_unmap_eq(dev, &eq->core))
 			mlx5_core_warn(dev, "failed to destroy comp EQ 0x%x\n",
 				       eq->core.eqn);
@@ -752,13 +793,19 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 			.irq_index = vecidx,
 			.mask = 0,
 			.nent = nent,
-			.nb = &eq->irq_nb,
 		};
 		err = create_map_eq(dev, &eq->core, &param);
 		if (err) {
 			kfree(eq);
 			goto clean;
 		}
+		err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb);
+		if (err) {
+			destroy_unmap_eq(dev, &eq->core);
+			kfree(eq);
+			goto clean;
+		}
+
 		mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->core.eqn);
 		/* add tail, to keep the list ordered, for mlx5_vector2eqn to work */
 		list_add_tail(&eq->list, &table->comp_eqs_list);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index 3836c39b2900..24bd991a727e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -33,7 +33,6 @@ struct mlx5_eq {
 	u8                      eqn;
 	int                     nent;
 	struct mlx5_rsc_debug   *dbg;
-	struct notifier_block   *irq_nb; /* For destroy only */
 };
 
 struct mlx5_eq_async {
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 4a94e04eff0a..70e16dcfb4c4 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -16,13 +16,16 @@ struct mlx5_eq_param {
 	u8             irq_index;
 	int            nent;
 	u64            mask;
-	struct notifier_block *nb;
 };
 
 struct mlx5_eq *
 mlx5_eq_create_generic(struct mlx5_core_dev *dev, struct mlx5_eq_param *param);
 int
 mlx5_eq_destroy_generic(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_eq_enable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		   struct notifier_block *nb);
+void mlx5_eq_disable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		     struct notifier_block *nb);
 
 struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, u32 cc);
 void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm);
-- 
cgit v1.2.3


From b25bbc2f24dcab9cd186ef4003c39bf51ad0454c Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 28 Jun 2018 15:05:58 +0300
Subject: net/mlx5: Add Vendor Specific Capability access gateway

The Vendor Specific Capability (VSC) is used to activate a gateway
interfacing with the device. The gateway is used to read or write
device configurations, which are organized in different domains (spaces).
A configuration access may result in multiple actions, reads, writes.

Example usages are accessing the Crspace domain to read the crspace or
locking a device semaphore using the Semaphore domain.

The configuration access use pci_cfg_access to prevent parallel access to
the VSC space by the driver and userspace calls.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   3 +-
 .../net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c  | 286 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h  |  24 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   3 +
 include/linux/mlx5/driver.h                        |   1 +
 5 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9006fda6bd11..8e07354faea1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -15,7 +15,8 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o alloc.o qp.o port.o mr.o pd.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
 		fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
-		lib/devcom.o diag/fs_tracepoint.o diag/fw_tracer.o devlink.o
+		lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
+		diag/fw_tracer.o devlink.o
 
 #
 # Netdev basic
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c
new file mode 100644
index 000000000000..a27b0119b3d6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies */
+
+#include <linux/pci.h>
+#include "mlx5_core.h"
+#include "pci_vsc.h"
+
+#define MLX5_EXTRACT_C(source, offset, size)	\
+	((((u32)(source)) >> (offset)) & MLX5_ONES32(size))
+#define MLX5_EXTRACT(src, start, len)		\
+	(((len) == 32) ? (src) : MLX5_EXTRACT_C(src, start, len))
+#define MLX5_ONES32(size)			\
+	((size) ? (0xffffffff >> (32 - (size))) : 0)
+#define MLX5_MASK32(offset, size)		\
+	(MLX5_ONES32(size) << (offset))
+#define MLX5_MERGE_C(rsrc1, rsrc2, start, len)  \
+	((((rsrc2) << (start)) & (MLX5_MASK32((start), (len)))) | \
+	((rsrc1) & (~MLX5_MASK32((start), (len)))))
+#define MLX5_MERGE(rsrc1, rsrc2, start, len)	\
+	(((len) == 32) ? (rsrc2) : MLX5_MERGE_C(rsrc1, rsrc2, start, len))
+#define vsc_read(dev, offset, val) \
+	pci_read_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val))
+#define vsc_write(dev, offset, val) \
+	pci_write_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val))
+#define VSC_MAX_RETRIES 2048
+
+enum mlx5_vsc_state {
+	MLX5_VSC_UNLOCK,
+	MLX5_VSC_LOCK,
+};
+
+enum {
+	VSC_CTRL_OFFSET = 0x4,
+	VSC_COUNTER_OFFSET = 0x8,
+	VSC_SEMAPHORE_OFFSET = 0xc,
+	VSC_ADDR_OFFSET = 0x10,
+	VSC_DATA_OFFSET = 0x14,
+
+	VSC_FLAG_BIT_OFFS = 31,
+	VSC_FLAG_BIT_LEN = 1,
+
+	VSC_SYND_BIT_OFFS = 30,
+	VSC_SYND_BIT_LEN = 1,
+
+	VSC_ADDR_BIT_OFFS = 0,
+	VSC_ADDR_BIT_LEN = 30,
+
+	VSC_SPACE_BIT_OFFS = 0,
+	VSC_SPACE_BIT_LEN = 16,
+
+	VSC_SIZE_VLD_BIT_OFFS = 28,
+	VSC_SIZE_VLD_BIT_LEN = 1,
+
+	VSC_STATUS_BIT_OFFS = 29,
+	VSC_STATUS_BIT_LEN = 3,
+};
+
+void mlx5_pci_vsc_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_core_is_pf(dev))
+		return;
+
+	dev->vsc_addr = pci_find_capability(dev->pdev,
+					    PCI_CAP_ID_VNDR);
+	if (!dev->vsc_addr)
+		mlx5_core_warn(dev, "Failed to get valid vendor specific ID\n");
+}
+
+int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev)
+{
+	u32 counter = 0;
+	int retries = 0;
+	u32 lock_val;
+	int ret;
+
+	pci_cfg_access_lock(dev->pdev);
+	do {
+		if (retries > VSC_MAX_RETRIES) {
+			ret = -EBUSY;
+			goto pci_unlock;
+		}
+
+		/* Check if semaphore is already locked */
+		ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val);
+		if (ret)
+			goto pci_unlock;
+
+		if (lock_val) {
+			retries++;
+			usleep_range(1000, 2000);
+			continue;
+		}
+
+		/* Read and write counter value, if written value is
+		 * the same, semaphore was acquired successfully.
+		 */
+		ret = vsc_read(dev, VSC_COUNTER_OFFSET, &counter);
+		if (ret)
+			goto pci_unlock;
+
+		ret = vsc_write(dev, VSC_SEMAPHORE_OFFSET, counter);
+		if (ret)
+			goto pci_unlock;
+
+		ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val);
+		if (ret)
+			goto pci_unlock;
+
+		retries++;
+	} while (counter != lock_val);
+
+	return 0;
+
+pci_unlock:
+	pci_cfg_access_unlock(dev->pdev);
+	return ret;
+}
+
+int mlx5_vsc_gw_unlock(struct mlx5_core_dev *dev)
+{
+	int ret;
+
+	ret = vsc_write(dev, VSC_SEMAPHORE_OFFSET, MLX5_VSC_UNLOCK);
+	pci_cfg_access_unlock(dev->pdev);
+	return ret;
+}
+
+int mlx5_vsc_gw_set_space(struct mlx5_core_dev *dev, u16 space,
+			  u32 *ret_space_size)
+{
+	int ret;
+	u32 val = 0;
+
+	if (!mlx5_vsc_accessible(dev))
+		return -EINVAL;
+
+	if (ret_space_size)
+		*ret_space_size = 0;
+
+	/* Get a unique val */
+	ret = vsc_read(dev, VSC_CTRL_OFFSET, &val);
+	if (ret)
+		goto out;
+
+	/* Try to modify the lock */
+	val = MLX5_MERGE(val, space, VSC_SPACE_BIT_OFFS, VSC_SPACE_BIT_LEN);
+	ret = vsc_write(dev, VSC_CTRL_OFFSET, val);
+	if (ret)
+		goto out;
+
+	/* Verify lock was modified */
+	ret = vsc_read(dev, VSC_CTRL_OFFSET, &val);
+	if (ret)
+		goto out;
+
+	if (MLX5_EXTRACT(val, VSC_STATUS_BIT_OFFS, VSC_STATUS_BIT_LEN) == 0)
+		return -EINVAL;
+
+	/* Get space max address if indicated by size valid bit */
+	if (ret_space_size &&
+	    MLX5_EXTRACT(val, VSC_SIZE_VLD_BIT_OFFS, VSC_SIZE_VLD_BIT_LEN)) {
+		ret = vsc_read(dev, VSC_ADDR_OFFSET, &val);
+		if (ret) {
+			mlx5_core_warn(dev, "Failed to get max space size\n");
+			goto out;
+		}
+		*ret_space_size = MLX5_EXTRACT(val, VSC_ADDR_BIT_OFFS,
+					       VSC_ADDR_BIT_LEN);
+	}
+	return 0;
+
+out:
+	return ret;
+}
+
+static int mlx5_vsc_wait_on_flag(struct mlx5_core_dev *dev, u8 expected_val)
+{
+	int retries = 0;
+	u32 flag;
+	int ret;
+
+	do {
+		if (retries > VSC_MAX_RETRIES)
+			return -EBUSY;
+
+		ret = vsc_read(dev, VSC_ADDR_OFFSET, &flag);
+		if (ret)
+			return ret;
+		flag = MLX5_EXTRACT(flag, VSC_FLAG_BIT_OFFS, VSC_FLAG_BIT_LEN);
+		retries++;
+
+		if ((retries & 0xf) == 0)
+			usleep_range(1000, 2000);
+
+	} while (flag != expected_val);
+
+	return 0;
+}
+
+static int mlx5_vsc_gw_write(struct mlx5_core_dev *dev, unsigned int address,
+			     u32 data)
+{
+	int ret;
+
+	if (MLX5_EXTRACT(address, VSC_SYND_BIT_OFFS,
+			 VSC_FLAG_BIT_LEN + VSC_SYND_BIT_LEN))
+		return -EINVAL;
+
+	/* Set flag to 0x1 */
+	address = MLX5_MERGE(address, 1, VSC_FLAG_BIT_OFFS, 1);
+	ret = vsc_write(dev, VSC_DATA_OFFSET, data);
+	if (ret)
+		goto out;
+
+	ret = vsc_write(dev, VSC_ADDR_OFFSET, address);
+	if (ret)
+		goto out;
+
+	/* Wait for the flag to be cleared */
+	ret = mlx5_vsc_wait_on_flag(dev, 0);
+
+out:
+	return ret;
+}
+
+static int mlx5_vsc_gw_read(struct mlx5_core_dev *dev, unsigned int address,
+			    u32 *data)
+{
+	int ret;
+
+	if (MLX5_EXTRACT(address, VSC_SYND_BIT_OFFS,
+			 VSC_FLAG_BIT_LEN + VSC_SYND_BIT_LEN))
+		return -EINVAL;
+
+	ret = vsc_write(dev, VSC_ADDR_OFFSET, address);
+	if (ret)
+		goto out;
+
+	ret = mlx5_vsc_wait_on_flag(dev, 1);
+	if (ret)
+		goto out;
+
+	ret = vsc_read(dev, VSC_DATA_OFFSET, data);
+out:
+	return ret;
+}
+
+static int mlx5_vsc_gw_read_fast(struct mlx5_core_dev *dev,
+				 unsigned int read_addr,
+				 unsigned int *next_read_addr,
+				 u32 *data)
+{
+	int ret;
+
+	ret = mlx5_vsc_gw_read(dev, read_addr, data);
+	if (ret)
+		goto out;
+
+	ret = vsc_read(dev, VSC_ADDR_OFFSET, next_read_addr);
+	if (ret)
+		goto out;
+
+	*next_read_addr = MLX5_EXTRACT(*next_read_addr, VSC_ADDR_BIT_OFFS,
+				       VSC_ADDR_BIT_LEN);
+
+	if (*next_read_addr <= read_addr)
+		ret = -EINVAL;
+out:
+	return ret;
+}
+
+int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data,
+				int length)
+{
+	unsigned int next_read_addr = 0;
+	unsigned int read_addr = 0;
+
+	while (read_addr < length) {
+		if (mlx5_vsc_gw_read_fast(dev, read_addr, &next_read_addr,
+					  &data[(read_addr >> 2)]))
+			return read_addr;
+
+		read_addr = next_read_addr;
+	}
+	return length;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h
new file mode 100644
index 000000000000..28ea6bfa439f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies */
+
+#ifndef __MLX5_PCI_VSC_H__
+#define __MLX5_PCI_VSC_H__
+
+enum {
+	MLX5_VSC_SPACE_SCAN_CRSPACE = 0x7,
+};
+
+void mlx5_pci_vsc_init(struct mlx5_core_dev *dev);
+int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev);
+int mlx5_vsc_gw_unlock(struct mlx5_core_dev *dev);
+int mlx5_vsc_gw_set_space(struct mlx5_core_dev *dev, u16 space,
+			  u32 *ret_space_size);
+int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data,
+				int length);
+
+static inline bool mlx5_vsc_accessible(struct mlx5_core_dev *dev)
+{
+	return !!dev->vsc_addr;
+}
+
+#endif /* __MLX5_PCI_VSC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 5ea141893b99..3adc09a1a312 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -66,6 +66,7 @@
 #include "lib/vxlan.h"
 #include "lib/geneve.h"
 #include "lib/devcom.h"
+#include "lib/pci_vsc.h"
 #include "diag/fw_tracer.h"
 #include "ecpf.h"
 
@@ -763,6 +764,8 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev,
 		goto err_clr_master;
 	}
 
+	mlx5_pci_vsc_init(dev);
+
 	return 0;
 
 err_clr_master:
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3a810bf043fe..f732445bcbdb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -693,6 +693,7 @@ struct mlx5_core_dev {
 	struct mlx5_clock        clock;
 	struct mlx5_ib_clock_info  *clock_info;
 	struct mlx5_fw_tracer   *tracer;
+	u32                      vsc_addr;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 8b9d8baae1de7400f19058020ee8f0f27d436687 Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Tue, 17 Jul 2018 11:18:26 +0300
Subject: net/mlx5: Add Crdump support

Crdump allows the driver to retrieve a dump of the FW PCI crspace.
This is useful in case of catastrophic issues which may require FW
reset. The crspace dump can be used for later debug.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/diag/crdump.c  | 106 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   5 +
 include/linux/mlx5/driver.h                        |   1 +
 5 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8e07354faea1..5fe2bf916c06 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -16,7 +16,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
 		fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
 		lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
-		diag/fw_tracer.o devlink.o
+		diag/fw_tracer.o diag/crdump.o devlink.o
 
 #
 # Netdev basic
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
new file mode 100644
index 000000000000..dfb34172c69b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "lib/pci_vsc.h"
+#include "lib/mlx5.h"
+
+#define BAD_ACCESS			0xBADACCE5
+#define MLX5_PROTECTED_CR_SCAN_CRSPACE	0x7
+
+static bool mlx5_crdump_enabled(struct mlx5_core_dev *dev)
+{
+	return !!dev->priv.health.crdump_size;
+}
+
+static int mlx5_crdump_fill(struct mlx5_core_dev *dev, u32 *cr_data)
+{
+	u32 crdump_size = dev->priv.health.crdump_size;
+	int i, ret;
+
+	for (i = 0; i < (crdump_size / 4); i++)
+		cr_data[i] = BAD_ACCESS;
+
+	ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump_size);
+	if (ret <= 0) {
+		if (ret == 0)
+			return -EIO;
+		return ret;
+	}
+
+	if (crdump_size != ret) {
+		mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n",
+			       ret, crdump_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data)
+{
+	int ret;
+
+	if (!mlx5_crdump_enabled(dev))
+		return -ENODEV;
+
+	ret = mlx5_vsc_gw_lock(dev);
+	if (ret) {
+		mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n",
+			       ret);
+		return ret;
+	}
+
+	ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
+	if (ret)
+		goto unlock;
+
+	ret = mlx5_crdump_fill(dev, cr_data);
+
+unlock:
+	mlx5_vsc_gw_unlock(dev);
+	return ret;
+}
+
+int mlx5_crdump_enable(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	u32 space_size;
+	int ret;
+
+	if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) ||
+	    mlx5_crdump_enabled(dev))
+		return 0;
+
+	ret = mlx5_vsc_gw_lock(dev);
+	if (ret)
+		return ret;
+
+	/* Check if space is supported and get space size */
+	ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE,
+				    &space_size);
+	if (ret) {
+		/* Unlock and mask error since space is not supported */
+		mlx5_vsc_gw_unlock(dev);
+		return 0;
+	}
+
+	if (!space_size) {
+		mlx5_core_warn(dev, "Invalid Crspace size, zero\n");
+		mlx5_vsc_gw_unlock(dev);
+		return -EINVAL;
+	}
+
+	ret = mlx5_vsc_gw_unlock(dev);
+	if (ret)
+		return ret;
+
+	priv->health.crdump_size = space_size;
+	return 0;
+}
+
+void mlx5_crdump_disable(struct mlx5_core_dev *dev)
+{
+	dev->priv.health.crdump_size = 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 397a2847867a..d918e44491f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -41,6 +41,9 @@ int  mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count);
 void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
 int  mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
 void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
+int mlx5_crdump_enable(struct mlx5_core_dev *dev);
+void mlx5_crdump_disable(struct mlx5_core_dev *dev);
+int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data);
 
 /* TODO move to lib/events.h */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 3adc09a1a312..c70e97071b87 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1313,6 +1313,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto clean_load;
 
+	err = mlx5_crdump_enable(dev);
+	if (err)
+		dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
+
 	pci_save_state(pdev);
 	return 0;
 
@@ -1334,6 +1338,7 @@ static void remove_one(struct pci_dev *pdev)
 	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
 	struct devlink *devlink = priv_to_devlink(dev);
 
+	mlx5_crdump_disable(dev);
 	mlx5_devlink_unregister(devlink);
 	mlx5_unregister_device(dev);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f732445bcbdb..4ae533b3da07 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -435,6 +435,7 @@ struct mlx5_core_health {
 	u32				prev;
 	int				miss_counter;
 	bool				sick;
+	u32				crdump_size;
 	/* wq spinlock to synchronize draining */
 	spinlock_t			wq_lock;
 	struct workqueue_struct	       *wq;
-- 
cgit v1.2.3


From 63cbc552eebf08818af2025aef4589a48ef849c0 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Mon, 12 Nov 2018 15:23:02 +0200
Subject: net/mlx5: Handle SW reset of FW in error flow

New mlx5 adapters allow the driver to reset the FW in the event of an
error, this action called "SW Reset". When an SW reset is issued on any
PF all PFs enter reset state which is a recoverable condition. The
existing recovery flow was designed to allow the recovery of a VF after
a PF driver reload. This patch adds the sw reset to the NIC states
as a preparation for sw reset handling.

When a software reset is issued the following occurs:
1. The NIC interface mode is set to 7 while the reset is in progress.
2. Once the reset completes the NIC interface mode is set to 1.

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_selftest.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c   | 105 +++++++++------------
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   2 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +-
 include/linux/mlx5/driver.h                        |   2 +-
 5 files changed, 48 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 4382ef85488c..840ec945ccba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -64,7 +64,7 @@ static int mlx5e_test_health_info(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_health *health = &priv->mdev->priv.health;
 
-	return health->sick ? 1 : 0;
+	return health->fatal_error ? 1 : 0;
 }
 
 static int mlx5e_test_link_state(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index a2656f4008d9..737e6d550775 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -62,12 +62,18 @@ enum {
 
 enum {
 	MLX5_DROP_NEW_HEALTH_WORK,
-	MLX5_DROP_NEW_RECOVERY_WORK,
+};
+
+enum  {
+	MLX5_SENSOR_NO_ERR		= 0,
+	MLX5_SENSOR_PCI_COMM_ERR	= 1,
+	MLX5_SENSOR_NIC_DISABLED	= 2,
+	MLX5_SENSOR_NIC_SW_RESET	= 3,
 };
 
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
 {
-	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
 }
 
 void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
@@ -80,18 +86,25 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
 		    &dev->iseg->cmdq_addr_l_sz);
 }
 
-static int in_fatal(struct mlx5_core_dev *dev)
+static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct health_buffer __iomem *h = health->health;
 
-	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
-		return 1;
+	/* Offline PCI reads return 0xffffffff */
+	return (ioread32be(&h->fw_ver) == 0xffffffff);
+}
 
-	if (ioread32be(&h->fw_ver) == 0xffffffff)
-		return 1;
+static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
+{
+	if (sensor_pci_not_working(dev))
+		return MLX5_SENSOR_PCI_COMM_ERR;
+	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+		return MLX5_SENSOR_NIC_DISABLED;
+	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET)
+		return MLX5_SENSOR_NIC_SW_RESET;
 
-	return 0;
+	return MLX5_SENSOR_NO_ERR;
 }
 
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
@@ -101,7 +114,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 		goto unlock;
 
 	mlx5_core_err(dev, "start\n");
-	if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) {
+	if (pci_channel_offline(dev->pdev) ||
+	    dev->priv.health.fatal_error != MLX5_SENSOR_NO_ERR || force) {
 		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 		mlx5_cmd_flush(dev);
 	}
@@ -137,38 +151,14 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 	mlx5_disable_device(dev);
 }
 
-static void health_recover(struct work_struct *work)
-{
-	struct mlx5_core_health *health;
-	struct delayed_work *dwork;
-	struct mlx5_core_dev *dev;
-	struct mlx5_priv *priv;
-	u8 nic_state;
-
-	dwork = container_of(work, struct delayed_work, work);
-	health = container_of(dwork, struct mlx5_core_health, recover_work);
-	priv = container_of(health, struct mlx5_priv, health);
-	dev = container_of(priv, struct mlx5_core_dev, priv);
-
-	nic_state = mlx5_get_nic_state(dev);
-	if (nic_state == MLX5_NIC_IFC_INVALID) {
-		mlx5_core_err(dev, "health recovery flow aborted since the nic state is invalid\n");
-		return;
-	}
-
-	mlx5_core_err(dev, "starting health recovery flow\n");
-	mlx5_recover_device(dev);
-}
-
 /* How much time to wait until health resetting the driver (in msecs) */
-#define MLX5_RECOVERY_DELAY_MSECS 60000
+#define MLX5_RECOVERY_WAIT_MSECS 60000
 static void health_care(struct work_struct *work)
 {
-	unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS);
 	struct mlx5_core_health *health;
 	struct mlx5_core_dev *dev;
 	struct mlx5_priv *priv;
-	unsigned long flags;
+	unsigned long end;
 
 	health = container_of(work, struct mlx5_core_health, work);
 	priv = container_of(health, struct mlx5_priv, health);
@@ -176,13 +166,18 @@ static void health_care(struct work_struct *work)
 	mlx5_core_warn(dev, "handling bad device here\n");
 	mlx5_handle_bad_state(dev);
 
-	spin_lock_irqsave(&health->wq_lock, flags);
-	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
-		schedule_delayed_work(&health->recover_work, recover_delay);
-	else
-		mlx5_core_err(dev,
-			      "new health works are not permitted at this stage\n");
-	spin_unlock_irqrestore(&health->wq_lock, flags);
+	end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
+	while (sensor_pci_not_working(dev)) {
+		if (time_after(jiffies, end)) {
+			mlx5_core_err(dev,
+				      "health recovery flow aborted, PCI reads still not working\n");
+			return;
+		}
+		msleep(100);
+	}
+
+	mlx5_core_err(dev, "starting health recovery flow\n");
+	mlx5_recover_device(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -274,6 +269,7 @@ static void poll_health(struct timer_list *t)
 {
 	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
 	struct mlx5_core_health *health = &dev->priv.health;
+	u32 fatal_error;
 	u32 count;
 
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
@@ -291,8 +287,11 @@ static void poll_health(struct timer_list *t)
 		print_health_info(dev);
 	}
 
-	if (in_fatal(dev) && !health->sick) {
-		health->sick = true;
+	fatal_error = check_fatal_sensors(dev);
+
+	if (fatal_error && !health->fatal_error) {
+		mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);
+		dev->priv.health.fatal_error = fatal_error;
 		print_health_info(dev);
 		mlx5_trigger_health_work(dev);
 	}
@@ -306,9 +305,8 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health = &dev->priv.health;
 
 	timer_setup(&health->timer, poll_health, 0);
-	health->sick = 0;
+	health->fatal_error = MLX5_SENSOR_NO_ERR;
 	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
-	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 	health->health = &dev->iseg->health;
 	health->health_counter = &dev->iseg->health_counter;
 
@@ -324,7 +322,6 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
 	if (disable_health) {
 		spin_lock_irqsave(&health->wq_lock, flags);
 		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
-		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 		spin_unlock_irqrestore(&health->wq_lock, flags);
 	}
 
@@ -338,23 +335,10 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 
 	spin_lock_irqsave(&health->wq_lock, flags);
 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
-	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
-	cancel_delayed_work_sync(&health->recover_work);
 	cancel_work_sync(&health->work);
 }
 
-void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
-{
-	struct mlx5_core_health *health = &dev->priv.health;
-	unsigned long flags;
-
-	spin_lock_irqsave(&health->wq_lock, flags);
-	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
-	spin_unlock_irqrestore(&health->wq_lock, flags);
-	cancel_delayed_work_sync(&dev->priv.health.recover_work);
-}
-
 void mlx5_health_flush(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
@@ -387,7 +371,6 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
-	INIT_DELAYED_WORK(&health->recover_work, health_recover);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c70e97071b87..fd0e2949c4f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1191,7 +1191,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup)
 	int err = 0;
 
 	if (cleanup)
-		mlx5_drain_health_recovery(dev);
+		mlx5_drain_health_wq(dev);
 
 	mutex_lock(&dev->intf_state_mutex);
 	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index d4dd8c1ae55c..97f8cf67ced0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -214,7 +214,7 @@ enum {
 	MLX5_NIC_IFC_FULL		= 0,
 	MLX5_NIC_IFC_DISABLED		= 1,
 	MLX5_NIC_IFC_NO_DRAM_NIC	= 2,
-	MLX5_NIC_IFC_INVALID		= 3
+	MLX5_NIC_IFC_SW_RESET		= 7
 };
 
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 4ae533b3da07..cc7fd8e62844 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -435,6 +435,7 @@ struct mlx5_core_health {
 	u32				prev;
 	int				miss_counter;
 	bool				sick;
+	u32				fatal_error;
 	u32				crdump_size;
 	/* wq spinlock to synchronize draining */
 	spinlock_t			wq_lock;
@@ -906,7 +907,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
-void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
 int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			struct mlx5_frag_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3


From 3e5b72ac2f298423902169db7893fef43365e0a6 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Mon, 12 Nov 2018 16:40:17 +0200
Subject: net/mlx5: Issue SW reset on FW assert

If a FW assert is considered fatal, indicated by a new bit in the health
buffer, reset the FW. After the reset go through the normal recovery
flow. Only one PF needs to issue the reset, so an attempt is made to
prevent the 2nd function from also issuing the reset.
It's not an error if that happens, it just slows recovery.

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/diag/crdump.c  |  13 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c   | 157 ++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   1 +
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +
 include/linux/mlx5/device.h                        |  10 +-
 include/linux/mlx5/driver.h                        |   1 +
 6 files changed, 176 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
index dfb34172c69b..28d02749d3c4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
@@ -51,14 +51,23 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data)
 			       ret);
 		return ret;
 	}
+	/* Verify no other PF is running cr-dump or sw reset */
+	ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET,
+				     MLX5_VSC_LOCK);
+	if (ret) {
+		mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n");
+		goto unlock_gw;
+	}
 
 	ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
 	if (ret)
-		goto unlock;
+		goto unlock_sem;
 
 	ret = mlx5_crdump_fill(dev, cr_data);
 
-unlock:
+unlock_sem:
+	mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, MLX5_VSC_UNLOCK);
+unlock_gw:
 	mlx5_vsc_gw_unlock(dev);
 	return ret;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 737e6d550775..caf54bd7d538 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -40,6 +40,7 @@
 #include "mlx5_core.h"
 #include "lib/eq.h"
 #include "lib/mlx5.h"
+#include "lib/pci_vsc.h"
 
 enum {
 	MLX5_HEALTH_POLL_INTERVAL	= 2 * HZ,
@@ -67,8 +68,10 @@ enum {
 enum  {
 	MLX5_SENSOR_NO_ERR		= 0,
 	MLX5_SENSOR_PCI_COMM_ERR	= 1,
-	MLX5_SENSOR_NIC_DISABLED	= 2,
-	MLX5_SENSOR_NIC_SW_RESET	= 3,
+	MLX5_SENSOR_PCI_ERR		= 2,
+	MLX5_SENSOR_NIC_DISABLED	= 3,
+	MLX5_SENSOR_NIC_SW_RESET	= 4,
+	MLX5_SENSOR_FW_SYND_RFR		= 5,
 };
 
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
@@ -95,32 +98,162 @@ static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
 	return (ioread32be(&h->fw_ver) == 0xffffffff);
 }
 
+static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+	u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
+	u8 synd = ioread8(&h->synd);
+
+	if (rfr && synd)
+		mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
+	return rfr && synd;
+}
+
 static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
 {
 	if (sensor_pci_not_working(dev))
 		return MLX5_SENSOR_PCI_COMM_ERR;
+	if (pci_channel_offline(dev->pdev))
+		return MLX5_SENSOR_PCI_ERR;
 	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
 		return MLX5_SENSOR_NIC_DISABLED;
 	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET)
 		return MLX5_SENSOR_NIC_SW_RESET;
+	if (sensor_fw_synd_rfr(dev))
+		return MLX5_SENSOR_FW_SYND_RFR;
 
 	return MLX5_SENSOR_NO_ERR;
 }
 
+static int lock_sem_sw_reset(struct mlx5_core_dev *dev, bool lock)
+{
+	enum mlx5_vsc_state state;
+	int ret;
+
+	if (!mlx5_core_is_pf(dev))
+		return -EBUSY;
+
+	/* Try to lock GW access, this stage doesn't return
+	 * EBUSY because locked GW does not mean that other PF
+	 * already started the reset.
+	 */
+	ret = mlx5_vsc_gw_lock(dev);
+	if (ret == -EBUSY)
+		return -EINVAL;
+	if (ret)
+		return ret;
+
+	state = lock ? MLX5_VSC_LOCK : MLX5_VSC_UNLOCK;
+	/* At this stage, if the return status == EBUSY, then we know
+	 * for sure that another PF started the reset, so don't allow
+	 * another reset.
+	 */
+	ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, state);
+	if (ret)
+		mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n");
+
+	/* Unlock GW access */
+	mlx5_vsc_gw_unlock(dev);
+
+	return ret;
+}
+
+static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
+{
+	bool supported = (ioread32be(&dev->iseg->initializing) >>
+			  MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
+	u32 fatal_error;
+
+	if (!supported)
+		return false;
+
+	/* The reset only needs to be issued by one PF. The health buffer is
+	 * shared between all functions, and will be cleared during a reset.
+	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
+	 * PCI related a reset won't help.
+	 */
+	fatal_error = check_fatal_sensors(dev);
+	if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
+	    fatal_error == MLX5_SENSOR_NIC_DISABLED ||
+	    fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
+		mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help.");
+		return false;
+	}
+
+	mlx5_core_warn(dev, "Issuing FW Reset\n");
+	/* Write the NIC interface field to initiate the reset, the command
+	 * interface address also resides here, don't overwrite it.
+	 */
+	mlx5_set_nic_state(dev, MLX5_NIC_IFC_SW_RESET);
+
+	return true;
+}
+
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 {
 	mutex_lock(&dev->intf_state_mutex);
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		goto unlock;
+	if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) {
+		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+		goto unlock;
+	}
 
-	mlx5_core_err(dev, "start\n");
-	if (pci_channel_offline(dev->pdev) ||
-	    dev->priv.health.fatal_error != MLX5_SENSOR_NO_ERR || force) {
+	if (check_fatal_sensors(dev) || force) {
 		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 		mlx5_cmd_flush(dev);
 	}
 
 	mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
+unlock:
+	mutex_unlock(&dev->intf_state_mutex);
+}
+
+#define MLX5_CRDUMP_WAIT_MS	60000
+#define MLX5_FW_RESET_WAIT_MS	1000
+void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
+{
+	unsigned long end, delay_ms = MLX5_FW_RESET_WAIT_MS;
+	int lock = -EBUSY;
+
+	mutex_lock(&dev->intf_state_mutex);
+	if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		goto unlock;
+
+	mlx5_core_err(dev, "start\n");
+
+	if (check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) {
+		/* Get cr-dump and reset FW semaphore */
+		lock = lock_sem_sw_reset(dev, true);
+
+		if (lock == -EBUSY) {
+			delay_ms = MLX5_CRDUMP_WAIT_MS;
+			goto recover_from_sw_reset;
+		}
+		/* Execute SW reset */
+		reset_fw_if_needed(dev);
+	}
+
+recover_from_sw_reset:
+	/* Recover from SW reset */
+	end = jiffies + msecs_to_jiffies(delay_ms);
+	do {
+		if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+			break;
+
+		cond_resched();
+	} while (!time_after(jiffies, end));
+
+	if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) {
+		dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
+			mlx5_get_nic_state(dev), delay_ms);
+	}
+
+	/* Release FW semaphore if you are the lock owner */
+	if (!lock)
+		lock_sem_sw_reset(dev, false);
+
 	mlx5_core_err(dev, "end\n");
 
 unlock:
@@ -143,6 +276,20 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 	case MLX5_NIC_IFC_NO_DRAM_NIC:
 		mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
 		break;
+
+	case MLX5_NIC_IFC_SW_RESET:
+		/* The IFC mode field is 3 bits, so it will read 0x7 in 2 cases:
+		 * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded
+		 *    and this is a VF), this is not recoverable by SW reset.
+		 *    Logging of this is handled elsewhere.
+		 * 2. FW reset has been issued by another function, driver can
+		 *    be reloaded to recover after the mode switches to
+		 *    MLX5_NIC_IFC_DISABLED.
+		 */
+		if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
+			mlx5_core_warn(dev, "NIC SW reset in progress\n");
+		break;
+
 	default:
 		mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
 			       nic_interface);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index fd0e2949c4f2..ec5287c51825 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1361,6 +1361,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
 	mlx5_core_info(dev, "%s was called\n", __func__);
 
 	mlx5_enter_error_state(dev, false);
+	mlx5_error_sw_reset(dev);
 	mlx5_unload_one(dev, false);
 	/* In case of kernel call drain the health wq */
 	if (state) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 8593c8183d87..29bb61a10289 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -113,6 +113,7 @@ enum {
 
 enum mlx5_semaphore_space_address {
 	MLX5_SEMAPHORE_SPACE_DOMAIN     = 0xA,
+	MLX5_SEMAPHORE_SW_RESET         = 0x20,
 };
 
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
@@ -122,6 +123,7 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
+void mlx5_error_sw_reset(struct mlx5_core_dev *dev);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
 void mlx5_recover_device(struct mlx5_core_dev *dev);
 int mlx5_sriov_init(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 5e760067ac41..35ed38c2ae6c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -510,6 +510,10 @@ struct mlx5_cmd_layout {
 	u8		status_own;
 };
 
+enum mlx5_fatal_assert_bit_offsets {
+	MLX5_RFR_OFFSET = 31,
+};
+
 struct health_buffer {
 	__be32		assert_var[5];
 	__be32		rsvd0[3];
@@ -518,12 +522,16 @@ struct health_buffer {
 	__be32		rsvd1[2];
 	__be32		fw_ver;
 	__be32		hw_id;
-	__be32		rsvd2;
+	__be32		rfr;
 	u8		irisc_index;
 	u8		synd;
 	__be16		ext_synd;
 };
 
+enum mlx5_initializing_bit_offsets {
+	MLX5_FW_RESET_SUPPORTED_OFFSET = 30,
+};
+
 enum mlx5_cmd_addr_l_sz_offset {
 	MLX5_NIC_IFC_OFFSET = 8,
 };
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cc7fd8e62844..89205b6cc7ef 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -583,6 +583,7 @@ struct mlx5_priv {
 };
 
 enum mlx5_device_state {
+	MLX5_DEVICE_STATE_UNINITIALIZED,
 	MLX5_DEVICE_STATE_UP,
 	MLX5_DEVICE_STATE_INTERNAL_ERROR,
 };
-- 
cgit v1.2.3


From 1e34f3efd413a6318c3edd6e8e7e091f1214b2e6 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Tue, 11 Dec 2018 16:09:53 +0200
Subject: net/mlx5: Create FW devlink_health_reporter

Create mlx5_devlink_health_reporter for FW reporter. The FW reporter
implements devlink_health_reporter diagnose callback.

The fw reporter diagnose command can be triggered any time by the user
to check current fw status.
In healthy status, it will return clear syndrome. Otherwise it will
return the syndrome and description of the error type.

Command example and output on healthy status:
$ devlink health diagnose pci/0000:82:00.0 reporter fw
Syndrome: 0

Command example and output on non healthy status:
$ devlink health diagnose pci/0000:82:00.0 reporter fw
Syndrome: 8 Description: unrecoverable hardware error

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 48 ++++++++++++++++++++++++
 include/linux/mlx5/driver.h                      |  2 +
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index caf54bd7d538..973cc005ae60 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -388,6 +388,51 @@ static void print_health_info(struct mlx5_core_dev *dev)
 	mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw);
 }
 
+static int
+mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
+			  struct devlink_fmsg *fmsg)
+{
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+	u8 synd;
+	int err;
+
+	synd = ioread8(&h->synd);
+	err = devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd);
+	if (err || !synd)
+		return err;
+	return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd));
+}
+
+static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
+		.name = "fw",
+		.diagnose = mlx5_fw_reporter_diagnose,
+};
+
+static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct devlink *devlink = priv_to_devlink(dev);
+
+	health->fw_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
+					       0, false, dev);
+	if (IS_ERR(health->fw_reporter))
+		mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n",
+			       PTR_ERR(health->fw_reporter));
+}
+
+static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	if (IS_ERR_OR_NULL(health->fw_reporter))
+		return;
+
+	devlink_health_reporter_destroy(health->fw_reporter);
+}
+
 static unsigned long get_next_poll_jiffies(void)
 {
 	unsigned long next;
@@ -498,6 +543,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health = &dev->priv.health;
 
 	destroy_workqueue(health->wq);
+	mlx5_fw_reporter_destroy(dev);
 }
 
 int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -519,5 +565,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
 
+	mlx5_fw_reporter_create(dev);
+
 	return 0;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 89205b6cc7ef..8d5d065d1aa6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -53,6 +53,7 @@
 #include <linux/mlx5/eq.h>
 #include <linux/timecounter.h>
 #include <linux/ptp_clock_kernel.h>
+#include <net/devlink.h>
 
 enum {
 	MLX5_BOARD_ID_LEN = 64,
@@ -443,6 +444,7 @@ struct mlx5_core_health {
 	unsigned long			flags;
 	struct work_struct		work;
 	struct delayed_work		recover_work;
+	struct devlink_health_reporter *fw_reporter;
 };
 
 struct mlx5_qp_table {
-- 
cgit v1.2.3


From d1bf0e2cc4a6e66c2bff48176b8b2930098468ef Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Tue, 11 Dec 2018 16:09:56 +0200
Subject: net/mlx5: Report devlink health on FW issues

Use devlink_health_report() to report any symptom of FW issue as FW
counter miss or new health syndrome.
The FW issues detected in mlx5 during poll_health which is called in
timer atomic context and so health work queue is used to schedule the
reports.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 33 ++++++++++++++++++++++++
 include/linux/mlx5/driver.h                      |  3 ++-
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 1c20d3f1d238..5e876f1de114 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -515,6 +515,29 @@ mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
 	return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
 }
 
+static void mlx5_fw_reporter_err_work(struct work_struct *work)
+{
+	struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+	struct mlx5_core_health *health;
+
+	health = container_of(work, struct mlx5_core_health, report_work);
+
+	if (IS_ERR_OR_NULL(health->fw_reporter))
+		return;
+
+	fw_reporter_ctx.err_synd = health->synd;
+	fw_reporter_ctx.miss_counter = health->miss_counter;
+	if (fw_reporter_ctx.err_synd) {
+		devlink_health_report(health->fw_reporter,
+				      "FW syndrom reported", &fw_reporter_ctx);
+		return;
+	}
+	if (fw_reporter_ctx.miss_counter)
+		devlink_health_report(health->fw_reporter,
+				      "FW miss counter reported",
+				      &fw_reporter_ctx);
+}
+
 static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.name = "fw",
 		.diagnose = mlx5_fw_reporter_diagnose,
@@ -572,7 +595,9 @@ static void poll_health(struct timer_list *t)
 {
 	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
 	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
 	u32 fatal_error;
+	u8 prev_synd;
 	u32 count;
 
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
@@ -588,8 +613,14 @@ static void poll_health(struct timer_list *t)
 	if (health->miss_counter == MAX_MISSES) {
 		mlx5_core_err(dev, "device's health compromised - reached miss count\n");
 		print_health_info(dev);
+		queue_work(health->wq, &health->report_work);
 	}
 
+	prev_synd = health->synd;
+	health->synd = ioread8(&h->synd);
+	if (health->synd && health->synd != prev_synd)
+		queue_work(health->wq, &health->report_work);
+
 	fatal_error = check_fatal_sensors(dev);
 
 	if (fatal_error && !health->fatal_error) {
@@ -639,6 +670,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 	spin_lock_irqsave(&health->wq_lock, flags);
 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
+	cancel_work_sync(&health->report_work);
 	cancel_work_sync(&health->work);
 }
 
@@ -675,6 +707,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
+	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 
 	mlx5_fw_reporter_create(dev);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8d5d065d1aa6..1931a4080d78 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -435,7 +435,7 @@ struct mlx5_core_health {
 	struct timer_list		timer;
 	u32				prev;
 	int				miss_counter;
-	bool				sick;
+	u8				synd;
 	u32				fatal_error;
 	u32				crdump_size;
 	/* wq spinlock to synchronize draining */
@@ -443,6 +443,7 @@ struct mlx5_core_health {
 	struct workqueue_struct	       *wq;
 	unsigned long			flags;
 	struct work_struct		work;
+	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 	struct devlink_health_reporter *fw_reporter;
 };
-- 
cgit v1.2.3


From 96c82cdfe77b5e769624af71ec0554434037b82f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Tue, 11 Dec 2018 16:09:57 +0200
Subject: net/mlx5: Add fw fatal devlink_health_reporter

Create mlx5_devlink_health_reporter for fw fatal reporter.
The fw fatal reporter is added in addition to the fw reporter and
implements the recover callback.
The point of having two reporters for FW issues, is that we
don't want to run FW recover on any issue, but only fatal ones.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 81 ++++++++++++++++++------
 include/linux/mlx5/driver.h                      |  1 +
 2 files changed, 62 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 5e876f1de114..82a658834675 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -301,31 +301,43 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 
 /* How much time to wait until health resetting the driver (in msecs) */
 #define MLX5_RECOVERY_WAIT_MSECS 60000
-static void health_care(struct work_struct *work)
+static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
 {
-	struct mlx5_core_health *health;
-	struct mlx5_core_dev *dev;
-	struct mlx5_priv *priv;
 	unsigned long end;
 
-	health = container_of(work, struct mlx5_core_health, work);
-	priv = container_of(health, struct mlx5_priv, health);
-	dev = container_of(priv, struct mlx5_core_dev, priv);
 	mlx5_core_warn(dev, "handling bad device here\n");
 	mlx5_handle_bad_state(dev);
-
 	end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
 	while (sensor_pci_not_working(dev)) {
 		if (time_after(jiffies, end)) {
 			mlx5_core_err(dev,
 				      "health recovery flow aborted, PCI reads still not working\n");
-			return;
+			return -EIO;
 		}
 		msleep(100);
 	}
 
 	mlx5_core_err(dev, "starting health recovery flow\n");
 	mlx5_recover_device(dev);
+	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
+	    check_fatal_sensors(dev)) {
+		mlx5_core_err(dev, "health recovery failed\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+static void health_recover_work(struct work_struct *work)
+{
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+
+	health = container_of(work, struct mlx5_core_health, work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	mlx5_health_try_recover(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -544,7 +556,22 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.dump = mlx5_fw_reporter_dump,
 };
 
-static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+static int
+mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
+			       void *priv_ctx)
+{
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+
+	return mlx5_health_try_recover(dev);
+}
+
+static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
+		.name = "fw_fatal",
+		.recover = mlx5_fw_fatal_reporter_recover,
+};
+
+#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000
+static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct devlink *devlink = priv_to_devlink(dev);
@@ -555,16 +582,26 @@ static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
 	if (IS_ERR(health->fw_reporter))
 		mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n",
 			       PTR_ERR(health->fw_reporter));
+
+	health->fw_fatal_reporter =
+		devlink_health_reporter_create(devlink,
+					       &mlx5_fw_fatal_reporter_ops,
+					       MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+					       true, dev);
+	if (IS_ERR(health->fw_fatal_reporter))
+		mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n",
+			       PTR_ERR(health->fw_fatal_reporter));
 }
 
-static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 
-	if (IS_ERR_OR_NULL(health->fw_reporter))
-		return;
+	if (!IS_ERR_OR_NULL(health->fw_reporter))
+		devlink_health_reporter_destroy(health->fw_reporter);
 
-	devlink_health_reporter_destroy(health->fw_reporter);
+	if (!IS_ERR_OR_NULL(health->fw_fatal_reporter))
+		devlink_health_reporter_destroy(health->fw_fatal_reporter);
 }
 
 static unsigned long get_next_poll_jiffies(void)
@@ -686,7 +723,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health = &dev->priv.health;
 
 	destroy_workqueue(health->wq);
-	mlx5_fw_reporter_destroy(dev);
+	mlx5_fw_reporters_destroy(dev);
 }
 
 int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -694,22 +731,26 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health;
 	char *name;
 
+	mlx5_fw_reporters_create(dev);
+
 	health = &dev->priv.health;
 	name = kmalloc(64, GFP_KERNEL);
 	if (!name)
-		return -ENOMEM;
+		goto out_err;
 
 	strcpy(name, "mlx5_health");
 	strcat(name, dev_name(dev->device));
 	health->wq = create_singlethread_workqueue(name);
 	kfree(name);
 	if (!health->wq)
-		return -ENOMEM;
+		goto out_err;
 	spin_lock_init(&health->wq_lock);
-	INIT_WORK(&health->work, health_care);
+	INIT_WORK(&health->work, health_recover_work);
 	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 
-	mlx5_fw_reporter_create(dev);
-
 	return 0;
+
+out_err:
+	mlx5_fw_reporters_destroy(dev);
+	return -ENOMEM;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1931a4080d78..caac96bf9c0d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -446,6 +446,7 @@ struct mlx5_core_health {
 	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 	struct devlink_health_reporter *fw_reporter;
+	struct devlink_health_reporter *fw_fatal_reporter;
 };
 
 struct mlx5_qp_table {
-- 
cgit v1.2.3


From b3bd076f7501afea2871bb4738ab53498fd32cd5 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Sun, 27 Jan 2019 18:38:39 +0200
Subject: net/mlx5: Report devlink health on FW fatal issues

Report devlink health on FW fatal issues via fw_fatal_reporter. The
driver recover flow for FW fatal error is now being handled by the
devlink health.

Having the recovery controlled by devlink health, the user has the
ability to cancel the auto-recovery for debug session and run it
manually.

Call mlx5_enter_error_state() before calling devlink_health_report() to
ensure entering device error state even if auto-recovery is off.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 42 +++++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/main.c   | 10 +++---
 include/linux/mlx5/driver.h                      |  2 +-
 3 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 4ef62c6c6424..2fe6923f7ce0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -327,19 +327,6 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
 	return 0;
 }
 
-static void health_recover_work(struct work_struct *work)
-{
-	struct mlx5_core_health *health;
-	struct mlx5_core_dev *dev;
-	struct mlx5_priv *priv;
-
-	health = container_of(work, struct mlx5_core_health, work);
-	priv = container_of(health, struct mlx5_priv, health);
-	dev = container_of(priv, struct mlx5_core_dev, priv);
-
-	mlx5_health_try_recover(dev);
-}
-
 static const char *hsynd_str(u8 synd)
 {
 	switch (synd) {
@@ -614,6 +601,29 @@ free_data:
 	return err;
 }
 
+static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
+{
+	struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+
+	health = container_of(work, struct mlx5_core_health, fatal_report_work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	mlx5_enter_error_state(dev, false);
+	if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
+		if (mlx5_health_try_recover(dev))
+			mlx5_core_err(dev, "health recovery failed\n");
+		return;
+	}
+	fw_reporter_ctx.err_synd = health->synd;
+	fw_reporter_ctx.miss_counter = health->miss_counter;
+	devlink_health_report(health->fw_fatal_reporter,
+			      "FW fatal error reported", &fw_reporter_ctx);
+}
+
 static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
 		.name = "fw_fatal",
 		.recover = mlx5_fw_fatal_reporter_recover,
@@ -672,7 +682,7 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
 
 	spin_lock_irqsave(&health->wq_lock, flags);
 	if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
-		queue_work(health->wq, &health->work);
+		queue_work(health->wq, &health->fatal_report_work);
 	else
 		mlx5_core_err(dev, "new health works are not permitted at this stage\n");
 	spin_unlock_irqrestore(&health->wq_lock, flags);
@@ -758,7 +768,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
 	cancel_work_sync(&health->report_work);
-	cancel_work_sync(&health->work);
+	cancel_work_sync(&health->fatal_report_work);
 }
 
 void mlx5_health_flush(struct mlx5_core_dev *dev)
@@ -795,7 +805,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	if (!health->wq)
 		goto out_err;
 	spin_lock_init(&health->wq_lock);
-	INIT_WORK(&health->work, health_recover_work);
+	INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work);
 	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ec5287c51825..998eec938d3c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1363,11 +1363,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
 	mlx5_enter_error_state(dev, false);
 	mlx5_error_sw_reset(dev);
 	mlx5_unload_one(dev, false);
-	/* In case of kernel call drain the health wq */
-	if (state) {
-		mlx5_drain_health_wq(dev);
-		mlx5_pci_disable_device(dev);
-	}
+	mlx5_drain_health_wq(dev);
+	mlx5_pci_disable_device(dev);
 
 	return state == pci_channel_io_perm_failure ?
 		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
@@ -1535,7 +1532,8 @@ MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);
 
 void mlx5_disable_device(struct mlx5_core_dev *dev)
 {
-	mlx5_pci_err_detected(dev->pdev, 0);
+	mlx5_error_sw_reset(dev);
+	mlx5_unload_one(dev, false);
 }
 
 void mlx5_recover_device(struct mlx5_core_dev *dev)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index caac96bf9c0d..25847beabd3f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -442,7 +442,7 @@ struct mlx5_core_health {
 	spinlock_t			wq_lock;
 	struct workqueue_struct	       *wq;
 	unsigned long			flags;
-	struct work_struct		work;
+	struct work_struct		fatal_report_work;
 	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 	struct devlink_health_reporter *fw_reporter;
-- 
cgit v1.2.3


From 4368dada5b37e74a13b892ca5cef8a7d558e9a5f Mon Sep 17 00:00:00 2001
From: Shalom Toledo <shalomt@mellanox.com>
Date: Tue, 11 Jun 2019 18:45:09 +0300
Subject: ptp: ptp_clock: Publish scaled_ppm_to_ppb

Publish scaled_ppm_to_ppb to allow drivers to use it.

Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          | 3 ++-
 include/linux/ptp_clock_kernel.h | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index e189fa1be21e..e60eab7f8a61 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -63,7 +63,7 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue,
 	spin_unlock_irqrestore(&queue->lock, flags);
 }
 
-static s32 scaled_ppm_to_ppb(long ppm)
+s32 scaled_ppm_to_ppb(long ppm)
 {
 	/*
 	 * The 'freq' field in the 'struct timex' is in parts per
@@ -82,6 +82,7 @@ static s32 scaled_ppm_to_ppb(long ppm)
 	ppb >>= 13;
 	return (s32) ppb;
 }
+EXPORT_SYMBOL(scaled_ppm_to_ppb);
 
 /* posix clock implementation */
 
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 28eb9c792522..93cc4f1d444a 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -212,6 +212,14 @@ extern void ptp_clock_event(struct ptp_clock *ptp,
 
 extern int ptp_clock_index(struct ptp_clock *ptp);
 
+/**
+ * scaled_ppm_to_ppb() - convert scaled ppm to ppb
+ *
+ * @ppm:    Parts per million, but with a 16 bit binary fractional field
+ */
+
+extern s32 scaled_ppm_to_ppb(long ppm);
+
 /**
  * ptp_find_pin() - obtain the pin index of a given auxiliary function
  *
-- 
cgit v1.2.3


From 90cc4bd611903c77b29b94aa03ced6ad79229065 Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Mon, 6 May 2019 21:01:48 +0200
Subject: mac80211: AMPDU handling for Extended Key ID

IEEE 802.11 - 2016 forbids mixing MPDUs with different keyIDs in one
A-MPDU. Drivers supporting A-MPDUs and Extended Key ID must actively
enforce that requirement due to the available two unicast keyIDs.

Allow driver to signal mac80211 that they will not check the keyID in
MPDUs when aggregating them and that they expect mac80211 to stop Tx
aggregation when rekeying a connection using Extended Key ID.

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |   4 ++
 net/mac80211/debugfs.c |   1 +
 net/mac80211/key.c     | 100 +++++++++++++++++++++++++++----------------------
 3 files changed, 61 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 72080d9d617e..b7f0b56a09f4 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2269,6 +2269,9 @@ struct ieee80211_txq {
  * @IEEE80211_HW_EXT_KEY_ID_NATIVE: Driver and hardware are supporting Extended
  *	Key ID and can handle two unicast keys per station for Rx and Tx.
  *
+ * @IEEE80211_HW_NO_AMPDU_KEYBORDER_SUPPORT: The card/driver can't handle
+ *	active Tx A-MPDU sessions with Extended Key IDs during rekey.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2321,6 +2324,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_SUPPORTS_MULTI_BSSID,
 	IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID,
 	IEEE80211_HW_EXT_KEY_ID_NATIVE,
+	IEEE80211_HW_NO_AMPDU_KEYBORDER_SUPPORT,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 0d462206eef6..7c580010836e 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -274,6 +274,7 @@ static const char *hw_flag_names[] = {
 	FLAG(SUPPORTS_MULTI_BSSID),
 	FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID),
 	FLAG(EXT_KEY_ID_NATIVE),
+	FLAG(NO_AMPDU_KEYBORDER_SUPPORT),
 #undef FLAG
 };
 
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 20bf9db7a388..faadfeea7127 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -274,50 +274,61 @@ int ieee80211_set_tx_key(struct ieee80211_key *key)
 
 	old = key_mtx_dereference(local, sta->ptk[sta->ptk_idx]);
 	sta->ptk_idx = key->conf.keyidx;
+
+	if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT))
+		clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
 	ieee80211_check_fast_xmit(sta);
 
 	return 0;
 }
 
-static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
-				    struct ieee80211_key *new_key,
-				    bool pairwise)
+static void ieee80211_pairwise_rekey(struct ieee80211_key *old,
+				     struct ieee80211_key *new)
 {
-	struct ieee80211_sub_if_data *sdata;
-	struct ieee80211_local *local;
-	struct sta_info *sta;
-	int ret;
-
-	/* Aggregation sessions are OK when running on SW crypto.
-	 * A broken remote STA may cause issues not observed with HW
-	 * crypto, though.
-	 */
-	if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
-		return 0;
+	struct ieee80211_local *local = new->local;
+	struct sta_info *sta = new->sta;
+	int i;
 
-	assert_key_lock(old_key->local);
-	sta = old_key->sta;
+	assert_key_lock(local);
 
-	/* Unicast rekey without Extended Key ID needs special handling */
-	if (new_key && sta && pairwise &&
-	    rcu_access_pointer(sta->ptk[sta->ptk_idx]) == old_key) {
-		local = old_key->local;
-		sdata = old_key->sdata;
+	if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) {
+		/* Extended Key ID key install, initial one or rekey */
+
+		if (sta->ptk_idx != INVALID_PTK_KEYIDX &&
+		    ieee80211_hw_check(&local->hw,
+				       NO_AMPDU_KEYBORDER_SUPPORT)) {
+			/* Aggregation Sessions with Extended Key ID must not
+			 * mix MPDUs with different keyIDs within one A-MPDU.
+			 * Tear down any running Tx aggregation and all new
+			 * Rx/Tx aggregation request during rekey if the driver
+			 * asks us to do so. (Blocking Tx only would be
+			 * sufficient but WLAN_STA_BLOCK_BA gets the job done
+			 * for the few ms we need it.)
+			 */
+			set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+			mutex_lock(&sta->ampdu_mlme.mtx);
+			for (i = 0; i <  IEEE80211_NUM_TIDS; i++)
+				___ieee80211_stop_tx_ba_session(sta, i,
+								AGG_STOP_LOCAL_REQUEST);
+			mutex_unlock(&sta->ampdu_mlme.mtx);
+		}
+	} else if (old) {
+		/* Rekey without Extended Key ID.
+		 * Aggregation sessions are OK when running on SW crypto.
+		 * A broken remote STA may cause issues not observed with HW
+		 * crypto, though.
+		 */
+		if (!(old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+			return;
 
-		/* Stop TX till we are on the new key */
-		old_key->flags |= KEY_FLAG_TAINTED;
+		/* Stop Tx till we are on the new key */
+		old->flags |= KEY_FLAG_TAINTED;
 		ieee80211_clear_fast_xmit(sta);
-
-		/* Aggregation sessions during rekey are complicated due to the
-		 * reorder buffer and retransmits. Side step that by blocking
-		 * aggregation during rekey and tear down running sessions.
-		 */
 		if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) {
 			set_sta_flag(sta, WLAN_STA_BLOCK_BA);
 			ieee80211_sta_tear_down_BA_sessions(sta,
 							    AGG_STOP_LOCAL_REQUEST);
 		}
-
 		if (!wiphy_ext_feature_isset(local->hw.wiphy,
 					     NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) {
 			pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.",
@@ -325,18 +336,9 @@ static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
 			/* Flushing the driver queues *may* help prevent
 			 * the clear text leaks and freezes.
 			 */
-			ieee80211_flush_queues(local, sdata, false);
+			ieee80211_flush_queues(local, old->sdata, false);
 		}
 	}
-
-	ieee80211_key_disable_hw_accel(old_key);
-
-	if (new_key)
-		ret = ieee80211_key_enable_hw_accel(new_key);
-	else
-		ret = 0;
-
-	return ret;
 }
 
 static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
@@ -394,7 +396,6 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
 	mutex_unlock(&sdata->local->key_mtx);
 }
 
-
 static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 				  struct sta_info *sta,
 				  bool pairwise,
@@ -402,7 +403,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_key *new)
 {
 	int idx;
-	int ret;
+	int ret = 0;
 	bool defunikey, defmultikey, defmgmtkey;
 
 	/* caller must provide at least one old/new */
@@ -414,16 +415,27 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 
 	WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
 
+	if (new && sta && pairwise) {
+		/* Unicast rekey needs special handling. With Extended Key ID
+		 * old is still NULL for the first rekey.
+		 */
+		ieee80211_pairwise_rekey(old, new);
+	}
+
 	if (old) {
 		idx = old->conf.keyidx;
-		ret = ieee80211_hw_key_replace(old, new, pairwise);
+
+		if (old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
+			ieee80211_key_disable_hw_accel(old);
+
+			if (new)
+				ret = ieee80211_key_enable_hw_accel(new);
+		}
 	} else {
 		/* new must be provided in case old is not */
 		idx = new->conf.keyidx;
 		if (!new->local->wowlan)
 			ret = ieee80211_key_enable_hw_accel(new);
-		else
-			ret = 0;
 	}
 
 	if (ret)
-- 
cgit v1.2.3


From cc3e14c21ae928b3f8bce584b2c7d53d332b9738 Mon Sep 17 00:00:00 2001
From: Chung-Hsien Hsu <stanley.hsu@cypress.com>
Date: Thu, 9 May 2019 09:49:05 +0000
Subject: nl80211: add WPA3 definition for SAE authentication

Add definition of WPA version 3 for SAE authentication.

Signed-off-by: Chung-Hsien Hsu <stanley.hsu@cypress.com>
Signed-off-by: Chi-Hsien Lin <chi-hsien.lin@cypress.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 1 +
 net/wireless/nl80211.c       | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 6f09d1500960..e9bf3d69d847 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4406,6 +4406,7 @@ enum nl80211_mfp {
 enum nl80211_wpa_versions {
 	NL80211_WPA_VERSION_1 = 1 << 0,
 	NL80211_WPA_VERSION_2 = 1 << 1,
+	NL80211_WPA_VERSION_3 = 1 << 2,
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1c74ca377bd8..8332a5731c57 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8736,7 +8736,8 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
 static bool nl80211_valid_wpa_versions(u32 wpa_versions)
 {
 	return !(wpa_versions & ~(NL80211_WPA_VERSION_1 |
-				  NL80211_WPA_VERSION_2));
+				  NL80211_WPA_VERSION_2 |
+				  NL80211_WPA_VERSION_3));
 }
 
 static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
-- 
cgit v1.2.3


From 26f7044e95042daabcf1c71796a0e804a83c979f Mon Sep 17 00:00:00 2001
From: Chung-Hsien Hsu <stanley.hsu@cypress.com>
Date: Thu, 9 May 2019 09:49:06 +0000
Subject: nl80211: add support for SAE authentication offload

Let drivers advertise support for station-mode SAE authentication
offload with a new NL80211_EXT_FEATURE_SAE_OFFLOAD flag.

Signed-off-by: Chung-Hsien Hsu <stanley.hsu@cypress.com>
Signed-off-by: Chi-Hsien Lin <chi-hsien.lin@cypress.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    |  1 +
 include/net/cfg80211.h       |  5 +++++
 include/uapi/linux/nl80211.h | 19 +++++++++++++++++++
 net/wireless/nl80211.c       | 14 ++++++++++++++
 4 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 61f0a316c6ac..5dfd949ade25 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2612,6 +2612,7 @@ enum ieee80211_key_len {
 #define FILS_ERP_MAX_RRK_LEN		64
 
 #define PMK_MAX_LEN			64
+#define SAE_PASSWORD_MAX_LEN		128
 
 /* Public action codes (IEEE Std 802.11-2016, 9.6.8.1, Table 9-307) */
 enum ieee80211_pub_actioncode {
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index c19687833493..4b45056dbb25 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -742,6 +742,9 @@ struct survey_info {
  *	CFG80211_MAX_WEP_KEYS WEP keys
  * @wep_tx_key: key index (0..3) of the default TX static WEP key
  * @psk: PSK (for devices supporting 4-way-handshake offload)
+ * @sae_pwd: password for SAE authentication (for devices supporting SAE
+ *	offload)
+ * @sae_pwd_len: length of SAE password (for devices supporting SAE offload)
  */
 struct cfg80211_crypto_settings {
 	u32 wpa_versions;
@@ -757,6 +760,8 @@ struct cfg80211_crypto_settings {
 	struct key_params *wep_keys;
 	int wep_tx_key;
 	const u8 *psk;
+	const u8 *sae_pwd;
+	u8 sae_pwd_len;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e9bf3d69d847..8b1e43fecd25 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -234,6 +234,15 @@
  * use in a FILS shared key connection with PMKSA caching.
  */
 
+/**
+ * DOC: SAE authentication offload
+ *
+ * By setting @NL80211_EXT_FEATURE_SAE_OFFLOAD flag drivers can indicate they
+ * support offloading SAE authentication for WPA3-Personal networks. In
+ * %NL80211_CMD_CONNECT the password for SAE should be specified using
+ * %NL80211_ATTR_SAE_PASSWORD.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -2341,6 +2350,10 @@ enum nl80211_commands {
  *	should be picking up the lowest tx power, either tx power per-interface
  *	or per-station.
  *
+ * @NL80211_ATTR_SAE_PASSWORD: attribute for passing SAE password material. It
+ *	is used with %NL80211_CMD_CONNECT to provide password for offloading
+ *	SAE authentication for WPA3-Personal networks.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2794,6 +2807,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_STA_TX_POWER_SETTING,
 	NL80211_ATTR_STA_TX_POWER,
 
+	NL80211_ATTR_SAE_PASSWORD,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -5423,6 +5438,9 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_STA_TX_PWR: This driver supports controlling tx power
  *	to a station.
  *
+ * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in
+ *	station mode (SAE password is passed as part of the connect command).
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5467,6 +5485,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD,
 	NL80211_EXT_FEATURE_EXT_KEY_ID,
 	NL80211_EXT_FEATURE_STA_TX_PWR,
+	NL80211_EXT_FEATURE_SAE_OFFLOAD,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8332a5731c57..80e514872719 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -571,6 +571,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PEER_MEASUREMENTS] =
 		NLA_POLICY_NESTED(nl80211_pmsr_attr_policy),
 	[NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1),
+	[NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY,
+					.len = SAE_PASSWORD_MAX_LEN },
 };
 
 /* policy for the key attributes */
@@ -4434,6 +4436,8 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 		return true;
 	case NL80211_CMD_CONNECT:
 		if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_SAE_OFFLOAD) &&
 		    auth_type == NL80211_AUTHTYPE_SAE)
 			return false;
 
@@ -8973,6 +8977,16 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 		settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]);
 	}
 
+	if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_SAE_OFFLOAD))
+			return -EINVAL;
+		settings->sae_pwd =
+			nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
+		settings->sae_pwd_len =
+			nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From d7edf40c15e85b44c4bef146819b664089b827b1 Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Tue, 21 May 2019 17:02:58 +0200
Subject: mac80211: add ieee80211_get_he_iftype_cap() helper

This function is similar to ieee80211_get_he_sta_cap() but allows passing
the iftype. Also make ieee80211_get_he_sta_cap() use the new helper
rather than duplicating the code.

Signed-off-by: Shashidhar Lakkavalli <slakkavalli@datto.com>
Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4b45056dbb25..20613b35afcd 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -382,16 +382,18 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
 }
 
 /**
- * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA
- * @sband: the sband to search for the STA on
+ * ieee80211_get_he_iftype_cap - return HE capabilities for an sband's iftype
+ * @sband: the sband to search for the iftype on
+ * @iftype: enum nl80211_iftype
  *
  * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found
  */
 static inline const struct ieee80211_sta_he_cap *
-ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband)
+ieee80211_get_he_iftype_cap(const struct ieee80211_supported_band *sband,
+			    u8 iftype)
 {
 	const struct ieee80211_sband_iftype_data *data =
-		ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_STATION);
+		ieee80211_get_sband_iftype_data(sband, iftype);
 
 	if (data && data->he_cap.has_he)
 		return &data->he_cap;
@@ -399,6 +401,18 @@ ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband)
 	return NULL;
 }
 
+/**
+ * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA
+ * @sband: the sband to search for the STA on
+ *
+ * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found
+ */
+static inline const struct ieee80211_sta_he_cap *
+ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband)
+{
+	return ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_STATION);
+}
+
 /**
  * wiphy_read_of_freq_limits - read frequency limits from device tree
  *
-- 
cgit v1.2.3


From 901bb9891855164fdcfcfdd9c3d25bcc800d3f5b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 28 May 2019 10:56:03 +0200
Subject: nl80211: require and validate vendor command policy

Require that each vendor command give a policy of its sub-attributes
in NL80211_ATTR_VENDOR_DATA, and then (stricly) check the contents,
including the NLA_F_NESTED flag that we couldn't check on the outer
layer because there we don't know yet.

It is possible to use VENDOR_CMD_RAW_DATA for raw data, but then no
nested data can be given (NLA_F_NESTED flag must be clear) and the
data is just passed as is to the command.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c |  2 ++
 include/net/cfg80211.h                |  8 +++++++
 include/net/netlink.h                 |  9 ++++++++
 net/wireless/core.c                   | 13 ++++++++++++
 net/wireless/nl80211.c                | 39 +++++++++++++++++++++++++++++++++--
 5 files changed, 69 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 60ca13e0f15b..b88768c661e2 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -457,6 +457,8 @@ static struct wiphy_vendor_command mac80211_hwsim_vendor_commands[] = {
 			  .subcmd = QCA_NL80211_SUBCMD_TEST },
 		.flags = WIPHY_VENDOR_CMD_NEED_NETDEV,
 		.doit = mac80211_hwsim_vendor_cmd_test,
+		.policy = hwsim_vendor_test_policy,
+		.maxattr = QCA_WLAN_VENDOR_ATTR_MAX,
 	}
 };
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 20613b35afcd..7c4aa868e7a5 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4170,6 +4170,8 @@ struct sta_opmode_info {
 	u8 rx_nss;
 };
 
+#define VENDOR_CMD_RAW_DATA ((const struct nla_policy *)ERR_PTR(-ENODATA))
+
 /**
  * struct wiphy_vendor_command - vendor command definition
  * @info: vendor command identifying information, as used in nl80211
@@ -4180,6 +4182,10 @@ struct sta_opmode_info {
  * @dumpit: dump callback, for transferring bigger/multiple items. The
  *	@storage points to cb->args[5], ie. is preserved over the multiple
  *	dumpit calls.
+ * @policy: policy pointer for attributes within %NL80211_ATTR_VENDOR_DATA.
+ *	Set this to %VENDOR_CMD_RAW_DATA if no policy can be given and the
+ *	attribute is just raw data (e.g. a firmware command).
+ * @maxattr: highest attribute number in policy
  * It's recommended to not have the same sub command with both @doit and
  * @dumpit, so that userspace can assume certain ones are get and others
  * are used with dump requests.
@@ -4192,6 +4198,8 @@ struct wiphy_vendor_command {
 	int (*dumpit)(struct wiphy *wiphy, struct wireless_dev *wdev,
 		      struct sk_buff *skb, const void *data, int data_len,
 		      unsigned long *storage);
+	const struct nla_policy *policy;
+	unsigned int maxattr;
 };
 
 /**
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 395b4406f4b0..28ece67f5312 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1754,6 +1754,15 @@ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
 			      validate, extack);
 }
 
+static inline int
+nl80211_validate_nested(const struct nlattr *start, int maxtype,
+			const struct nla_policy *policy,
+			struct netlink_ext_ack *extack)
+{
+	return __nla_validate_nested(start, maxtype, policy,
+				     NL_VALIDATE_STRICT, extack);
+}
+
 static inline int
 nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
 			       const struct nla_policy *policy,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 037816163e70..fba0915fbd6f 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -859,6 +859,19 @@ int wiphy_register(struct wiphy *wiphy)
 		return -EINVAL;
 	}
 
+	for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) {
+		/*
+		 * Validate we have a policy (can be explicitly set to
+		 * VENDOR_CMD_RAW_DATA which is non-NULL) and also that
+		 * we have at least one of doit/dumpit.
+		 */
+		if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy))
+			return -EINVAL;
+		if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit &&
+			    !rdev->wiphy.vendor_commands[i].dumpit))
+			return -EINVAL;
+	}
+
 #ifdef CONFIG_PM
 	if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns &&
 		    (!rdev->wiphy.wowlan->pattern_min_len ||
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 80e514872719..34e86539552e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12669,6 +12669,29 @@ static int nl80211_crit_protocol_stop(struct sk_buff *skb,
 	return 0;
 }
 
+static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd,
+				       struct nlattr *attr,
+				       struct netlink_ext_ack *extack)
+{
+	if (vcmd->policy == VENDOR_CMD_RAW_DATA) {
+		if (attr->nla_type & NLA_F_NESTED) {
+			NL_SET_ERR_MSG_ATTR(extack, attr,
+					    "unexpected nested data");
+			return -EINVAL;
+		}
+
+		return 0;
+	}
+
+	if (!(attr->nla_type & NLA_F_NESTED)) {
+		NL_SET_ERR_MSG_ATTR(extack, attr, "expected nested data");
+		return -EINVAL;
+	}
+
+	return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy,
+				       extack);
+}
+
 static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -12727,11 +12750,16 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
 		if (info->attrs[NL80211_ATTR_VENDOR_DATA]) {
 			data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]);
 			len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]);
+
+			err = nl80211_vendor_check_policy(vcmd,
+					info->attrs[NL80211_ATTR_VENDOR_DATA],
+					info->extack);
+			if (err)
+				return err;
 		}
 
 		rdev->cur_cmd_info = info;
-		err = rdev->wiphy.vendor_commands[i].doit(&rdev->wiphy, wdev,
-							  data, len);
+		err = vcmd->doit(&rdev->wiphy, wdev, data, len);
 		rdev->cur_cmd_info = NULL;
 		return err;
 	}
@@ -12818,6 +12846,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
 	if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
 		data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
 		data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]);
+
+		err = nl80211_vendor_check_policy(
+				&(*rdev)->wiphy.vendor_commands[vcmd_idx],
+				attrbuf[NL80211_ATTR_VENDOR_DATA],
+				cb->extack);
+		if (err)
+			return err;
 	}
 
 	/* 0 is the first index - add 1 to parse only once */
-- 
cgit v1.2.3


From c9d3245e03ce20566e373b68dd24a7f2365d8dda Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Tue, 28 May 2019 13:49:47 +0200
Subject: mac80211: dynamically enable the TWT requester support on STA
 interfaces

Turn TWT for STA interfaces when they associate and/or receive a
beacon where the twt_responder bit has changed.

Signed-off-by: Shashidhar Lakkavalli <slakkavalli@datto.com>
Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  2 ++
 net/mac80211/mlme.c    | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index b7f0b56a09f4..e8fdb786b228 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -317,6 +317,7 @@ struct ieee80211_vif_chanctx_switch {
  * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface
  * @BSS_CHANGED_FTM_RESPONDER: fime timing reasurement request responder
  *	functionality changed for this BSS (AP mode).
+ * @BSS_CHANGED_TWT: TWT status changed
  *
  */
 enum ieee80211_bss_change {
@@ -347,6 +348,7 @@ enum ieee80211_bss_change {
 	BSS_CHANGED_KEEP_ALIVE		= 1<<24,
 	BSS_CHANGED_MCAST_RATE		= 1<<25,
 	BSS_CHANGED_FTM_RESPONDER	= 1<<26,
+	BSS_CHANGED_TWT			= 1<<27,
 
 	/* when adding here, make sure to change ieee80211_reconfig */
 };
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b7a9fe3d5fcb..281319c826dd 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3148,6 +3148,19 @@ static bool ieee80211_twt_req_supported(const struct sta_info *sta,
 		IEEE80211_HE_MAC_CAP0_TWT_RES;
 }
 
+static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata,
+				    struct sta_info *sta,
+				    struct ieee802_11_elems *elems)
+{
+	bool twt = ieee80211_twt_req_supported(sta, elems);
+
+	if (sdata->vif.bss_conf.twt_requester != twt) {
+		sdata->vif.bss_conf.twt_requester = twt;
+		return BSS_CHANGED_TWT;
+	}
+	return 0;
+}
+
 static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 				    struct cfg80211_bss *cbss,
 				    struct ieee80211_mgmt *mgmt, size_t len)
@@ -3330,8 +3343,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 						  sta);
 
 		bss_conf->he_support = sta->sta.he_cap.has_he;
-		bss_conf->twt_requester =
-			ieee80211_twt_req_supported(sta, &elems);
+		changed |= ieee80211_recalc_twt_req(sdata, sta, &elems);
 	} else {
 		bss_conf->he_support = false;
 		bss_conf->twt_requester = false;
@@ -3991,6 +4003,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	mutex_lock(&local->sta_mtx);
 	sta = sta_info_get(sdata, bssid);
 
+	changed |= ieee80211_recalc_twt_req(sdata, sta, &elems);
+
 	if (ieee80211_config_bw(sdata, sta,
 				elems.ht_cap_elem, elems.ht_operation,
 				elems.vht_operation, elems.he_operation,
-- 
cgit v1.2.3


From a0de1ca383c77a1ae123d7c0cea45e327b61876a Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Tue, 28 May 2019 13:49:48 +0200
Subject: mac80211: allow turning TWT responder support on and off via netlink

Allow the userland daemon to en/disable TWT support for an AP.

Signed-off-by: Shashidhar Lakkavalli <slakkavalli@datto.com>
Signed-off-by: John Crispin <john@phrozen.org>
[simplify parsing code]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/net/mac80211.h       | 3 +++
 include/uapi/linux/nl80211.h | 4 ++++
 net/mac80211/cfg.c           | 4 +++-
 net/wireless/nl80211.c       | 4 ++++
 5 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7c4aa868e7a5..ac758a54e971 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -897,6 +897,7 @@ enum cfg80211_ap_settings_flags {
  * @he_cap: HE capabilities (or %NULL if HE isn't enabled)
  * @ht_required: stations must support HT
  * @vht_required: stations must support VHT
+ * @twt_responder: Enable Target Wait Time
  * @flags: flags, as defined in enum cfg80211_ap_settings_flags
  */
 struct cfg80211_ap_settings {
@@ -923,6 +924,7 @@ struct cfg80211_ap_settings {
 	const struct ieee80211_vht_cap *vht_cap;
 	const struct ieee80211_he_cap_elem *he_cap;
 	bool ht_required, vht_required;
+	bool twt_responder;
 	u32 flags;
 };
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e8fdb786b228..ed4911306f03 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -506,6 +506,8 @@ struct ieee80211_ftm_responder_params {
  * @he_support: does this BSS support HE
  * @twt_requester: does this BSS support TWT requester (relevant for managed
  *	mode only, set if the AP advertises TWT responder role)
+ * @twt_responder: does this BSS support TWT requester (relevant for managed
+ *	mode only, set if the AP advertises TWT responder role)
  * @assoc: association status
  * @ibss_joined: indicates whether this station is part of an IBSS
  *	or not
@@ -613,6 +615,7 @@ struct ieee80211_bss_conf {
 	u16 frame_time_rts_th;
 	bool he_support;
 	bool twt_requester;
+	bool twt_responder;
 	/* association related data */
 	bool assoc, ibss_joined;
 	bool ibss_creator;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8b1e43fecd25..8fc3a43cac75 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2354,6 +2354,8 @@ enum nl80211_commands {
  *	is used with %NL80211_CMD_CONNECT to provide password for offloading
  *	SAE authentication for WPA3-Personal networks.
  *
+ * @NL80211_ATTR_TWT_RESPONDER: Enable target wait time responder support.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2809,6 +2811,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_SAE_PASSWORD,
 
+	NL80211_ATTR_TWT_RESPONDER,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 52e6a091b7e4..023e8751d223 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -975,7 +975,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 		      BSS_CHANGED_BEACON |
 		      BSS_CHANGED_SSID |
 		      BSS_CHANGED_P2P_PS |
-		      BSS_CHANGED_TXPOWER;
+		      BSS_CHANGED_TXPOWER |
+		      BSS_CHANGED_TWT;
 	int err;
 	int prev_beacon_int;
 
@@ -1045,6 +1046,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	sdata->vif.bss_conf.dtim_period = params->dtim_period;
 	sdata->vif.bss_conf.enable_beacon = true;
 	sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
+	sdata->vif.bss_conf.twt_responder = params->twt_responder;
 
 	sdata->vif.bss_conf.ssid_len = params->ssid_len;
 	if (params->ssid_len)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 34e86539552e..68e5ab5394dd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -573,6 +573,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1),
 	[NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY,
 					.len = SAE_PASSWORD_MAX_LEN },
+	[NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -4628,6 +4629,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			return PTR_ERR(params.acl);
 	}
 
+	params.twt_responder =
+		    nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]);
+
 	nl80211_calculate_ap_params(&params);
 
 	if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
-- 
cgit v1.2.3


From 4770c8f902285089ae1911de39808c808766a115 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 29 May 2019 15:25:32 +0300
Subject: cfg80211: Add a function to iterate all BSS entries

Add a function that iterates over the BSS entries associated with a
given wiphy and calls a callback for each iterated BSS. This can be
used by drivers in various ways, e.g., to evaluate some property for
all the BSSs in the medium.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 20 ++++++++++++++++++++
 net/wireless/scan.c    | 21 +++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ac758a54e971..4cd2857c06a4 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5750,6 +5750,26 @@ void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);
  */
 void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);
 
+/**
+ * cfg80211_bss_iter - iterate all BSS entries
+ *
+ * This function iterates over the BSS entries associated with the given wiphy
+ * and calls the callback for the iterated BSS. The iterator function is not
+ * allowed to call functions that might modify the internal state of the BSS DB.
+ *
+ * @wiphy: the wiphy
+ * @chandef: if given, the iterator function will be called only if the channel
+ *     of the currently iterated BSS is a subset of the given channel.
+ * @iter: the iterator function to call
+ * @iter_data: an argument to the iterator function
+ */
+void cfg80211_bss_iter(struct wiphy *wiphy,
+		       struct cfg80211_chan_def *chandef,
+		       void (*iter)(struct wiphy *wiphy,
+				    struct cfg80211_bss *bss,
+				    void *data),
+		       void *iter_data);
+
 static inline enum nl80211_bss_scan_width
 cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef)
 {
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index f347387f195a..dc1ba21428dd 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1974,6 +1974,27 @@ out:
 }
 EXPORT_SYMBOL(cfg80211_unlink_bss);
 
+void cfg80211_bss_iter(struct wiphy *wiphy,
+		       struct cfg80211_chan_def *chandef,
+		       void (*iter)(struct wiphy *wiphy,
+				    struct cfg80211_bss *bss,
+				    void *data),
+		       void *iter_data)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+	struct cfg80211_internal_bss *bss;
+
+	spin_lock_bh(&rdev->bss_lock);
+
+	list_for_each_entry(bss, &rdev->bss_list, list) {
+		if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel))
+			iter(wiphy, &bss->pub, iter_data);
+	}
+
+	spin_unlock_bh(&rdev->bss_lock);
+}
+EXPORT_SYMBOL(cfg80211_bss_iter);
+
 #ifdef CONFIG_CFG80211_WEXT
 static struct cfg80211_registered_device *
 cfg80211_get_dev_from_ifindex(struct net *net, int ifindex)
-- 
cgit v1.2.3


From cd6f34110285742ec5570f07aa2229e29f4d2092 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 29 May 2019 15:25:33 +0300
Subject: ieee80211: Add a missing extended capability flag definition

Add the "OBSS Narrow Bandwidth RU In OFDMA Tolerance Support" flag
definition to the definitions of the flags covered by the Extended
Capability IE.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 5dfd949ade25..2dbefeffc43c 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2713,6 +2713,13 @@ enum ieee80211_tdls_actioncode {
 #define WLAN_EXT_CAPA10_TWT_REQUESTER_SUPPORT	BIT(5)
 #define WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT	BIT(6)
 
+/*
+ * When set, indicates that the AP is able to tolerate 26-tone RU UL
+ * OFDMA transmissions using HE TB PPDU from OBSS (not falsely classify the
+ * 26-tone RU UL OFDMA transmissions as radar pulses).
+ */
+#define WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT BIT(7)
+
 /* Defines support for enhanced multi-bssid advertisement*/
 #define WLAN_EXT_CAPA11_EMA_SUPPORT	BIT(1)
 
-- 
cgit v1.2.3


From 1e87fec9fa52a6f7c223998d6bfbd3464eb37e31 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 16 May 2019 11:44:52 +0200
Subject: mac80211: call rate_control_send_low() internally

There's no rate control algorithm that *doesn't* want to call
it internally, and calling it internally will let us modify
its behaviour in the future.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 .../driver-api/80211/mac80211-advanced.rst         |  3 ---
 drivers/net/wireless/intel/iwlegacy/3945-rs.c      |  3 ---
 drivers/net/wireless/intel/iwlegacy/4965-rs.c      |  4 ----
 drivers/net/wireless/intel/iwlwifi/dvm/rs.c        |  4 ----
 drivers/net/wireless/intel/iwlwifi/mvm/rs.c        |  4 ----
 drivers/net/wireless/realtek/rtlwifi/rc.c          |  3 ---
 include/net/mac80211.h                             | 23 ----------------------
 net/mac80211/rate.c                                | 13 ++++++------
 net/mac80211/rc80211_minstrel.c                    |  4 ----
 net/mac80211/rc80211_minstrel_ht.c                 |  3 ---
 10 files changed, 7 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/80211/mac80211-advanced.rst b/Documentation/driver-api/80211/mac80211-advanced.rst
index 70a89b2163c2..9f1c5bb7ac35 100644
--- a/Documentation/driver-api/80211/mac80211-advanced.rst
+++ b/Documentation/driver-api/80211/mac80211-advanced.rst
@@ -226,9 +226,6 @@ TBD
 .. kernel-doc:: include/net/mac80211.h
    :functions: ieee80211_tx_rate_control
 
-.. kernel-doc:: include/net/mac80211.h
-   :functions: rate_control_send_low
-
 TBD
 
 This part of the book describes mac80211 internals.
diff --git a/drivers/net/wireless/intel/iwlegacy/3945-rs.c b/drivers/net/wireless/intel/iwlegacy/3945-rs.c
index a697edd46e7f..922f09f7ea3e 100644
--- a/drivers/net/wireless/intel/iwlegacy/3945-rs.c
+++ b/drivers/net/wireless/intel/iwlegacy/3945-rs.c
@@ -646,9 +646,6 @@ il3945_rs_get_rate(void *il_r, struct ieee80211_sta *sta, void *il_sta,
 		il_sta = NULL;
 	}
 
-	if (rate_control_send_low(sta, il_sta, txrc))
-		return;
-
 	rate_mask = sta->supp_rates[sband->band];
 
 	/* get user max rate if set */
diff --git a/drivers/net/wireless/intel/iwlegacy/4965-rs.c b/drivers/net/wireless/intel/iwlegacy/4965-rs.c
index 54ff83829afb..946f352fd9a4 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-rs.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-rs.c
@@ -2224,10 +2224,6 @@ il4965_rs_get_rate(void *il_r, struct ieee80211_sta *sta, void *il_sta,
 		il_sta = NULL;
 	}
 
-	/* Send management frames and NO_ACK data using lowest rate. */
-	if (rate_control_send_low(sta, il_sta, txrc))
-		return;
-
 	if (!lq_sta)
 		return;
 
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rs.c b/drivers/net/wireless/intel/iwlwifi/dvm/rs.c
index ef4b9de256f7..838e76a5db68 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/rs.c
@@ -2731,10 +2731,6 @@ static void rs_get_rate(void *priv_r, struct ieee80211_sta *sta, void *priv_sta,
 		priv_sta = NULL;
 	}
 
-	/* Send management frames and NO_ACK data using lowest rate. */
-	if (rate_control_send_low(sta, priv_sta, txrc))
-		return;
-
 	rate_idx  = lq_sta->last_txrate_idx;
 
 	if (lq_sta->last_rate_n_flags & RATE_MCS_HT_MSK) {
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index c182821ab22b..9107b1698b0f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -2960,10 +2960,6 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
 		mvm_sta = NULL;
 	}
 
-	/* Send management frames and NO_ACK data using lowest rate. */
-	if (rate_control_send_low(sta, mvm_sta, txrc))
-		return;
-
 	if (!mvm_sta)
 		return;
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rc.c b/drivers/net/wireless/realtek/rtlwifi/rc.c
index cf8e42a01015..0c7d74902d33 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rc.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rc.c
@@ -173,9 +173,6 @@ static void rtl_get_rate(void *ppriv, struct ieee80211_sta *sta,
 	u8 try_per_rate, i, rix;
 	bool not_data = !ieee80211_is_data(fc);
 
-	if (rate_control_send_low(sta, priv_sta, txrc))
-		return;
-
 	rix = _rtl_rc_get_highest_rix(rtlpriv, sta, skb, not_data);
 	try_per_rate = 1;
 	_rtl_rc_rate_set_series(rtlpriv, sta, &rates[0], txrc,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ed4911306f03..4411120e5a9a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5960,29 +5960,6 @@ static inline int rate_supported(struct ieee80211_sta *sta,
 	return (sta == NULL || sta->supp_rates[band] & BIT(index));
 }
 
-/**
- * rate_control_send_low - helper for drivers for management/no-ack frames
- *
- * Rate control algorithms that agree to use the lowest rate to
- * send management frames and NO_ACK data with the respective hw
- * retries should use this in the beginning of their mac80211 get_rate
- * callback. If true is returned the rate control can simply return.
- * If false is returned we guarantee that sta and sta and priv_sta is
- * not null.
- *
- * Rate control algorithms wishing to do more intelligent selection of
- * rate for multicast/broadcast frames may choose to not use this.
- *
- * @sta: &struct ieee80211_sta pointer to the target destination. Note
- * 	that this may be null.
- * @priv_sta: private rate control structure. This may be null.
- * @txrc: rate control information we sholud populate for mac80211.
- */
-bool rate_control_send_low(struct ieee80211_sta *sta,
-			   void *priv_sta,
-			   struct ieee80211_tx_rate_control *txrc);
-
-
 static inline s8
 rate_lowest_index(struct ieee80211_supported_band *sband,
 		  struct ieee80211_sta *sta)
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 76f303fda3ed..09f89d004a70 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -369,9 +369,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
 }
 
 
-bool rate_control_send_low(struct ieee80211_sta *pubsta,
-			   void *priv_sta,
-			   struct ieee80211_tx_rate_control *txrc)
+static bool rate_control_send_low(struct ieee80211_sta *pubsta,
+				  struct ieee80211_tx_rate_control *txrc)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
 	struct ieee80211_supported_band *sband = txrc->sband;
@@ -379,7 +378,7 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta,
 	int mcast_rate;
 	bool use_basicrate = false;
 
-	if (!pubsta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) {
+	if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) {
 		__rate_control_send_low(txrc->hw, sband, pubsta, info,
 					txrc->rate_idx_mask);
 
@@ -405,7 +404,6 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta,
 	}
 	return false;
 }
-EXPORT_SYMBOL(rate_control_send_low);
 
 static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask)
 {
@@ -902,12 +900,15 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
 	if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL))
 		return;
 
+	if (rate_control_send_low(ista, txrc))
+		return;
+
 	if (ista) {
 		spin_lock_bh(&sta->rate_ctrl_lock);
 		ref->ops->get_rate(ref->priv, ista, priv_sta, txrc);
 		spin_unlock_bh(&sta->rate_ctrl_lock);
 	} else {
-		ref->ops->get_rate(ref->priv, NULL, NULL, txrc);
+		rate_control_send_low(NULL, txrc);
 	}
 
 	if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE))
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index a34e9c2ca626..ee86c3333999 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -340,10 +340,6 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
 	int delta;
 	int sampling_ratio;
 
-	/* management/no-ack frames do not use rate control */
-	if (rate_control_send_low(sta, priv_sta, txrc))
-		return;
-
 	/* check multi-rate-retry capabilities & adjust lookaround_rate */
 	mrr_capable = mp->has_mrr &&
 		      !txrc->rts &&
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 8b168724c5e7..da18c6fb6c1d 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1098,9 +1098,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
 	struct minstrel_priv *mp = priv;
 	int sample_idx;
 
-	if (rate_control_send_low(sta, priv_sta, txrc))
-		return;
-
 	if (!msp->is_ht)
 		return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc);
 
-- 
cgit v1.2.3


From 1c38c7f22068b54a7ba5f026a45663c6727ab84c Mon Sep 17 00:00:00 2001
From: James Prestwood <james.prestwood@linux.intel.com>
Date: Wed, 12 Jun 2019 12:35:09 -0700
Subject: nl80211: send event when CMD_FRAME duration expires

cfg80211_remain_on_channel_expired is used to notify userspace when
the remain on channel duration expired by sending an event. There is
no such equivalent to CMD_FRAME, where if offchannel and a duration
is provided, the card will go offchannel for that duration. Currently
there is no way for userspace to tell when that duration expired
apart from setting an independent timeout. This timeout is quite
erroneous as the kernel may not immediately send out the frame
because of scheduling or work queue delays. In testing, it was found
this timeout had to be quite large to accomidate any potential delays.

A better solution is to have the kernel send an event when this
duration has expired. There is already NL80211_CMD_FRAME_WAIT_CANCEL
which can be used to cancel a NL80211_CMD_FRAME offchannel. Using this
command matches perfectly to how NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL
works, where its both used to cancel and notify if the duration has
expired.

Signed-off-by: James Prestwood <james.prestwood@linux.intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 10 ++++++++++
 net/wireless/nl80211.c | 13 +++++++++++++
 net/wireless/trace.h   | 18 ++++++++++++++++++
 3 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4cd2857c06a4..2d17e32eb438 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6547,6 +6547,16 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
 					struct ieee80211_channel *chan,
 					gfp_t gfp);
 
+/**
+ * cfg80211_tx_mgmt_expired - tx_mgmt duration expired
+ * @wdev: wireless device
+ * @cookie: the requested cookie
+ * @chan: The current channel (from tx_mgmt request)
+ * @gfp: allocation flags
+ */
+void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
+			      struct ieee80211_channel *chan, gfp_t gfp);
+
 /**
  * cfg80211_sinfo_alloc_tid_stats - allocate per-tid statistics.
  *
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 68e5ab5394dd..ff760ba83449 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -15417,6 +15417,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
 }
 EXPORT_SYMBOL(cfg80211_remain_on_channel_expired);
 
+void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
+					struct ieee80211_channel *chan,
+					gfp_t gfp)
+{
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+	trace_cfg80211_tx_mgmt_expired(wdev, cookie, chan);
+	nl80211_send_remain_on_chan_event(NL80211_CMD_FRAME_WAIT_CANCEL,
+					  rdev, wdev, cookie, chan, 0, gfp);
+}
+EXPORT_SYMBOL(cfg80211_tx_mgmt_expired);
+
 void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
 		      struct station_info *sinfo, gfp_t gfp)
 {
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2abfff925aac..4fbb91a511ae 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2752,6 +2752,24 @@ TRACE_EVENT(cfg80211_ready_on_channel_expired,
 		  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
 );
 
+TRACE_EVENT(cfg80211_tx_mgmt_expired,
+	TP_PROTO(struct wireless_dev *wdev, u64 cookie,
+		 struct ieee80211_channel *chan),
+	TP_ARGS(wdev, cookie, chan),
+	TP_STRUCT__entry(
+		WDEV_ENTRY
+		__field(u64, cookie)
+		CHAN_ENTRY
+	),
+	TP_fast_assign(
+		WDEV_ASSIGN;
+		__entry->cookie = cookie;
+		CHAN_ASSIGN(chan);
+	),
+	TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
+		  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
+);
+
 TRACE_EVENT(cfg80211_new_sta,
 	TP_PROTO(struct net_device *netdev, const u8 *mac_addr,
 		 struct station_info *sinfo),
-- 
cgit v1.2.3


From 99f3a064bc2e4bd5fe50218646c5be342f2ad18c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 13 Jun 2019 15:00:01 -0700
Subject: bpf: net: Add SO_DETACH_REUSEPORT_BPF

There is SO_ATTACH_REUSEPORT_[CE]BPF but there is no DETACH.
This patch adds SO_DETACH_REUSEPORT_BPF sockopt.  The same
sockopt can be used to undo both SO_ATTACH_REUSEPORT_[CE]BPF.

reseport_detach_prog() is added and it is mostly a mirror
of the existing reuseport_attach_prog().  The differences are,
it does not call reuseport_alloc() and returns -ENOENT when
there is no old prog.

Cc: Craig Gallek <kraig@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/alpha/include/uapi/asm/socket.h  |  2 ++
 arch/mips/include/uapi/asm/socket.h   |  2 ++
 arch/parisc/include/uapi/asm/socket.h |  2 ++
 arch/sparc/include/uapi/asm/socket.h  |  2 ++
 include/net/sock_reuseport.h          |  2 ++
 include/uapi/asm-generic/socket.h     |  2 ++
 net/core/sock.c                       |  4 ++++
 net/core/sock_reuseport.c             | 24 ++++++++++++++++++++++++
 8 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 976e89b116e5..de6c4df61082 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -122,6 +122,8 @@
 #define SO_RCVTIMEO_NEW         66
 #define SO_SNDTIMEO_NEW         67
 
+#define SO_DETACH_REUSEPORT_BPF 68
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index d41765cfbc6e..d0a9ed2ca2d6 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -133,6 +133,8 @@
 #define SO_RCVTIMEO_NEW         66
 #define SO_SNDTIMEO_NEW         67
 
+#define SO_DETACH_REUSEPORT_BPF 68
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 66c5dd245ac7..10173c32195e 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -114,6 +114,8 @@
 #define SO_RCVTIMEO_NEW         0x4040
 #define SO_SNDTIMEO_NEW         0x4041
 
+#define SO_DETACH_REUSEPORT_BPF 0x4042
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 9265a9eece15..8029b681fc7c 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -115,6 +115,8 @@
 #define SO_RCVTIMEO_NEW          0x0044
 #define SO_SNDTIMEO_NEW          0x0045
 
+#define SO_DETACH_REUSEPORT_BPF  0x0047
+
 #if !defined(__KERNEL__)
 
 
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 8a5f70c7cdf2..d9112de85261 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -35,6 +35,8 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
 					  struct sk_buff *skb,
 					  int hdr_len);
 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
+extern int reuseport_detach_prog(struct sock *sk);
+
 int reuseport_get_id(struct sock_reuseport *reuse);
 
 #endif  /* _SOCK_REUSEPORT_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 8c1391c89171..77f7c1638eb1 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -117,6 +117,8 @@
 #define SO_RCVTIMEO_NEW         66
 #define SO_SNDTIMEO_NEW         67
 
+#define SO_DETACH_REUSEPORT_BPF 68
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/core/sock.c b/net/core/sock.c
index 75b1c950b49f..06be30737b69 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1045,6 +1045,10 @@ set_rcvbuf:
 		}
 		break;
 
+	case SO_DETACH_REUSEPORT_BPF:
+		ret = reuseport_detach_prog(sk);
+		break;
+
 	case SO_DETACH_FILTER:
 		ret = sk_detach_filter(sk);
 		break;
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index dc4aefdf2a08..9408f9264d05 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -332,3 +332,27 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 	return 0;
 }
 EXPORT_SYMBOL(reuseport_attach_prog);
+
+int reuseport_detach_prog(struct sock *sk)
+{
+	struct sock_reuseport *reuse;
+	struct bpf_prog *old_prog;
+
+	if (!rcu_access_pointer(sk->sk_reuseport_cb))
+		return sk->sk_reuseport ? -ENOENT : -EINVAL;
+
+	old_prog = NULL;
+	spin_lock_bh(&reuseport_lock);
+	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+					  lockdep_is_held(&reuseport_lock));
+	rcu_swap_protected(reuse->prog, old_prog,
+			   lockdep_is_held(&reuseport_lock));
+	spin_unlock_bh(&reuseport_lock);
+
+	if (!old_prog)
+		return -ENOENT;
+
+	sk_reuseport_prog_free(old_prog);
+	return 0;
+}
+EXPORT_SYMBOL(reuseport_detach_prog);
-- 
cgit v1.2.3


From fb85c4a730af221339c1dde1a434b73da0dfc3ed Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 12 Jun 2019 10:30:37 -0700
Subject: bpf: export bpf_sock for BPF_PROG_TYPE_CGROUP_SOCK_ADDR prog type

And let it use bpf_sk_storage_{get,delete} helpers to access socket
storage. Kernel context (struct bpf_sock_addr_kern) already has sk
member, so I just expose it to the BPF hooks. Using PTR_TO_SOCKET
instead of PTR_TO_SOCK_COMMON should be safe because the hook is
called on bind/connect.

Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ae0907d8c03a..8815fc418cde 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3247,6 +3247,7 @@ struct bpf_sock_addr {
 	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
 				 * Stored in network byte order.
 				 */
+	__bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/net/core/filter.c b/net/core/filter.c
index a5e4ac7fcbe5..37c4a2fd559b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5922,6 +5922,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skc_lookup_tcp:
 		return &bpf_sock_addr_skc_lookup_tcp_proto;
 #endif /* CONFIG_INET */
+	case BPF_FUNC_sk_storage_get:
+		return &bpf_sk_storage_get_proto;
+	case BPF_FUNC_sk_storage_delete:
+		return &bpf_sk_storage_delete_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6828,6 +6832,13 @@ static bool sock_addr_is_valid_access(int off, int size,
 		if (size != size_default)
 			return false;
 		break;
+	case offsetof(struct bpf_sock_addr, sk):
+		if (type != BPF_READ)
+			return false;
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_SOCKET;
+		break;
 	default:
 		if (type == BPF_READ) {
 			if (size != size_default)
@@ -7778,6 +7789,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
 			struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
 			s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
 		break;
+	case offsetof(struct bpf_sock_addr, sk):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_addr_kern, sk));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From 1314ef561102e534e14cb1d37f89f5c1df0b2ea7 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 12 Jun 2019 10:30:38 -0700
Subject: bpf: export bpf_sock for BPF_PROG_TYPE_SOCK_OPS prog type

And let it use bpf_sk_storage_{get,delete} helpers to access socket
storage. Kernel context (struct bpf_sock_ops_kern) already has sk
member, so I just expose it to the BPF hooks. I use
PTR_TO_SOCKET_OR_NULL and return NULL in !is_fullsock case.

I also export bpf_tcp_sock to make it possible to access tcp socket stats.

Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8815fc418cde..d0a23476f887 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3299,6 +3299,7 @@ struct bpf_sock_ops {
 	__u32 sk_txhash;
 	__u64 bytes_received;
 	__u64 bytes_acked;
+	__bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
diff --git a/net/core/filter.c b/net/core/filter.c
index 37c4a2fd559b..8c18f2781afa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6147,6 +6147,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_sockopt_event_output_proto;
+	case BPF_FUNC_sk_storage_get:
+		return &bpf_sk_storage_get_proto;
+	case BPF_FUNC_sk_storage_delete:
+		return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_INET
+	case BPF_FUNC_tcp_sock:
+		return &bpf_tcp_sock_proto;
+#endif /* CONFIG_INET */
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6882,6 +6890,11 @@ static bool sock_ops_is_valid_access(int off, int size,
 			if (size != sizeof(__u64))
 				return false;
 			break;
+		case offsetof(struct bpf_sock_ops, sk):
+			if (size != sizeof(__u64))
+				return false;
+			info->reg_type = PTR_TO_SOCKET_OR_NULL;
+			break;
 		default:
 			if (size != size_default)
 				return false;
@@ -8053,6 +8066,19 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
 					  struct sock, type);
 		break;
+	case offsetof(struct bpf_sock_ops, sk):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern,
+						is_fullsock),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern,
+					       is_fullsock));
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit v1.2.3


From 7f94208c8f9a0a6d2ff0e0c0858c00ad8e5c8617 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 12 Jun 2019 17:18:47 +0800
Subject: bpf: Fix build error without CONFIG_INET

If CONFIG_INET is not set, building fails:

kernel/bpf/verifier.o: In function `check_mem_access':
verifier.c: undefined reference to `bpf_xdp_sock_is_valid_access'
kernel/bpf/verifier.o: In function `convert_ctx_accesses':
verifier.c: undefined reference to `bpf_xdp_sock_convert_ctx_access'

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: fada7fdc83c0 ("bpf: Allow bpf_map_lookup_elem() on an xskmap")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1fe137afa898..b15fb5fcb741 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -728,13 +728,6 @@ void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
-bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
-				  struct bpf_insn_access_aux *info);
-u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
-				    const struct bpf_insn *si,
-				    struct bpf_insn *insn_buf,
-				    struct bpf_prog *prog,
-				    u32 *target_size);
 
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
@@ -1110,6 +1103,15 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 				    struct bpf_insn *insn_buf,
 				    struct bpf_prog *prog,
 				    u32 *target_size);
+
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+				  struct bpf_insn_access_aux *info);
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+				    const struct bpf_insn *si,
+				    struct bpf_insn *insn_buf,
+				    struct bpf_prog *prog,
+				    u32 *target_size);
 #else
 static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
 						enum bpf_access_type type,
@@ -1126,6 +1128,21 @@ static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 {
 	return 0;
 }
+static inline bool bpf_xdp_sock_is_valid_access(int off, int size,
+						enum bpf_access_type type,
+						struct bpf_insn_access_aux *info)
+{
+	return false;
+}
+
+static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+						  const struct bpf_insn *si,
+						  struct bpf_insn *insn_buf,
+						  struct bpf_prog *prog,
+						  u32 *target_size)
+{
+	return 0;
+}
 #endif /* CONFIG_INET */
 
 #endif /* _LINUX_BPF_H */
-- 
cgit v1.2.3


From 7c86f20d15b7c1132e0c24358ce240ba4cb002b7 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Wed, 12 Jun 2019 21:31:15 +0200
Subject: net: stmmac: use GPIO descriptors in stmmac_mdio_reset

Switch stmmac_mdio_reset to use GPIO descriptors. GPIO core handles the
"snps,reset-gpio" for GPIO descriptors so we don't need to take care of
it inside the driver anymore.

The advantage of this is that we now preserve the GPIO flags which are
passed via devicetree. This is required on some newer Amlogic boards
which use an Open Drain pin for the reset GPIO. This pin can only output
a LOW signal or switch to input mode but it cannot output a HIGH signal.
There are already devicetree bindings for these special cases and GPIO
core already takes care of them but only if we use GPIO descriptors
instead of GPIO numbers.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 27 +++++++++++------------
 include/linux/stmmac.h                            |  2 +-
 2 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 093a223fe408..f1c39dd048e7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -20,11 +20,11 @@
   Maintainer: Giuseppe Cavallaro <peppe.cavallaro@st.com>
 *******************************************************************************/
 
+#include <linux/gpio/consumer.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
 #include <linux/mii.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 #include <linux/of_mdio.h>
 #include <linux/phy.h>
 #include <linux/slab.h>
@@ -251,37 +251,36 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 
 #ifdef CONFIG_OF
 	if (priv->device->of_node) {
+		struct gpio_desc *reset_gpio;
+
 		if (data->reset_gpio < 0) {
 			struct device_node *np = priv->device->of_node;
 
 			if (!np)
 				return 0;
 
-			data->reset_gpio = of_get_named_gpio(np,
-						"snps,reset-gpio", 0);
-			if (data->reset_gpio < 0)
-				return 0;
+			reset_gpio = devm_gpiod_get_optional(priv->device,
+							     "snps,reset",
+							     GPIOD_OUT_LOW);
+			if (IS_ERR(reset_gpio))
+				return PTR_ERR(reset_gpio);
 
-			data->active_low = of_property_read_bool(np,
-						"snps,reset-active-low");
 			of_property_read_u32_array(np,
 				"snps,reset-delays-us", data->delays, 3);
+		} else {
+			reset_gpio = gpio_to_desc(data->reset_gpio);
 
-			if (devm_gpio_request(priv->device, data->reset_gpio,
-					      "mdio-reset"))
-				return 0;
+			gpiod_direction_output(reset_gpio, 0);
 		}
 
-		gpio_direction_output(data->reset_gpio,
-				      data->active_low ? 1 : 0);
 		if (data->delays[0])
 			msleep(DIV_ROUND_UP(data->delays[0], 1000));
 
-		gpio_set_value(data->reset_gpio, data->active_low ? 0 : 1);
+		gpiod_set_value_cansleep(reset_gpio, 1);
 		if (data->delays[1])
 			msleep(DIV_ROUND_UP(data->delays[1], 1000));
 
-		gpio_set_value(data->reset_gpio, data->active_low ? 1 : 0);
+		gpiod_set_value_cansleep(reset_gpio, 0);
 		if (data->delays[2])
 			msleep(DIV_ROUND_UP(data->delays[2], 1000));
 	}
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 4335bd771ce5..816edb545592 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -97,7 +97,7 @@ struct stmmac_mdio_bus_data {
 	int *irqs;
 	int probed_phy_irq;
 #ifdef CONFIG_OF
-	int reset_gpio, active_low;
+	int reset_gpio;
 	u32 delays[3];
 #endif
 };
-- 
cgit v1.2.3


From f01c373fbeed9f5870bb056b65750ccef42f1f20 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 13 Jun 2019 11:08:15 -0400
Subject: locking/static_key: always define static_branch_deferred_inc

This interface is currently only defined if CONFIG_JUMP_LABEL. Make it
available also when jump labels are off.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/jump_label_ratelimit.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index 42710d5949ba..8c3ee291b2d8 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h
@@ -60,8 +60,6 @@ extern void jump_label_update_timeout(struct work_struct *work);
 						   0),			\
 	}
 
-#define static_branch_deferred_inc(x)	static_branch_inc(&(x)->key)
-
 #else	/* !CONFIG_JUMP_LABEL */
 struct static_key_deferred {
 	struct static_key  key;
@@ -95,4 +93,7 @@ jump_label_rate_limit(struct static_key_deferred *key,
 	STATIC_KEY_CHECK_USE(key);
 }
 #endif	/* CONFIG_JUMP_LABEL */
+
+#define static_branch_deferred_inc(x)	static_branch_inc(&(x)->key)
+
 #endif	/* _LINUX_JUMP_LABEL_RATELIMIT_H */
-- 
cgit v1.2.3


From d6fb396cfaa71afc9f38d573b8ec6409fe3716de Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 13 Jun 2019 21:22:35 -0700
Subject: ipv4: tcp: fix ACK/RST sent with a transmit delay

If we want to set a EDT time for the skb we want to send
via ip_send_unicast_reply(), we have to pass a new parameter
and initialize ipc.sockc.transmit_time with it.

This fixes the EDT time for ACK/RST packets sent on behalf of
a TIME_WAIT socket.

Fixes: a842fe1425cb ("tcp: add optional per socket transmit delay")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h     |  2 +-
 include/net/tcp.h    |  9 ++++++---
 net/ipv4/ip_output.c |  3 ++-
 net/ipv4/tcp_ipv4.c  | 14 +++++++++-----
 net/ipv6/tcp_ipv6.c  |  2 +-
 5 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 6dbf88ea07f1..29d89de39822 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -279,7 +279,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   const struct ip_options *sopt,
 			   __be32 daddr, __be32 saddr,
 			   const struct ip_reply_arg *arg,
-			   unsigned int len);
+			   unsigned int len, u64 transmit_time);
 
 #define IP_INC_STATS(net, field)	SNMP_INC_STATS64((net)->mib.ip_statistics, field)
 #define __IP_INC_STATS(net, field)	__SNMP_INC_STATS64((net)->mib.ip_statistics, field)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 49a178b8d5b2..96e0e53ff440 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2240,15 +2240,18 @@ static inline void tcp_add_tx_delay(struct sk_buff *skb,
 		skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
 }
 
-static inline void tcp_set_tx_time(struct sk_buff *skb,
-				   const struct sock *sk)
+/* Compute Earliest Departure Time for some control packets
+ * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets.
+ */
+static inline u64 tcp_transmit_time(const struct sock *sk)
 {
 	if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
 		u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
 			tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;
 
-		skb->skb_mstamp_ns = tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
+		return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
 	}
+	return 0;
 }
 
 #endif	/* _TCP_H */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f5636ab0b9c3..e0ac39072a9c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1632,7 +1632,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   const struct ip_options *sopt,
 			   __be32 daddr, __be32 saddr,
 			   const struct ip_reply_arg *arg,
-			   unsigned int len)
+			   unsigned int len, u64 transmit_time)
 {
 	struct ip_options_data replyopts;
 	struct ipcm_cookie ipc;
@@ -1648,6 +1648,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 
 	ipcm_init(&ipc);
 	ipc.addr = daddr;
+	ipc.sockc.transmit_time = transmit_time;
 
 	if (replyopts.opt.opt.optlen) {
 		ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1b7e9e1fbd3b..633e8244ed5b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -662,8 +662,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	int genhash;
 	struct sock *sk1 = NULL;
 #endif
-	struct net *net;
+	u64 transmit_time = 0;
 	struct sock *ctl_sk;
+	struct net *net;
 
 	/* Never send a reset in response to a reset. */
 	if (th->rst)
@@ -770,12 +771,13 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	if (sk) {
 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
-		tcp_set_tx_time(skb, sk);
+		transmit_time = tcp_transmit_time(sk);
 	}
 	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
-			      &arg, arg.iov[0].iov_len);
+			      &arg, arg.iov[0].iov_len,
+			      transmit_time);
 
 	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
@@ -810,6 +812,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	struct net *net = sock_net(sk);
 	struct ip_reply_arg arg;
 	struct sock *ctl_sk;
+	u64 transmit_time;
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
@@ -863,11 +866,12 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
-	tcp_set_tx_time(skb, sk);
+	transmit_time = tcp_transmit_time(sk);
 	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
-			      &arg, arg.iov[0].iov_len);
+			      &arg, arg.iov[0].iov_len,
+			      transmit_time);
 
 	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5606b2131b65..408d9ec26971 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -892,7 +892,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		} else {
 			mark = sk->sk_mark;
 		}
-		tcp_set_tx_time(buff, sk);
+		buff->tstamp = tcp_transmit_time(sk);
 	}
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
 	fl6.fl6_dport = t1->dest;
-- 
cgit v1.2.3


From 68b2d4a844e157c08773fcd8f412ba1a37bf45b8 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Fri, 14 Jun 2019 13:49:20 -0400
Subject: net: dsa: make cpu_dp non const

A port may trigger operations on its dedicated CPU port, so using
cpu_dp as const will raise warnings. Make cpu_dp non const.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 82a2baa2dc48..1e8650fa8acc 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -181,7 +181,7 @@ struct dsa_port {
 	struct dsa_switch	*ds;
 	unsigned int		index;
 	const char		*name;
-	const struct dsa_port	*cpu_dp;
+	struct dsa_port		*cpu_dp;
 	const char		*mac;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
-- 
cgit v1.2.3


From 4838a54050284daac15dfeb1d65677e4dacf1bf5 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 14 Jun 2019 17:06:57 +0200
Subject: net: stmmac: Fix wrapper drivers not detecting PHY

Because of PHYLINK conversion we stopped parsing the phy-handle property
from DT. Unfortunatelly, some wrapper drivers still rely on this phy
node to configure the PHY.

Let's restore the parsing of PHY handle while these wrapper drivers are
not fully converted to PHYLINK.

Fixes: 74371272f97f ("net: stmmac: Convert to phylink and remove phylib logic")
Reported-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Tested-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     | 4 ++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 9 ++++++++-
 include/linux/stmmac.h                                | 1 +
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ad007d8bf9d7..069951590018 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -958,7 +958,7 @@ static int stmmac_init_phy(struct net_device *dev)
 	struct device_node *node;
 	int ret;
 
-	node = priv->plat->phy_node;
+	node = priv->plat->phylink_node;
 
 	if (node) {
 		ret = phylink_of_phy_connect(priv->phylink, node, 0);
@@ -980,7 +980,7 @@ static int stmmac_init_phy(struct net_device *dev)
 
 static int stmmac_phy_setup(struct stmmac_priv *priv)
 {
-	struct device_node *node = priv->plat->phy_node;
+	struct device_node *node = priv->plat->phylink_node;
 	int mode = priv->plat->interface;
 	struct phylink *phylink;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 898f94aced53..49adda9b0ad8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -381,7 +381,13 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 
 	*mac = of_get_mac_address(np);
 	plat->interface = of_get_phy_mode(np);
-	plat->phy_node = np;
+
+	/* Some wrapper drivers still rely on phy_node. Let's save it while
+	 * they are not converted to phylink. */
+	plat->phy_node = of_parse_phandle(np, "phy-handle", 0);
+
+	/* PHYLINK automatically parses the phy-handle property */
+	plat->phylink_node = np;
 
 	/* Get max speed of operation from device tree */
 	if (of_property_read_u32(np, "max-speed", &plat->max_speed))
@@ -577,6 +583,7 @@ error_pclk_get:
 void stmmac_remove_config_dt(struct platform_device *pdev,
 			     struct plat_stmmacenet_data *plat)
 {
+	of_node_put(plat->phy_node);
 	of_node_put(plat->mdio_node);
 }
 #else
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 816edb545592..a3c2d9945bcf 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -151,6 +151,7 @@ struct plat_stmmacenet_data {
 	int interface;
 	struct stmmac_mdio_bus_data *mdio_bus_data;
 	struct device_node *phy_node;
+	struct device_node *phylink_node;
 	struct device_node *mdio_node;
 	struct stmmac_dma_cfg *dma_cfg;
 	int clk_csr;
-- 
cgit v1.2.3


From a51486266c3ba8e035a47fa96df67f274fe0c7d0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sat, 15 Jun 2019 11:03:49 +0200
Subject: net: sched: remove NET_CLS_IND config option

This config option makes only couple of lines optional.
Two small helpers and an int in couple of cls structs.

Remove the config option and always compile this in.
This saves the user from unexpected surprises when he adds
a filter with ingress device match which is silently ignored
in case the config option is not set.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/mips/configs/malta_defconfig           |  1 -
 arch/mips/configs/malta_kvm_defconfig       |  1 -
 arch/mips/configs/malta_kvm_guest_defconfig |  1 -
 arch/mips/configs/malta_qemu_32r6_defconfig |  1 -
 arch/mips/configs/maltaaprp_defconfig       |  1 -
 arch/mips/configs/maltasmvp_defconfig       |  1 -
 arch/mips/configs/maltasmvp_eva_defconfig   |  1 -
 arch/mips/configs/maltaup_defconfig         |  1 -
 arch/mips/configs/maltaup_xpa_defconfig     |  1 -
 arch/mips/configs/rb532_defconfig           |  1 -
 arch/powerpc/configs/ppc6xx_defconfig       |  1 -
 arch/sh/configs/se7712_defconfig            |  1 -
 arch/sh/configs/se7721_defconfig            |  1 -
 arch/sh/configs/titan_defconfig             |  1 -
 include/net/pkt_cls.h                       |  5 +----
 include/uapi/linux/pkt_cls.h                |  2 +-
 net/sched/Kconfig                           |  8 --------
 net/sched/cls_flower.c                      |  3 +--
 net/sched/cls_fw.c                          | 13 -------------
 net/sched/cls_u32.c                         | 15 ---------------
 tools/include/uapi/linux/pkt_cls.h          |  2 +-
 tools/testing/selftests/tc-testing/config   |  1 -
 22 files changed, 4 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig
index 0ee5e677662e..0de92ac1ca64 100644
--- a/arch/mips/configs/malta_defconfig
+++ b/arch/mips/configs/malta_defconfig
@@ -210,7 +210,6 @@ CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
-CONFIG_NET_CLS_IND=y
 CONFIG_CFG80211=m
 CONFIG_MAC80211=m
 CONFIG_MAC80211_MESH=y
diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig
index 041bffac043b..efc3abace048 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -215,7 +215,6 @@ CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
-CONFIG_NET_CLS_IND=y
 CONFIG_CFG80211=m
 CONFIG_MAC80211=m
 CONFIG_MAC80211_MESH=y
diff --git a/arch/mips/configs/malta_kvm_guest_defconfig b/arch/mips/configs/malta_kvm_guest_defconfig
index 511065e62182..c6ceeca4394d 100644
--- a/arch/mips/configs/malta_kvm_guest_defconfig
+++ b/arch/mips/configs/malta_kvm_guest_defconfig
@@ -212,7 +212,6 @@ CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
-CONFIG_NET_CLS_IND=y
 CONFIG_CFG80211=m
 CONFIG_MAC80211=m
 CONFIG_MAC80211_MESH=y
diff --git a/arch/mips/configs/malta_qemu_32r6_defconfig b/arch/mips/configs/malta_qemu_32r6_defconfig
index 299088043164..e6c600dc1814 100644
--- a/arch/mips/configs/malta_qemu_32r6_defconfig
+++ b/arch/mips/configs/malta_qemu_32r6_defconfig
@@ -74,7 +74,6 @@ CONFIG_NET_CLS_RSVP=m
 CONFIG_NET_CLS_RSVP6=m
 CONFIG_NET_CLS_ACT=y
 CONFIG_NET_ACT_POLICE=y
-CONFIG_NET_CLS_IND=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/mips/configs/maltaaprp_defconfig b/arch/mips/configs/maltaaprp_defconfig
index 2b4b3a24f637..82b44b774553 100644
--- a/arch/mips/configs/maltaaprp_defconfig
+++ b/arch/mips/configs/maltaaprp_defconfig
@@ -76,7 +76,6 @@ CONFIG_NET_CLS_RSVP=m
 CONFIG_NET_CLS_RSVP6=m
 CONFIG_NET_CLS_ACT=y
 CONFIG_NET_ACT_POLICE=y
-CONFIG_NET_CLS_IND=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/mips/configs/maltasmvp_defconfig b/arch/mips/configs/maltasmvp_defconfig
index 425ddfd7cd78..4190fc6189a0 100644
--- a/arch/mips/configs/maltasmvp_defconfig
+++ b/arch/mips/configs/maltasmvp_defconfig
@@ -77,7 +77,6 @@ CONFIG_NET_CLS_RSVP=m
 CONFIG_NET_CLS_RSVP6=m
 CONFIG_NET_CLS_ACT=y
 CONFIG_NET_ACT_POLICE=y
-CONFIG_NET_CLS_IND=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/mips/configs/maltasmvp_eva_defconfig b/arch/mips/configs/maltasmvp_eva_defconfig
index 8beaa7ba1e52..a13c10e910ec 100644
--- a/arch/mips/configs/maltasmvp_eva_defconfig
+++ b/arch/mips/configs/maltasmvp_eva_defconfig
@@ -78,7 +78,6 @@ CONFIG_NET_CLS_RSVP=m
 CONFIG_NET_CLS_RSVP6=m
 CONFIG_NET_CLS_ACT=y
 CONFIG_NET_ACT_POLICE=y
-CONFIG_NET_CLS_IND=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/mips/configs/maltaup_defconfig b/arch/mips/configs/maltaup_defconfig
index 6e8b95ceb54a..b35f1fc690fb 100644
--- a/arch/mips/configs/maltaup_defconfig
+++ b/arch/mips/configs/maltaup_defconfig
@@ -75,7 +75,6 @@ CONFIG_NET_CLS_RSVP=m
 CONFIG_NET_CLS_RSVP6=m
 CONFIG_NET_CLS_ACT=y
 CONFIG_NET_ACT_POLICE=y
-CONFIG_NET_CLS_IND=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig
index 6c026db96ff9..56861aef2756 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -212,7 +212,6 @@ CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
-CONFIG_NET_CLS_IND=y
 CONFIG_CFG80211=m
 CONFIG_MAC80211=m
 CONFIG_MAC80211_MESH=y
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 50632a3103dd..864c70fbe668 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -103,7 +103,6 @@ CONFIG_GACT_PROB=y
 CONFIG_NET_ACT_MIRRED=m
 CONFIG_NET_ACT_IPT=m
 CONFIG_NET_ACT_PEDIT=m
-CONFIG_NET_CLS_IND=y
 CONFIG_HAMRADIO=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index 7c6baf6df139..aa51b9b66fa2 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -301,7 +301,6 @@ CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
-CONFIG_NET_CLS_IND=y
 CONFIG_IRDA=m
 CONFIG_IRLAN=m
 CONFIG_IRNET=m
diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig
index 5a1097641247..1e116529735f 100644
--- a/arch/sh/configs/se7712_defconfig
+++ b/arch/sh/configs/se7712_defconfig
@@ -63,7 +63,6 @@ CONFIG_NET_SCH_NETEM=y
 CONFIG_NET_CLS_TCINDEX=y
 CONFIG_NET_CLS_ROUTE4=y
 CONFIG_NET_CLS_FW=y
-CONFIG_NET_CLS_IND=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig
index 9c0ef13bee10..c66e512719ab 100644
--- a/arch/sh/configs/se7721_defconfig
+++ b/arch/sh/configs/se7721_defconfig
@@ -62,7 +62,6 @@ CONFIG_NET_SCH_NETEM=y
 CONFIG_NET_CLS_TCINDEX=y
 CONFIG_NET_CLS_ROUTE4=y
 CONFIG_NET_CLS_FW=y
-CONFIG_NET_CLS_IND=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index 822fa9e96f74..171ab05ce4fc 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -142,7 +142,6 @@ CONFIG_GACT_PROB=y
 CONFIG_NET_ACT_MIRRED=m
 CONFIG_NET_ACT_IPT=m
 CONFIG_NET_ACT_PEDIT=m
-CONFIG_NET_CLS_IND=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_FW_LOADER=m
 CONFIG_CONNECTOR=m
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 514e3c80ecc1..720f2b32fc2f 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -7,6 +7,7 @@
 #include <net/sch_generic.h>
 #include <net/act_api.h>
 #include <net/flow_offload.h>
+#include <net/net_namespace.h>
 
 /* TC action not accessible from user space */
 #define TC_ACT_REINSERT		(TC_ACT_VALUE_MAX + 1)
@@ -576,9 +577,6 @@ static inline int tcf_valid_offset(const struct sk_buff *skb,
 		      (ptr <= (ptr + len)));
 }
 
-#ifdef CONFIG_NET_CLS_IND
-#include <net/net_namespace.h>
-
 static inline int
 tcf_change_indev(struct net *net, struct nlattr *indev_tlv,
 		 struct netlink_ext_ack *extack)
@@ -605,7 +603,6 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 		return false;
 	return ifindex == skb->skb_iif;
 }
-#endif /* CONFIG_NET_CLS_IND */
 
 int tc_setup_flow_action(struct flow_action *flow_action,
 			 const struct tcf_exts *exts);
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index a93680fc4bfa..8cc6b6777b3c 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -295,7 +295,7 @@ enum {
 	TCA_FW_UNSPEC,
 	TCA_FW_CLASSID,
 	TCA_FW_POLICE,
-	TCA_FW_INDEV, /*  used by CONFIG_NET_CLS_IND */
+	TCA_FW_INDEV,
 	TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */
 	TCA_FW_MASK,
 	__TCA_FW_MAX
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index d104f7ee26c7..360fdd3eaa77 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -941,14 +941,6 @@ config NET_IFE_SKBTCINDEX
         tristate "Support to encoding decoding skb tcindex on IFE action"
         depends on NET_ACT_IFE
 
-config NET_CLS_IND
-	bool "Incoming device classification"
-	depends on NET_CLS_U32 || NET_CLS_FW
-	---help---
-	  Say Y here to extend the u32 and fw classifier to support
-	  classification based on the incoming device. This option is
-	  likely to disappear in favour of the metadata ematch.
-
 endif # NET_SCHED
 
 config NET_SCH_FIFO
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c388372df0e2..84c7f279855b 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1010,7 +1010,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 {
 	__be16 ethertype;
 	int ret = 0;
-#ifdef CONFIG_NET_CLS_IND
+
 	if (tb[TCA_FLOWER_INDEV]) {
 		int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack);
 		if (err < 0)
@@ -1018,7 +1018,6 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		key->indev_ifindex = err;
 		mask->indev_ifindex = 0xffffffff;
 	}
-#endif
 
 	fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
 		       mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 4dab833f66cb..c9496c920d6f 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -8,9 +8,6 @@
  * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
  * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
  * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
- *
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
  */
 
 #include <linux/module.h>
@@ -37,9 +34,7 @@ struct fw_filter {
 	struct fw_filter __rcu	*next;
 	u32			id;
 	struct tcf_result	res;
-#ifdef CONFIG_NET_CLS_IND
 	int			ifindex;
-#endif /* CONFIG_NET_CLS_IND */
 	struct tcf_exts		exts;
 	struct tcf_proto	*tp;
 	struct rcu_work		rwork;
@@ -67,10 +62,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		     f = rcu_dereference_bh(f->next)) {
 			if (f->id == id) {
 				*res = f->res;
-#ifdef CONFIG_NET_CLS_IND
 				if (!tcf_match_indev(skb, f->ifindex))
 					continue;
-#endif /* CONFIG_NET_CLS_IND */
 				r = tcf_exts_exec(skb, &f->exts, res);
 				if (r < 0)
 					continue;
@@ -222,7 +215,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 		tcf_bind_filter(tp, &f->res, base);
 	}
 
-#ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_FW_INDEV]) {
 		int ret;
 		ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack);
@@ -230,7 +222,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 			return ret;
 		f->ifindex = ret;
 	}
-#endif /* CONFIG_NET_CLS_IND */
 
 	err = -EINVAL;
 	if (tb[TCA_FW_MASK]) {
@@ -276,9 +267,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 
 		fnew->id = f->id;
 		fnew->res = f->res;
-#ifdef CONFIG_NET_CLS_IND
 		fnew->ifindex = f->ifindex;
-#endif /* CONFIG_NET_CLS_IND */
 		fnew->tp = f->tp;
 
 		err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT,
@@ -405,14 +394,12 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
 	if (f->res.classid &&
 	    nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
 		goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
 	if (f->ifindex) {
 		struct net_device *dev;
 		dev = __dev_get_by_index(net, f->ifindex);
 		if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
 			goto nla_put_failure;
 	}
-#endif /* CONFIG_NET_CLS_IND */
 	if (head->mask != 0xFFFFFFFF &&
 	    nla_put_u32(skb, TCA_FW_MASK, head->mask))
 		goto nla_put_failure;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index c7727de5e073..be9e46c77e8b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -20,9 +20,6 @@
  *	pure RSVP doesn't need such a general approach and can use
  *	much simpler (and faster) schemes, sort of cls_rsvp.c.
  *
- *	JHS: We should remove the CONFIG_NET_CLS_IND from here
- *	eventually when the meta match extension is made available
- *
  *	nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
  */
 
@@ -48,9 +45,7 @@ struct tc_u_knode {
 	u32			handle;
 	struct tc_u_hnode __rcu	*ht_up;
 	struct tcf_exts		exts;
-#ifdef CONFIG_NET_CLS_IND
 	int			ifindex;
-#endif
 	u8			fshift;
 	struct tcf_result	res;
 	struct tc_u_hnode __rcu	*ht_down;
@@ -176,12 +171,10 @@ check_terminal:
 			if (n->sel.flags & TC_U32_TERMINAL) {
 
 				*res = n->res;
-#ifdef CONFIG_NET_CLS_IND
 				if (!tcf_match_indev(skb, n->ifindex)) {
 					n = rcu_dereference_bh(n->next);
 					goto next_knode;
 				}
-#endif
 #ifdef CONFIG_CLS_U32_PERF
 				__this_cpu_inc(n->pf->rhit);
 #endif
@@ -761,7 +754,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 		tcf_bind_filter(tp, &n->res, base);
 	}
 
-#ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_U32_INDEV]) {
 		int ret;
 		ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
@@ -769,7 +761,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 			return -EINVAL;
 		n->ifindex = ret;
 	}
-#endif
 	return 0;
 }
 
@@ -817,9 +808,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
 	new->handle = n->handle;
 	RCU_INIT_POINTER(new->ht_up, n->ht_up);
 
-#ifdef CONFIG_NET_CLS_IND
 	new->ifindex = n->ifindex;
-#endif
 	new->fshift = n->fshift;
 	new->res = n->res;
 	new->flags = n->flags;
@@ -1351,14 +1340,12 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		if (tcf_exts_dump(skb, &n->exts) < 0)
 			goto nla_put_failure;
 
-#ifdef CONFIG_NET_CLS_IND
 		if (n->ifindex) {
 			struct net_device *dev;
 			dev = __dev_get_by_index(net, n->ifindex);
 			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
 				goto nla_put_failure;
 		}
-#endif
 #ifdef CONFIG_CLS_U32_PERF
 		gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
 			      n->sel.nkeys * sizeof(u64),
@@ -1422,9 +1409,7 @@ static int __init init_u32(void)
 #ifdef CONFIG_CLS_U32_PERF
 	pr_info("    Performance counters on\n");
 #endif
-#ifdef CONFIG_NET_CLS_IND
 	pr_info("    input device check on\n");
-#endif
 #ifdef CONFIG_NET_CLS_ACT
 	pr_info("    Actions configured\n");
 #endif
diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h
index 401d0c1e612d..12153771396a 100644
--- a/tools/include/uapi/linux/pkt_cls.h
+++ b/tools/include/uapi/linux/pkt_cls.h
@@ -257,7 +257,7 @@ enum {
 	TCA_FW_UNSPEC,
 	TCA_FW_CLASSID,
 	TCA_FW_POLICE,
-	TCA_FW_INDEV, /*  used by CONFIG_NET_CLS_IND */
+	TCA_FW_INDEV,
 	TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */
 	TCA_FW_MASK,
 	__TCA_FW_MAX
diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
index b235efd55367..1adc4f9bb795 100644
--- a/tools/testing/selftests/tc-testing/config
+++ b/tools/testing/selftests/tc-testing/config
@@ -45,5 +45,4 @@ CONFIG_NET_ACT_TUNNEL_KEY=m
 CONFIG_NET_IFE_SKBMARK=m
 CONFIG_NET_IFE_SKBPRIO=m
 CONFIG_NET_IFE_SKBTCINDEX=m
-CONFIG_NET_CLS_IND=y
 CONFIG_NET_SCH_FIFO=y
-- 
cgit v1.2.3


From 98fdbea550378e0153092bce21261df86a8ccc57 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 12 Jun 2019 15:20:11 +0300
Subject: net/mlx5: Declare more strictly devlink encap mode

Devlink has UAPI declaration for encap mode, so there is no
need to be loose on the data get/set by drivers.

Update call sites to use enum devlink_eswitch_encap_mode
instead of plain u8.

Suggested-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 8 +++++---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 6 ++++--
 include/net/devlink.h                                      | 6 ++++--
 net/core/devlink.c                                         | 6 ++++--
 4 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index e03811be771d..8b9f2cf58e91 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -176,7 +176,7 @@ struct mlx5_esw_offload {
 	const struct mlx5_eswitch_rep_ops *rep_ops[NUM_REP_TYPES];
 	u8 inline_mode;
 	u64 num_flows;
-	u8 encap;
+	enum devlink_eswitch_encap_mode encap;
 };
 
 /* E-Switch MC FDB table hash node */
@@ -357,9 +357,11 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
 					 struct netlink_ext_ack *extack);
 int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode);
 int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode);
-int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap,
+int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink,
+					enum devlink_eswitch_encap_mode encap,
 					struct netlink_ext_ack *extack);
-int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap);
+int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink,
+					enum devlink_eswitch_encap_mode *encap);
 void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type);
 
 int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 1638e4cdeb16..17abb98b48af 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2160,7 +2160,8 @@ out:
 	return 0;
 }
 
-int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap,
+int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink,
+					enum devlink_eswitch_encap_mode encap,
 					struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
@@ -2209,7 +2210,8 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap,
 	return err;
 }
 
-int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap)
+int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink,
+					enum devlink_eswitch_encap_mode *encap)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 1c4adfb4195a..7a34fc586def 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -530,8 +530,10 @@ struct devlink_ops {
 	int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode);
 	int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode,
 				       struct netlink_ext_ack *extack);
-	int (*eswitch_encap_mode_get)(struct devlink *devlink, u8 *p_encap_mode);
-	int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode,
+	int (*eswitch_encap_mode_get)(struct devlink *devlink,
+				      enum devlink_eswitch_encap_mode *p_encap_mode);
+	int (*eswitch_encap_mode_set)(struct devlink *devlink,
+				      enum devlink_eswitch_encap_mode encap_mode,
 				      struct netlink_ext_ack *extack);
 	int (*info_get)(struct devlink *devlink, struct devlink_info_req *req,
 			struct netlink_ext_ack *extack);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d43bc52b8840..47ae69363b07 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1552,7 +1552,8 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
 				   u32 seq, int flags)
 {
 	const struct devlink_ops *ops = devlink->ops;
-	u8 inline_mode, encap_mode;
+	enum devlink_eswitch_encap_mode encap_mode;
+	u8 inline_mode;
 	void *hdr;
 	int err = 0;
 	u16 mode;
@@ -1628,7 +1629,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
 {
 	struct devlink *devlink = info->user_ptr[0];
 	const struct devlink_ops *ops = devlink->ops;
-	u8 inline_mode, encap_mode;
+	enum devlink_eswitch_encap_mode encap_mode;
+	u8 inline_mode;
 	int err = 0;
 	u16 mode;
 
-- 
cgit v1.2.3


From 82b11f071936a11094e1c44730030cd3d894e0b4 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Wed, 12 Jun 2019 15:20:12 +0300
Subject: net/mlx5: Expose eswitch encap mode

Add API to get the current Eswitch encap mode.
It will be used in downstream patches to check if
flow table can be created with encap support or not.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 11 +++++++++++
 include/linux/mlx5/eswitch.h                      | 12 ++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index a4df109fbeb7..12010f85fa35 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2457,6 +2457,17 @@ u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
 }
 EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
 
+enum devlink_eswitch_encap_mode
+mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev)
+{
+	struct mlx5_eswitch *esw;
+
+	esw = dev->priv.eswitch;
+	return ESW_ALLOWED(esw) ? esw->offloads.encap :
+		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_encap_mode);
+
 bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
 {
 	if ((dev0->priv.eswitch->mode == SRIOV_NONE &&
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index d81ee4df181c..174eec0871d9 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -7,6 +7,7 @@
 #define _MLX5_ESWITCH_
 
 #include <linux/mlx5/driver.h>
+#include <net/devlink.h>
 
 #define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager)
 
@@ -62,4 +63,15 @@ u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw);
 struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw,
 				    int vport, u32 sqn);
+
+#ifdef CONFIG_MLX5_ESWITCH
+enum devlink_eswitch_encap_mode
+mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
+#else  /* CONFIG_MLX5_ESWITCH */
+static inline enum devlink_eswitch_encap_mode
+mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev)
+{
+	return DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+}
+#endif /* CONFIG_MLX5_ESWITCH */
 #endif
-- 
cgit v1.2.3


From 7e770b252a62e7498cfa9411018100fd86e56d47 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 15 Jun 2019 12:09:30 +0200
Subject: net: stmmac: drop the reset GPIO from struct stmmac_mdio_bus_data

No platform uses the "reset_gpio" field from stmmac_mdio_bus_data
anymore. Drop it so we don't get any new consumers either.

Plain GPIO numbers are being deprecated in favor of GPIO descriptors. If
needed any new non-OF platform can add a GPIO descriptor lookup table.
devm_gpiod_get_optional() will find the GPIO in that case.

Suggested-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 29 +++++++----------------
 include/linux/stmmac.h                            |  1 -
 2 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 4614f1f2bffb..459ef8afe4fb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -253,21 +253,15 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 	if (priv->device->of_node) {
 		struct gpio_desc *reset_gpio;
 
-		if (data->reset_gpio < 0) {
-			reset_gpio = devm_gpiod_get_optional(priv->device,
-							     "snps,reset",
-							     GPIOD_OUT_LOW);
-			if (IS_ERR(reset_gpio))
-				return PTR_ERR(reset_gpio);
-
-			device_property_read_u32_array(priv->device,
-						       "snps,reset-delays-us",
-						       data->delays, 3);
-		} else {
-			reset_gpio = gpio_to_desc(data->reset_gpio);
-
-			gpiod_direction_output(reset_gpio, 0);
-		}
+		reset_gpio = devm_gpiod_get_optional(priv->device,
+						     "snps,reset",
+						     GPIOD_OUT_LOW);
+		if (IS_ERR(reset_gpio))
+			return PTR_ERR(reset_gpio);
+
+		device_property_read_u32_array(priv->device,
+					       "snps,reset-delays-us",
+					       data->delays, 3);
 
 		if (data->delays[0])
 			msleep(DIV_ROUND_UP(data->delays[0], 1000));
@@ -323,11 +317,6 @@ int stmmac_mdio_register(struct net_device *ndev)
 	if (mdio_bus_data->irqs)
 		memcpy(new_bus->irq, mdio_bus_data->irqs, sizeof(new_bus->irq));
 
-#ifdef CONFIG_OF
-	if (priv->device->of_node)
-		mdio_bus_data->reset_gpio = -1;
-#endif
-
 	new_bus->name = "stmmac";
 
 	if (priv->plat->has_xgmac) {
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index a3c2d9945bcf..a0cc6fa4965b 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -97,7 +97,6 @@ struct stmmac_mdio_bus_data {
 	int *irqs;
 	int probed_phy_irq;
 #ifdef CONFIG_OF
-	int reset_gpio;
 	u32 delays[3];
 #endif
 };
-- 
cgit v1.2.3


From ce4ab73ab0c27c6a3853695aa8ec0f453c6329cd Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 15 Jun 2019 12:09:31 +0200
Subject: net: stmmac: drop the reset delays from struct stmmac_mdio_bus_data

Only OF platforms use the reset delays and these delays are only read in
stmmac_mdio_reset(). Move them from struct stmmac_mdio_bus_data to a
stack variable inside stmmac_mdio_reset() because that's the only usage
of these delays.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 15 ++++++++-------
 include/linux/stmmac.h                            |  3 ---
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 459ef8afe4fb..c9454cf4f189 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -252,6 +252,7 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 #ifdef CONFIG_OF
 	if (priv->device->of_node) {
 		struct gpio_desc *reset_gpio;
+		u32 delays[3];
 
 		reset_gpio = devm_gpiod_get_optional(priv->device,
 						     "snps,reset",
@@ -261,18 +262,18 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 
 		device_property_read_u32_array(priv->device,
 					       "snps,reset-delays-us",
-					       data->delays, 3);
+					       delays, ARRAY_SIZE(delays));
 
-		if (data->delays[0])
-			msleep(DIV_ROUND_UP(data->delays[0], 1000));
+		if (delays[0])
+			msleep(DIV_ROUND_UP(delays[0], 1000));
 
 		gpiod_set_value_cansleep(reset_gpio, 1);
-		if (data->delays[1])
-			msleep(DIV_ROUND_UP(data->delays[1], 1000));
+		if (delays[1])
+			msleep(DIV_ROUND_UP(delays[1], 1000));
 
 		gpiod_set_value_cansleep(reset_gpio, 0);
-		if (data->delays[2])
-			msleep(DIV_ROUND_UP(data->delays[2], 1000));
+		if (delays[2])
+			msleep(DIV_ROUND_UP(delays[2], 1000));
 	}
 #endif
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index a0cc6fa4965b..7c8328edd501 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -96,9 +96,6 @@ struct stmmac_mdio_bus_data {
 	unsigned int phy_mask;
 	int *irqs;
 	int probed_phy_irq;
-#ifdef CONFIG_OF
-	u32 delays[3];
-#endif
 };
 
 struct stmmac_dma_cfg {
-- 
cgit v1.2.3


From fead5b1b5838ba2f231d76e1b8ed31a4e9449382 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 15 Jun 2019 12:09:32 +0200
Subject: net: stmmac: drop the phy_reset hook from struct stmmac_mdio_bus_data

The phy_reset hook is not set anywhere. Drop it to make
stmmac_mdio_reset() smaller.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 6 ------
 include/linux/stmmac.h                            | 1 -
 2 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index c9454cf4f189..14aa3ee14082 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -247,7 +247,6 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 	struct net_device *ndev = bus->priv;
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	unsigned int mii_address = priv->hw->mii.addr;
-	struct stmmac_mdio_bus_data *data = priv->plat->mdio_bus_data;
 
 #ifdef CONFIG_OF
 	if (priv->device->of_node) {
@@ -277,11 +276,6 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 	}
 #endif
 
-	if (data->phy_reset) {
-		netdev_dbg(ndev, "stmmac_mdio_reset: calling phy_reset\n");
-		data->phy_reset(priv->plat->bsp_priv);
-	}
-
 	/* This is a workaround for problems with the STE101P PHY.
 	 * It doesn't complete its reset until at least one clock cycle
 	 * on MDC, so perform a dummy mdio read. To be updated for GMAC4
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 7c8328edd501..6dfb5aa75b0c 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -92,7 +92,6 @@
 /* Platfrom data for platform device structure's platform_data field */
 
 struct stmmac_mdio_bus_data {
-	int (*phy_reset)(void *priv);
 	unsigned int phy_mask;
 	int *irqs;
 	int probed_phy_irq;
-- 
cgit v1.2.3


From 857b46027d6f91150797295752581b7155b9d0e1 Mon Sep 17 00:00:00 2001
From: Stéphane Veyret <sveyret@gmail.com>
Date: Sat, 25 May 2019 15:30:58 +0200
Subject: netfilter: nft_ct: add ct expectations support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch allows to add, list and delete expectations via nft objref
infrastructure and assigning these expectations via nft rule.

This allows manual port triggering when no helper is defined to manage a
specific protocol. For example, if I have an online game which protocol
is based on initial connection to TCP port 9753 of the server, and where
the server opens a connection to port 9876, I can set rules as follow:

table ip filter {
    ct expectation mygame {
        protocol udp;
        dport 9876;
        timeout 2m;
        size 1;
    }

    chain input {
        type filter hook input priority 0; policy drop;
        tcp dport 9753 ct expectation set "mygame";
    }

    chain output {
        type filter hook output priority 0; policy drop;
        udp dport 9876 ct status expected accept;
    }
}

Signed-off-by: Stéphane Veyret <sveyret@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  14 +++-
 net/netfilter/nft_ct.c                   | 138 ++++++++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 505393c6e959..31a6b8f7ff73 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1445,6 +1445,17 @@ enum nft_ct_timeout_timeout_attributes {
 };
 #define NFTA_CT_TIMEOUT_MAX	(__NFTA_CT_TIMEOUT_MAX - 1)
 
+enum nft_ct_expectation_attributes {
+	NFTA_CT_EXPECT_UNSPEC,
+	NFTA_CT_EXPECT_L3PROTO,
+	NFTA_CT_EXPECT_L4PROTO,
+	NFTA_CT_EXPECT_DPORT,
+	NFTA_CT_EXPECT_TIMEOUT,
+	NFTA_CT_EXPECT_SIZE,
+	__NFTA_CT_EXPECT_MAX,
+};
+#define NFTA_CT_EXPECT_MAX	(__NFTA_CT_EXPECT_MAX - 1)
+
 #define NFT_OBJECT_UNSPEC	0
 #define NFT_OBJECT_COUNTER	1
 #define NFT_OBJECT_QUOTA	2
@@ -1454,7 +1465,8 @@ enum nft_ct_timeout_timeout_attributes {
 #define NFT_OBJECT_TUNNEL	6
 #define NFT_OBJECT_CT_TIMEOUT	7
 #define NFT_OBJECT_SECMARK	8
-#define __NFT_OBJECT_MAX	9
+#define NFT_OBJECT_CT_EXPECT	9
+#define __NFT_OBJECT_MAX	10
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index f043936763f3..06b52c894573 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -24,6 +24,7 @@
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
 
 struct nft_ct {
 	enum nft_ct_keys	key:8;
@@ -1156,6 +1157,131 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
+struct nft_ct_expect_obj {
+	u16		l3num;
+	__be16		dport;
+	u8		l4proto;
+	u8		size;
+	u32		timeout;
+};
+
+static int nft_ct_expect_obj_init(const struct nft_ctx *ctx,
+				  const struct nlattr * const tb[],
+				  struct nft_object *obj)
+{
+	struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+
+	if (!tb[NFTA_CT_EXPECT_L4PROTO] ||
+	    !tb[NFTA_CT_EXPECT_DPORT] ||
+	    !tb[NFTA_CT_EXPECT_TIMEOUT] ||
+	    !tb[NFTA_CT_EXPECT_SIZE])
+		return -EINVAL;
+
+	priv->l3num = ctx->family;
+	if (tb[NFTA_CT_EXPECT_L3PROTO])
+		priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO]));
+
+	priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]);
+	priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]);
+	priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]);
+	priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]);
+
+	return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx,
+				       struct nft_object *obj)
+{
+	nf_ct_netns_put(ctx->net, ctx->family);
+}
+
+static int nft_ct_expect_obj_dump(struct sk_buff *skb,
+				  struct nft_object *obj, bool reset)
+{
+	const struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+
+	if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) ||
+	    nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) ||
+	    nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) ||
+	    nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) ||
+	    nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size))
+		return -1;
+
+	return 0;
+}
+
+static void nft_ct_expect_obj_eval(struct nft_object *obj,
+				   struct nft_regs *regs,
+				   const struct nft_pktinfo *pkt)
+{
+	const struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+	struct nf_conntrack_expect *exp;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_help *help;
+	enum ip_conntrack_dir dir;
+	u16 l3num = priv->l3num;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(pkt->skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_UNTRACKED) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+	dir = CTINFO2DIR(ctinfo);
+
+	help = nfct_help(ct);
+	if (!help)
+		help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+
+	if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+	if (l3num == NFPROTO_INET)
+		l3num = nf_ct_l3num(ct);
+
+	exp = nf_ct_expect_alloc(ct);
+	if (exp == NULL) {
+		regs->verdict.code = NF_DROP;
+		return;
+	}
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num,
+		          &ct->tuplehash[!dir].tuple.src.u3,
+		          &ct->tuplehash[!dir].tuple.dst.u3,
+		          priv->l4proto, NULL, &priv->dport);
+	exp->timeout.expires = jiffies + priv->timeout * HZ;
+
+	if (nf_ct_expect_related(exp) != 0)
+		regs->verdict.code = NF_DROP;
+}
+
+static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = {
+	[NFTA_CT_EXPECT_L3PROTO]	= { .type = NLA_U16 },
+	[NFTA_CT_EXPECT_L4PROTO]	= { .type = NLA_U8 },
+	[NFTA_CT_EXPECT_DPORT]		= { .type = NLA_U16 },
+	[NFTA_CT_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
+	[NFTA_CT_EXPECT_SIZE]		= { .type = NLA_U8 },
+};
+
+static struct nft_object_type nft_ct_expect_obj_type;
+
+static const struct nft_object_ops nft_ct_expect_obj_ops = {
+	.type		= &nft_ct_expect_obj_type,
+	.size		= sizeof(struct nft_ct_expect_obj),
+	.eval		= nft_ct_expect_obj_eval,
+	.init		= nft_ct_expect_obj_init,
+	.destroy	= nft_ct_expect_obj_destroy,
+	.dump		= nft_ct_expect_obj_dump,
+};
+
+static struct nft_object_type nft_ct_expect_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_CT_EXPECT,
+	.ops		= &nft_ct_expect_obj_ops,
+	.maxattr	= NFTA_CT_EXPECT_MAX,
+	.policy		= nft_ct_expect_policy,
+	.owner		= THIS_MODULE,
+};
+
 static int __init nft_ct_module_init(void)
 {
 	int err;
@@ -1173,17 +1299,23 @@ static int __init nft_ct_module_init(void)
 	err = nft_register_obj(&nft_ct_helper_obj_type);
 	if (err < 0)
 		goto err2;
+
+	err = nft_register_obj(&nft_ct_expect_obj_type);
+	if (err < 0)
+		goto err3;
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	err = nft_register_obj(&nft_ct_timeout_obj_type);
 	if (err < 0)
-		goto err3;
+		goto err4;
 #endif
 	return 0;
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+err4:
+	nft_unregister_obj(&nft_ct_expect_obj_type);
+#endif
 err3:
 	nft_unregister_obj(&nft_ct_helper_obj_type);
-#endif
 err2:
 	nft_unregister_expr(&nft_notrack_type);
 err1:
@@ -1196,6 +1328,7 @@ static void __exit nft_ct_module_exit(void)
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	nft_unregister_obj(&nft_ct_timeout_obj_type);
 #endif
+	nft_unregister_obj(&nft_ct_expect_obj_type);
 	nft_unregister_obj(&nft_ct_helper_obj_type);
 	nft_unregister_expr(&nft_notrack_type);
 	nft_unregister_expr(&nft_ct_type);
@@ -1210,3 +1343,4 @@ MODULE_ALIAS_NFT_EXPR("ct");
 MODULE_ALIAS_NFT_EXPR("notrack");
 MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
 MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT);
-- 
cgit v1.2.3


From 87e389b4c20091b562bd65d90272f9d7c67eb437 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 4 Jun 2019 14:14:04 +0200
Subject: netfilter: conntrack: small conntrack lookup optimization

____nf_conntrack_find() performs checks on the conntrack objects in
this order:

1. if (nf_ct_is_expired(ct))

This fetches ct->timeout, in third cache line.

The hnnode that is used to store the list pointers resides in the first
(origin) or second (reply tuple) cache lines.

This test rarely passes, but its necessary to reap obsolete entries.

2. if (nf_ct_is_dying(ct))

This fetches ct->status, also in third cache line.

The test is useless, and can be removed:
  Consider:
     cpu0                                           cpu1
    ct = ____nf_conntrack_find()
    atomic_inc_not_zero(ct) -> ok
    nf_ct_key_equal -> ok
    is_dying -> DYING bit not set, ok
                                                    set_bit(ct, DYING);
						    ... unhash ... etc.
    return ct
    -> returning a ct with dying bit set, despite
    having a test for it.

This (unlikely) case is fine - refcount prevents ct from getting free'd.

3. if (nf_ct_key_equal(h, tuple, zone, net))

nf_ct_key_equal checks in following order:

1. Tuple equal (first or second cacheline)
2. Zone equal (third cacheline)
3. confirmed bit set (->status, third cacheline)
4. net namespace match (third cacheline).

Swapping "timeout" and "cpu" places timeout in the first cacheline.
This has two advantages:

1. For a conntrack that won't even match the original tuple,
   we will now only fetch the first and maybe the second cacheline
   instead of always accessing the 3rd one as well.

2.  in case of TCP ct->timeout changes frequently because we
    reduce/increase it when there are packets outstanding in the network.

The first cacheline contains both the reference count and the ct spinlock,
i.e. moving timeout there avoids writes to 3rd cacheline.

The restart sequence in __nf_conntrack_find() is removed, if we found a
candidate, but then fail to increment the refcount or discover the tuple
has changed (object recycling), just pretend we did not find an entry.

A second lookup won't find anything until another CPU adds a new conntrack
with identical tuple into the hash table, which is very unlikely.

We have the confirmation-time checks (when we hold hash lock) that deal
with identical entries and even perform clash resolution in some cases.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |  7 +++----
 net/netfilter/nf_conntrack_core.c    | 25 +++++++++++++------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 5cb19ce454d1..c86657d99630 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -70,7 +70,8 @@ struct nf_conn {
 	struct nf_conntrack ct_general;
 
 	spinlock_t	lock;
-	u16		cpu;
+	/* jiffies32 when this ct is considered dead */
+	u32 timeout;
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	struct nf_conntrack_zone zone;
@@ -82,9 +83,7 @@ struct nf_conn {
 	/* Have we seen traffic both ways yet? (bitset) */
 	unsigned long status;
 
-	/* jiffies32 when this ct is considered dead */
-	u32 timeout;
-
+	u16		cpu;
 	possible_net_t ct_net;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 2a714527cde1..2855a2e39fc4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -752,9 +752,6 @@ begin:
 			continue;
 		}
 
-		if (nf_ct_is_dying(ct))
-			continue;
-
 		if (nf_ct_key_equal(h, tuple, zone, net))
 			return h;
 	}
@@ -780,20 +777,24 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 	struct nf_conn *ct;
 
 	rcu_read_lock();
-begin:
+
 	h = ____nf_conntrack_find(net, zone, tuple, hash);
 	if (h) {
+		/* We have a candidate that matches the tuple we're interested
+		 * in, try to obtain a reference and re-check tuple
+		 */
 		ct = nf_ct_tuplehash_to_ctrack(h);
-		if (unlikely(nf_ct_is_dying(ct) ||
-			     !atomic_inc_not_zero(&ct->ct_general.use)))
-			h = NULL;
-		else {
-			if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
-				nf_ct_put(ct);
-				goto begin;
-			}
+		if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
+			if (likely(nf_ct_key_equal(h, tuple, zone, net)))
+				goto found;
+
+			/* TYPESAFE_BY_RCU recycled the candidate */
+			nf_ct_put(ct);
 		}
+
+		h = NULL;
 	}
+found:
 	rcu_read_unlock();
 
 	return h;
-- 
cgit v1.2.3


From 9911c1139fd072594ac259c2ce055b004ca92f49 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 7 Jun 2019 16:37:30 +0200
Subject: netfilter: xt_owner: bail out with EINVAL in case of unsupported
 flags

Reject flags that are not supported with EINVAL.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_owner.h | 5 +++++
 net/netfilter/xt_owner.c                | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/xt_owner.h b/include/uapi/linux/netfilter/xt_owner.h
index 9e98c09eda32..5108df4d0313 100644
--- a/include/uapi/linux/netfilter/xt_owner.h
+++ b/include/uapi/linux/netfilter/xt_owner.h
@@ -11,6 +11,11 @@ enum {
 	XT_OWNER_SUPPL_GROUPS = 1 << 3,
 };
 
+#define XT_OWNER_MASK	(XT_OWNER_UID | 	\
+			 XT_OWNER_GID | 	\
+			 XT_OWNER_SOCKET |	\
+			 XT_OWNER_SUPPL_GROUPS)
+
 struct xt_owner_match_info {
 	__u32 uid_min, uid_max;
 	__u32 gid_min, gid_max;
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index a8784502aca6..ee597fdc5db7 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -25,6 +25,9 @@ static int owner_check(const struct xt_mtchk_param *par)
 	struct xt_owner_match_info *info = par->matchinfo;
 	struct net *net = par->net;
 
+	if (info->match & ~XT_OWNER_MASK)
+		return -EINVAL;
+
 	/* Only allow the common case where the userns of the writer
 	 * matches the userns of the network namespace.
 	 */
-- 
cgit v1.2.3


From ff6d090d0db41425aef0cfe5dc58bb3cc12514a2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Mon, 10 Jun 2019 23:26:05 +0200
Subject: netfilter: bridge: port sysctls to use brnf_net

This ports the sysctls to use struct brnf_net.

With this patch we make it possible to namespace the br_netfilter module in
the following patch.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/br_netfilter.h |   3 +-
 net/bridge/br_netfilter_hooks.c      | 162 ++++++++++++++++++++++-------------
 net/bridge/br_netfilter_ipv6.c       |   2 +-
 3 files changed, 107 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h
index 89808ce293c4..302fcd3aade2 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -42,7 +42,8 @@ static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
 	return port ? &port->br->fake_rtable : NULL;
 }
 
-struct net_device *setup_pre_routing(struct sk_buff *skb);
+struct net_device *setup_pre_routing(struct sk_buff *skb,
+				     const struct net *net);
 
 #if IS_ENABLED(CONFIG_IPV6)
 int br_validate_ipv6(struct net *net, struct sk_buff *skb);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 22afa566cbce..3c67754d8075 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -49,27 +49,24 @@
 
 static unsigned int brnf_net_id __read_mostly;
 
-struct brnf_net {
-	bool enabled;
-};
-
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *brnf_sysctl_header;
-static int brnf_call_iptables __read_mostly = 1;
-static int brnf_call_ip6tables __read_mostly = 1;
-static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly;
-static int brnf_filter_pppoe_tagged __read_mostly;
-static int brnf_pass_vlan_indev __read_mostly;
-#else
-#define brnf_call_iptables 1
-#define brnf_call_ip6tables 1
-#define brnf_call_arptables 1
-#define brnf_filter_vlan_tagged 0
-#define brnf_filter_pppoe_tagged 0
-#define brnf_pass_vlan_indev 0
 #endif
 
+struct brnf_net {
+	bool enabled;
+
+	/* default value is 1 */
+	int call_iptables;
+	int call_ip6tables;
+	int call_arptables;
+
+	/* default value is 0 */
+	int filter_vlan_tagged;
+	int filter_pppoe_tagged;
+	int pass_vlan_indev;
+};
+
 #define IS_IP(skb) \
 	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
 
@@ -89,17 +86,28 @@ static inline __be16 vlan_proto(const struct sk_buff *skb)
 		return 0;
 }
 
-#define IS_VLAN_IP(skb) \
-	(vlan_proto(skb) == htons(ETH_P_IP) && \
-	 brnf_filter_vlan_tagged)
+static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+	return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged;
+}
+
+static inline bool is_vlan_ipv6(const struct sk_buff *skb,
+				const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
 
-#define IS_VLAN_IPV6(skb) \
-	(vlan_proto(skb) == htons(ETH_P_IPV6) && \
-	 brnf_filter_vlan_tagged)
+	return vlan_proto(skb) == htons(ETH_P_IPV6) &&
+	       brnet->filter_vlan_tagged;
+}
 
-#define IS_VLAN_ARP(skb) \
-	(vlan_proto(skb) == htons(ETH_P_ARP) &&	\
-	 brnf_filter_vlan_tagged)
+static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+	return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged;
+}
 
 static inline __be16 pppoe_proto(const struct sk_buff *skb)
 {
@@ -107,15 +115,23 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
 			    sizeof(struct pppoe_hdr)));
 }
 
-#define IS_PPPOE_IP(skb) \
-	(skb->protocol == htons(ETH_P_PPP_SES) && \
-	 pppoe_proto(skb) == htons(PPP_IP) && \
-	 brnf_filter_pppoe_tagged)
+static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+	return skb->protocol == htons(ETH_P_PPP_SES) &&
+	       pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged;
+}
+
+static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
+				 const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
 
-#define IS_PPPOE_IPV6(skb) \
-	(skb->protocol == htons(ETH_P_PPP_SES) && \
-	 pppoe_proto(skb) == htons(PPP_IPV6) && \
-	 brnf_filter_pppoe_tagged)
+	return skb->protocol == htons(ETH_P_PPP_SES) &&
+	       pppoe_proto(skb) == htons(PPP_IPV6) &&
+	       brnet->filter_pppoe_tagged;
+}
 
 /* largest possible L2 header, see br_nf_dev_queue_xmit() */
 #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
@@ -412,12 +428,16 @@ bridged_dnat:
 	return 0;
 }
 
-static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
+					       const struct net_device *dev,
+					       const struct net *net)
 {
 	struct net_device *vlan, *br;
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
 
 	br = bridge_parent(dev);
-	if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+
+	if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
 		return br;
 
 	vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -427,7 +447,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 }
 
 /* Some common code for IPv4/IPv6 */
-struct net_device *setup_pre_routing(struct sk_buff *skb)
+struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
 {
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
@@ -438,7 +458,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
 
 	nf_bridge->in_prerouting = 1;
 	nf_bridge->physindev = skb->dev;
-	skb->dev = brnf_get_logical_dev(skb, skb->dev);
+	skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
 
 	if (skb->protocol == htons(ETH_P_8021Q))
 		nf_bridge->orig_proto = BRNF_PROTO_8021Q;
@@ -464,6 +484,7 @@ static unsigned int br_nf_pre_routing(void *priv,
 	struct net_bridge_port *p;
 	struct net_bridge *br;
 	__u32 len = nf_bridge_encap_header_len(skb);
+	struct brnf_net *brnet;
 
 	if (unlikely(!pskb_may_pull(skb, len)))
 		return NF_DROP;
@@ -473,8 +494,10 @@ static unsigned int br_nf_pre_routing(void *priv,
 		return NF_DROP;
 	br = p->br;
 
-	if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
-		if (!brnf_call_ip6tables &&
+	brnet = net_generic(state->net, brnf_net_id);
+	if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+	    is_pppoe_ipv6(skb, state->net)) {
+		if (!brnet->call_ip6tables &&
 		    !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
 			return NF_ACCEPT;
 
@@ -482,10 +505,11 @@ static unsigned int br_nf_pre_routing(void *priv,
 		return br_nf_pre_routing_ipv6(priv, skb, state);
 	}
 
-	if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
+	if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
 		return NF_ACCEPT;
 
-	if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+	if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) &&
+	    !is_pppoe_ip(skb, state->net))
 		return NF_ACCEPT;
 
 	nf_bridge_pull_encap_header_rcsum(skb);
@@ -495,7 +519,7 @@ static unsigned int br_nf_pre_routing(void *priv,
 
 	if (!nf_bridge_alloc(skb))
 		return NF_DROP;
-	if (!setup_pre_routing(skb))
+	if (!setup_pre_routing(skb, state->net))
 		return NF_DROP;
 
 	nf_bridge = nf_bridge_info_get(skb);
@@ -518,7 +542,7 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 	struct net_device *in;
 
-	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+	if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) {
 
 		if (skb->protocol == htons(ETH_P_IP))
 			nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
@@ -573,9 +597,11 @@ static unsigned int br_nf_forward_ip(void *priv,
 	if (!parent)
 		return NF_DROP;
 
-	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+	if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+	    is_pppoe_ip(skb, state->net))
 		pf = NFPROTO_IPV4;
-	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+	else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+		 is_pppoe_ipv6(skb, state->net))
 		pf = NFPROTO_IPV6;
 	else
 		return NF_ACCEPT;
@@ -606,7 +632,7 @@ static unsigned int br_nf_forward_ip(void *priv,
 		skb->protocol = htons(ETH_P_IPV6);
 
 	NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
-		brnf_get_logical_dev(skb, state->in),
+		brnf_get_logical_dev(skb, state->in, state->net),
 		parent,	br_nf_forward_finish);
 
 	return NF_STOLEN;
@@ -619,23 +645,25 @@ static unsigned int br_nf_forward_arp(void *priv,
 	struct net_bridge_port *p;
 	struct net_bridge *br;
 	struct net_device **d = (struct net_device **)(skb->cb);
+	struct brnf_net *brnet;
 
 	p = br_port_get_rcu(state->out);
 	if (p == NULL)
 		return NF_ACCEPT;
 	br = p->br;
 
-	if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
+	brnet = net_generic(state->net, brnf_net_id);
+	if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
 		return NF_ACCEPT;
 
 	if (!IS_ARP(skb)) {
-		if (!IS_VLAN_ARP(skb))
+		if (!is_vlan_arp(skb, state->net))
 			return NF_ACCEPT;
 		nf_bridge_pull_encap_header(skb);
 	}
 
 	if (arp_hdr(skb)->ar_pln != 4) {
-		if (IS_VLAN_ARP(skb))
+		if (is_vlan_arp(skb, state->net))
 			nf_bridge_push_encap_header(skb);
 		return NF_ACCEPT;
 	}
@@ -795,9 +823,11 @@ static unsigned int br_nf_post_routing(void *priv,
 	if (!realoutdev)
 		return NF_DROP;
 
-	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+	if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+	    is_pppoe_ip(skb, state->net))
 		pf = NFPROTO_IPV4;
-	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+	else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+		 is_pppoe_ipv6(skb, state->net))
 		pf = NFPROTO_IPV6;
 	else
 		return NF_ACCEPT;
@@ -1025,53 +1055,59 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
 static struct ctl_table brnf_table[] = {
 	{
 		.procname	= "bridge-nf-call-arptables",
-		.data		= &brnf_call_arptables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-call-iptables",
-		.data		= &brnf_call_iptables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-call-ip6tables",
-		.data		= &brnf_call_ip6tables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-filter-vlan-tagged",
-		.data		= &brnf_filter_vlan_tagged,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-filter-pppoe-tagged",
-		.data		= &brnf_filter_pppoe_tagged,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-pass-vlan-input-dev",
-		.data		= &brnf_pass_vlan_indev,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{ }
 };
+
+static inline void br_netfilter_sysctl_default(struct brnf_net *brnf)
+{
+	brnf->call_iptables = 1;
+	brnf->call_ip6tables = 1;
+	brnf->call_arptables = 1;
+	brnf->filter_vlan_tagged = 0;
+	brnf->filter_pppoe_tagged = 0;
+	brnf->pass_vlan_indev = 0;
+}
+
 #endif
 
 static int __init br_netfilter_init(void)
 {
 	int ret;
+	struct brnf_net *brnet;
 
 	ret = register_pernet_subsys(&brnf_net_ops);
 	if (ret < 0)
@@ -1084,6 +1120,16 @@ static int __init br_netfilter_init(void)
 	}
 
 #ifdef CONFIG_SYSCTL
+	brnet = net_generic(&init_net, brnf_net_id);
+	brnf_table[0].data = &brnet->call_arptables;
+	brnf_table[1].data = &brnet->call_iptables;
+	brnf_table[2].data = &brnet->call_ip6tables;
+	brnf_table[3].data = &brnet->filter_vlan_tagged;
+	brnf_table[4].data = &brnet->filter_pppoe_tagged;
+	brnf_table[5].data = &brnet->pass_vlan_indev;
+
+	br_netfilter_sysctl_default(brnet);
+
 	brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
 	if (brnf_sysctl_header == NULL) {
 		printk(KERN_WARNING
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index e88d6641647b..d77304e4e31a 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -228,7 +228,7 @@ unsigned int br_nf_pre_routing_ipv6(void *priv,
 	nf_bridge = nf_bridge_alloc(skb);
 	if (!nf_bridge)
 		return NF_DROP;
-	if (!setup_pre_routing(skb))
+	if (!setup_pre_routing(skb, state->net))
 		return NF_DROP;
 
 	nf_bridge = nf_bridge_info_get(skb);
-- 
cgit v1.2.3


From 5fcc88ecf681b64da6c2c918352e2451db6a97ec Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Fri, 7 Jun 2019 02:36:02 +0200
Subject: netfilter: synproxy: add common uapi for SYNPROXY infrastructure

This new UAPI file is going to be used by the xt and nft common SYNPROXY
infrastructure. It is needed to avoid duplicated code.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_SYNPROXY.h | 19 +++++++++++++++++++
 include/uapi/linux/netfilter/xt_SYNPROXY.h | 18 +++++++-----------
 2 files changed, 26 insertions(+), 11 deletions(-)
 create mode 100644 include/uapi/linux/netfilter/nf_SYNPROXY.h

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_SYNPROXY.h b/include/uapi/linux/netfilter/nf_SYNPROXY.h
new file mode 100644
index 000000000000..068d1b3a6f06
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_SYNPROXY.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NF_SYNPROXY_H
+#define _NF_SYNPROXY_H
+
+#include <linux/types.h>
+
+#define NF_SYNPROXY_OPT_MSS		0x01
+#define NF_SYNPROXY_OPT_WSCALE		0x02
+#define NF_SYNPROXY_OPT_SACK_PERM	0x04
+#define NF_SYNPROXY_OPT_TIMESTAMP	0x08
+#define NF_SYNPROXY_OPT_ECN		0x10
+
+struct nf_synproxy_info {
+	__u8	options;
+	__u8	wscale;
+	__u16	mss;
+};
+
+#endif /* _NF_SYNPROXY_H */
diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h
index ea5eba15d4c1..4d5611d647df 100644
--- a/include/uapi/linux/netfilter/xt_SYNPROXY.h
+++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h
@@ -2,18 +2,14 @@
 #ifndef _XT_SYNPROXY_H
 #define _XT_SYNPROXY_H
 
-#include <linux/types.h>
+#include <linux/netfilter/nf_SYNPROXY.h>
 
-#define XT_SYNPROXY_OPT_MSS		0x01
-#define XT_SYNPROXY_OPT_WSCALE		0x02
-#define XT_SYNPROXY_OPT_SACK_PERM	0x04
-#define XT_SYNPROXY_OPT_TIMESTAMP	0x08
-#define XT_SYNPROXY_OPT_ECN		0x10
+#define XT_SYNPROXY_OPT_MSS		NF_SYNPROXY_OPT_MSS
+#define XT_SYNPROXY_OPT_WSCALE		NF_SYNPROXY_OPT_WSCALE
+#define XT_SYNPROXY_OPT_SACK_PERM	NF_SYNPROXY_OPT_SACK_PERM
+#define XT_SYNPROXY_OPT_TIMESTAMP	NF_SYNPROXY_OPT_TIMESTAMP
+#define XT_SYNPROXY_OPT_ECN		NF_SYNPROXY_OPT_ECN
 
-struct xt_synproxy_info {
-	__u8	options;
-	__u8	wscale;
-	__u16	mss;
-};
+#define xt_synproxy_info		nf_synproxy_info
 
 #endif /* _XT_SYNPROXY_H */
-- 
cgit v1.2.3


From 3006a5224f15cf68edc4878799ac6d6089861518 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Fri, 7 Jun 2019 02:36:05 +0200
Subject: netfilter: synproxy: remove module dependency on IPv6 SYNPROXY

This is a prerequisite for the infrastructure module NETFILTER_SYNPROXY.
The new module is needed to avoid duplicated code for the SYNPROXY
nftables support.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h | 36 ++++++++++++++++++++++++++++++++++++
 net/ipv6/netfilter.c           |  2 ++
 2 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 3a3dc4b1f0e7..35b12525ee45 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -8,6 +8,7 @@
 #define __LINUX_IP6_NETFILTER_H
 
 #include <uapi/linux/netfilter_ipv6.h>
+#include <net/tcp.h>
 
 /* Extra routing may needed on local out, as the QUEUE target never returns
  * control to the table.
@@ -35,6 +36,10 @@ struct nf_ipv6_ops {
 		       struct in6_addr *saddr);
 	int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		     bool strict);
+	u32 (*cookie_init_sequence)(const struct ipv6hdr *iph,
+				    const struct tcphdr *th, u16 *mssp);
+	int (*cookie_v6_check)(const struct ipv6hdr *iph,
+			       const struct tcphdr *th, __u32 cookie);
 #endif
 	void (*route_input)(struct sk_buff *skb);
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -154,6 +159,37 @@ static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 #endif
 }
 
+static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph,
+					       const struct tcphdr *th,
+					       u16 *mssp)
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (v6_ops)
+		return v6_ops->cookie_init_sequence(iph, th, mssp);
+
+	return 0;
+#else
+	return __cookie_v6_init_sequence(iph, th, mssp);
+#endif
+}
+
+static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
+				     const struct tcphdr *th, __u32 cookie)
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (v6_ops)
+		return v6_ops->cookie_v6_check(iph, th, cookie);
+
+	return 0;
+#else
+	return __cookie_v6_check(iph, th, cookie);
+#endif
+}
+
 __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 			unsigned int dataoff, u_int8_t protocol);
 
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 86048dce301b..dffb10fdc3e8 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -234,6 +234,8 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.route_me_harder	= ip6_route_me_harder,
 	.dev_get_saddr		= ipv6_dev_get_saddr,
 	.route			= __nf_ip6_route,
+	.cookie_init_sequence	= __cookie_v6_init_sequence,
+	.cookie_v6_check	= __cookie_v6_check,
 #endif
 	.route_input		= ip6_route_input,
 	.fragment		= ip6_fragment,
-- 
cgit v1.2.3


From d7f9b2f18eaef74b4f948c7e24e3a8f796f0c90d Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Fri, 7 Jun 2019 02:36:07 +0200
Subject: netfilter: synproxy: extract SYNPROXY infrastructure from {ipt,
 ip6t}_SYNPROXY

Add common functions into nf_synproxy_core.c to prepare for nftables support.
The prototypes of the functions used by {ipt, ip6t}_SYNPROXY are in the new
file nf_synproxy.h

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_synproxy.h |  13 +-
 include/net/netfilter/nf_synproxy.h           |  44 ++
 net/ipv4/netfilter/ipt_SYNPROXY.c             | 394 +----------
 net/ipv6/netfilter/ip6t_SYNPROXY.c            | 420 +-----------
 net/netfilter/nf_synproxy_core.c              | 896 ++++++++++++++++++++++++--
 5 files changed, 920 insertions(+), 847 deletions(-)
 create mode 100644 include/net/netfilter/nf_synproxy.h

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index 2c7559a54092..c5659dcf5b1a 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -72,21 +72,12 @@ struct synproxy_options {
 };
 
 struct tcphdr;
-struct xt_synproxy_info;
+struct nf_synproxy_info;
 bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 			    const struct tcphdr *th,
 			    struct synproxy_options *opts);
-unsigned int synproxy_options_size(const struct synproxy_options *opts);
-void synproxy_build_options(struct tcphdr *th,
-			    const struct synproxy_options *opts);
 
-void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
 				    struct synproxy_options *opts);
-void synproxy_check_timestamp_cookie(struct synproxy_options *opts);
-
-unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
-				    struct tcphdr *th, struct nf_conn *ct,
-				    enum ip_conntrack_info ctinfo,
-				    const struct nf_conn_synproxy *synproxy);
 
 #endif /* _NF_CONNTRACK_SYNPROXY_H */
diff --git a/include/net/netfilter/nf_synproxy.h b/include/net/netfilter/nf_synproxy.h
new file mode 100644
index 000000000000..3e8b3f03b687
--- /dev/null
+++ b/include/net/netfilter/nf_synproxy.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NF_SYNPROXY_SHARED_H
+#define _NF_SYNPROXY_SHARED_H
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_route.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+void synproxy_send_client_synack(struct net *net, const struct sk_buff *skb,
+				 const struct tcphdr *th,
+				 const struct synproxy_options *opts);
+
+bool synproxy_recv_client_ack(struct net *net,
+			      const struct sk_buff *skb,
+			      const struct tcphdr *th,
+			      struct synproxy_options *opts, u32 recv_seq);
+
+unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb,
+				const struct nf_hook_state *nhs);
+int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net);
+void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net);
+
+#if IS_ENABLED(CONFIG_IPV6)
+void synproxy_send_client_synack_ipv6(struct net *net,
+				      const struct sk_buff *skb,
+				      const struct tcphdr *th,
+				      const struct synproxy_options *opts);
+
+bool synproxy_recv_client_ack_ipv6(struct net *net, const struct sk_buff *skb,
+				   const struct tcphdr *th,
+				   struct synproxy_options *opts, u32 recv_seq);
+
+unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
+				const struct nf_hook_state *nhs);
+int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net);
+void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net);
+#endif /* CONFIG_IPV6 */
+
+#endif /* _NF_SYNPROXY_SHARED_H */
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 690b17ef6a44..7f7979734fb4 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -6,258 +6,11 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/tcp.h>
-
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct iphdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
-		  __be32 daddr)
-{
-	struct iphdr *iph;
-
-	skb_reset_network_header(skb);
-	iph = skb_put(skb, sizeof(*iph));
-	iph->version	= 4;
-	iph->ihl	= sizeof(*iph) / 4;
-	iph->tos	= 0;
-	iph->id		= 0;
-	iph->frag_off	= htons(IP_DF);
-	iph->ttl	= net->ipv4.sysctl_ip_default_ttl;
-	iph->protocol	= IPPROTO_TCP;
-	iph->check	= 0;
-	iph->saddr	= saddr;
-	iph->daddr	= daddr;
-
-	return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
-		  const struct sk_buff *skb, struct sk_buff *nskb,
-		  struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
-		  struct iphdr *niph, struct tcphdr *nth,
-		  unsigned int tcp_hdr_size)
-{
-	nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
-	nskb->ip_summed   = CHECKSUM_PARTIAL;
-	nskb->csum_start  = (unsigned char *)nth - nskb->head;
-	nskb->csum_offset = offsetof(struct tcphdr, check);
-
-	skb_dst_set_noref(nskb, skb_dst(skb));
-	nskb->protocol = htons(ETH_P_IP);
-	if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
-		goto free_nskb;
-
-	if (nfct) {
-		nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
-		nf_conntrack_get(nfct);
-	}
-
-	ip_local_out(net, nskb->sk, nskb);
-	return;
-
-free_nskb:
-	kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
-			    const struct sk_buff *skb, const struct tcphdr *th,
-			    const struct synproxy_options *opts)
-{
-	struct sk_buff *nskb;
-	struct iphdr *iph, *niph;
-	struct tcphdr *nth;
-	unsigned int tcp_hdr_size;
-	u16 mss = opts->mss;
-
-	iph = ip_hdr(skb);
-
-	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
-	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
-			 GFP_ATOMIC);
-	if (nskb == NULL)
-		return;
-	skb_reserve(nskb, MAX_TCP_HEADER);
-
-	niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
-
-	skb_reset_transport_header(nskb);
-	nth = skb_put(nskb, tcp_hdr_size);
-	nth->source	= th->dest;
-	nth->dest	= th->source;
-	nth->seq	= htonl(__cookie_v4_init_sequence(iph, th, &mss));
-	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
-	tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
-	if (opts->options & XT_SYNPROXY_OPT_ECN)
-		tcp_flag_word(nth) |= TCP_FLAG_ECE;
-	nth->doff	= tcp_hdr_size / 4;
-	nth->window	= 0;
-	nth->check	= 0;
-	nth->urg_ptr	= 0;
-
-	synproxy_build_options(nth, opts);
-
-	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
-			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_syn(struct net *net,
-			 const struct sk_buff *skb, const struct tcphdr *th,
-			 const struct synproxy_options *opts, u32 recv_seq)
-{
-	struct synproxy_net *snet = synproxy_pernet(net);
-	struct sk_buff *nskb;
-	struct iphdr *iph, *niph;
-	struct tcphdr *nth;
-	unsigned int tcp_hdr_size;
-
-	iph = ip_hdr(skb);
-
-	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
-	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
-			 GFP_ATOMIC);
-	if (nskb == NULL)
-		return;
-	skb_reserve(nskb, MAX_TCP_HEADER);
-
-	niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
-
-	skb_reset_transport_header(nskb);
-	nth = skb_put(nskb, tcp_hdr_size);
-	nth->source	= th->source;
-	nth->dest	= th->dest;
-	nth->seq	= htonl(recv_seq - 1);
-	/* ack_seq is used to relay our ISN to the synproxy hook to initialize
-	 * sequence number translation once a connection tracking entry exists.
-	 */
-	nth->ack_seq	= htonl(ntohl(th->ack_seq) - 1);
-	tcp_flag_word(nth) = TCP_FLAG_SYN;
-	if (opts->options & XT_SYNPROXY_OPT_ECN)
-		tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
-	nth->doff	= tcp_hdr_size / 4;
-	nth->window	= th->window;
-	nth->check	= 0;
-	nth->urg_ptr	= 0;
-
-	synproxy_build_options(nth, opts);
-
-	synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
-			  niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
-			 const struct ip_ct_tcp *state,
-			 const struct sk_buff *skb, const struct tcphdr *th,
-			 const struct synproxy_options *opts)
-{
-	struct sk_buff *nskb;
-	struct iphdr *iph, *niph;
-	struct tcphdr *nth;
-	unsigned int tcp_hdr_size;
-
-	iph = ip_hdr(skb);
-
-	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
-	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
-			 GFP_ATOMIC);
-	if (nskb == NULL)
-		return;
-	skb_reserve(nskb, MAX_TCP_HEADER);
-
-	niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
 
-	skb_reset_transport_header(nskb);
-	nth = skb_put(nskb, tcp_hdr_size);
-	nth->source	= th->dest;
-	nth->dest	= th->source;
-	nth->seq	= htonl(ntohl(th->ack_seq));
-	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
-	tcp_flag_word(nth) = TCP_FLAG_ACK;
-	nth->doff	= tcp_hdr_size / 4;
-	nth->window	= htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
-	nth->check	= 0;
-	nth->urg_ptr	= 0;
-
-	synproxy_build_options(nth, opts);
-
-	synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
-			 const struct sk_buff *skb, const struct tcphdr *th,
-			 const struct synproxy_options *opts)
-{
-	struct sk_buff *nskb;
-	struct iphdr *iph, *niph;
-	struct tcphdr *nth;
-	unsigned int tcp_hdr_size;
-
-	iph = ip_hdr(skb);
-
-	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
-	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
-			 GFP_ATOMIC);
-	if (nskb == NULL)
-		return;
-	skb_reserve(nskb, MAX_TCP_HEADER);
-
-	niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
-
-	skb_reset_transport_header(nskb);
-	nth = skb_put(nskb, tcp_hdr_size);
-	nth->source	= th->source;
-	nth->dest	= th->dest;
-	nth->seq	= htonl(ntohl(th->seq) + 1);
-	nth->ack_seq	= th->ack_seq;
-	tcp_flag_word(nth) = TCP_FLAG_ACK;
-	nth->doff	= tcp_hdr_size / 4;
-	nth->window	= htons(ntohs(th->window) >> opts->wscale);
-	nth->check	= 0;
-	nth->urg_ptr	= 0;
-
-	synproxy_build_options(nth, opts);
-
-	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
-			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
-			 const struct sk_buff *skb, const struct tcphdr *th,
-			 struct synproxy_options *opts, u32 recv_seq)
-{
-	struct synproxy_net *snet = synproxy_pernet(net);
-	int mss;
-
-	mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
-	if (mss == 0) {
-		this_cpu_inc(snet->stats->cookie_invalid);
-		return false;
-	}
-
-	this_cpu_inc(snet->stats->cookie_valid);
-	opts->mss = mss;
-	opts->options |= XT_SYNPROXY_OPT_MSS;
-
-	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
-		synproxy_check_timestamp_cookie(opts);
-
-	synproxy_send_server_syn(net, skb, th, opts, recv_seq);
-	return true;
-}
+#include <net/netfilter/nf_synproxy.h>
 
 static unsigned int
 synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
@@ -309,135 +62,6 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static unsigned int ipv4_synproxy_hook(void *priv,
-				       struct sk_buff *skb,
-				       const struct nf_hook_state *nhs)
-{
-	struct net *net = nhs->net;
-	struct synproxy_net *snet = synproxy_pernet(net);
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
-	struct nf_conn_synproxy *synproxy;
-	struct synproxy_options opts = {};
-	const struct ip_ct_tcp *state;
-	struct tcphdr *th, _th;
-	unsigned int thoff;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (ct == NULL)
-		return NF_ACCEPT;
-
-	synproxy = nfct_synproxy(ct);
-	if (synproxy == NULL)
-		return NF_ACCEPT;
-
-	if (nf_is_loopback_packet(skb) ||
-	    ip_hdr(skb)->protocol != IPPROTO_TCP)
-		return NF_ACCEPT;
-
-	thoff = ip_hdrlen(skb);
-	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
-	if (th == NULL)
-		return NF_DROP;
-
-	state = &ct->proto.tcp;
-	switch (state->state) {
-	case TCP_CONNTRACK_CLOSE:
-		if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
-			nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
-						      ntohl(th->seq) + 1);
-			break;
-		}
-
-		if (!th->syn || th->ack ||
-		    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
-			break;
-
-		/* Reopened connection - reset the sequence number and timestamp
-		 * adjustments, they will get initialized once the connection is
-		 * reestablished.
-		 */
-		nf_ct_seqadj_init(ct, ctinfo, 0);
-		synproxy->tsoff = 0;
-		this_cpu_inc(snet->stats->conn_reopened);
-
-		/* fall through */
-	case TCP_CONNTRACK_SYN_SENT:
-		if (!synproxy_parse_options(skb, thoff, th, &opts))
-			return NF_DROP;
-
-		if (!th->syn && th->ack &&
-		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
-			/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
-			 * therefore we need to add 1 to make the SYN sequence
-			 * number match the one of first SYN.
-			 */
-			if (synproxy_recv_client_ack(net, skb, th, &opts,
-						     ntohl(th->seq) + 1)) {
-				this_cpu_inc(snet->stats->cookie_retrans);
-				consume_skb(skb);
-				return NF_STOLEN;
-			} else {
-				return NF_DROP;
-			}
-		}
-
-		synproxy->isn = ntohl(th->ack_seq);
-		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
-			synproxy->its = opts.tsecr;
-
-		nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
-		break;
-	case TCP_CONNTRACK_SYN_RECV:
-		if (!th->syn || !th->ack)
-			break;
-
-		if (!synproxy_parse_options(skb, thoff, th, &opts))
-			return NF_DROP;
-
-		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
-			synproxy->tsoff = opts.tsval - synproxy->its;
-			nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
-		}
-
-		opts.options &= ~(XT_SYNPROXY_OPT_MSS |
-				  XT_SYNPROXY_OPT_WSCALE |
-				  XT_SYNPROXY_OPT_SACK_PERM);
-
-		swap(opts.tsval, opts.tsecr);
-		synproxy_send_server_ack(net, state, skb, th, &opts);
-
-		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
-		nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
-		swap(opts.tsval, opts.tsecr);
-		synproxy_send_client_ack(net, skb, th, &opts);
-
-		consume_skb(skb);
-		return NF_STOLEN;
-	default:
-		break;
-	}
-
-	synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
-	return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv4_synproxy_ops[] = {
-	{
-		.hook		= ipv4_synproxy_hook,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
-	},
-	{
-		.hook		= ipv4_synproxy_hook,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
-	},
-};
-
 static int synproxy_tg4_check(const struct xt_tgchk_param *par)
 {
 	struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -452,13 +76,10 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
 	if (err)
 		return err;
 
-	if (snet->hook_ref4 == 0) {
-		err = nf_register_net_hooks(par->net, ipv4_synproxy_ops,
-					    ARRAY_SIZE(ipv4_synproxy_ops));
-		if (err) {
-			nf_ct_netns_put(par->net, par->family);
-			return err;
-		}
+	err = nf_synproxy_ipv4_init(snet, par->net);
+	if (err) {
+		nf_ct_netns_put(par->net, par->family);
+		return err;
 	}
 
 	snet->hook_ref4++;
@@ -469,10 +90,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
 {
 	struct synproxy_net *snet = synproxy_pernet(par->net);
 
-	snet->hook_ref4--;
-	if (snet->hook_ref4 == 0)
-		nf_unregister_net_hooks(par->net, ipv4_synproxy_ops,
-					ARRAY_SIZE(ipv4_synproxy_ops));
+	nf_synproxy_ipv4_fini(snet, par->net);
 	nf_ct_netns_put(par->net, par->family);
 }
 
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index cb6d42b03cb5..55a9b92d0a1f 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -6,272 +6,11 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/ip6_checksum.h>
-#include <net/ip6_route.h>
-#include <net/tcp.h>
-
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct ipv6hdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb,
-		  const struct in6_addr *saddr,
-		  const struct in6_addr *daddr)
-{
-	struct ipv6hdr *iph;
-
-	skb_reset_network_header(skb);
-	iph = skb_put(skb, sizeof(*iph));
-	ip6_flow_hdr(iph, 0, 0);
-	iph->hop_limit	= net->ipv6.devconf_all->hop_limit;
-	iph->nexthdr	= IPPROTO_TCP;
-	iph->saddr	= *saddr;
-	iph->daddr	= *daddr;
-
-	return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
-		  const struct sk_buff *skb, struct sk_buff *nskb,
-		  struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
-		  struct ipv6hdr *niph, struct tcphdr *nth,
-		  unsigned int tcp_hdr_size)
-{
-	struct dst_entry *dst;
-	struct flowi6 fl6;
-
-	nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
-	nskb->ip_summed   = CHECKSUM_PARTIAL;
-	nskb->csum_start  = (unsigned char *)nth - nskb->head;
-	nskb->csum_offset = offsetof(struct tcphdr, check);
-
-	memset(&fl6, 0, sizeof(fl6));
-	fl6.flowi6_proto = IPPROTO_TCP;
-	fl6.saddr = niph->saddr;
-	fl6.daddr = niph->daddr;
-	fl6.fl6_sport = nth->source;
-	fl6.fl6_dport = nth->dest;
-	security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
-	dst = ip6_route_output(net, NULL, &fl6);
-	if (dst->error) {
-		dst_release(dst);
-		goto free_nskb;
-	}
-	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
-	if (IS_ERR(dst))
-		goto free_nskb;
-
-	skb_dst_set(nskb, dst);
-
-	if (nfct) {
-		nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
-		nf_conntrack_get(nfct);
-	}
-
-	ip6_local_out(net, nskb->sk, nskb);
-	return;
-
-free_nskb:
-	kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
-			    const struct sk_buff *skb, const struct tcphdr *th,
-			    const struct synproxy_options *opts)
-{
-	struct sk_buff *nskb;
-	struct ipv6hdr *iph, *niph;
-	struct tcphdr *nth;
-	unsigned int tcp_hdr_size;
-	u16 mss = opts->mss;
-
-	iph = ipv6_hdr(skb);
-
-	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
-	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
-			 GFP_ATOMIC);
-	if (nskb == NULL)
-		return;
-	skb_reserve(nskb, MAX_TCP_HEADER);
-
-	niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
-	skb_reset_transport_header(nskb);
-	nth = skb_put(nskb, tcp_hdr_size);
-	nth->source	= th->dest;
-	nth->dest	= th->source;
-	nth->seq	= htonl(__cookie_v6_init_sequence(iph, th, &mss));
-	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
-	tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
-	if (opts->options & XT_SYNPROXY_OPT_ECN)
-		tcp_flag_word(nth) |= TCP_FLAG_ECE;
-	nth->doff	= tcp_hdr_size / 4;
-	nth->window	= 0;
-	nth->check	= 0;
-	nth->urg_ptr	= 0;
-
-	synproxy_build_options(nth, opts);
-
-	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
-			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
 
-static void
-synproxy_send_server_syn(struct net *net,
-			 const struct sk_buff *skb, const struct tcphdr *th,
-			 const struct synproxy_options *opts, u32 recv_seq)
-{
-	struct synproxy_net *snet = synproxy_pernet(net);
-	struct sk_buff *nskb;
-	struct ipv6hdr *iph, *niph;
-	struct tcphdr *nth;
-	unsigned int tcp_hdr_size;
-
-	iph = ipv6_hdr(skb);
-
-	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
-	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
-			 GFP_ATOMIC);
-	if (nskb == NULL)
-		return;
-	skb_reserve(nskb, MAX_TCP_HEADER);
-
-	niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
-	skb_reset_transport_header(nskb);
-	nth = skb_put(nskb, tcp_hdr_size);
-	nth->source	= th->source;
-	nth->dest	= th->dest;
-	nth->seq	= htonl(recv_seq - 1);
-	/* ack_seq is used to relay our ISN to the synproxy hook to initialize
-	 * sequence number translation once a connection tracking entry exists.
-	 */
-	nth->ack_seq	= htonl(ntohl(th->ack_seq) - 1);
-	tcp_flag_word(nth) = TCP_FLAG_SYN;
-	if (opts->options & XT_SYNPROXY_OPT_ECN)
-		tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
-	nth->doff	= tcp_hdr_size / 4;
-	nth->window	= th->window;
-	nth->check	= 0;
-	nth->urg_ptr	= 0;
-
-	synproxy_build_options(nth, opts);
-
-	synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
-			  niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
-			 const struct ip_ct_tcp *state,
-			 const struct sk_buff *skb, const struct tcphdr *th,
-			 const struct synproxy_options *opts)
-{
-	struct sk_buff *nskb;
-	struct ipv6hdr *iph, *niph;
-	struct tcphdr *nth;
-	unsigned int tcp_hdr_size;
-
-	iph = ipv6_hdr(skb);
-
-	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
-	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
-			 GFP_ATOMIC);
-	if (nskb == NULL)
-		return;
-	skb_reserve(nskb, MAX_TCP_HEADER);
-
-	niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
-	skb_reset_transport_header(nskb);
-	nth = skb_put(nskb, tcp_hdr_size);
-	nth->source	= th->dest;
-	nth->dest	= th->source;
-	nth->seq	= htonl(ntohl(th->ack_seq));
-	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
-	tcp_flag_word(nth) = TCP_FLAG_ACK;
-	nth->doff	= tcp_hdr_size / 4;
-	nth->window	= htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
-	nth->check	= 0;
-	nth->urg_ptr	= 0;
-
-	synproxy_build_options(nth, opts);
-
-	synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
-			 const struct sk_buff *skb, const struct tcphdr *th,
-			 const struct synproxy_options *opts)
-{
-	struct sk_buff *nskb;
-	struct ipv6hdr *iph, *niph;
-	struct tcphdr *nth;
-	unsigned int tcp_hdr_size;
-
-	iph = ipv6_hdr(skb);
-
-	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
-	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
-			 GFP_ATOMIC);
-	if (nskb == NULL)
-		return;
-	skb_reserve(nskb, MAX_TCP_HEADER);
-
-	niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
-	skb_reset_transport_header(nskb);
-	nth = skb_put(nskb, tcp_hdr_size);
-	nth->source	= th->source;
-	nth->dest	= th->dest;
-	nth->seq	= htonl(ntohl(th->seq) + 1);
-	nth->ack_seq	= th->ack_seq;
-	tcp_flag_word(nth) = TCP_FLAG_ACK;
-	nth->doff	= tcp_hdr_size / 4;
-	nth->window	= htons(ntohs(th->window) >> opts->wscale);
-	nth->check	= 0;
-	nth->urg_ptr	= 0;
-
-	synproxy_build_options(nth, opts);
-
-	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
-			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
-			 const struct sk_buff *skb, const struct tcphdr *th,
-			 struct synproxy_options *opts, u32 recv_seq)
-{
-	struct synproxy_net *snet = synproxy_pernet(net);
-	int mss;
-
-	mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
-	if (mss == 0) {
-		this_cpu_inc(snet->stats->cookie_invalid);
-		return false;
-	}
-
-	this_cpu_inc(snet->stats->cookie_valid);
-	opts->mss = mss;
-	opts->options |= XT_SYNPROXY_OPT_MSS;
-
-	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
-		synproxy_check_timestamp_cookie(opts);
-
-	synproxy_send_server_syn(net, skb, th, opts, recv_seq);
-	return true;
-}
+#include <net/netfilter/nf_synproxy.h>
 
 static unsigned int
 synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
@@ -307,13 +46,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 					  XT_SYNPROXY_OPT_SACK_PERM |
 					  XT_SYNPROXY_OPT_ECN);
 
-		synproxy_send_client_synack(net, skb, th, &opts);
+		synproxy_send_client_synack_ipv6(net, skb, th, &opts);
 		consume_skb(skb);
 		return NF_STOLEN;
 
 	} else if (th->ack && !(th->fin || th->rst || th->syn)) {
 		/* ACK from client */
-		if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) {
+		if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+						  ntohl(th->seq))) {
 			consume_skb(skb);
 			return NF_STOLEN;
 		} else {
@@ -324,141 +64,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static unsigned int ipv6_synproxy_hook(void *priv,
-				       struct sk_buff *skb,
-				       const struct nf_hook_state *nhs)
-{
-	struct net *net = nhs->net;
-	struct synproxy_net *snet = synproxy_pernet(net);
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
-	struct nf_conn_synproxy *synproxy;
-	struct synproxy_options opts = {};
-	const struct ip_ct_tcp *state;
-	struct tcphdr *th, _th;
-	__be16 frag_off;
-	u8 nexthdr;
-	int thoff;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (ct == NULL)
-		return NF_ACCEPT;
-
-	synproxy = nfct_synproxy(ct);
-	if (synproxy == NULL)
-		return NF_ACCEPT;
-
-	if (nf_is_loopback_packet(skb))
-		return NF_ACCEPT;
-
-	nexthdr = ipv6_hdr(skb)->nexthdr;
-	thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
-				 &frag_off);
-	if (thoff < 0 || nexthdr != IPPROTO_TCP)
-		return NF_ACCEPT;
-
-	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
-	if (th == NULL)
-		return NF_DROP;
-
-	state = &ct->proto.tcp;
-	switch (state->state) {
-	case TCP_CONNTRACK_CLOSE:
-		if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
-			nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
-						      ntohl(th->seq) + 1);
-			break;
-		}
-
-		if (!th->syn || th->ack ||
-		    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
-			break;
-
-		/* Reopened connection - reset the sequence number and timestamp
-		 * adjustments, they will get initialized once the connection is
-		 * reestablished.
-		 */
-		nf_ct_seqadj_init(ct, ctinfo, 0);
-		synproxy->tsoff = 0;
-		this_cpu_inc(snet->stats->conn_reopened);
-
-		/* fall through */
-	case TCP_CONNTRACK_SYN_SENT:
-		if (!synproxy_parse_options(skb, thoff, th, &opts))
-			return NF_DROP;
-
-		if (!th->syn && th->ack &&
-		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
-			/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
-			 * therefore we need to add 1 to make the SYN sequence
-			 * number match the one of first SYN.
-			 */
-			if (synproxy_recv_client_ack(net, skb, th, &opts,
-						     ntohl(th->seq) + 1)) {
-				this_cpu_inc(snet->stats->cookie_retrans);
-				consume_skb(skb);
-				return NF_STOLEN;
-			} else {
-				return NF_DROP;
-			}
-		}
-
-		synproxy->isn = ntohl(th->ack_seq);
-		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
-			synproxy->its = opts.tsecr;
-
-		nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
-		break;
-	case TCP_CONNTRACK_SYN_RECV:
-		if (!th->syn || !th->ack)
-			break;
-
-		if (!synproxy_parse_options(skb, thoff, th, &opts))
-			return NF_DROP;
-
-		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
-			synproxy->tsoff = opts.tsval - synproxy->its;
-			nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
-		}
-
-		opts.options &= ~(XT_SYNPROXY_OPT_MSS |
-				  XT_SYNPROXY_OPT_WSCALE |
-				  XT_SYNPROXY_OPT_SACK_PERM);
-
-		swap(opts.tsval, opts.tsecr);
-		synproxy_send_server_ack(net, state, skb, th, &opts);
-
-		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
-		nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
-		swap(opts.tsval, opts.tsecr);
-		synproxy_send_client_ack(net, skb, th, &opts);
-
-		consume_skb(skb);
-		return NF_STOLEN;
-	default:
-		break;
-	}
-
-	synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
-	return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv6_synproxy_ops[] = {
-	{
-		.hook		= ipv6_synproxy_hook,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
-	},
-	{
-		.hook		= ipv6_synproxy_hook,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
-	},
-};
-
 static int synproxy_tg6_check(const struct xt_tgchk_param *par)
 {
 	struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -474,16 +79,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
 	if (err)
 		return err;
 
-	if (snet->hook_ref6 == 0) {
-		err = nf_register_net_hooks(par->net, ipv6_synproxy_ops,
-					    ARRAY_SIZE(ipv6_synproxy_ops));
-		if (err) {
-			nf_ct_netns_put(par->net, par->family);
-			return err;
-		}
+	err = nf_synproxy_ipv6_init(snet, par->net);
+	if (err) {
+		nf_ct_netns_put(par->net, par->family);
+		return err;
 	}
 
-	snet->hook_ref6++;
 	return err;
 }
 
@@ -491,10 +92,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
 {
 	struct synproxy_net *snet = synproxy_pernet(par->net);
 
-	snet->hook_ref6--;
-	if (snet->hook_ref6 == 0)
-		nf_unregister_net_hooks(par->net, ipv6_synproxy_ops,
-					ARRAY_SIZE(ipv6_synproxy_ops));
+	nf_synproxy_ipv6_fini(snet, par->net);
 	nf_ct_netns_put(par->net, par->family);
 }
 
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 3d58a9e93e5a..50677285f82e 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -13,16 +13,16 @@
 #include <net/netns/generic.h>
 #include <linux/proc_fs.h>
 
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/xt_tcpudp.h>
-#include <linux/netfilter/xt_SYNPROXY.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nf_SYNPROXY.h>
 
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_synproxy.h>
 #include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_synproxy.h>
 
 unsigned int synproxy_net_id;
 EXPORT_SYMBOL_GPL(synproxy_net_id);
@@ -60,7 +60,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 			case TCPOPT_MSS:
 				if (opsize == TCPOLEN_MSS) {
 					opts->mss = get_unaligned_be16(ptr);
-					opts->options |= XT_SYNPROXY_OPT_MSS;
+					opts->options |= NF_SYNPROXY_OPT_MSS;
 				}
 				break;
 			case TCPOPT_WINDOW:
@@ -68,19 +68,19 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 					opts->wscale = *ptr;
 					if (opts->wscale > TCP_MAX_WSCALE)
 						opts->wscale = TCP_MAX_WSCALE;
-					opts->options |= XT_SYNPROXY_OPT_WSCALE;
+					opts->options |= NF_SYNPROXY_OPT_WSCALE;
 				}
 				break;
 			case TCPOPT_TIMESTAMP:
 				if (opsize == TCPOLEN_TIMESTAMP) {
 					opts->tsval = get_unaligned_be32(ptr);
 					opts->tsecr = get_unaligned_be32(ptr + 4);
-					opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+					opts->options |= NF_SYNPROXY_OPT_TIMESTAMP;
 				}
 				break;
 			case TCPOPT_SACK_PERM:
 				if (opsize == TCPOLEN_SACK_PERM)
-					opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+					opts->options |= NF_SYNPROXY_OPT_SACK_PERM;
 				break;
 			}
 
@@ -92,36 +92,36 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 }
 EXPORT_SYMBOL_GPL(synproxy_parse_options);
 
-unsigned int synproxy_options_size(const struct synproxy_options *opts)
+static unsigned int
+synproxy_options_size(const struct synproxy_options *opts)
 {
 	unsigned int size = 0;
 
-	if (opts->options & XT_SYNPROXY_OPT_MSS)
+	if (opts->options & NF_SYNPROXY_OPT_MSS)
 		size += TCPOLEN_MSS_ALIGNED;
-	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+	if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
 		size += TCPOLEN_TSTAMP_ALIGNED;
-	else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+	else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
 		size += TCPOLEN_SACKPERM_ALIGNED;
-	if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+	if (opts->options & NF_SYNPROXY_OPT_WSCALE)
 		size += TCPOLEN_WSCALE_ALIGNED;
 
 	return size;
 }
-EXPORT_SYMBOL_GPL(synproxy_options_size);
 
-void
+static void
 synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
 {
 	__be32 *ptr = (__be32 *)(th + 1);
 	u8 options = opts->options;
 
-	if (options & XT_SYNPROXY_OPT_MSS)
+	if (options & NF_SYNPROXY_OPT_MSS)
 		*ptr++ = htonl((TCPOPT_MSS << 24) |
 			       (TCPOLEN_MSS << 16) |
 			       opts->mss);
 
-	if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
-		if (options & XT_SYNPROXY_OPT_SACK_PERM)
+	if (options & NF_SYNPROXY_OPT_TIMESTAMP) {
+		if (options & NF_SYNPROXY_OPT_SACK_PERM)
 			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
 				       (TCPOLEN_SACK_PERM << 16) |
 				       (TCPOPT_TIMESTAMP << 8) |
@@ -134,58 +134,56 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
 
 		*ptr++ = htonl(opts->tsval);
 		*ptr++ = htonl(opts->tsecr);
-	} else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+	} else if (options & NF_SYNPROXY_OPT_SACK_PERM)
 		*ptr++ = htonl((TCPOPT_NOP << 24) |
 			       (TCPOPT_NOP << 16) |
 			       (TCPOPT_SACK_PERM << 8) |
 			       TCPOLEN_SACK_PERM);
 
-	if (options & XT_SYNPROXY_OPT_WSCALE)
+	if (options & NF_SYNPROXY_OPT_WSCALE)
 		*ptr++ = htonl((TCPOPT_NOP << 24) |
 			       (TCPOPT_WINDOW << 16) |
 			       (TCPOLEN_WINDOW << 8) |
 			       opts->wscale);
 }
-EXPORT_SYMBOL_GPL(synproxy_build_options);
 
-void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
 				    struct synproxy_options *opts)
 {
 	opts->tsecr = opts->tsval;
 	opts->tsval = tcp_time_stamp_raw() & ~0x3f;
 
-	if (opts->options & XT_SYNPROXY_OPT_WSCALE) {
+	if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
 		opts->tsval |= opts->wscale;
 		opts->wscale = info->wscale;
 	} else
 		opts->tsval |= 0xf;
 
-	if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+	if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
 		opts->tsval |= 1 << 4;
 
-	if (opts->options & XT_SYNPROXY_OPT_ECN)
+	if (opts->options & NF_SYNPROXY_OPT_ECN)
 		opts->tsval |= 1 << 5;
 }
 EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
 
-void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+static void
+synproxy_check_timestamp_cookie(struct synproxy_options *opts)
 {
 	opts->wscale = opts->tsecr & 0xf;
 	if (opts->wscale != 0xf)
-		opts->options |= XT_SYNPROXY_OPT_WSCALE;
+		opts->options |= NF_SYNPROXY_OPT_WSCALE;
 
-	opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+	opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0;
 
-	opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+	opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0;
 }
-EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
 
-unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
-				    unsigned int protoff,
-				    struct tcphdr *th,
-				    struct nf_conn *ct,
-				    enum ip_conntrack_info ctinfo,
-				    const struct nf_conn_synproxy *synproxy)
+static unsigned int
+synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
+		       struct tcphdr *th, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       const struct nf_conn_synproxy *synproxy)
 {
 	unsigned int optoff, optend;
 	__be32 *ptr, old;
@@ -235,7 +233,6 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
 	}
 	return 1;
 }
-EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
 
 static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
 	.len		= sizeof(struct nf_conn_synproxy),
@@ -416,5 +413,830 @@ static void __exit synproxy_core_exit(void)
 module_init(synproxy_core_init);
 module_exit(synproxy_core_exit);
 
+static struct iphdr *
+synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
+		  __be32 daddr)
+{
+	struct iphdr *iph;
+
+	skb_reset_network_header(skb);
+	iph = skb_put(skb, sizeof(*iph));
+	iph->version	= 4;
+	iph->ihl	= sizeof(*iph) / 4;
+	iph->tos	= 0;
+	iph->id		= 0;
+	iph->frag_off	= htons(IP_DF);
+	iph->ttl	= net->ipv4.sysctl_ip_default_ttl;
+	iph->protocol	= IPPROTO_TCP;
+	iph->check	= 0;
+	iph->saddr	= saddr;
+	iph->daddr	= daddr;
+
+	return iph;
+}
+
+static void
+synproxy_send_tcp(struct net *net,
+		  const struct sk_buff *skb, struct sk_buff *nskb,
+		  struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+		  struct iphdr *niph, struct tcphdr *nth,
+		  unsigned int tcp_hdr_size)
+{
+	nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+	nskb->ip_summed   = CHECKSUM_PARTIAL;
+	nskb->csum_start  = (unsigned char *)nth - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
+
+	skb_dst_set_noref(nskb, skb_dst(skb));
+	nskb->protocol = htons(ETH_P_IP);
+	if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+		goto free_nskb;
+
+	if (nfct) {
+		nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
+		nf_conntrack_get(nfct);
+	}
+
+	ip_local_out(net, nskb->sk, nskb);
+	return;
+
+free_nskb:
+	kfree_skb(nskb);
+}
+
+void
+synproxy_send_client_synack(struct net *net,
+			    const struct sk_buff *skb, const struct tcphdr *th,
+			    const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+	u16 mss = opts->mss;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (!nskb)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(__cookie_v4_init_sequence(iph, th, &mss));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+	if (opts->options & NF_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= 0;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
+}
+EXPORT_SYMBOL_GPL(synproxy_send_client_synack);
+
+static void
+synproxy_send_server_syn(struct net *net,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts, u32 recv_seq)
+{
+	struct synproxy_net *snet = synproxy_pernet(net);
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (!nskb)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(recv_seq - 1);
+	/* ack_seq is used to relay our ISN to the synproxy hook to initialize
+	 * sequence number translation once a connection tracking entry exists.
+	 */
+	nth->ack_seq	= htonl(ntohl(th->ack_seq) - 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN;
+	if (opts->options & NF_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= th->window;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+			  niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(struct net *net,
+			 const struct ip_ct_tcp *state,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (!nskb)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(ntohl(th->ack_seq));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(struct net *net,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (!nskb)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(ntohl(th->seq) + 1);
+	nth->ack_seq	= th->ack_seq;
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= htons(ntohs(th->window) >> opts->wscale);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
+}
+
+bool
+synproxy_recv_client_ack(struct net *net,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 struct synproxy_options *opts, u32 recv_seq)
+{
+	struct synproxy_net *snet = synproxy_pernet(net);
+	int mss;
+
+	mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+	if (mss == 0) {
+		this_cpu_inc(snet->stats->cookie_invalid);
+		return false;
+	}
+
+	this_cpu_inc(snet->stats->cookie_valid);
+	opts->mss = mss;
+	opts->options |= NF_SYNPROXY_OPT_MSS;
+
+	if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+		synproxy_check_timestamp_cookie(opts);
+
+	synproxy_send_server_syn(net, skb, th, opts, recv_seq);
+	return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_recv_client_ack);
+
+unsigned int
+ipv4_synproxy_hook(void *priv, struct sk_buff *skb,
+		   const struct nf_hook_state *nhs)
+{
+	struct net *net = nhs->net;
+	struct synproxy_net *snet = synproxy_pernet(net);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	struct nf_conn_synproxy *synproxy;
+	struct synproxy_options opts = {};
+	const struct ip_ct_tcp *state;
+	struct tcphdr *th, _th;
+	unsigned int thoff;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return NF_ACCEPT;
+
+	synproxy = nfct_synproxy(ct);
+	if (!synproxy)
+		return NF_ACCEPT;
+
+	if (nf_is_loopback_packet(skb) ||
+	    ip_hdr(skb)->protocol != IPPROTO_TCP)
+		return NF_ACCEPT;
+
+	thoff = ip_hdrlen(skb);
+	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+	if (!th)
+		return NF_DROP;
+
+	state = &ct->proto.tcp;
+	switch (state->state) {
+	case TCP_CONNTRACK_CLOSE:
+		if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+			nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+						      ntohl(th->seq) + 1);
+			break;
+		}
+
+		if (!th->syn || th->ack ||
+		    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+			break;
+
+		/* Reopened connection - reset the sequence number and timestamp
+		 * adjustments, they will get initialized once the connection is
+		 * reestablished.
+		 */
+		nf_ct_seqadj_init(ct, ctinfo, 0);
+		synproxy->tsoff = 0;
+		this_cpu_inc(snet->stats->conn_reopened);
+
+		/* fall through */
+	case TCP_CONNTRACK_SYN_SENT:
+		if (!synproxy_parse_options(skb, thoff, th, &opts))
+			return NF_DROP;
+
+		if (!th->syn && th->ack &&
+		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+			/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+			 * therefore we need to add 1 to make the SYN sequence
+			 * number match the one of first SYN.
+			 */
+			if (synproxy_recv_client_ack(net, skb, th, &opts,
+						     ntohl(th->seq) + 1)) {
+				this_cpu_inc(snet->stats->cookie_retrans);
+				consume_skb(skb);
+				return NF_STOLEN;
+			} else {
+				return NF_DROP;
+			}
+		}
+
+		synproxy->isn = ntohl(th->ack_seq);
+		if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
+			synproxy->its = opts.tsecr;
+
+		nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+		break;
+	case TCP_CONNTRACK_SYN_RECV:
+		if (!th->syn || !th->ack)
+			break;
+
+		if (!synproxy_parse_options(skb, thoff, th, &opts))
+			return NF_DROP;
+
+		if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
+			synproxy->tsoff = opts.tsval - synproxy->its;
+			nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+		}
+
+		opts.options &= ~(NF_SYNPROXY_OPT_MSS |
+				  NF_SYNPROXY_OPT_WSCALE |
+				  NF_SYNPROXY_OPT_SACK_PERM);
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_server_ack(net, state, skb, th, &opts);
+
+		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+		nf_conntrack_event_cache(IPCT_SEQADJ, ct);
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_client_ack(net, skb, th, &opts);
+
+		consume_skb(skb);
+		return NF_STOLEN;
+	default:
+		break;
+	}
+
+	synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ipv4_synproxy_hook);
+
+static const struct nf_hook_ops ipv4_synproxy_ops[] = {
+	{
+		.hook		= ipv4_synproxy_hook,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+	{
+		.hook		= ipv4_synproxy_hook,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+};
+
+int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net)
+{
+	int err;
+
+	if (snet->hook_ref4 == 0) {
+		err = nf_register_net_hooks(net, ipv4_synproxy_ops,
+					    ARRAY_SIZE(ipv4_synproxy_ops));
+		if (err)
+			return err;
+	}
+
+	snet->hook_ref4++;
+	return err;
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init);
+
+void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net)
+{
+	snet->hook_ref4--;
+	if (snet->hook_ref4 == 0)
+		nf_unregister_net_hooks(net, ipv4_synproxy_ops,
+					ARRAY_SIZE(ipv4_synproxy_ops));
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct ipv6hdr *
+synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb,
+		       const struct in6_addr *saddr,
+		       const struct in6_addr *daddr)
+{
+	struct ipv6hdr *iph;
+
+	skb_reset_network_header(skb);
+	iph = skb_put(skb, sizeof(*iph));
+	ip6_flow_hdr(iph, 0, 0);
+	iph->hop_limit	= net->ipv6.devconf_all->hop_limit;
+	iph->nexthdr	= IPPROTO_TCP;
+	iph->saddr	= *saddr;
+	iph->daddr	= *daddr;
+
+	return iph;
+}
+
+static void
+synproxy_send_tcp_ipv6(struct net *net,
+		       const struct sk_buff *skb, struct sk_buff *nskb,
+		       struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+		       struct ipv6hdr *niph, struct tcphdr *nth,
+		       unsigned int tcp_hdr_size)
+{
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+	int err;
+
+	nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
+	nskb->ip_summed   = CHECKSUM_PARTIAL;
+	nskb->csum_start  = (unsigned char *)nth - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_TCP;
+	fl6.saddr = niph->saddr;
+	fl6.daddr = niph->daddr;
+	fl6.fl6_sport = nth->source;
+	fl6.fl6_dport = nth->dest;
+	security_skb_classify_flow((struct sk_buff *)skb,
+				   flowi6_to_flowi(&fl6));
+	err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
+	if (err) {
+		goto free_nskb;
+	}
+
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	if (IS_ERR(dst))
+		goto free_nskb;
+
+	skb_dst_set(nskb, dst);
+
+	if (nfct) {
+		nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
+		nf_conntrack_get(nfct);
+	}
+
+	ip6_local_out(net, nskb->sk, nskb);
+	return;
+
+free_nskb:
+	kfree_skb(nskb);
+}
+
+void
+synproxy_send_client_synack_ipv6(struct net *net,
+				 const struct sk_buff *skb,
+				 const struct tcphdr *th,
+				 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct ipv6hdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+	u16 mss = opts->mss;
+
+	iph = ipv6_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (!nskb)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+	if (opts->options & NF_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= 0;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
+			       IP_CT_ESTABLISHED_REPLY, niph, nth,
+			       tcp_hdr_size);
+}
+EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6);
+
+static void
+synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb,
+			      const struct tcphdr *th,
+			      const struct synproxy_options *opts, u32 recv_seq)
+{
+	struct synproxy_net *snet = synproxy_pernet(net);
+	struct sk_buff *nskb;
+	struct ipv6hdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ipv6_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (!nskb)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(recv_seq - 1);
+	/* ack_seq is used to relay our ISN to the synproxy hook to initialize
+	 * sequence number translation once a connection tracking entry exists.
+	 */
+	nth->ack_seq	= htonl(ntohl(th->ack_seq) - 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN;
+	if (opts->options & NF_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= th->window;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general,
+			       IP_CT_NEW, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state,
+			      const struct sk_buff *skb,
+			      const struct tcphdr *th,
+			      const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct ipv6hdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ipv6_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (!nskb)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(ntohl(th->ack_seq));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth,
+			       tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb,
+			      const struct tcphdr *th,
+			      const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct ipv6hdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ipv6_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (!nskb)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(ntohl(th->seq) + 1);
+	nth->ack_seq	= th->ack_seq;
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= htons(ntohs(th->window) >> opts->wscale);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
+			       IP_CT_ESTABLISHED_REPLY, niph, nth,
+			       tcp_hdr_size);
+}
+
+bool
+synproxy_recv_client_ack_ipv6(struct net *net,
+			      const struct sk_buff *skb,
+			      const struct tcphdr *th,
+			      struct synproxy_options *opts, u32 recv_seq)
+{
+	struct synproxy_net *snet = synproxy_pernet(net);
+	int mss;
+
+	mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
+	if (mss == 0) {
+		this_cpu_inc(snet->stats->cookie_invalid);
+		return false;
+	}
+
+	this_cpu_inc(snet->stats->cookie_valid);
+	opts->mss = mss;
+	opts->options |= NF_SYNPROXY_OPT_MSS;
+
+	if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+		synproxy_check_timestamp_cookie(opts);
+
+	synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq);
+	return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6);
+
+unsigned int
+ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
+		   const struct nf_hook_state *nhs)
+{
+	struct net *net = nhs->net;
+	struct synproxy_net *snet = synproxy_pernet(net);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	struct nf_conn_synproxy *synproxy;
+	struct synproxy_options opts = {};
+	const struct ip_ct_tcp *state;
+	struct tcphdr *th, _th;
+	__be16 frag_off;
+	u8 nexthdr;
+	int thoff;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return NF_ACCEPT;
+
+	synproxy = nfct_synproxy(ct);
+	if (!synproxy)
+		return NF_ACCEPT;
+
+	if (nf_is_loopback_packet(skb))
+		return NF_ACCEPT;
+
+	nexthdr = ipv6_hdr(skb)->nexthdr;
+	thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+				 &frag_off);
+	if (thoff < 0 || nexthdr != IPPROTO_TCP)
+		return NF_ACCEPT;
+
+	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+	if (!th)
+		return NF_DROP;
+
+	state = &ct->proto.tcp;
+	switch (state->state) {
+	case TCP_CONNTRACK_CLOSE:
+		if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+			nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+						      ntohl(th->seq) + 1);
+			break;
+		}
+
+		if (!th->syn || th->ack ||
+		    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+			break;
+
+		/* Reopened connection - reset the sequence number and timestamp
+		 * adjustments, they will get initialized once the connection is
+		 * reestablished.
+		 */
+		nf_ct_seqadj_init(ct, ctinfo, 0);
+		synproxy->tsoff = 0;
+		this_cpu_inc(snet->stats->conn_reopened);
+
+		/* fall through */
+	case TCP_CONNTRACK_SYN_SENT:
+		if (!synproxy_parse_options(skb, thoff, th, &opts))
+			return NF_DROP;
+
+		if (!th->syn && th->ack &&
+		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+			/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+			 * therefore we need to add 1 to make the SYN sequence
+			 * number match the one of first SYN.
+			 */
+			if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+							  ntohl(th->seq) + 1)) {
+				this_cpu_inc(snet->stats->cookie_retrans);
+				consume_skb(skb);
+				return NF_STOLEN;
+			} else {
+				return NF_DROP;
+			}
+		}
+
+		synproxy->isn = ntohl(th->ack_seq);
+		if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
+			synproxy->its = opts.tsecr;
+
+		nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+		break;
+	case TCP_CONNTRACK_SYN_RECV:
+		if (!th->syn || !th->ack)
+			break;
+
+		if (!synproxy_parse_options(skb, thoff, th, &opts))
+			return NF_DROP;
+
+		if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
+			synproxy->tsoff = opts.tsval - synproxy->its;
+			nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+		}
+
+		opts.options &= ~(NF_SYNPROXY_OPT_MSS |
+				  NF_SYNPROXY_OPT_WSCALE |
+				  NF_SYNPROXY_OPT_SACK_PERM);
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_server_ack_ipv6(net, state, skb, th, &opts);
+
+		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+		nf_conntrack_event_cache(IPCT_SEQADJ, ct);
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_client_ack_ipv6(net, skb, th, &opts);
+
+		consume_skb(skb);
+		return NF_STOLEN;
+	default:
+		break;
+	}
+
+	synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ipv6_synproxy_hook);
+
+static const struct nf_hook_ops ipv6_synproxy_ops[] = {
+	{
+		.hook		= ipv6_synproxy_hook,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+	{
+		.hook		= ipv6_synproxy_hook,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+};
+
+int
+nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net)
+{
+	int err;
+
+	if (snet->hook_ref6 == 0) {
+		err = nf_register_net_hooks(net, ipv6_synproxy_ops,
+					    ARRAY_SIZE(ipv6_synproxy_ops));
+		if (err)
+			return err;
+	}
+
+	snet->hook_ref6++;
+	return err;
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init);
+
+void
+nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net)
+{
+	snet->hook_ref6--;
+	if (snet->hook_ref6 == 0)
+		nf_unregister_net_hooks(net, ipv6_synproxy_ops,
+					ARRAY_SIZE(ipv6_synproxy_ops));
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini);
+#endif /* CONFIG_IPV6 */
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-- 
cgit v1.2.3


From c681edae33e86ff27be2d6cc717663d91df20b0e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 17 Jun 2019 10:09:33 +0200
Subject: net: ipv4: move tcp_fastopen server side code to SipHash library

Using a bare block cipher in non-crypto code is almost always a bad idea,
not only for security reasons (and we've seen some examples of this in
the kernel in the past), but also for performance reasons.

In the TCP fastopen case, we call into the bare AES block cipher one or
two times (depending on whether the connection is IPv4 or IPv6). On most
systems, this results in a call chain such as

  crypto_cipher_encrypt_one(ctx, dst, src)
    crypto_cipher_crt(tfm)->cit_encrypt_one(crypto_cipher_tfm(tfm), ...);
      aesni_encrypt
        kernel_fpu_begin();
        aesni_enc(ctx, dst, src); // asm routine
        kernel_fpu_end();

It is highly unlikely that the use of special AES instructions has a
benefit in this case, especially since we are doing the above twice
for IPv6 connections, instead of using a transform which can process
the entire input in one go.

We could switch to the cbcmac(aes) shash, which would at least get
rid of the duplicated overhead in *some* cases (i.e., today, only
arm64 has an accelerated implementation of cbcmac(aes), while x86 will
end up using the generic cbcmac template wrapping the AES-NI cipher,
which basically ends up doing exactly the above). However, in the given
context, it makes more sense to use a light-weight MAC algorithm that
is more suitable for the purpose at hand, such as SipHash.

Since the output size of SipHash already matches our chosen value for
TCP_FASTOPEN_COOKIE_SIZE, and given that it accepts arbitrary input
sizes, this greatly simplifies the code as well.

NOTE: Server farms backing a single server IP for load balancing purposes
      and sharing a single fastopen key will be adversely affected by
      this change unless all systems in the pool receive their kernel
      upgrades at the same time.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  7 +---
 include/net/tcp.h       | 10 ++---
 net/Kconfig             |  2 -
 net/ipv4/tcp_fastopen.c | 97 ++++++++++++++++---------------------------------
 4 files changed, 36 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c23019a3b264..9ea0e71f5c6a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -58,12 +58,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 
 /* TCP Fast Open Cookie as stored in memory */
 struct tcp_fastopen_cookie {
-	union {
-		u8	val[TCP_FASTOPEN_COOKIE_MAX];
-#if IS_ENABLED(CONFIG_IPV6)
-		struct in6_addr addr;
-#endif
-	};
+	u64	val[TCP_FASTOPEN_COOKIE_MAX / sizeof(u64)];
 	s8	len;
 	bool	exp;	/* In RFC6994 experimental option format */
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 96e0e53ff440..184930b02779 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1628,9 +1628,9 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
 
 /* Fastopen key context */
 struct tcp_fastopen_context {
-	struct crypto_cipher	*tfm[TCP_FASTOPEN_KEY_MAX];
-	__u8			key[TCP_FASTOPEN_KEY_BUF_LENGTH];
-	struct rcu_head		rcu;
+	__u8		key[TCP_FASTOPEN_KEY_MAX][TCP_FASTOPEN_KEY_LENGTH];
+	int		num;
+	struct rcu_head	rcu;
 };
 
 extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
@@ -1665,9 +1665,7 @@ bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
 static inline
 int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
 {
-	if (ctx->tfm[1])
-		return 2;
-	return 1;
+	return ctx->num;
 }
 
 /* Latencies incurred by various limits for a sender. They are
diff --git a/net/Kconfig b/net/Kconfig
index d122f53c6fa2..57f51a279ad6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -67,8 +67,6 @@ source "net/xdp/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
-	select CRYPTO
-	select CRYPTO_AES
 	---help---
 	  These are the protocols used on the Internet and on most local
 	  Ethernets. It is highly recommended to say Y here (this will enlarge
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 7d19fa4c8121..46b67128e1ca 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -7,6 +7,7 @@
 #include <linux/tcp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
+#include <linux/siphash.h>
 #include <net/inetpeer.h>
 #include <net/tcp.h>
 
@@ -37,14 +38,8 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
 {
 	struct tcp_fastopen_context *ctx =
 	    container_of(head, struct tcp_fastopen_context, rcu);
-	int i;
 
-	/* We own ctx, thus no need to hold the Fastopen-lock */
-	for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++) {
-		if (ctx->tfm[i])
-			crypto_free_cipher(ctx->tfm[i]);
-	}
-	kfree(ctx);
+	kzfree(ctx);
 }
 
 void tcp_fastopen_destroy_cipher(struct sock *sk)
@@ -72,41 +67,6 @@ void tcp_fastopen_ctx_destroy(struct net *net)
 		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
 }
 
-static struct tcp_fastopen_context *tcp_fastopen_alloc_ctx(void *primary_key,
-							   void *backup_key,
-							   unsigned int len)
-{
-	struct tcp_fastopen_context *new_ctx;
-	void *key = primary_key;
-	int err, i;
-
-	new_ctx = kmalloc(sizeof(*new_ctx), GFP_KERNEL);
-	if (!new_ctx)
-		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++)
-		new_ctx->tfm[i] = NULL;
-	for (i = 0; i < (backup_key ? 2 : 1); i++) {
-		new_ctx->tfm[i] = crypto_alloc_cipher("aes", 0, 0);
-		if (IS_ERR(new_ctx->tfm[i])) {
-			err = PTR_ERR(new_ctx->tfm[i]);
-			new_ctx->tfm[i] = NULL;
-			pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
-			goto out;
-		}
-		err = crypto_cipher_setkey(new_ctx->tfm[i], key, len);
-		if (err) {
-			pr_err("TCP: TFO cipher key error: %d\n", err);
-			goto out;
-		}
-		memcpy(&new_ctx->key[i * TCP_FASTOPEN_KEY_LENGTH], key, len);
-		key = backup_key;
-	}
-	return new_ctx;
-out:
-	tcp_fastopen_ctx_free(&new_ctx->rcu);
-	return ERR_PTR(err);
-}
-
 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 			      void *primary_key, void *backup_key,
 			      unsigned int len)
@@ -115,11 +75,20 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 	struct fastopen_queue *q;
 	int err = 0;
 
-	ctx = tcp_fastopen_alloc_ctx(primary_key, backup_key, len);
-	if (IS_ERR(ctx)) {
-		err = PTR_ERR(ctx);
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		err = -ENOMEM;
 		goto out;
 	}
+
+	memcpy(ctx->key[0], primary_key, len);
+	if (backup_key) {
+		memcpy(ctx->key[1], backup_key, len);
+		ctx->num = 2;
+	} else {
+		ctx->num = 1;
+	}
+
 	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
 	if (sk) {
 		q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
@@ -141,31 +110,30 @@ out:
 
 static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
 					     struct sk_buff *syn,
-					     struct crypto_cipher *tfm,
+					     const u8 *key,
 					     struct tcp_fastopen_cookie *foc)
 {
+	BUILD_BUG_ON(TCP_FASTOPEN_KEY_LENGTH != sizeof(siphash_key_t));
+	BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
+
 	if (req->rsk_ops->family == AF_INET) {
 		const struct iphdr *iph = ip_hdr(syn);
-		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
 
-		crypto_cipher_encrypt_one(tfm, foc->val, (void *)path);
+		foc->val[0] = siphash(&iph->saddr,
+				      sizeof(iph->saddr) +
+				      sizeof(iph->daddr),
+				      (const siphash_key_t *)key);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 		return true;
 	}
-
 #if IS_ENABLED(CONFIG_IPV6)
 	if (req->rsk_ops->family == AF_INET6) {
 		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
-		struct tcp_fastopen_cookie tmp;
-		struct in6_addr *buf;
-		int i;
-
-		crypto_cipher_encrypt_one(tfm, tmp.val,
-					  (void *)&ip6h->saddr);
-		buf = &tmp.addr;
-		for (i = 0; i < 4; i++)
-			buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
-		crypto_cipher_encrypt_one(tfm, foc->val, (void *)buf);
+
+		foc->val[0] = siphash(&ip6h->saddr,
+				      sizeof(ip6h->saddr) +
+				      sizeof(ip6h->daddr),
+				      (const siphash_key_t *)key);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 		return true;
 	}
@@ -173,11 +141,8 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
 	return false;
 }
 
-/* Generate the fastopen cookie by doing aes128 encryption on both
- * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
- * addresses. For the longer IPv6 addresses use CBC-MAC.
- *
- * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
+/* Generate the fastopen cookie by applying SipHash to both the source and
+ * destination addresses.
  */
 static void tcp_fastopen_cookie_gen(struct sock *sk,
 				    struct request_sock *req,
@@ -189,7 +154,7 @@ static void tcp_fastopen_cookie_gen(struct sock *sk,
 	rcu_read_lock();
 	ctx = tcp_fastopen_get_ctx(sk);
 	if (ctx)
-		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[0], foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[0], foc);
 	rcu_read_unlock();
 }
 
@@ -253,7 +218,7 @@ static int tcp_fastopen_cookie_gen_check(struct sock *sk,
 	if (!ctx)
 		goto out;
 	for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
-		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[i], foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[i], foc);
 		if (tcp_fastopen_cookie_match(foc, orig)) {
 			ret = i + 1;
 			goto out;
-- 
cgit v1.2.3


From 3de205cde4ee8e36416f8b1a1510658abb14f408 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 18 Jun 2019 18:12:43 +0300
Subject: netlink: Document all fields of 'struct nl_info'

Some fields were not documented. Add documentation.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 28ece67f5312..ce66e43b9b6a 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -378,7 +378,9 @@ struct nla_policy {
 /**
  * struct nl_info - netlink source information
  * @nlh: Netlink message header of original request
+ * @nl_net: Network namespace
  * @portid: Netlink PORTID of requesting application
+ * @skip_notify: Skip netlink notifications to user space
  */
 struct nl_info {
 	struct nlmsghdr		*nlh;
-- 
cgit v1.2.3


From c82481f7ea21be8ec960a28aef07bf258f6820b7 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 18 Jun 2019 18:12:44 +0300
Subject: netlink: Add field to skip in-kernel notifications

The struct includes a 'skip_notify' flag that indicates if netlink
notifications to user space should be suppressed. As explained in commit
3b1137fe7482 ("net: ipv6: Change notifications for multipath add to
RTA_MULTIPATH"), this is useful to suppress per-nexthop RTM_NEWROUTE
notifications when an IPv6 multipath route is added / deleted. Instead,
one notification is sent for the entire multipath route.

This concept is also useful for in-kernel notifications. Sending one
in-kernel notification for the addition / deletion of an IPv6 multipath
route - instead of one per-nexthop - provides a significant increase in
the insertion / deletion rate to underlying devices.

Add a 'skip_notify_kernel' flag to suppress in-kernel notifications.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index ce66e43b9b6a..e4650e5b64a1 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -381,12 +381,14 @@ struct nla_policy {
  * @nl_net: Network namespace
  * @portid: Netlink PORTID of requesting application
  * @skip_notify: Skip netlink notifications to user space
+ * @skip_notify_kernel: Skip selected in-kernel notifications
  */
 struct nl_info {
 	struct nlmsghdr		*nlh;
 	struct net		*nl_net;
 	u32			portid;
-	bool			skip_notify;
+	u8			skip_notify:1,
+				skip_notify_kernel:1;
 };
 
 /**
-- 
cgit v1.2.3


From d4b96c7b51e8fe9bcf94c8ab8cd5717d2f005b04 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 18 Jun 2019 18:12:45 +0300
Subject: ipv6: Extend notifier info for multipath routes

Extend the IPv6 FIB notifier info with number of sibling routes being
notified.

This will later allow listeners to process one notification for a
multipath routes instead of N, where N is the number of nexthops.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  7 +++++++
 net/ipv6/ip6_fib.c    | 17 +++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1e92f1500b87..7c3d5ab05879 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -377,6 +377,8 @@ typedef struct rt6_info *(*pol_lookup_t)(struct net *,
 struct fib6_entry_notifier_info {
 	struct fib_notifier_info info; /* must be first */
 	struct fib6_info *rt;
+	unsigned int nsiblings;
+	bool multipath_rt;
 };
 
 /*
@@ -450,6 +452,11 @@ int call_fib6_entry_notifiers(struct net *net,
 			      enum fib_event_type event_type,
 			      struct fib6_info *rt,
 			      struct netlink_ext_ack *extack);
+int call_fib6_multipath_entry_notifiers(struct net *net,
+					enum fib_event_type event_type,
+					struct fib6_info *rt,
+					unsigned int nsiblings,
+					struct netlink_ext_ack *extack);
 void fib6_rt_update(struct net *net, struct fib6_info *rt,
 		    struct nl_info *info);
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1cce2082279c..df08ba8fe6fc 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -381,6 +381,23 @@ int call_fib6_entry_notifiers(struct net *net,
 	return call_fib6_notifiers(net, event_type, &info.info);
 }
 
+int call_fib6_multipath_entry_notifiers(struct net *net,
+					enum fib_event_type event_type,
+					struct fib6_info *rt,
+					unsigned int nsiblings,
+					struct netlink_ext_ack *extack)
+{
+	struct fib6_entry_notifier_info info = {
+		.info.extack = extack,
+		.rt = rt,
+		.nsiblings = nsiblings,
+		.multipath_rt = true,
+	};
+
+	rt->fib6_table->fib_seq++;
+	return call_fib6_notifiers(net, event_type, &info.info);
+}
+
 struct fib6_dump_arg {
 	struct net *net;
 	struct notifier_block *nb;
-- 
cgit v1.2.3


From d5382fef70ce273608d6fc652c24f075de3737ef Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 18 Jun 2019 18:12:57 +0300
Subject: ipv6: Stop sending in-kernel notifications for each nexthop

Both listeners - mlxsw and netdevsim - of IPv6 FIB notifications are now
ready to handle IPv6 multipath notifications.

Therefore, stop ignoring such notifications in both drivers and stop
sending notification for each added / deleted nexthop.

v2:
* Remove 'multipath_rt' from 'struct fib6_entry_notifier_info'

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  2 --
 drivers/net/netdevsim/fib.c                        |  7 ------
 include/net/ip6_fib.h                              |  1 -
 net/ipv6/ip6_fib.c                                 | 29 +++++++++++++---------
 4 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 92ec65188e9a..e618be7ce6c6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -6294,8 +6294,6 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 				NL_SET_ERR_MSG_MOD(info->extack, "IPv6 route with nexthop objects is not supported");
 				return notifier_from_errno(-EINVAL);
 			}
-			if (fen6_info->multipath_rt)
-				return NOTIFY_DONE;
 		}
 		break;
 	}
diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c
index 83ba5113210d..8c57ba747772 100644
--- a/drivers/net/netdevsim/fib.c
+++ b/drivers/net/netdevsim/fib.c
@@ -190,13 +190,6 @@ static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event,
 
 	case FIB_EVENT_ENTRY_ADD:  /* fall through */
 	case FIB_EVENT_ENTRY_DEL:
-		if (info->family == AF_INET6) {
-			struct fib6_entry_notifier_info *fen6_info = ptr;
-
-			if (fen6_info->multipath_rt)
-				return NOTIFY_DONE;
-		}
-
 		err = nsim_fib_event(data, info,
 				     event == FIB_EVENT_ENTRY_ADD);
 		break;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 7c3d5ab05879..87331f2c4af0 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -378,7 +378,6 @@ struct fib6_entry_notifier_info {
 	struct fib_notifier_info info; /* must be first */
 	struct fib6_info *rt;
 	unsigned int nsiblings;
-	bool multipath_rt;
 };
 
 /*
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index df08ba8fe6fc..1d16a01eccf5 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -391,7 +391,6 @@ int call_fib6_multipath_entry_notifiers(struct net *net,
 		.info.extack = extack,
 		.rt = rt,
 		.nsiblings = nsiblings,
-		.multipath_rt = true,
 	};
 
 	rt->fib6_table->fib_seq++;
@@ -1140,11 +1139,13 @@ next_iter:
 add:
 		nlflags |= NLM_F_CREATE;
 
-		err = call_fib6_entry_notifiers(info->nl_net,
-						FIB_EVENT_ENTRY_ADD,
-						rt, extack);
-		if (err)
-			return err;
+		if (!info->skip_notify_kernel) {
+			err = call_fib6_entry_notifiers(info->nl_net,
+							FIB_EVENT_ENTRY_ADD,
+							rt, extack);
+			if (err)
+				return err;
+		}
 
 		rcu_assign_pointer(rt->fib6_next, iter);
 		fib6_info_hold(rt);
@@ -1169,11 +1170,13 @@ add:
 			return -ENOENT;
 		}
 
-		err = call_fib6_entry_notifiers(info->nl_net,
-						FIB_EVENT_ENTRY_REPLACE,
-						rt, extack);
-		if (err)
-			return err;
+		if (!info->skip_notify_kernel) {
+			err = call_fib6_entry_notifiers(info->nl_net,
+							FIB_EVENT_ENTRY_REPLACE,
+							rt, extack);
+			if (err)
+				return err;
+		}
 
 		fib6_info_hold(rt);
 		rcu_assign_pointer(rt->fib6_node, fn);
@@ -1856,9 +1859,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	fib6_purge_rt(rt, fn, net);
 
-	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
+	if (!info->skip_notify_kernel)
+		call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
 	if (!info->skip_notify)
 		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
+
 	fib6_info_release(rt);
 }
 
-- 
cgit v1.2.3


From 75345f888f700c4ab2448287e35d48c760b202e6 Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <kda@linux-powerpc.org>
Date: Mon, 17 Jun 2019 10:53:41 +0200
Subject: ipoib: show VF broadcast address

in IPoIB case we can't see a VF broadcast address for but
can see for PF

Before:
11: ib1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 2044 qdisc pfifo_fast
state UP mode DEFAULT group default qlen 256
    link/infiniband
80:00:00:66:fe:80:00:00:00:00:00:00:24:8a:07:03:00:a4:3e:7c brd
00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff
    vf 0 MAC 14:80:00:00:66:fe, spoof checking off, link-state disable,
trust off, query_rss off
...

After:
11: ib1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 2044 qdisc pfifo_fast
state UP mode DEFAULT group default qlen 256
    link/infiniband
80:00:00:66:fe:80:00:00:00:00:00:00:24:8a:07:03:00:a4:3e:7c brd
00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff
    vf 0     link/infiniband
80:00:00:66:fe:80:00:00:00:00:00:00:24:8a:07:03:00:a4:3e:7c brd
00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff, spoof
checking off, link-state disable, trust off, query_rss off

v1->v2: add the IFLA_VF_BROADCAST constant
v2->v3: put IFLA_VF_BROADCAST at the end
to avoid KABI breakage and set NLA_REJECT
dev_setlink

Signed-off-by: Denis Kirjanov <kda@linux-powerpc.org>
Acked-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 5 +++++
 net/core/rtnetlink.c         | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5b225ff63b48..6f75bda2c2d7 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -694,6 +694,7 @@ enum {
 	IFLA_VF_IB_NODE_GUID,	/* VF Infiniband node GUID */
 	IFLA_VF_IB_PORT_GUID,	/* VF Infiniband port GUID */
 	IFLA_VF_VLAN_LIST,	/* nested list of vlans, option for QinQ */
+	IFLA_VF_BROADCAST,	/* VF broadcast */
 	__IFLA_VF_MAX,
 };
 
@@ -704,6 +705,10 @@ struct ifla_vf_mac {
 	__u8 mac[32]; /* MAX_ADDR_LEN */
 };
 
+struct ifla_vf_broadcast {
+	__u8 broadcast[32];
+};
+
 struct ifla_vf_vlan {
 	__u32 vf;
 	__u32 vlan; /* 0 - 4095, 0 disables VLAN filter */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cec60583931f..8ac81630ab5c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -908,6 +908,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 		size += num_vfs *
 			(nla_total_size(0) +
 			 nla_total_size(sizeof(struct ifla_vf_mac)) +
+			 nla_total_size(sizeof(struct ifla_vf_broadcast)) +
 			 nla_total_size(sizeof(struct ifla_vf_vlan)) +
 			 nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
 			 nla_total_size(MAX_VLAN_LIST_LEN *
@@ -1197,6 +1198,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	struct ifla_vf_vlan vf_vlan;
 	struct ifla_vf_rate vf_rate;
 	struct ifla_vf_mac vf_mac;
+	struct ifla_vf_broadcast vf_broadcast;
 	struct ifla_vf_info ivi;
 
 	memset(&ivi, 0, sizeof(ivi));
@@ -1231,6 +1233,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 		vf_trust.vf = ivi.vf;
 
 	memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+	memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
 	vf_vlan.vlan = ivi.vlan;
 	vf_vlan.qos = ivi.qos;
 	vf_vlan_info.vlan = ivi.vlan;
@@ -1247,6 +1250,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	if (!vf)
 		goto nla_put_vfinfo_failure;
 	if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+	    nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
 	    nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
 	    nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
 		    &vf_rate) ||
@@ -1753,6 +1757,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
 
 static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_MAC]		= { .len = sizeof(struct ifla_vf_mac) },
+	[IFLA_VF_BROADCAST]	= { .type = NLA_REJECT },
 	[IFLA_VF_VLAN]		= { .len = sizeof(struct ifla_vf_vlan) },
 	[IFLA_VF_VLAN_LIST]     = { .type = NLA_NESTED },
 	[IFLA_VF_TX_RATE]	= { .len = sizeof(struct ifla_vf_tx_rate) },
-- 
cgit v1.2.3


From 2589726d12a1b12eaaa93c7f1ea64287e383c7a5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 15 Jun 2019 12:12:20 -0700
Subject: bpf: introduce bounded loops

Allow the verifier to validate the loops by simulating their execution.
Exisiting programs have used '#pragma unroll' to unroll the loops
by the compiler. Instead let the verifier simulate all iterations
of the loop.
In order to do that introduce parentage chain of bpf_verifier_state and
'branches' counter for the number of branches left to explore.
See more detailed algorithm description in bpf_verifier.h

This algorithm borrows the key idea from Edward Cree approach:
https://patchwork.ozlabs.org/patch/877222/
Additional state pruning heuristics make such brute force loop walk
practical even for large loops.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  51 ++++++++++++++-
 kernel/bpf/verifier.c        | 143 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 181 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 704ed7971472..03037373b447 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -194,6 +194,53 @@ struct bpf_func_state {
 struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
+	struct bpf_verifier_state *parent;
+	/*
+	 * 'branches' field is the number of branches left to explore:
+	 * 0 - all possible paths from this state reached bpf_exit or
+	 * were safely pruned
+	 * 1 - at least one path is being explored.
+	 * This state hasn't reached bpf_exit
+	 * 2 - at least two paths are being explored.
+	 * This state is an immediate parent of two children.
+	 * One is fallthrough branch with branches==1 and another
+	 * state is pushed into stack (to be explored later) also with
+	 * branches==1. The parent of this state has branches==1.
+	 * The verifier state tree connected via 'parent' pointer looks like:
+	 * 1
+	 * 1
+	 * 2 -> 1 (first 'if' pushed into stack)
+	 * 1
+	 * 2 -> 1 (second 'if' pushed into stack)
+	 * 1
+	 * 1
+	 * 1 bpf_exit.
+	 *
+	 * Once do_check() reaches bpf_exit, it calls update_branch_counts()
+	 * and the verifier state tree will look:
+	 * 1
+	 * 1
+	 * 2 -> 1 (first 'if' pushed into stack)
+	 * 1
+	 * 1 -> 1 (second 'if' pushed into stack)
+	 * 0
+	 * 0
+	 * 0 bpf_exit.
+	 * After pop_stack() the do_check() will resume at second 'if'.
+	 *
+	 * If is_state_visited() sees a state with branches > 0 it means
+	 * there is a loop. If such state is exactly equal to the current state
+	 * it's an infinite loop. Note states_equal() checks for states
+	 * equvalency, so two states being 'states_equal' does not mean
+	 * infinite loop. The exact comparison is provided by
+	 * states_maybe_looping() function. It's a stronger pre-check and
+	 * much faster than states_equal().
+	 *
+	 * This algorithm may not find all possible infinite loops or
+	 * loop iteration count may be too high.
+	 * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in.
+	 */
+	u32 branches;
 	u32 insn_idx;
 	u32 curframe;
 	u32 active_spin_lock;
@@ -312,7 +359,9 @@ struct bpf_verifier_env {
 	} cfg;
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
-	u32 insn_processed;
+	u32 prev_insn_processed, insn_processed;
+	/* number of jmps, calls, exits analyzed so far */
+	u32 prev_jmps_processed, jmps_processed;
 	/* total verification time */
 	u64 verification_time;
 	/* maximum number of verifier states kept in 'branching' instructions */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d3a4ef1d969..25baa3c8cdd2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -721,6 +721,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
 	dst_state->active_spin_lock = src->active_spin_lock;
+	dst_state->branches = src->branches;
+	dst_state->parent = src->parent;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -736,6 +738,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	return 0;
 }
 
+static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+	while (st) {
+		u32 br = --st->branches;
+
+		/* WARN_ON(br > 1) technically makes sense here,
+		 * but see comment in push_stack(), hence:
+		 */
+		WARN_ONCE((int)br < 0,
+			  "BUG update_branch_counts:branches_to_explore=%d\n",
+			  br);
+		if (br)
+			break;
+		st = st->parent;
+	}
+}
+
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
 		     int *insn_idx)
 {
@@ -789,6 +808,18 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 			env->stack_size);
 		goto err;
 	}
+	if (elem->st.parent) {
+		++elem->st.parent->branches;
+		/* WARN_ON(branches > 2) technically makes sense here,
+		 * but
+		 * 1. speculative states will bump 'branches' for non-branch
+		 * instructions
+		 * 2. is_state_visited() heuristics may decide not to create
+		 * a new state for a sequence of branches and all such current
+		 * and cloned states will be pointing to a single parent state
+		 * which might have large 'branches' count.
+		 */
+	}
 	return &elem->st;
 err:
 	free_verifier_state(env->cur_state, true);
@@ -5682,7 +5713,8 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx)
  * w - next instruction
  * e - edge
  */
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
+		     bool loop_ok)
 {
 	int *insn_stack = env->cfg.insn_stack;
 	int *insn_state = env->cfg.insn_state;
@@ -5712,6 +5744,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		insn_stack[env->cfg.cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+		if (loop_ok && env->allow_ptr_leaks)
+			return 0;
 		verbose_linfo(env, t, "%d: ", t);
 		verbose_linfo(env, w, "%d: ", w);
 		verbose(env, "back-edge from insn %d to %d\n", t, w);
@@ -5763,7 +5797,7 @@ peek_stack:
 		if (opcode == BPF_EXIT) {
 			goto mark_explored;
 		} else if (opcode == BPF_CALL) {
-			ret = push_insn(t, t + 1, FALLTHROUGH, env);
+			ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
 			if (ret == 1)
 				goto peek_stack;
 			else if (ret < 0)
@@ -5772,7 +5806,8 @@ peek_stack:
 				init_explored_state(env, t + 1);
 			if (insns[t].src_reg == BPF_PSEUDO_CALL) {
 				init_explored_state(env, t);
-				ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+				ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+						env, false);
 				if (ret == 1)
 					goto peek_stack;
 				else if (ret < 0)
@@ -5785,7 +5820,7 @@ peek_stack:
 			}
 			/* unconditional jump with single edge */
 			ret = push_insn(t, t + insns[t].off + 1,
-					FALLTHROUGH, env);
+					FALLTHROUGH, env, true);
 			if (ret == 1)
 				goto peek_stack;
 			else if (ret < 0)
@@ -5798,13 +5833,13 @@ peek_stack:
 		} else {
 			/* conditional jump with two edges */
 			init_explored_state(env, t);
-			ret = push_insn(t, t + 1, FALLTHROUGH, env);
+			ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
 			if (ret == 1)
 				goto peek_stack;
 			else if (ret < 0)
 				goto err_free;
 
-			ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+			ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
 			if (ret == 1)
 				goto peek_stack;
 			else if (ret < 0)
@@ -5814,7 +5849,7 @@ peek_stack:
 		/* all other non-branch instructions with single
 		 * fall-through edge
 		 */
-		ret = push_insn(t, t + 1, FALLTHROUGH, env);
+		ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
 		if (ret == 1)
 			goto peek_stack;
 		else if (ret < 0)
@@ -6247,6 +6282,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 
 	sl = *explored_state(env, insn);
 	while (sl) {
+		if (sl->state.branches)
+			goto next;
 		if (sl->state.insn_idx != insn ||
 		    sl->state.curframe != cur->curframe)
 			goto next;
@@ -6611,12 +6648,32 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static bool states_maybe_looping(struct bpf_verifier_state *old,
+				 struct bpf_verifier_state *cur)
+{
+	struct bpf_func_state *fold, *fcur;
+	int i, fr = cur->curframe;
+
+	if (old->curframe != fr)
+		return false;
+
+	fold = old->frame[fr];
+	fcur = cur->frame[fr];
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (memcmp(&fold->regs[i], &fcur->regs[i],
+			   offsetof(struct bpf_reg_state, parent)))
+			return false;
+	return true;
+}
+
+
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
+	bool add_new_state = false;
 
 	if (!env->insn_aux_data[insn_idx].prune_point)
 		/* this 'insn_idx' instruction wasn't marked, so we will not
@@ -6624,6 +6681,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		 */
 		return 0;
 
+	/* bpf progs typically have pruning point every 4 instructions
+	 * http://vger.kernel.org/bpfconf2019.html#session-1
+	 * Do not add new state for future pruning if the verifier hasn't seen
+	 * at least 2 jumps and at least 8 instructions.
+	 * This heuristics helps decrease 'total_states' and 'peak_states' metric.
+	 * In tests that amounts to up to 50% reduction into total verifier
+	 * memory consumption and 20% verifier time speedup.
+	 */
+	if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
+	    env->insn_processed - env->prev_insn_processed >= 8)
+		add_new_state = true;
+
 	pprev = explored_state(env, insn_idx);
 	sl = *pprev;
 
@@ -6633,6 +6702,30 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		states_cnt++;
 		if (sl->state.insn_idx != insn_idx)
 			goto next;
+		if (sl->state.branches) {
+			if (states_maybe_looping(&sl->state, cur) &&
+			    states_equal(env, &sl->state, cur)) {
+				verbose_linfo(env, insn_idx, "; ");
+				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+				return -EINVAL;
+			}
+			/* if the verifier is processing a loop, avoid adding new state
+			 * too often, since different loop iterations have distinct
+			 * states and may not help future pruning.
+			 * This threshold shouldn't be too low to make sure that
+			 * a loop with large bound will be rejected quickly.
+			 * The most abusive loop will be:
+			 * r1 += 1
+			 * if r1 < 1000000 goto pc-2
+			 * 1M insn_procssed limit / 100 == 10k peak states.
+			 * This threshold shouldn't be too high either, since states
+			 * at the end of the loop are likely to be useful in pruning.
+			 */
+			if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+			    env->insn_processed - env->prev_insn_processed < 100)
+				add_new_state = false;
+			goto miss;
+		}
 		if (states_equal(env, &sl->state, cur)) {
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -6650,7 +6743,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				return err;
 			return 1;
 		}
-		sl->miss_cnt++;
+miss:
+		/* when new state is not going to be added do not increase miss count.
+		 * Otherwise several loop iterations will remove the state
+		 * recorded earlier. The goal of these heuristics is to have
+		 * states from some iterations of the loop (some in the beginning
+		 * and some at the end) to help pruning.
+		 */
+		if (add_new_state)
+			sl->miss_cnt++;
 		/* heuristic to determine whether this state is beneficial
 		 * to keep checking from state equivalence point of view.
 		 * Higher numbers increase max_states_per_insn and verification time,
@@ -6662,6 +6763,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 */
 			*pprev = sl->next;
 			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+				u32 br = sl->state.branches;
+
+				WARN_ONCE(br,
+					  "BUG live_done but branches_to_explore %d\n",
+					  br);
 				free_verifier_state(&sl->state, false);
 				kfree(sl);
 				env->peak_states--;
@@ -6687,18 +6793,25 @@ next:
 	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
 		return 0;
 
-	/* there were no equivalent states, remember current one.
-	 * technically the current state is not proven to be safe yet,
+	if (!add_new_state)
+		return 0;
+
+	/* There were no equivalent states, remember the current one.
+	 * Technically the current state is not proven to be safe yet,
 	 * but it will either reach outer most bpf_exit (which means it's safe)
-	 * or it will be rejected. Since there are no loops, we won't be
+	 * or it will be rejected. When there are no loops the verifier won't be
 	 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
-	 * again on the way to bpf_exit
+	 * again on the way to bpf_exit.
+	 * When looping the sl->state.branches will be > 0 and this state
+	 * will not be considered for equivalence until branches == 0.
 	 */
 	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
 	env->total_states++;
 	env->peak_states++;
+	env->prev_jmps_processed = env->jmps_processed;
+	env->prev_insn_processed = env->insn_processed;
 
 	/* add new state to the head of linked list */
 	new = &new_sl->state;
@@ -6709,6 +6822,9 @@ next:
 		return err;
 	}
 	new->insn_idx = insn_idx;
+	WARN_ONCE(new->branches != 1,
+		  "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+	cur->parent = new;
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
 	/* connect new state to parentage chain. Current frame needs all
@@ -6795,6 +6911,7 @@ static int do_check(struct bpf_verifier_env *env)
 		return -ENOMEM;
 	state->curframe = 0;
 	state->speculative = false;
+	state->branches = 1;
 	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
 	if (!state->frame[0]) {
 		kfree(state);
@@ -7001,6 +7118,7 @@ static int do_check(struct bpf_verifier_env *env)
 		} else if (class == BPF_JMP || class == BPF_JMP32) {
 			u8 opcode = BPF_OP(insn->code);
 
+			env->jmps_processed++;
 			if (opcode == BPF_CALL) {
 				if (BPF_SRC(insn->code) != BPF_K ||
 				    insn->off != 0 ||
@@ -7086,6 +7204,7 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
+				update_branch_counts(env, env->cur_state);
 				err = pop_stack(env, &env->prev_insn_idx,
 						&env->insn_idx);
 				if (err < 0) {
-- 
cgit v1.2.3


From b5dc0163d8fd78e64a7e21f309cf932fda34353e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 15 Jun 2019 12:12:25 -0700
Subject: bpf: precise scalar_value tracking

Introduce precision tracking logic that
helps cilium programs the most:
                  old clang  old clang    new clang  new clang
                          with all patches         with all patches
bpf_lb-DLB_L3.o      1838     2283         1923       1863
bpf_lb-DLB_L4.o      3218     2657         3077       2468
bpf_lb-DUNKNOWN.o    1064     545          1062       544
bpf_lxc-DDROP_ALL.o  26935    23045        166729     22629
bpf_lxc-DUNKNOWN.o   34439    35240        174607     28805
bpf_netdev.o         9721     8753         8407       6801
bpf_overlay.o        6184     7901         5420       4754
bpf_lxc_jit.o        39389    50925        39389      50925

Consider code:
654: (85) call bpf_get_hash_recalc#34
655: (bf) r7 = r0
656: (15) if r8 == 0x0 goto pc+29
657: (bf) r2 = r10
658: (07) r2 += -48
659: (18) r1 = 0xffff8881e41e1b00
661: (85) call bpf_map_lookup_elem#1
662: (15) if r0 == 0x0 goto pc+23
663: (69) r1 = *(u16 *)(r0 +0)
664: (15) if r1 == 0x0 goto pc+21
665: (bf) r8 = r7
666: (57) r8 &= 65535
667: (bf) r2 = r8
668: (3f) r2 /= r1
669: (2f) r2 *= r1
670: (bf) r1 = r8
671: (1f) r1 -= r2
672: (57) r1 &= 255
673: (25) if r1 > 0x1e goto pc+12
 R0=map_value(id=0,off=0,ks=20,vs=64,imm=0) R1_w=inv(id=0,umax_value=30,var_off=(0x0; 0x1f))
674: (67) r1 <<= 1
675: (0f) r0 += r1

At this point the verifier will notice that scalar R1 is used in map pointer adjustment.
R1 has to be precise for later operations on R0 to be validated properly.

The verifier will backtrack the above code in the following way:
last_idx 675 first_idx 664
regs=2 stack=0 before 675: (0f) r0 += r1         // started backtracking R1 regs=2 is a bitmask
regs=2 stack=0 before 674: (67) r1 <<= 1
regs=2 stack=0 before 673: (25) if r1 > 0x1e goto pc+12
regs=2 stack=0 before 672: (57) r1 &= 255
regs=2 stack=0 before 671: (1f) r1 -= r2         // now both R1 and R2 has to be precise -> regs=6 mask
regs=6 stack=0 before 670: (bf) r1 = r8          // after this insn R8 and R2 has to be precise
regs=104 stack=0 before 669: (2f) r2 *= r1       // after this one R8, R2, and R1
regs=106 stack=0 before 668: (3f) r2 /= r1
regs=106 stack=0 before 667: (bf) r2 = r8
regs=102 stack=0 before 666: (57) r8 &= 65535
regs=102 stack=0 before 665: (bf) r8 = r7
regs=82 stack=0 before 664: (15) if r1 == 0x0 goto pc+21
 // this is the end of verifier state. The following regs will be marked precised:
 R1_rw=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R7_rw=invP(id=0)
parent didn't have regs=82 stack=0 marks         // so backtracking continues into parent state
last_idx 663 first_idx 655
regs=82 stack=0 before 663: (69) r1 = *(u16 *)(r0 +0)   // R1 was assigned no need to track it further
regs=80 stack=0 before 662: (15) if r0 == 0x0 goto pc+23    // keep tracking R7
regs=80 stack=0 before 661: (85) call bpf_map_lookup_elem#1  // keep tracking R7
regs=80 stack=0 before 659: (18) r1 = 0xffff8881e41e1b00
regs=80 stack=0 before 658: (07) r2 += -48
regs=80 stack=0 before 657: (bf) r2 = r10
regs=80 stack=0 before 656: (15) if r8 == 0x0 goto pc+29
regs=80 stack=0 before 655: (bf) r7 = r0                // here the assignment into R7
 // mark R0 to be precise:
 R0_rw=invP(id=0)
parent didn't have regs=1 stack=0 marks                 // regs=1 -> tracking R0
last_idx 654 first_idx 644
regs=1 stack=0 before 654: (85) call bpf_get_hash_recalc#34 // and in the parent frame it was a return value
  // nothing further to backtrack

Two scalar registers not marked precise are equivalent from state pruning point of view.
More details in the patch comments.

It doesn't support bpf2bpf calls yet and enabled for root only.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  18 ++
 kernel/bpf/verifier.c        | 491 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 498 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 03037373b447..19393b0964a8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -139,6 +139,8 @@ struct bpf_reg_state {
 	 */
 	s32 subreg_def;
 	enum bpf_reg_liveness live;
+	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
+	bool precise;
 };
 
 enum bpf_stack_slot_type {
@@ -190,6 +192,11 @@ struct bpf_func_state {
 	struct bpf_stack_state *stack;
 };
 
+struct bpf_idx_pair {
+	u32 prev_idx;
+	u32 idx;
+};
+
 #define MAX_CALL_FRAMES 8
 struct bpf_verifier_state {
 	/* call stack tracking */
@@ -245,6 +252,17 @@ struct bpf_verifier_state {
 	u32 curframe;
 	u32 active_spin_lock;
 	bool speculative;
+
+	/* first and last insn idx of this verifier state */
+	u32 first_insn_idx;
+	u32 last_insn_idx;
+	/* jmp history recorded from first to last.
+	 * backtracking is using it to go from last to first.
+	 * For most states jmp_history_cnt is [0-3].
+	 * For loops can go up to ~40.
+	 */
+	struct bpf_idx_pair *jmp_history;
+	u32 jmp_history_cnt;
 };
 
 #define bpf_get_spilled_reg(slot, frame)				\
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 870c8f19ce80..709ce4cef8ba 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -455,12 +455,12 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		verbose(env, " R%d", i);
 		print_liveness(env, reg->live);
 		verbose(env, "=%s", reg_type_str[t]);
+		if (t == SCALAR_VALUE && reg->precise)
+			verbose(env, "P");
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
 		    tnum_is_const(reg->var_off)) {
 			/* reg->off should be 0 for SCALAR_VALUE */
 			verbose(env, "%lld", reg->var_off.value + reg->off);
-			if (t == PTR_TO_STACK)
-				verbose(env, ",call_%d", func(env, reg)->callsite);
 		} else {
 			verbose(env, "(id=%d", reg->id);
 			if (reg_type_may_be_refcounted_or_null(t))
@@ -522,11 +522,17 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			continue;
 		verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
 		print_liveness(env, state->stack[i].spilled_ptr.live);
-		if (state->stack[i].slot_type[0] == STACK_SPILL)
-			verbose(env, "=%s",
-				reg_type_str[state->stack[i].spilled_ptr.type]);
-		else
+		if (state->stack[i].slot_type[0] == STACK_SPILL) {
+			reg = &state->stack[i].spilled_ptr;
+			t = reg->type;
+			verbose(env, "=%s", reg_type_str[t]);
+			if (t == SCALAR_VALUE && reg->precise)
+				verbose(env, "P");
+			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
+				verbose(env, "%lld", reg->var_off.value + reg->off);
+		} else {
 			verbose(env, "=%s", types_buf);
+		}
 	}
 	if (state->acquired_refs && state->refs[0].id) {
 		verbose(env, " refs=%d", state->refs[0].id);
@@ -675,6 +681,13 @@ static void free_func_state(struct bpf_func_state *state)
 	kfree(state);
 }
 
+static void clear_jmp_history(struct bpf_verifier_state *state)
+{
+	kfree(state->jmp_history);
+	state->jmp_history = NULL;
+	state->jmp_history_cnt = 0;
+}
+
 static void free_verifier_state(struct bpf_verifier_state *state,
 				bool free_self)
 {
@@ -684,6 +697,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
 		free_func_state(state->frame[i]);
 		state->frame[i] = NULL;
 	}
+	clear_jmp_history(state);
 	if (free_self)
 		kfree(state);
 }
@@ -711,8 +725,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 			       const struct bpf_verifier_state *src)
 {
 	struct bpf_func_state *dst;
+	u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
 	int i, err;
 
+	if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
+		kfree(dst_state->jmp_history);
+		dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
+		if (!dst_state->jmp_history)
+			return -ENOMEM;
+	}
+	memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
+	dst_state->jmp_history_cnt = src->jmp_history_cnt;
+
 	/* if dst has more stack frames then src frame, free them */
 	for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
 		free_func_state(dst_state->frame[i]);
@@ -723,6 +747,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->active_spin_lock = src->active_spin_lock;
 	dst_state->branches = src->branches;
 	dst_state->parent = src->parent;
+	dst_state->first_insn_idx = src->first_insn_idx;
+	dst_state->last_insn_idx = src->last_insn_idx;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -967,6 +993,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
 	reg->smax_value = S64_MAX;
 	reg->umin_value = 0;
 	reg->umax_value = U64_MAX;
+
+	/* constant backtracking is enabled for root only for now */
+	reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
 }
 
 /* Mark a register as having a completely unknown (scalar) value. */
@@ -1378,6 +1407,389 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	return 0;
 }
 
+/* for any branch, call, exit record the history of jmps in the given state */
+static int push_jmp_history(struct bpf_verifier_env *env,
+			    struct bpf_verifier_state *cur)
+{
+	u32 cnt = cur->jmp_history_cnt;
+	struct bpf_idx_pair *p;
+
+	cnt++;
+	p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+	if (!p)
+		return -ENOMEM;
+	p[cnt - 1].idx = env->insn_idx;
+	p[cnt - 1].prev_idx = env->prev_insn_idx;
+	cur->jmp_history = p;
+	cur->jmp_history_cnt = cnt;
+	return 0;
+}
+
+/* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
+ */
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+			     u32 *history)
+{
+	u32 cnt = *history;
+
+	if (cnt && st->jmp_history[cnt - 1].idx == i) {
+		i = st->jmp_history[cnt - 1].prev_idx;
+		(*history)--;
+	} else {
+		i--;
+	}
+	return i;
+}
+
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx,
+			  u32 *reg_mask, u64 *stack_mask)
+{
+	const struct bpf_insn_cbs cbs = {
+		.cb_print	= verbose,
+		.private_data	= env,
+	};
+	struct bpf_insn *insn = env->prog->insnsi + idx;
+	u8 class = BPF_CLASS(insn->code);
+	u8 opcode = BPF_OP(insn->code);
+	u8 mode = BPF_MODE(insn->code);
+	u32 dreg = 1u << insn->dst_reg;
+	u32 sreg = 1u << insn->src_reg;
+	u32 spi;
+
+	if (insn->code == 0)
+		return 0;
+	if (env->log.level & BPF_LOG_LEVEL) {
+		verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+		verbose(env, "%d: ", idx);
+		print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+	}
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (!(*reg_mask & dreg))
+			return 0;
+		if (opcode == BPF_MOV) {
+			if (BPF_SRC(insn->code) == BPF_X) {
+				/* dreg = sreg
+				 * dreg needs precision after this insn
+				 * sreg needs precision before this insn
+				 */
+				*reg_mask &= ~dreg;
+				*reg_mask |= sreg;
+			} else {
+				/* dreg = K
+				 * dreg needs precision after this insn.
+				 * Corresponding register is already marked
+				 * as precise=true in this verifier state.
+				 * No further markings in parent are necessary
+				 */
+				*reg_mask &= ~dreg;
+			}
+		} else {
+			if (BPF_SRC(insn->code) == BPF_X) {
+				/* dreg += sreg
+				 * both dreg and sreg need precision
+				 * before this insn
+				 */
+				*reg_mask |= sreg;
+			} /* else dreg += K
+			   * dreg still needs precision before this insn
+			   */
+		}
+	} else if (class == BPF_LDX) {
+		if (!(*reg_mask & dreg))
+			return 0;
+		*reg_mask &= ~dreg;
+
+		/* scalars can only be spilled into stack w/o losing precision.
+		 * Load from any other memory can be zero extended.
+		 * The desire to keep that precision is already indicated
+		 * by 'precise' mark in corresponding register of this state.
+		 * No further tracking necessary.
+		 */
+		if (insn->src_reg != BPF_REG_FP)
+			return 0;
+		if (BPF_SIZE(insn->code) != BPF_DW)
+			return 0;
+
+		/* dreg = *(u64 *)[fp - off] was a fill from the stack.
+		 * that [fp - off] slot contains scalar that needs to be
+		 * tracked with precision
+		 */
+		spi = (-insn->off - 1) / BPF_REG_SIZE;
+		if (spi >= 64) {
+			verbose(env, "BUG spi %d\n", spi);
+			WARN_ONCE(1, "verifier backtracking bug");
+			return -EFAULT;
+		}
+		*stack_mask |= 1ull << spi;
+	} else if (class == BPF_STX) {
+		if (*reg_mask & dreg)
+			/* stx shouldn't be using _scalar_ dst_reg
+			 * to access memory. It means backtracking
+			 * encountered a case of pointer subtraction.
+			 */
+			return -ENOTSUPP;
+		/* scalars can only be spilled into stack */
+		if (insn->dst_reg != BPF_REG_FP)
+			return 0;
+		if (BPF_SIZE(insn->code) != BPF_DW)
+			return 0;
+		spi = (-insn->off - 1) / BPF_REG_SIZE;
+		if (spi >= 64) {
+			verbose(env, "BUG spi %d\n", spi);
+			WARN_ONCE(1, "verifier backtracking bug");
+			return -EFAULT;
+		}
+		if (!(*stack_mask & (1ull << spi)))
+			return 0;
+		*stack_mask &= ~(1ull << spi);
+		*reg_mask |= sreg;
+	} else if (class == BPF_JMP || class == BPF_JMP32) {
+		if (opcode == BPF_CALL) {
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				return -ENOTSUPP;
+			/* regular helper call sets R0 */
+			*reg_mask &= ~1;
+			if (*reg_mask & 0x3f) {
+				/* if backtracing was looking for registers R1-R5
+				 * they should have been found already.
+				 */
+				verbose(env, "BUG regs %x\n", *reg_mask);
+				WARN_ONCE(1, "verifier backtracking bug");
+				return -EFAULT;
+			}
+		} else if (opcode == BPF_EXIT) {
+			return -ENOTSUPP;
+		}
+	} else if (class == BPF_LD) {
+		if (!(*reg_mask & dreg))
+			return 0;
+		*reg_mask &= ~dreg;
+		/* It's ld_imm64 or ld_abs or ld_ind.
+		 * For ld_imm64 no further tracking of precision
+		 * into parent is necessary
+		 */
+		if (mode == BPF_IND || mode == BPF_ABS)
+			/* to be analyzed */
+			return -ENOTSUPP;
+	} else if (class == BPF_ST) {
+		if (*reg_mask & dreg)
+			/* likely pointer subtraction */
+			return -ENOTSUPP;
+	}
+	return 0;
+}
+
+/* the scalar precision tracking algorithm:
+ * . at the start all registers have precise=false.
+ * . scalar ranges are tracked as normal through alu and jmp insns.
+ * . once precise value of the scalar register is used in:
+ *   .  ptr + scalar alu
+ *   . if (scalar cond K|scalar)
+ *   .  helper_call(.., scalar, ...) where ARG_CONST is expected
+ *   backtrack through the verifier states and mark all registers and
+ *   stack slots with spilled constants that these scalar regisers
+ *   should be precise.
+ * . during state pruning two registers (or spilled stack slots)
+ *   are equivalent if both are not precise.
+ *
+ * Note the verifier cannot simply walk register parentage chain,
+ * since many different registers and stack slots could have been
+ * used to compute single precise scalar.
+ *
+ * The approach of starting with precise=true for all registers and then
+ * backtrack to mark a register as not precise when the verifier detects
+ * that program doesn't care about specific value (e.g., when helper
+ * takes register as ARG_ANYTHING parameter) is not safe.
+ *
+ * It's ok to walk single parentage chain of the verifier states.
+ * It's possible that this backtracking will go all the way till 1st insn.
+ * All other branches will be explored for needing precision later.
+ *
+ * The backtracking needs to deal with cases like:
+ *   R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
+ * r9 -= r8
+ * r5 = r9
+ * if r5 > 0x79f goto pc+7
+ *    R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
+ * r5 += 1
+ * ...
+ * call bpf_perf_event_output#25
+ *   where .arg5_type = ARG_CONST_SIZE_OR_ZERO
+ *
+ * and this case:
+ * r6 = 1
+ * call foo // uses callee's r6 inside to compute r0
+ * r0 += r6
+ * if r0 == 0 goto
+ *
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
+ *
+ * Also if parent's curframe > frame where backtracking started,
+ * the verifier need to mark registers in both frames, otherwise callees
+ * may incorrectly prune callers. This is similar to
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
+ *
+ * For now backtracking falls back into conservative marking.
+ */
+static void mark_all_scalars_precise(struct bpf_verifier_env *env,
+				     struct bpf_verifier_state *st)
+{
+	struct bpf_func_state *func;
+	struct bpf_reg_state *reg;
+	int i, j;
+
+	/* big hammer: mark all scalars precise in this path.
+	 * pop_stack may still get !precise scalars.
+	 */
+	for (; st; st = st->parent)
+		for (i = 0; i <= st->curframe; i++) {
+			func = st->frame[i];
+			for (j = 0; j < BPF_REG_FP; j++) {
+				reg = &func->regs[j];
+				if (reg->type != SCALAR_VALUE)
+					continue;
+				reg->precise = true;
+			}
+			for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+				if (func->stack[j].slot_type[0] != STACK_SPILL)
+					continue;
+				reg = &func->stack[j].spilled_ptr;
+				if (reg->type != SCALAR_VALUE)
+					continue;
+				reg->precise = true;
+			}
+		}
+}
+
+static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+	struct bpf_verifier_state *st = env->cur_state;
+	int first_idx = st->first_insn_idx;
+	int last_idx = env->insn_idx;
+	struct bpf_func_state *func;
+	struct bpf_reg_state *reg;
+	u32 reg_mask = 1u << regno;
+	u64 stack_mask = 0;
+	bool skip_first = true;
+	int i, err;
+
+	if (!env->allow_ptr_leaks)
+		/* backtracking is root only for now */
+		return 0;
+
+	func = st->frame[st->curframe];
+	reg = &func->regs[regno];
+	if (reg->type != SCALAR_VALUE) {
+		WARN_ONCE(1, "backtracing misuse");
+		return -EFAULT;
+	}
+	if (reg->precise)
+		return 0;
+	func->regs[regno].precise = true;
+
+	for (;;) {
+		DECLARE_BITMAP(mask, 64);
+		bool new_marks = false;
+		u32 history = st->jmp_history_cnt;
+
+		if (env->log.level & BPF_LOG_LEVEL)
+			verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+		for (i = last_idx;;) {
+			if (skip_first) {
+				err = 0;
+				skip_first = false;
+			} else {
+				err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+			}
+			if (err == -ENOTSUPP) {
+				mark_all_scalars_precise(env, st);
+				return 0;
+			} else if (err) {
+				return err;
+			}
+			if (!reg_mask && !stack_mask)
+				/* Found assignment(s) into tracked register in this state.
+				 * Since this state is already marked, just return.
+				 * Nothing to be tracked further in the parent state.
+				 */
+				return 0;
+			if (i == first_idx)
+				break;
+			i = get_prev_insn_idx(st, i, &history);
+			if (i >= env->prog->len) {
+				/* This can happen if backtracking reached insn 0
+				 * and there are still reg_mask or stack_mask
+				 * to backtrack.
+				 * It means the backtracking missed the spot where
+				 * particular register was initialized with a constant.
+				 */
+				verbose(env, "BUG backtracking idx %d\n", i);
+				WARN_ONCE(1, "verifier backtracking bug");
+				return -EFAULT;
+			}
+		}
+		st = st->parent;
+		if (!st)
+			break;
+
+		func = st->frame[st->curframe];
+		bitmap_from_u64(mask, reg_mask);
+		for_each_set_bit(i, mask, 32) {
+			reg = &func->regs[i];
+			if (reg->type != SCALAR_VALUE)
+				continue;
+			if (!reg->precise)
+				new_marks = true;
+			reg->precise = true;
+		}
+
+		bitmap_from_u64(mask, stack_mask);
+		for_each_set_bit(i, mask, 64) {
+			if (i >= func->allocated_stack / BPF_REG_SIZE) {
+				/* This can happen if backtracking
+				 * is propagating stack precision where
+				 * caller has larger stack frame
+				 * than callee, but backtrack_insn() should
+				 * have returned -ENOTSUPP.
+				 */
+				verbose(env, "BUG spi %d stack_size %d\n",
+					i, func->allocated_stack);
+				WARN_ONCE(1, "verifier backtracking bug");
+				return -EFAULT;
+			}
+
+			if (func->stack[i].slot_type[0] != STACK_SPILL)
+				continue;
+			reg = &func->stack[i].spilled_ptr;
+			if (reg->type != SCALAR_VALUE)
+				continue;
+			if (!reg->precise)
+				new_marks = true;
+			reg->precise = true;
+		}
+		if (env->log.level & BPF_LOG_LEVEL) {
+			print_verifier_state(env, func);
+			verbose(env, "parent %s regs=%x stack=%llx marks\n",
+				new_marks ? "didn't have" : "already had",
+				reg_mask, stack_mask);
+		}
+
+		if (!new_marks)
+			break;
+
+		last_idx = st->last_insn_idx;
+		first_idx = st->first_insn_idx;
+	}
+	return 0;
+}
+
+
 static bool is_spillable_regtype(enum bpf_reg_type type)
 {
 	switch (type) {
@@ -1435,6 +1847,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 {
 	struct bpf_func_state *cur; /* state of the current function */
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+	u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
 	struct bpf_reg_state *reg = NULL;
 
 	err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
@@ -1457,6 +1870,17 @@ static int check_stack_write(struct bpf_verifier_env *env,
 
 	if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
 	    !register_is_null(reg) && env->allow_ptr_leaks) {
+		if (dst_reg != BPF_REG_FP) {
+			/* The backtracking logic can only recognize explicit
+			 * stack slot address like [fp - 8]. Other spill of
+			 * scalar via different register has to be conervative.
+			 * Backtrack from here and mark all registers as precise
+			 * that contributed into 'reg' being a constant.
+			 */
+			err = mark_chain_precision(env, value_regno);
+			if (err)
+				return err;
+		}
 		save_register_state(state, spi, reg);
 	} else if (reg && is_spillable_regtype(reg->type)) {
 		/* register containing pointer is being spilled into stack */
@@ -1529,8 +1953,13 @@ static int check_stack_write(struct bpf_verifier_env *env,
 			state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
 		/* when we zero initialize stack slots mark them as such */
-		if (reg && register_is_null(reg))
+		if (reg && register_is_null(reg)) {
+			/* backtracking doesn't work for STACK_ZERO yet. */
+			err = mark_chain_precision(env, value_regno);
+			if (err)
+				return err;
 			type = STACK_ZERO;
+		}
 
 		/* Mark slots affected by this stack write. */
 		for (i = 0; i < size; i++)
@@ -1610,6 +2039,17 @@ static int check_stack_read(struct bpf_verifier_env *env,
 				 * so the whole register == const_zero
 				 */
 				__mark_reg_const_zero(&state->regs[value_regno]);
+				/* backtracking doesn't support STACK_ZERO yet,
+				 * so mark it precise here, so that later
+				 * backtracking can stop here.
+				 * Backtracking may not need this if this register
+				 * doesn't participate in pointer adjustment.
+				 * Forward propagation of precise flag is not
+				 * necessary either. This mark is only to stop
+				 * backtracking. Any register that contributed
+				 * to const 0 was marked precise before spill.
+				 */
+				state->regs[value_regno].precise = true;
 			} else {
 				/* have read misc data from the stack */
 				mark_reg_unknown(env, state->regs, value_regno);
@@ -2925,6 +3365,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_helper_mem_access(env, regno - 1,
 					      reg->umax_value,
 					      zero_size_allowed, meta);
+		if (!err)
+			err = mark_chain_precision(env, regno);
 	} else if (arg_type_is_int_ptr(arg_type)) {
 		int size = int_ptr_type_to_size(arg_type);
 
@@ -4361,6 +4803,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
+	int err;
 
 	dst_reg = &regs[insn->dst_reg];
 	src_reg = NULL;
@@ -4387,11 +4830,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				 * This is legal, but we have to reverse our
 				 * src/dest handling in computing the range
 				 */
+				err = mark_chain_precision(env, insn->dst_reg);
+				if (err)
+					return err;
 				return adjust_ptr_min_max_vals(env, insn,
 							       src_reg, dst_reg);
 			}
 		} else if (ptr_reg) {
 			/* pointer += scalar */
+			err = mark_chain_precision(env, insn->src_reg);
+			if (err)
+				return err;
 			return adjust_ptr_min_max_vals(env, insn,
 						       dst_reg, src_reg);
 		}
@@ -5348,6 +5797,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		 tnum_is_const(src_reg->var_off))
 		pred = is_branch_taken(dst_reg, src_reg->var_off.value,
 				       opcode, is_jmp32);
+	if (pred >= 0) {
+		err = mark_chain_precision(env, insn->dst_reg);
+		if (BPF_SRC(insn->code) == BPF_X && !err)
+			err = mark_chain_precision(env, insn->src_reg);
+		if (err)
+			return err;
+	}
 	if (pred == 1) {
 		/* only follow the goto, ignore fall-through */
 		*insn_idx += insn->off;
@@ -5825,6 +6281,11 @@ peek_stack:
 				goto peek_stack;
 			else if (ret < 0)
 				goto err_free;
+			/* unconditional jmp is not a good pruning point,
+			 * but it's marked, since backtracking needs
+			 * to record jmp history in is_state_visited().
+			 */
+			init_explored_state(env, t + insns[t].off + 1);
 			/* tell verifier to check for equivalent states
 			 * after every call and jump
 			 */
@@ -6325,6 +6786,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	switch (rold->type) {
 	case SCALAR_VALUE:
 		if (rcur->type == SCALAR_VALUE) {
+			if (!rold->precise && !rcur->precise)
+				return true;
 			/* new val must satisfy old val knowledge */
 			return range_within(rold, rcur) &&
 			       tnum_in(rold->var_off, rcur->var_off);
@@ -6675,6 +7138,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	int i, j, err, states_cnt = 0;
 	bool add_new_state = false;
 
+	cur->last_insn_idx = env->prev_insn_idx;
 	if (!env->insn_aux_data[insn_idx].prune_point)
 		/* this 'insn_idx' instruction wasn't marked, so we will not
 		 * be doing state search here
@@ -6791,10 +7255,10 @@ next:
 		env->max_states_per_insn = states_cnt;
 
 	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
-		return 0;
+		return push_jmp_history(env, cur);
 
 	if (!add_new_state)
-		return 0;
+		return push_jmp_history(env, cur);
 
 	/* There were no equivalent states, remember the current one.
 	 * Technically the current state is not proven to be safe yet,
@@ -6824,7 +7288,10 @@ next:
 	new->insn_idx = insn_idx;
 	WARN_ONCE(new->branches != 1,
 		  "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+
 	cur->parent = new;
+	cur->first_insn_idx = insn_idx;
+	clear_jmp_history(cur);
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
 	/* connect new state to parentage chain. Current frame needs all
@@ -6904,6 +7371,7 @@ static int do_check(struct bpf_verifier_env *env)
 	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len;
 	bool do_print_state = false;
+	int prev_insn_idx = -1;
 
 	env->prev_linfo = NULL;
 
@@ -6929,6 +7397,7 @@ static int do_check(struct bpf_verifier_env *env)
 		u8 class;
 		int err;
 
+		env->prev_insn_idx = prev_insn_idx;
 		if (env->insn_idx >= insn_cnt) {
 			verbose(env, "invalid insn idx %d insn_cnt %d\n",
 				env->insn_idx, insn_cnt);
@@ -7001,6 +7470,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		regs = cur_regs(env);
 		env->insn_aux_data[env->insn_idx].seen = true;
+		prev_insn_idx = env->insn_idx;
 
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
@@ -7174,7 +7644,6 @@ static int do_check(struct bpf_verifier_env *env)
 
 				if (state->curframe) {
 					/* exit from nested function */
-					env->prev_insn_idx = env->insn_idx;
 					err = prepare_func_exit(env, &env->insn_idx);
 					if (err)
 						return err;
@@ -7206,7 +7675,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return err;
 process_bpf_exit:
 				update_branch_counts(env, env->cur_state);
-				err = pop_stack(env, &env->prev_insn_idx,
+				err = pop_stack(env, &prev_insn_idx,
 						&env->insn_idx);
 				if (err < 0) {
 					if (err != -ENOENT)
-- 
cgit v1.2.3


From 82828b88f081a0084cd65f90a4a1d3652f5adb66 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 19 Jun 2019 09:41:02 +0300
Subject: flow_dissector: add support for ingress ifindex dissection

Add new key meta that contains ingress ifindex value and add a function
to dissect this from skb. The key and function is prepared to cover
other potential skb metadata values dissection.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       |  4 ++++
 include/net/flow_dissector.h |  9 +++++++++
 net/core/flow_dissector.c    | 16 ++++++++++++++++
 3 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 28bdaf978e72..b5d427b149c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1320,6 +1320,10 @@ skb_flow_dissect_flow_keys_basic(const struct net *net,
 				  data, proto, nhoff, hlen, flags);
 }
 
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+			   struct flow_dissector *flow_dissector,
+			   void *target_container);
+
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index d7ce647a8ca9..02478e48fae4 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -200,6 +200,14 @@ struct flow_dissector_key_ip {
 	__u8	ttl;
 };
 
+/**
+ * struct flow_dissector_key_meta:
+ * @ingress_ifindex: ingress ifindex
+ */
+struct flow_dissector_key_meta {
+	int ingress_ifindex;
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
@@ -225,6 +233,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */
 	FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */
 	FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */
+	FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c0559af9e5e5..01ad60b5aa75 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -199,6 +199,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+			   struct flow_dissector *flow_dissector,
+			   void *target_container)
+{
+	struct flow_dissector_key_meta *meta;
+
+	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
+		return;
+
+	meta = skb_flow_dissector_target(flow_dissector,
+					 FLOW_DISSECTOR_KEY_META,
+					 target_container);
+	meta->ingress_ifindex = skb->skb_iif;
+}
+EXPORT_SYMBOL(skb_flow_dissect_meta);
+
 static void
 skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
 				   struct flow_dissector *flow_dissector,
-- 
cgit v1.2.3


From 9558a83aee62be7c3ce9eddd6484a5da16aad4cf Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 19 Jun 2019 09:41:04 +0300
Subject: net: flow_offload: implement support for meta key

Implement support for previously added flow dissector meta key.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 6 ++++++
 net/core/flow_offload.c    | 7 +++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 36fdb85c974d..36127c1858a4 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -10,6 +10,10 @@ struct flow_match {
 	void			*key;
 };
 
+struct flow_match_meta {
+	struct flow_dissector_key_meta *key, *mask;
+};
+
 struct flow_match_basic {
 	struct flow_dissector_key_basic *key, *mask;
 };
@@ -64,6 +68,8 @@ struct flow_match_enc_opts {
 
 struct flow_rule;
 
+void flow_rule_match_meta(const struct flow_rule *rule,
+			  struct flow_match_meta *out);
 void flow_rule_match_basic(const struct flow_rule *rule,
 			   struct flow_match_basic *out);
 void flow_rule_match_control(const struct flow_rule *rule,
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 3d93e51b83e0..f52fe0bc4017 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -25,6 +25,13 @@ EXPORT_SYMBOL(flow_rule_alloc);
 	(__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key);	\
 	(__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask);	\
 
+void flow_rule_match_meta(const struct flow_rule *rule,
+			  struct flow_match_meta *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out);
+}
+EXPORT_SYMBOL(flow_rule_match_meta);
+
 void flow_rule_match_basic(const struct flow_rule *rule,
 			   struct flow_match_basic *out)
 {
-- 
cgit v1.2.3


From 23cdf8752b26d4edbd60a6293bca492d83192d4d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 19 Jun 2019 10:12:58 -0400
Subject: act_ctinfo: Don't use BIT() in UAPI headers.

Use _BITUL() instead.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_ctinfo.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_ctinfo.h b/include/uapi/linux/tc_act/tc_ctinfo.h
index da803e05a89b..32337304fbe5 100644
--- a/include/uapi/linux/tc_act/tc_ctinfo.h
+++ b/include/uapi/linux/tc_act/tc_ctinfo.h
@@ -27,8 +27,8 @@ enum {
 #define TCA_CTINFO_MAX (__TCA_CTINFO_MAX - 1)
 
 enum {
-	CTINFO_MODE_DSCP	= BIT(0),
-	CTINFO_MODE_CPMARK	= BIT(1)
+	CTINFO_MODE_DSCP	= _BITUL(0),
+	CTINFO_MODE_CPMARK	= _BITUL(1)
 };
 
 #endif
-- 
cgit v1.2.3


From 0afdeeed08b5f59148069e9f7a52d2ab0eba5d49 Mon Sep 17 00:00:00 2001
From: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Date: Tue, 18 Jun 2019 15:05:12 +0200
Subject: net: page_pool: add helper function to retrieve dma addresses

On a previous patch dma addr was stored in 'struct page'.
Use that to retrieve DMA addresses used by network drivers

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 694d055e01ef..b885d86cb7a1 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -132,6 +132,11 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	__page_pool_put_page(pool, page, true);
 }
 
+static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
+{
+	return page->dma_addr;
+}
+
 static inline bool is_page_pool_compiled_in(void)
 {
 #ifdef CONFIG_PAGE_POOL
-- 
cgit v1.2.3


From a25d50bfe645b3ed6b2cb3773e7025db14a608f3 Mon Sep 17 00:00:00 2001
From: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Date: Tue, 18 Jun 2019 15:05:17 +0200
Subject: net: page_pool: add helper function to unmap dma addresses

On a previous patch dma addr was stored in 'struct page'.
Use that to unmap DMA addresses used by network drivers

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 1 +
 net/core/page_pool.c    | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index b885d86cb7a1..ad218cef88c5 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -110,6 +110,7 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
 void page_pool_destroy(struct page_pool *pool);
+void page_pool_unmap_page(struct page_pool *pool, struct page *page);
 
 /* Never call this directly, use helpers below */
 void __page_pool_put_page(struct page_pool *pool,
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5b2252c6d49b..205af7bd6d09 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -190,6 +190,13 @@ static void __page_pool_clean_page(struct page_pool *pool,
 	page->dma_addr = 0;
 }
 
+/* unmap the page and clean our state */
+void page_pool_unmap_page(struct page_pool *pool, struct page *page)
+{
+	__page_pool_clean_page(pool, page);
+}
+EXPORT_SYMBOL(page_pool_unmap_page);
+
 /* Return a page to the page allocator, cleaning up our state */
 static void __page_pool_return_page(struct page_pool *pool, struct page *page)
 {
-- 
cgit v1.2.3


From 6bf071bf09d4b2ff3ee8783531e2ce814f0870cb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Jun 2019 15:05:27 +0200
Subject: xdp: page_pool related fix to cpumap

When converting an xdp_frame into an SKB, and sending this into the network
stack, then the underlying XDP memory model need to release associated
resources, because the network stack don't have callbacks for XDP memory
models.  The only memory model that needs this is page_pool, when a driver
use the DMA-mapping feature.

Introduce page_pool_release_page(), which basically does the same as
page_pool_unmap_page(). Add xdp_release_frame() as the XDP memory model
interface for calling it, if the memory model match MEM_TYPE_PAGE_POOL, to
save the function call overhead for others. Have cpumap call
xdp_release_frame() before xdp_scrub_frame().

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 15 ++++++++++++++-
 include/net/xdp.h       | 15 +++++++++++++++
 kernel/bpf/cpumap.c     |  3 +++
 net/core/xdp.c          | 15 +++++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ad218cef88c5..e240fac4c5b9 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -110,7 +110,6 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
 void page_pool_destroy(struct page_pool *pool);
-void page_pool_unmap_page(struct page_pool *pool, struct page *page);
 
 /* Never call this directly, use helpers below */
 void __page_pool_put_page(struct page_pool *pool,
@@ -133,6 +132,20 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	__page_pool_put_page(pool, page, true);
 }
 
+/* Disconnects a page (from a page_pool).  API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+void page_pool_unmap_page(struct page_pool *pool, struct page *page);
+static inline void page_pool_release_page(struct page_pool *pool,
+					  struct page *page)
+{
+#ifdef CONFIG_PAGE_POOL
+	page_pool_unmap_page(pool, page);
+#endif
+}
+
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
 	return page->dma_addr;
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 8e0deddef35c..40c6d3398458 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -129,6 +129,21 @@ void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
 
+/* When sending xdp_frame into the network stack, then there is no
+ * return point callback, which is needed to release e.g. DMA-mapping
+ * resources with page_pool.  Thus, have explicit function to release
+ * frame resources.
+ */
+void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
+static inline void xdp_release_frame(struct xdp_frame *xdpf)
+{
+	struct xdp_mem_info *mem = &xdpf->mem;
+
+	/* Curr only page_pool needs this */
+	if (mem->type == MEM_TYPE_PAGE_POOL)
+		__xdp_release_frame(xdpf->data, mem);
+}
+
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8ee5532cf6a6..8dff08768087 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -208,6 +208,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * - RX ring dev queue index	(skb_record_rx_queue)
 	 */
 
+	/* Until page_pool get SKB return path, release DMA here */
+	xdp_release_frame(xdpf);
+
 	/* Allow SKB to reuse area used by xdp_frame */
 	xdp_scrub_frame(xdpf);
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 1d5f2292962c..0fcc32340c4e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -381,6 +381,21 @@ void xdp_return_buff(struct xdp_buff *xdp)
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
 
+/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
+void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
+{
+	struct xdp_mem_allocator *xa;
+	struct page *page;
+
+	rcu_read_lock();
+	xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+	page = virt_to_head_page(data);
+	if (xa)
+		page_pool_release_page(xa->page_pool, page);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__xdp_release_frame);
+
 int xdp_attachment_query(struct xdp_attachment_info *info,
 			 struct netdev_bpf *bpf)
 {
-- 
cgit v1.2.3


From e54cfd7e1745e52eb6c67ee9c77aefb8e4666a88 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Jun 2019 15:05:37 +0200
Subject: page_pool: introduce page_pool_free and use in mlx5

In case driver fails to register the page_pool with XDP return API (via
xdp_rxq_info_reg_mem_model()), then the driver can free the page_pool
resources more directly than calling page_pool_destroy(), which does a
unnecessarily RCU free procedure.

This patch is preparing for removing page_pool_destroy(), from driver
invocation.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  6 +++---
 include/net/page_pool.h                           | 11 +++++++++++
 net/core/page_pool.c                              | 15 +++++++++++----
 3 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a8e8350b38aa..46323709ad47 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -545,8 +545,10 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	}
 	err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
 					 MEM_TYPE_PAGE_POOL, rq->page_pool);
-	if (err)
+	if (err) {
+		page_pool_free(rq->page_pool);
 		goto err_free;
+	}
 
 	for (i = 0; i < wq_sz; i++) {
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
@@ -611,8 +613,6 @@ err_rq_wq_destroy:
 	if (rq->xdp_prog)
 		bpf_prog_put(rq->xdp_prog);
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
-	if (rq->page_pool)
-		page_pool_destroy(rq->page_pool);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 
 	return err;
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index e240fac4c5b9..754d980700df 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -111,6 +111,17 @@ struct page_pool *page_pool_create(const struct page_pool_params *params);
 
 void page_pool_destroy(struct page_pool *pool);
 
+void __page_pool_free(struct page_pool *pool);
+static inline void page_pool_free(struct page_pool *pool)
+{
+	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+	 */
+#ifdef CONFIG_PAGE_POOL
+	__page_pool_free(pool);
+#endif
+}
+
 /* Never call this directly, use helpers below */
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 205af7bd6d09..41391b5dc14c 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -292,17 +292,24 @@ static void __page_pool_empty_ring(struct page_pool *pool)
 	}
 }
 
+void __page_pool_free(struct page_pool *pool)
+{
+	WARN(pool->alloc.count, "API usage violation");
+	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
+
+	ptr_ring_cleanup(&pool->ring, NULL);
+	kfree(pool);
+}
+EXPORT_SYMBOL(__page_pool_free);
+
 static void __page_pool_destroy_rcu(struct rcu_head *rcu)
 {
 	struct page_pool *pool;
 
 	pool = container_of(rcu, struct page_pool, rcu);
 
-	WARN(pool->alloc.count, "API usage violation");
-
 	__page_pool_empty_ring(pool);
-	ptr_ring_cleanup(&pool->ring, NULL);
-	kfree(pool);
+	__page_pool_free(pool);
 }
 
 /* Cleanup and release resources */
-- 
cgit v1.2.3


From 99c07c43c4ea0bc101331401a0fabfc51933c6a3 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Jun 2019 15:05:47 +0200
Subject: xdp: tracking page_pool resources and safe removal

This patch is needed before we can allow drivers to use page_pool for
DMA-mappings. Today with page_pool and XDP return API, it is possible to
remove the page_pool object (from rhashtable), while there are still
in-flight packet-pages. This is safely handled via RCU and failed lookups in
__xdp_return() fallback to call put_page(), when page_pool object is gone.
In-case page is still DMA mapped, this will result in page note getting
correctly DMA unmapped.

To solve this, the page_pool is extended with tracking in-flight pages. And
XDP disconnect system queries page_pool and waits, via workqueue, for all
in-flight pages to be returned.

To avoid killing performance when tracking in-flight pages, the implement
use two (unsigned) counters, that in placed on different cache-lines, and
can be used to deduct in-flight packets. This is done by mapping the
unsigned "sequence" counters onto signed Two's complement arithmetic
operations. This is e.g. used by kernel's time_after macros, described in
kernel commit 1ba3aab3033b and 5a581b367b5, and also explained in RFC1982.

The trick is these two incrementing counters only need to be read and
compared, when checking if it's safe to free the page_pool structure. Which
will only happen when driver have disconnected RX/alloc side. Thus, on a
non-fast-path.

It is chosen that page_pool tracking is also enabled for the non-DMA
use-case, as this can be used for statistics later.

After this patch, using page_pool requires more strict resource "release",
e.g. via page_pool_release_page() that was introduced in this patchset, and
previous patches implement/fix this more strict requirement.

Drivers no-longer call page_pool_destroy(). Drivers already call
xdp_rxq_info_unreg() which call xdp_rxq_info_unreg_mem_model(), which will
attempt to disconnect the mem id, and if attempt fails schedule the
disconnect for later via delayed workqueue.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  3 --
 include/net/page_pool.h                           | 41 ++++++++++----
 net/core/page_pool.c                              | 62 +++++++++++++++------
 net/core/xdp.c                                    | 65 ++++++++++++++++++++---
 4 files changed, 136 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 46b6a47bd1e3..5e40db8f92e6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -643,9 +643,6 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 	}
 
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
-	if (rq->page_pool)
-		page_pool_destroy(rq->page_pool);
-
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 754d980700df..f09b3f1994e6 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -16,14 +16,16 @@
  * page_pool_alloc_pages() call.  Drivers should likely use
  * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
  *
- * If page_pool handles DMA mapping (use page->private), then API user
- * is responsible for invoking page_pool_put_page() once.  In-case of
- * elevated refcnt, the DMA state is released, assuming other users of
- * the page will eventually call put_page().
+ * API keeps track of in-flight pages, in-order to let API user know
+ * when it is safe to dealloactor page_pool object.  Thus, API users
+ * must make sure to call page_pool_release_page() when a page is
+ * "leaving" the page_pool.  Or call page_pool_put_page() where
+ * appropiate.  For maintaining correct accounting.
  *
- * If no DMA mapping is done, then it can act as shim-layer that
- * fall-through to alloc_page.  As no state is kept on the page, the
- * regular put_page() call is sufficient.
+ * API user must only call page_pool_put_page() once on a page, as it
+ * will either recycle the page, or in case of elevated refcnt, it
+ * will release the DMA mapping and in-flight state accounting.  We
+ * hope to lift this requirement in the future.
  */
 #ifndef _NET_PAGE_POOL_H
 #define _NET_PAGE_POOL_H
@@ -66,9 +68,10 @@ struct page_pool_params {
 };
 
 struct page_pool {
-	struct rcu_head rcu;
 	struct page_pool_params p;
 
+        u32 pages_state_hold_cnt;
+
 	/*
 	 * Data structure for allocation side
 	 *
@@ -96,6 +99,8 @@ struct page_pool {
 	 * TODO: Implement bulk return pages into this structure.
 	 */
 	struct ptr_ring ring;
+
+	atomic_t pages_state_release_cnt;
 };
 
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
@@ -109,8 +114,6 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
 
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
-void page_pool_destroy(struct page_pool *pool);
-
 void __page_pool_free(struct page_pool *pool);
 static inline void page_pool_free(struct page_pool *pool)
 {
@@ -143,6 +146,24 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	__page_pool_put_page(pool, page, true);
 }
 
+/* API user MUST have disconnected alloc-side (not allowed to call
+ * page_pool_alloc_pages()) before calling this.  The free-side can
+ * still run concurrently, to handle in-flight packet-pages.
+ *
+ * A request to shutdown can fail (with false) if there are still
+ * in-flight packet-pages.
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool);
+static inline bool page_pool_request_shutdown(struct page_pool *pool)
+{
+	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+	 */
+#ifdef CONFIG_PAGE_POOL
+	return __page_pool_request_shutdown(pool);
+#endif
+}
+
 /* Disconnects a page (from a page_pool).  API users can have a need
  * to disconnect a page (from a page_pool), to allow it to be used as
  * a regular page (that will eventually be returned to the normal
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 41391b5dc14c..8679e24fd665 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -43,6 +43,8 @@ static int page_pool_init(struct page_pool *pool,
 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 		return -ENOMEM;
 
+	atomic_set(&pool->pages_state_release_cnt, 0);
+
 	return 0;
 }
 
@@ -151,6 +153,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	page->dma_addr = dma;
 
 skip_dma_map:
+	/* Track how many pages are held 'in-flight' */
+	pool->pages_state_hold_cnt++;
+
 	/* When page just alloc'ed is should/must have refcnt 1. */
 	return page;
 }
@@ -173,6 +178,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 }
 EXPORT_SYMBOL(page_pool_alloc_pages);
 
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
+ *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
+ */
+#define _distance(a, b)	(s32)((a) - (b))
+
+static s32 page_pool_inflight(struct page_pool *pool)
+{
+	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+	s32 distance;
+
+	distance = _distance(hold_cnt, release_cnt);
+
+	/* TODO: Add tracepoint here */
+	return distance;
+}
+
+static bool __page_pool_safe_to_destroy(struct page_pool *pool)
+{
+	s32 inflight = page_pool_inflight(pool);
+
+	/* The distance should not be able to become negative */
+	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
+
+	return (inflight == 0);
+}
+
 /* Cleanup page_pool state from page */
 static void __page_pool_clean_page(struct page_pool *pool,
 				   struct page *page)
@@ -180,7 +212,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
 	dma_addr_t dma;
 
 	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
-		return;
+		goto skip_dma_unmap;
 
 	dma = page->dma_addr;
 	/* DMA unmap */
@@ -188,11 +220,16 @@ static void __page_pool_clean_page(struct page_pool *pool,
 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 			     DMA_ATTR_SKIP_CPU_SYNC);
 	page->dma_addr = 0;
+skip_dma_unmap:
+	atomic_inc(&pool->pages_state_release_cnt);
 }
 
 /* unmap the page and clean our state */
 void page_pool_unmap_page(struct page_pool *pool, struct page *page)
 {
+	/* When page is unmapped, this implies page will not be
+	 * returned to page_pool.
+	 */
 	__page_pool_clean_page(pool, page);
 }
 EXPORT_SYMBOL(page_pool_unmap_page);
@@ -201,6 +238,7 @@ EXPORT_SYMBOL(page_pool_unmap_page);
 static void __page_pool_return_page(struct page_pool *pool, struct page *page)
 {
 	__page_pool_clean_page(pool, page);
+
 	put_page(page);
 	/* An optimization would be to call __free_pages(page, pool->p.order)
 	 * knowing page is not part of page-cache (thus avoiding a
@@ -296,24 +334,17 @@ void __page_pool_free(struct page_pool *pool)
 {
 	WARN(pool->alloc.count, "API usage violation");
 	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
+	WARN(!__page_pool_safe_to_destroy(pool), "still in-flight pages");
 
 	ptr_ring_cleanup(&pool->ring, NULL);
 	kfree(pool);
 }
 EXPORT_SYMBOL(__page_pool_free);
 
-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
-{
-	struct page_pool *pool;
-
-	pool = container_of(rcu, struct page_pool, rcu);
-
-	__page_pool_empty_ring(pool);
-	__page_pool_free(pool);
-}
-
-/* Cleanup and release resources */
-void page_pool_destroy(struct page_pool *pool)
+/* Request to shutdown: release pages cached by page_pool, and check
+ * for in-flight pages
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool)
 {
 	struct page *page;
 
@@ -331,7 +362,6 @@ void page_pool_destroy(struct page_pool *pool)
 	 */
 	__page_pool_empty_ring(pool);
 
-	/* An xdp_mem_allocator can still ref page_pool pointer */
-	call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+	return __page_pool_safe_to_destroy(pool);
 }
-EXPORT_SYMBOL(page_pool_destroy);
+EXPORT_SYMBOL(__page_pool_request_shutdown);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 0fcc32340c4e..aae665ccee3f 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -38,6 +38,7 @@ struct xdp_mem_allocator {
 	};
 	struct rhash_head node;
 	struct rcu_head rcu;
+	struct delayed_work defer_wq;
 };
 
 static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
@@ -79,13 +80,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 
 	xa = container_of(rcu, struct xdp_mem_allocator, rcu);
 
+	/* Allocator have indicated safe to remove before this is called */
+	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+		page_pool_free(xa->page_pool);
+
 	/* Allow this ID to be reused */
 	ida_simple_remove(&mem_id_pool, xa->mem.id);
 
-	/* Notice, driver is expected to free the *allocator,
-	 * e.g. page_pool, and MUST also use RCU free.
-	 */
-
 	/* Poison memory */
 	xa->mem.id = 0xFFFF;
 	xa->mem.type = 0xF0F0;
@@ -94,6 +95,46 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 	kfree(xa);
 }
 
+bool __mem_id_disconnect(int id)
+{
+	struct xdp_mem_allocator *xa;
+	bool safe_to_remove = true;
+
+	mutex_lock(&mem_id_lock);
+
+	xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+	if (!xa) {
+		mutex_unlock(&mem_id_lock);
+		WARN(1, "Request remove non-existing id(%d), driver bug?", id);
+		return true;
+	}
+
+	/* Detects in-flight packet-pages for page_pool */
+	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
+
+	if (safe_to_remove &&
+	    !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+	mutex_unlock(&mem_id_lock);
+	return safe_to_remove;
+}
+
+#define DEFER_TIME (msecs_to_jiffies(1000))
+
+static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+{
+	struct delayed_work *dwq = to_delayed_work(wq);
+	struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
+
+	if (__mem_id_disconnect(xa->mem.id))
+		return;
+
+	/* Still not ready to be disconnected, retry later */
+	schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+}
+
 void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 {
 	struct xdp_mem_allocator *xa;
@@ -112,16 +153,28 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 	if (id == 0)
 		return;
 
+	if (__mem_id_disconnect(id))
+		return;
+
+	/* Could not disconnect, defer new disconnect attempt to later */
 	mutex_lock(&mem_id_lock);
 
 	xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
-	if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
-		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+	if (!xa) {
+		mutex_unlock(&mem_id_lock);
+		return;
+	}
 
+	INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
 	mutex_unlock(&mem_id_lock);
+	schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
 
+/* This unregister operation will also cleanup and destroy the
+ * allocator. The page_pool_free() operation is first called when it's
+ * safe to remove, possibly deferred to a workqueue.
+ */
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 {
 	/* Simplify driver cleanup code paths, allow unreg "unused" */
-- 
cgit v1.2.3


From f033b688c1ede5ec78c9a718fa9f0b374049bc31 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Jun 2019 15:05:58 +0200
Subject: xdp: add tracepoints for XDP mem

These tracepoints make it easier to troubleshoot XDP mem id disconnect.

The xdp:mem_disconnect tracepoint cannot be replaced via kprobe. It is
placed at the last stable place for the pointer to struct xdp_mem_allocator,
just before it's scheduled for RCU removal. It also extract info on
'safe_to_remove' and 'force'.

Detailed info about in-flight pages is not available at this layer. The next
patch will added tracepoints needed at the page_pool layer for this.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xdp_priv.h     |  23 +++++++++
 include/trace/events/xdp.h | 115 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/xdp.c             |  21 ++-------
 3 files changed, 143 insertions(+), 16 deletions(-)
 create mode 100644 include/net/xdp_priv.h

(limited to 'include')

diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
new file mode 100644
index 000000000000..6a8cba6ea79a
--- /dev/null
+++ b/include/net/xdp_priv.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_NET_XDP_PRIV_H__
+#define __LINUX_NET_XDP_PRIV_H__
+
+#include <linux/rhashtable.h>
+
+/* Private to net/core/xdp.c, but used by trace/events/xdp.h */
+struct xdp_mem_allocator {
+	struct xdp_mem_info mem;
+	union {
+		void *allocator;
+		struct page_pool *page_pool;
+		struct zero_copy_allocator *zc_alloc;
+	};
+	int disconnect_cnt;
+	unsigned long defer_start;
+	struct rhash_head node;
+	struct rcu_head rcu;
+	struct delayed_work defer_wq;
+	unsigned long defer_warn;
+};
+
+#endif /* __LINUX_NET_XDP_PRIV_H__ */
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index e95cb86b65cf..bb5e380e2ef3 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -269,6 +269,121 @@ TRACE_EVENT(xdp_devmap_xmit,
 		  __entry->from_ifindex, __entry->to_ifindex, __entry->err)
 );
 
+/* Expect users already include <net/xdp.h>, but not xdp_priv.h */
+#include <net/xdp_priv.h>
+
+#define __MEM_TYPE_MAP(FN)	\
+	FN(PAGE_SHARED)		\
+	FN(PAGE_ORDER0)		\
+	FN(PAGE_POOL)		\
+	FN(ZERO_COPY)
+
+#define __MEM_TYPE_TP_FN(x)	\
+	TRACE_DEFINE_ENUM(MEM_TYPE_##x);
+#define __MEM_TYPE_SYM_FN(x)	\
+	{ MEM_TYPE_##x, #x },
+#define __MEM_TYPE_SYM_TAB	\
+	__MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 }
+__MEM_TYPE_MAP(__MEM_TYPE_TP_FN)
+
+TRACE_EVENT(mem_disconnect,
+
+	TP_PROTO(const struct xdp_mem_allocator *xa,
+		 bool safe_to_remove, bool force),
+
+	TP_ARGS(xa, safe_to_remove, force),
+
+	TP_STRUCT__entry(
+		__field(const struct xdp_mem_allocator *,	xa)
+		__field(u32,		mem_id)
+		__field(u32,		mem_type)
+		__field(const void *,	allocator)
+		__field(bool,		safe_to_remove)
+		__field(bool,		force)
+		__field(int,		disconnect_cnt)
+	),
+
+	TP_fast_assign(
+		__entry->xa		= xa;
+		__entry->mem_id		= xa->mem.id;
+		__entry->mem_type	= xa->mem.type;
+		__entry->allocator	= xa->allocator;
+		__entry->safe_to_remove	= safe_to_remove;
+		__entry->force		= force;
+		__entry->disconnect_cnt	= xa->disconnect_cnt;
+	),
+
+	TP_printk("mem_id=%d mem_type=%s allocator=%p"
+		  " safe_to_remove=%s force=%s disconnect_cnt=%d",
+		  __entry->mem_id,
+		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
+		  __entry->allocator,
+		  __entry->safe_to_remove ? "true" : "false",
+		  __entry->force ? "true" : "false",
+		  __entry->disconnect_cnt
+	)
+);
+
+TRACE_EVENT(mem_connect,
+
+	TP_PROTO(const struct xdp_mem_allocator *xa,
+		 const struct xdp_rxq_info *rxq),
+
+	TP_ARGS(xa, rxq),
+
+	TP_STRUCT__entry(
+		__field(const struct xdp_mem_allocator *,	xa)
+		__field(u32,		mem_id)
+		__field(u32,		mem_type)
+		__field(const void *,	allocator)
+		__field(const struct xdp_rxq_info *,		rxq)
+		__field(int,		ifindex)
+	),
+
+	TP_fast_assign(
+		__entry->xa		= xa;
+		__entry->mem_id		= xa->mem.id;
+		__entry->mem_type	= xa->mem.type;
+		__entry->allocator	= xa->allocator;
+		__entry->rxq		= rxq;
+		__entry->ifindex	= rxq->dev->ifindex;
+	),
+
+	TP_printk("mem_id=%d mem_type=%s allocator=%p"
+		  " ifindex=%d",
+		  __entry->mem_id,
+		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
+		  __entry->allocator,
+		  __entry->ifindex
+	)
+);
+
+TRACE_EVENT(mem_return_failed,
+
+	TP_PROTO(const struct xdp_mem_info *mem,
+		 const struct page *page),
+
+	TP_ARGS(mem, page),
+
+	TP_STRUCT__entry(
+		__field(const struct page *,	page)
+		__field(u32,		mem_id)
+		__field(u32,		mem_type)
+	),
+
+	TP_fast_assign(
+		__entry->page		= page;
+		__entry->mem_id		= mem->id;
+		__entry->mem_type	= mem->type;
+	),
+
+	TP_printk("mem_id=%d mem_type=%s page=%p",
+		  __entry->mem_id,
+		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
+		  __entry->page
+	)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 622c81dc7ba8..b29d7b513a18 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -14,6 +14,8 @@
 #include <net/page_pool.h>
 
 #include <net/xdp.h>
+#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
+#include <trace/events/xdp.h>
 
 #define REG_STATE_NEW		0x0
 #define REG_STATE_REGISTERED	0x1
@@ -29,21 +31,6 @@ static int mem_id_next = MEM_ID_MIN;
 static bool mem_id_init; /* false */
 static struct rhashtable *mem_id_ht;
 
-struct xdp_mem_allocator {
-	struct xdp_mem_info mem;
-	union {
-		void *allocator;
-		struct page_pool *page_pool;
-		struct zero_copy_allocator *zc_alloc;
-	};
-	struct rhash_head node;
-	struct rcu_head rcu;
-	struct delayed_work defer_wq;
-	unsigned long defer_start;
-	unsigned long defer_warn;
-	int disconnect_cnt;
-};
-
 static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
 {
 	const u32 *k = data;
@@ -117,7 +104,7 @@ bool __mem_id_disconnect(int id, bool force)
 	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
 		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
 
-	/* TODO: Tracepoint will be added here in next-patch */
+	trace_mem_disconnect(xa, safe_to_remove, force);
 
 	if ((safe_to_remove || force) &&
 	    !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
@@ -385,6 +372,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 
 	mutex_unlock(&mem_id_lock);
 
+	trace_mem_connect(xdp_alloc, xdp_rxq);
 	return 0;
 err:
 	mutex_unlock(&mem_id_lock);
@@ -417,6 +405,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		} else {
 			/* Hopefully stack show who to blame for late return */
 			WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
+			trace_mem_return_failed(mem, page);
 			put_page(page);
 		}
 		rcu_read_unlock();
-- 
cgit v1.2.3


From 32c28f7e413981c7dd4a3ad9bbb1151e4b654261 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Jun 2019 15:06:03 +0200
Subject: page_pool: add tracepoints for page_pool with details need by XDP

The xdp tracepoints for mem id disconnect don't carry information about, why
it was not safe_to_remove.  The tracepoint page_pool:page_pool_inflight in
this patch can be used for extract this info for further debugging.

This patchset also adds tracepoint for the pages_state_* release/hold
transitions, including a pointer to the page.  This can be used for stats
about in-flight pages, or used to debug page leakage via keeping track of
page pointer and combining this with kprobe for __put_page().

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/page_pool.h | 87 ++++++++++++++++++++++++++++++++++++++++
 net/core/net-traces.c            |  4 ++
 net/core/page_pool.c             |  9 ++++-
 3 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/events/page_pool.h

(limited to 'include')

diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h
new file mode 100644
index 000000000000..47b5ee880aa9
--- /dev/null
+++ b/include/trace/events/page_pool.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM page_pool
+
+#if !defined(_TRACE_PAGE_POOL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define      _TRACE_PAGE_POOL_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#include <net/page_pool.h>
+
+TRACE_EVENT(page_pool_inflight,
+
+	TP_PROTO(const struct page_pool *pool,
+		 s32 inflight, u32 hold, u32 release),
+
+	TP_ARGS(pool, inflight, hold, release),
+
+	TP_STRUCT__entry(
+		__field(const struct page_pool *, pool)
+		__field(s32,	inflight)
+		__field(u32,	hold)
+		__field(u32,	release)
+	),
+
+	TP_fast_assign(
+		__entry->pool		= pool;
+		__entry->inflight	= inflight;
+		__entry->hold		= hold;
+		__entry->release	= release;
+	),
+
+	TP_printk("page_pool=%p inflight=%d hold=%u release=%u",
+	  __entry->pool, __entry->inflight, __entry->hold, __entry->release)
+);
+
+TRACE_EVENT(page_pool_state_release,
+
+	TP_PROTO(const struct page_pool *pool,
+		 const struct page *page, u32 release),
+
+	TP_ARGS(pool, page, release),
+
+	TP_STRUCT__entry(
+		__field(const struct page_pool *,	pool)
+		__field(const struct page *,		page)
+		__field(u32,				release)
+	),
+
+	TP_fast_assign(
+		__entry->pool		= pool;
+		__entry->page		= page;
+		__entry->release	= release;
+	),
+
+	TP_printk("page_pool=%p page=%p release=%u",
+		  __entry->pool, __entry->page, __entry->release)
+);
+
+TRACE_EVENT(page_pool_state_hold,
+
+	TP_PROTO(const struct page_pool *pool,
+		 const struct page *page, u32 hold),
+
+	TP_ARGS(pool, page, hold),
+
+	TP_STRUCT__entry(
+		__field(const struct page_pool *,	pool)
+		__field(const struct page *,		page)
+		__field(u32,				hold)
+	),
+
+	TP_fast_assign(
+		__entry->pool	= pool;
+		__entry->page	= page;
+		__entry->hold	= hold;
+	),
+
+	TP_printk("page_pool=%p page=%p hold=%u",
+		  __entry->pool, __entry->page, __entry->hold)
+);
+
+#endif /* _TRACE_PAGE_POOL_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 470b179d599e..283ddb2dbc7d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
 EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
 #endif
 
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+#include <trace/events/page_pool.h>
+#endif
+
 #include <trace/events/neigh.h>
 EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update);
 EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 42c3b0a5a259..f55ab055d543 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -4,6 +4,7 @@
  *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
  *	Copyright (C) 2016 Red Hat, Inc.
  */
+
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -14,6 +15,8 @@
 #include <linux/page-flags.h>
 #include <linux/mm.h> /* for __put_page() */
 
+#include <trace/events/page_pool.h>
+
 static int page_pool_init(struct page_pool *pool,
 			  const struct page_pool_params *params)
 {
@@ -156,6 +159,8 @@ skip_dma_map:
 	/* Track how many pages are held 'in-flight' */
 	pool->pages_state_hold_cnt++;
 
+	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+
 	/* When page just alloc'ed is should/must have refcnt 1. */
 	return page;
 }
@@ -191,7 +196,7 @@ static s32 page_pool_inflight(struct page_pool *pool)
 
 	distance = _distance(hold_cnt, release_cnt);
 
-	/* TODO: Add tracepoint here */
+	trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
 	return distance;
 }
 
@@ -222,6 +227,8 @@ static void __page_pool_clean_page(struct page_pool *pool,
 	page->dma_addr = 0;
 skip_dma_unmap:
 	atomic_inc(&pool->pages_state_release_cnt);
+	trace_page_pool_state_release(pool, page,
+			      atomic_read(&pool->pages_state_release_cnt));
 }
 
 /* unmap the page and clean our state */
-- 
cgit v1.2.3


From d7d99872c144a2c2f5d9c9d83627fa833836cba5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 18 Jun 2019 11:08:59 -0700
Subject: netns: add pre_exit method to struct pernet_operations

Current struct pernet_operations exit() handlers are highly
discouraged to call synchronize_rcu().

There are cases where we need them, and exit_batch() does
not help the common case where a single netns is dismantled.

This patch leverages the existing synchronize_rcu() call
in cleanup_net()

Calling optional ->pre_exit() method before ->exit() or
->exit_batch() allows to benefit from a single synchronize_rcu()
call.

Note that the synchronize_rcu() calls added in this patch
are only in error paths or slow paths.

Tested:

$ time for i in {1..1000}; do unshare -n /bin/false;done

real	0m2.612s
user	0m0.171s
sys	0m2.216s

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h |  5 +++++
 net/core/net_namespace.c    | 28 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index abb4f92456e1..ad9243afac67 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -355,8 +355,13 @@ struct pernet_operations {
 	 * synchronize_rcu() related to these pernet_operations,
 	 * instead of separate synchronize_rcu() for every net.
 	 * Please, avoid synchronize_rcu() at all, where it's possible.
+	 *
+	 * Note that a combination of pre_exit() and exit() can
+	 * be used, since a synchronize_rcu() is guaranteed between
+	 * the calls.
 	 */
 	int (*init)(struct net *net);
+	void (*pre_exit)(struct net *net);
 	void (*exit)(struct net *net);
 	void (*exit_batch)(struct list_head *net_exit_list);
 	unsigned int *id;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 15f68842ac6b..89dc99a28978 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -145,6 +145,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
 	}
 }
 
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+			      struct list_head *net_exit_list)
+{
+	struct net *net;
+
+	if (ops->pre_exit) {
+		list_for_each_entry(net, net_exit_list, exit_list)
+			ops->pre_exit(net);
+	}
+}
+
 static void ops_exit_list(const struct pernet_operations *ops,
 			  struct list_head *net_exit_list)
 {
@@ -328,6 +339,12 @@ out_undo:
 	 * for the pernet modules whose init functions did not fail.
 	 */
 	list_add(&net->exit_list, &net_exit_list);
+	saved_ops = ops;
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+		ops_pre_exit_list(ops, &net_exit_list);
+
+	synchronize_rcu();
+
 	saved_ops = ops;
 	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
 		ops_exit_list(ops, &net_exit_list);
@@ -541,10 +558,15 @@ static void cleanup_net(struct work_struct *work)
 		list_add_tail(&net->exit_list, &net_exit_list);
 	}
 
+	/* Run all of the network namespace pre_exit methods */
+	list_for_each_entry_reverse(ops, &pernet_list, list)
+		ops_pre_exit_list(ops, &net_exit_list);
+
 	/*
 	 * Another CPU might be rcu-iterating the list, wait for it.
 	 * This needs to be before calling the exit() notifiers, so
 	 * the rcu_barrier() below isn't sufficient alone.
+	 * Also the pre_exit() and exit() methods need this barrier.
 	 */
 	synchronize_rcu();
 
@@ -1101,6 +1123,8 @@ static int __register_pernet_operations(struct list_head *list,
 out_undo:
 	/* If I have an error cleanup all namespaces I initialized */
 	list_del(&ops->list);
+	ops_pre_exit_list(ops, &net_exit_list);
+	synchronize_rcu();
 	ops_exit_list(ops, &net_exit_list);
 	ops_free_list(ops, &net_exit_list);
 	return error;
@@ -1115,6 +1139,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
 	/* See comment in __register_pernet_operations() */
 	for_each_net(net)
 		list_add_tail(&net->exit_list, &net_exit_list);
+	ops_pre_exit_list(ops, &net_exit_list);
+	synchronize_rcu();
 	ops_exit_list(ops, &net_exit_list);
 	ops_free_list(ops, &net_exit_list);
 }
@@ -1139,6 +1165,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
 	} else {
 		LIST_HEAD(net_exit_list);
 		list_add(&init_net.exit_list, &net_exit_list);
+		ops_pre_exit_list(ops, &net_exit_list);
+		synchronize_rcu();
 		ops_exit_list(ops, &net_exit_list);
 		ops_free_list(ops, &net_exit_list);
 	}
-- 
cgit v1.2.3


From d5dd88794a13c2f24cce31abad7a0a6c5e0ed2db Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 18 Jun 2019 11:09:00 -0700
Subject: inet: fix various use-after-free in defrags units

syzbot reported another issue caused by my recent patches. [1]

The issue here is that fqdir_exit() is initiating a work queue
and immediately returns. A bit later cleanup_net() was able
to free the MIB (percpu data) and the whole struct net was freed,
but we had active frag timers that fired and triggered use-after-free.

We need to make sure that timers can catch fqdir->dead being set,
to bailout.

Since RCU is used for the reader side, this means
we want to respect an RCU grace period between these operations :

1) qfdir->dead = 1;

2) netns dismantle (freeing of various data structure)

This patch uses new new (struct pernet_operations)->pre_exit
infrastructure to ensures a full RCU grace period
happens between fqdir_pre_exit() and fqdir_exit()

This also means we can use a regular work queue, we no
longer need rcu_work.

Tested:

$ time for i in {1..1000}; do unshare -n /bin/false;done

real	0m2.585s
user	0m0.160s
sys	0m2.214s

[1]

BUG: KASAN: use-after-free in ip_expire+0x73e/0x800 net/ipv4/ip_fragment.c:152
Read of size 8 at addr ffff88808b9fe330 by task syz-executor.4/11860

CPU: 1 PID: 11860 Comm: syz-executor.4 Not tainted 5.2.0-rc2+ #22
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188
 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 kasan_report+0x12/0x20 mm/kasan/common.c:614
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
 ip_expire+0x73e/0x800 net/ipv4/ip_fragment.c:152
 call_timer_fn+0x193/0x720 kernel/time/timer.c:1322
 expire_timers kernel/time/timer.c:1366 [inline]
 __run_timers kernel/time/timer.c:1685 [inline]
 __run_timers kernel/time/timer.c:1653 [inline]
 run_timer_softirq+0x66f/0x1740 kernel/time/timer.c:1698
 __do_softirq+0x25c/0x94c kernel/softirq.c:293
 invoke_softirq kernel/softirq.c:374 [inline]
 irq_exit+0x180/0x1d0 kernel/softirq.c:414
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 smp_apic_timer_interrupt+0x13b/0x550 arch/x86/kernel/apic/apic.c:1068
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:806
 </IRQ>
RIP: 0010:tomoyo_domain_quota_is_ok+0x131/0x540 security/tomoyo/util.c:1035
Code: 24 4c 3b 65 d0 0f 84 9c 00 00 00 e8 19 1d 73 fe 49 8d 7c 24 18 48 ba 00 00 00 00 00 fc ff df 48 89 f8 48 c1 e8 03 0f b6 04 10 <48> 89 fa 83 e2 07 38 d0 7f 08 84 c0 0f 85 69 03 00 00 41 0f b6 5c
RSP: 0018:ffff88806ae079c0 EFLAGS: 00000a02 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000000 RBX: 0000000000000010 RCX: ffffc9000e655000
RDX: dffffc0000000000 RSI: ffffffff82fd88a7 RDI: ffff888086202398
RBP: ffff88806ae07a00 R08: ffff88808b6c8700 R09: ffffed100d5c0f4d
R10: ffffed100d5c0f4c R11: 0000000000000000 R12: ffff888086202380
R13: 0000000000000030 R14: 00000000000000d3 R15: 0000000000000000
 tomoyo_supervisor+0x2e8/0xef0 security/tomoyo/common.c:2087
 tomoyo_audit_path_number_log security/tomoyo/file.c:235 [inline]
 tomoyo_path_number_perm+0x42f/0x520 security/tomoyo/file.c:734
 tomoyo_file_ioctl+0x23/0x30 security/tomoyo/tomoyo.c:335
 security_file_ioctl+0x77/0xc0 security/security.c:1370
 ksys_ioctl+0x57/0xd0 fs/ioctl.c:711
 __do_sys_ioctl fs/ioctl.c:720 [inline]
 __se_sys_ioctl fs/ioctl.c:718 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4592c9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f8db5e44c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004592c9
RDX: 0000000020000080 RSI: 00000000000089f1 RDI: 0000000000000006
RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f8db5e456d4
R13: 00000000004cc770 R14: 00000000004d5cd8 R15: 00000000ffffffff

Allocated by task 9047:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_kmalloc mm/kasan/common.c:489 [inline]
 __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462
 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:497
 slab_post_alloc_hook mm/slab.h:437 [inline]
 slab_alloc mm/slab.c:3326 [inline]
 kmem_cache_alloc+0x11a/0x6f0 mm/slab.c:3488
 kmem_cache_zalloc include/linux/slab.h:732 [inline]
 net_alloc net/core/net_namespace.c:386 [inline]
 copy_net_ns+0xed/0x340 net/core/net_namespace.c:426
 create_new_namespaces+0x400/0x7b0 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xc2/0x200 kernel/nsproxy.c:206
 ksys_unshare+0x440/0x980 kernel/fork.c:2692
 __do_sys_unshare kernel/fork.c:2760 [inline]
 __se_sys_unshare kernel/fork.c:2758 [inline]
 __x64_sys_unshare+0x31/0x40 kernel/fork.c:2758
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 2541:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451
 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459
 __cache_free mm/slab.c:3432 [inline]
 kmem_cache_free+0x86/0x260 mm/slab.c:3698
 net_free net/core/net_namespace.c:402 [inline]
 net_drop_ns.part.0+0x70/0x90 net/core/net_namespace.c:409
 net_drop_ns net/core/net_namespace.c:408 [inline]
 cleanup_net+0x538/0x960 net/core/net_namespace.c:571
 process_one_work+0x989/0x1790 kernel/workqueue.c:2269
 worker_thread+0x98/0xe40 kernel/workqueue.c:2415
 kthread+0x354/0x420 kernel/kthread.c:255
 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352

The buggy address belongs to the object at ffff88808b9fe100
 which belongs to the cache net_namespace of size 6784
The buggy address is located 560 bytes inside of
 6784-byte region [ffff88808b9fe100, ffff88808b9ffb80)
The buggy address belongs to the page:
page:ffffea00022e7f80 refcount:1 mapcount:0 mapping:ffff88821b6f60c0 index:0x0 compound_mapcount: 0
flags: 0x1fffc0000010200(slab|head)
raw: 01fffc0000010200 ffffea000256f288 ffffea0001bbef08 ffff88821b6f60c0
raw: 0000000000000000 ffff88808b9fe100 0000000100000001 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88808b9fe200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88808b9fe280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff88808b9fe300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                     ^
 ffff88808b9fe380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88808b9fe400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 3c8fc8782044 ("inet: frags: rework rhashtable dismantle")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  8 +++++++-
 include/net/ipv6_frag.h                 |  2 ++
 net/ieee802154/6lowpan/reassembly.c     | 13 +++++++++++--
 net/ipv4/inet_fragment.c                | 19 ++++---------------
 net/ipv4/ip_fragment.c                  | 14 ++++++++++++--
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 ++++++++--
 net/ipv6/reassembly.c                   | 10 ++++++++--
 7 files changed, 52 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index e91b79ad4e4a..46574d996f1d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -20,7 +20,7 @@ struct fqdir {
 
 	/* Keep atomic mem on separate cachelines in structs that include it */
 	atomic_long_t		mem ____cacheline_aligned_in_smp;
-	struct rcu_work		destroy_rwork;
+	struct work_struct	destroy_work;
 };
 
 /**
@@ -113,6 +113,12 @@ int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
 int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net);
+
+static void inline fqdir_pre_exit(struct fqdir *fqdir)
+{
+	fqdir->high_thresh = 0; /* prevent creation of new frags */
+	fqdir->dead = true;
+}
 void fqdir_exit(struct fqdir *fqdir);
 
 void inet_frag_kill(struct inet_frag_queue *q);
diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 1f77fb4dc79d..a21e8b1381a1 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -67,6 +67,8 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
 	struct sk_buff *head;
 
 	rcu_read_lock();
+	if (fq->q.fqdir->dead)
+		goto out_rcu_unlock;
 	spin_lock(&fq->q.lock);
 
 	if (fq->q.flags & INET_FRAG_COMPLETE)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index a0ed13cd120e..e4aba5d485be 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -459,6 +459,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	return res;
 }
 
+static void __net_exit lowpan_frags_pre_exit_net(struct net *net)
+{
+	struct netns_ieee802154_lowpan *ieee802154_lowpan =
+		net_ieee802154_lowpan(net);
+
+	fqdir_pre_exit(ieee802154_lowpan->fqdir);
+}
+
 static void __net_exit lowpan_frags_exit_net(struct net *net)
 {
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
@@ -469,8 +477,9 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 }
 
 static struct pernet_operations lowpan_frags_ops = {
-	.init = lowpan_frags_init_net,
-	.exit = lowpan_frags_exit_net,
+	.init		= lowpan_frags_init_net,
+	.pre_exit	= lowpan_frags_pre_exit_net,
+	.exit		= lowpan_frags_exit_net,
 };
 
 static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5c25727d491e..d666756be5f1 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -145,10 +145,9 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 		inet_frag_destroy(fq);
 }
 
-static void fqdir_rwork_fn(struct work_struct *work)
+static void fqdir_work_fn(struct work_struct *work)
 {
-	struct fqdir *fqdir = container_of(to_rcu_work(work),
-					   struct fqdir, destroy_rwork);
+	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
 	struct inet_frags *f = fqdir->f;
 
 	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
@@ -187,18 +186,8 @@ EXPORT_SYMBOL(fqdir_init);
 
 void fqdir_exit(struct fqdir *fqdir)
 {
-	fqdir->high_thresh = 0; /* prevent creation of new frags */
-
-	fqdir->dead = true;
-
-	/* call_rcu is supposed to provide memory barrier semantics,
-	 * separating the setting of fqdir->dead with the destruction
-	 * work.  This implicit barrier is paired with inet_frag_kill().
-	 */
-
-	INIT_RCU_WORK(&fqdir->destroy_rwork, fqdir_rwork_fn);
-	queue_rcu_work(system_wq, &fqdir->destroy_rwork);
-
+	INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
+	queue_work(system_wq, &fqdir->destroy_work);
 }
 EXPORT_SYMBOL(fqdir_exit);
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1ffaec056821..4385eb9e781f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,6 +143,10 @@ static void ip_expire(struct timer_list *t)
 	net = qp->q.fqdir->net;
 
 	rcu_read_lock();
+
+	if (qp->q.fqdir->dead)
+		goto out_rcu_unlock;
+
 	spin_lock(&qp->q.lock);
 
 	if (qp->q.flags & INET_FRAG_COMPLETE)
@@ -676,6 +680,11 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	return res;
 }
 
+static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
+{
+	fqdir_pre_exit(net->ipv4.fqdir);
+}
+
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
@@ -683,8 +692,9 @@ static void __net_exit ipv4_frags_exit_net(struct net *net)
 }
 
 static struct pernet_operations ip4_frags_ops = {
-	.init = ipv4_frags_init_net,
-	.exit = ipv4_frags_exit_net,
+	.init		= ipv4_frags_init_net,
+	.pre_exit	= ipv4_frags_pre_exit_net,
+	.exit		= ipv4_frags_exit_net,
 };
 
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b8962d4d6ae6..3299a389d166 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -499,6 +499,11 @@ static int nf_ct_net_init(struct net *net)
 	return res;
 }
 
+static void nf_ct_net_pre_exit(struct net *net)
+{
+	fqdir_pre_exit(net->nf_frag.fqdir);
+}
+
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
@@ -506,8 +511,9 @@ static void nf_ct_net_exit(struct net *net)
 }
 
 static struct pernet_operations nf_ct_net_ops = {
-	.init = nf_ct_net_init,
-	.exit = nf_ct_net_exit,
+	.init		= nf_ct_net_init,
+	.pre_exit	= nf_ct_net_pre_exit,
+	.exit		= nf_ct_net_exit,
 };
 
 static const struct rhashtable_params nfct_rhash_params = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 0c9fd8a7c4e7..ca05b16f1bb9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -520,6 +520,11 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	return res;
 }
 
+static void __net_exit ipv6_frags_pre_exit_net(struct net *net)
+{
+	fqdir_pre_exit(net->ipv6.fqdir);
+}
+
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
@@ -527,8 +532,9 @@ static void __net_exit ipv6_frags_exit_net(struct net *net)
 }
 
 static struct pernet_operations ip6_frags_ops = {
-	.init = ipv6_frags_init_net,
-	.exit = ipv6_frags_exit_net,
+	.init		= ipv6_frags_init_net,
+	.pre_exit	= ipv6_frags_pre_exit_net,
+	.exit		= ipv6_frags_exit_net,
 };
 
 static const struct rhashtable_params ip6_rhash_params = {
-- 
cgit v1.2.3


From 79ebb5bb4e38a58ca796dd242b855a4982e101d7 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Tue, 18 Jun 2019 11:11:02 +0200
Subject: netfilter: nf_tables: enable set expiration time for set elements

Currently, the expiration of every element in a set or map
is a read-only parameter generated at kernel side.

This change will permit to set a certain expiration date
per element that will be required, for example, during
stateful replication among several nodes.

This patch handles the NFTA_SET_ELEM_EXPIRATION in order
to configure the expiration parameter per element, or
will use the timeout in the case that the expiration
is not set.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c     | 26 ++++++++++++++++++++------
 net/netfilter/nft_dynset.c        |  2 +-
 3 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5b8624ae4a27..9e8493aad49d 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -636,7 +636,7 @@ static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
 void *nft_set_elem_init(const struct nft_set *set,
 			const struct nft_set_ext_tmpl *tmpl,
 			const u32 *key, const u32 *data,
-			u64 timeout, gfp_t gfp);
+			u64 timeout, u64 expiration, gfp_t gfp);
 void nft_set_elem_destroy(const struct nft_set *set, void *elem,
 			  bool destroy_expr);
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d444405211c5..412bb85e9d29 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3873,6 +3873,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
 	[NFTA_SET_ELEM_DATA]		= { .type = NLA_NESTED },
 	[NFTA_SET_ELEM_FLAGS]		= { .type = NLA_U32 },
 	[NFTA_SET_ELEM_TIMEOUT]		= { .type = NLA_U64 },
+	[NFTA_SET_ELEM_EXPIRATION]	= { .type = NLA_U64 },
 	[NFTA_SET_ELEM_USERDATA]	= { .type = NLA_BINARY,
 					    .len = NFT_USERDATA_MAXLEN },
 	[NFTA_SET_ELEM_EXPR]		= { .type = NLA_NESTED },
@@ -4326,7 +4327,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
 void *nft_set_elem_init(const struct nft_set *set,
 			const struct nft_set_ext_tmpl *tmpl,
 			const u32 *key, const u32 *data,
-			u64 timeout, gfp_t gfp)
+			u64 timeout, u64 expiration, gfp_t gfp)
 {
 	struct nft_set_ext *ext;
 	void *elem;
@@ -4341,9 +4342,11 @@ void *nft_set_elem_init(const struct nft_set *set,
 	memcpy(nft_set_ext_key(ext), key, set->klen);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
 		memcpy(nft_set_ext_data(ext), data, set->dlen);
-	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
-		*nft_set_ext_expiration(ext) =
-			get_jiffies_64() + timeout;
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
+		*nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
+		if (expiration == 0)
+			*nft_set_ext_expiration(ext) += timeout;
+	}
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
 		*nft_set_ext_timeout(ext) = timeout;
 
@@ -4408,6 +4411,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_trans *trans;
 	u32 flags = 0;
 	u64 timeout;
+	u64 expiration;
 	u8 ulen;
 	int err;
 
@@ -4451,6 +4455,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		timeout = set->timeout;
 	}
 
+	expiration = 0;
+	if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) {
+		if (!(set->flags & NFT_SET_TIMEOUT))
+			return -EINVAL;
+		err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION],
+					    &expiration);
+		if (err)
+			return err;
+	}
+
 	err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1,
 			    nla[NFTA_SET_ELEM_KEY]);
 	if (err < 0)
@@ -4533,7 +4547,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 
 	err = -ENOMEM;
 	elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data,
-				      timeout, GFP_KERNEL);
+				      timeout, expiration, GFP_KERNEL);
 	if (elem.priv == NULL)
 		goto err3;
 
@@ -4735,7 +4749,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 
 	err = -ENOMEM;
 	elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
-				      GFP_KERNEL);
+				      0, GFP_KERNEL);
 	if (elem.priv == NULL)
 		goto err2;
 
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 8394560aa695..bfb9f7463b03 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -60,7 +60,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
 	elem = nft_set_elem_init(set, &priv->tmpl,
 				 &regs->data[priv->sreg_key],
 				 &regs->data[priv->sreg_data],
-				 timeout, GFP_ATOMIC);
+				 timeout, 0, GFP_ATOMIC);
 	if (elem == NULL)
 		goto err1;
 
-- 
cgit v1.2.3


From 16e5a266f51639492ac30761d043525d7d43f4c8 Mon Sep 17 00:00:00 2001
From: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
Date: Wed, 19 Jun 2019 18:41:10 +0100
Subject: net: sched: act_ctinfo: tidy UAPI definition

Remove some enums from the UAPI definition that were only used
internally and are NOT part of the UAPI.

Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ctinfo.h        | 5 +++++
 include/uapi/linux/tc_act/tc_ctinfo.h | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
index d6a688571672..f071c1d70a25 100644
--- a/include/net/tc_act/tc_ctinfo.h
+++ b/include/net/tc_act/tc_ctinfo.h
@@ -23,6 +23,11 @@ struct tcf_ctinfo {
 	u64 stats_cpmark_set;
 };
 
+enum {
+	CTINFO_MODE_DSCP	= BIT(0),
+	CTINFO_MODE_CPMARK	= BIT(1)
+};
+
 #define to_ctinfo(a) ((struct tcf_ctinfo *)a)
 
 #endif /* __NET_TC_CTINFO_H */
diff --git a/include/uapi/linux/tc_act/tc_ctinfo.h b/include/uapi/linux/tc_act/tc_ctinfo.h
index 32337304fbe5..f5f26d95d0e7 100644
--- a/include/uapi/linux/tc_act/tc_ctinfo.h
+++ b/include/uapi/linux/tc_act/tc_ctinfo.h
@@ -26,9 +26,4 @@ enum {
 
 #define TCA_CTINFO_MAX (__TCA_CTINFO_MAX - 1)
 
-enum {
-	CTINFO_MODE_DSCP	= _BITUL(0),
-	CTINFO_MODE_CPMARK	= _BITUL(1)
-};
-
 #endif
-- 
cgit v1.2.3


From 497ad9f5b2dc86b733761b9afa44ecfa2f17be65 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 20 Jun 2019 00:15:52 +0200
Subject: page_pool: fix compile warning when CONFIG_PAGE_POOL is disabled

Kbuild test robot reported compile warning:
 warning: no return statement in function returning non-void
in function page_pool_request_shutdown, when CONFIG_PAGE_POOL is disabled.

The fix makes the code a little more verbose, with a descriptive variable.

Fixes: 99c07c43c4ea ("xdp: tracking page_pool resources and safe removal")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index f09b3f1994e6..f07c518ef8a5 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -156,12 +156,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 bool __page_pool_request_shutdown(struct page_pool *pool);
 static inline bool page_pool_request_shutdown(struct page_pool *pool)
 {
-	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
-	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
-	 */
+	bool safe_to_remove = false;
+
 #ifdef CONFIG_PAGE_POOL
-	return __page_pool_request_shutdown(pool);
+	safe_to_remove = __page_pool_request_shutdown(pool);
 #endif
+	return safe_to_remove;
 }
 
 /* Disconnects a page (from a page_pool).  API users can have a need
-- 
cgit v1.2.3


From 8527fa6cc68a489f735823e61b31ec6cb266274a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jun 2019 14:54:36 +0200
Subject: netfilter: synproxy: fix building syncookie calls

When either CONFIG_IPV6 or CONFIG_SYN_COOKIES are disabled, the kernel
fails to build:

include/linux/netfilter_ipv6.h:180:9: error: implicit declaration of function '__cookie_v6_init_sequence'
      [-Werror,-Wimplicit-function-declaration]
        return __cookie_v6_init_sequence(iph, th, mssp);
include/linux/netfilter_ipv6.h:194:9: error: implicit declaration of function '__cookie_v6_check'
      [-Werror,-Wimplicit-function-declaration]
        return __cookie_v6_check(iph, th, cookie);
net/ipv6/netfilter.c:237:26: error: use of undeclared identifier '__cookie_v6_init_sequence'; did you mean 'cookie_init_sequence'?
net/ipv6/netfilter.c:238:21: error: use of undeclared identifier '__cookie_v6_check'; did you mean '__cookie_v4_check'?

Fix the IS_ENABLED() checks to match the function declaration
and definitions for these.

Fixes: 3006a5224f15 ("netfilter: synproxy: remove module dependency on IPv6 SYNPROXY")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h | 14 ++++++++------
 net/ipv6/netfilter.c           |  2 ++
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 35b12525ee45..22e6398bc482 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -163,31 +163,33 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph,
 					       const struct tcphdr *th,
 					       u16 *mssp)
 {
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
 
 	if (v6_ops)
 		return v6_ops->cookie_init_sequence(iph, th, mssp);
-
-	return 0;
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return __cookie_v6_init_sequence(iph, th, mssp);
 #endif
+#endif
+	return 0;
 }
 
 static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
 				     const struct tcphdr *th, __u32 cookie)
 {
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
 
 	if (v6_ops)
 		return v6_ops->cookie_v6_check(iph, th, cookie);
-
-	return 0;
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return __cookie_v6_check(iph, th, cookie);
 #endif
+#endif
+	return 0;
 }
 
 __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index dffb10fdc3e8..61819ed858b1 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -234,8 +234,10 @@ static const struct nf_ipv6_ops ipv6ops = {
 	.route_me_harder	= ip6_route_me_harder,
 	.dev_get_saddr		= ipv6_dev_get_saddr,
 	.route			= __nf_ip6_route,
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
 	.cookie_init_sequence	= __cookie_v6_init_sequence,
 	.cookie_v6_check	= __cookie_v6_check,
+#endif
 #endif
 	.route_input		= ip6_route_input,
 	.fragment		= ip6_fragment,
-- 
cgit v1.2.3


From 43a38c3f318082839d7e613352d4dae7bbdfcdec Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 17 Jun 2019 15:15:04 +0200
Subject: netfilter: fix nf_conntrack_bridge/ipv6 link error

When CONFIG_IPV6 is disabled, the bridge netfilter code
produces a link error:

ERROR: "br_ip6_fragment" [net/bridge/netfilter/nf_conntrack_bridge.ko] undefined!
ERROR: "nf_ct_frag6_gather" [net/bridge/netfilter/nf_conntrack_bridge.ko] undefined!

The problem is that it assumes that whenever IPV6 is not a loadable
module, we can call the functions direction. This is clearly
not true when IPV6 is disabled.

There are two other functions defined like this in linux/netfilter_ipv6.h,
so change them all the same way.

Fixes: 764dd163ac92 ("netfilter: nf_conntrack_bridge: add support for IPv6")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 22e6398bc482..7beb681e1ce5 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -75,8 +75,10 @@ static inline int nf_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 		return 1;
 
 	return v6_ops->chk_addr(net, addr, dev, strict);
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return ipv6_chk_addr(net, addr, dev, strict);
+#else
+	return 1;
 #endif
 }
 
@@ -113,8 +115,10 @@ static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
 		return 1;
 
 	return v6_ops->br_defrag(net, skb, user);
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return nf_ct_frag6_gather(net, skb, user);
+#else
+	return 1;
 #endif
 }
 
@@ -138,8 +142,10 @@ static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk,
 		return 1;
 
 	return v6_ops->br_fragment(net, sk, skb, data, output);
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return br_ip6_fragment(net, sk, skb, data, output);
+#else
+	return 1;
 #endif
 }
 
@@ -154,8 +160,10 @@ static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 		return -EHOSTUNREACH;
 
 	return v6_ops->route_me_harder(net, skb);
-#else
+#elif IS_BUILTIN(CONFIG_IPV6)
 	return ip6_route_me_harder(net, skb);
+#else
+	return -EHOSTUNREACH;
 #endif
 }
 
-- 
cgit v1.2.3


From dbb5281a1f84b2f93032d4864c211ce8a20811a7 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Thu, 20 Jun 2019 12:19:59 -0400
Subject: netfilter: nf_tables: add support for matching IPv4 options

This is the kernel change for the overall changes with this description:
Add capability to have rules matching IPv4 options. This is developed
mainly to support dropping of IP packets with loose and/or strict source
route route options.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   2 +
 net/ipv4/ip_options.c                    |   1 +
 net/netfilter/nft_exthdr.c               | 133 +++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 31a6b8f7ff73..c6c8ec5c7c00 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -730,10 +730,12 @@ enum nft_exthdr_flags {
  *
  * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers
  * @NFT_EXTHDR_OP_TCP: match against tcp options
+ * @NFT_EXTHDR_OP_IPV4: match against ipv4 options
  */
 enum nft_exthdr_op {
 	NFT_EXTHDR_OP_IPV6,
 	NFT_EXTHDR_OP_TCPOPT,
+	NFT_EXTHDR_OP_IPV4,
 	__NFT_EXTHDR_OP_MAX
 };
 #define NFT_EXTHDR_OP_MAX	(__NFT_EXTHDR_OP_MAX - 1)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 3db31bb9df50..ddaa01ec2bce 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -473,6 +473,7 @@ error:
 		*info = htonl((pp_ptr-iph)<<24);
 	return -EINVAL;
 }
+EXPORT_SYMBOL(__ip_options_compile);
 
 int ip_options_compile(struct net *net,
 		       struct ip_options *opt, struct sk_buff *skb)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 45c8a6c07783..8032b2937c7f 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -62,6 +62,103 @@ err:
 	regs->verdict.code = NFT_BREAK;
 }
 
+/* find the offset to specified option.
+ *
+ * If target header is found, its offset is set in *offset and return option
+ * number. Otherwise, return negative error.
+ *
+ * If the first fragment doesn't contain the End of Options it is considered
+ * invalid.
+ */
+static int ipv4_find_option(struct net *net, struct sk_buff *skb,
+			    unsigned int *offset, int target)
+{
+	unsigned char optbuf[sizeof(struct ip_options) + 40];
+	struct ip_options *opt = (struct ip_options *)optbuf;
+	struct iphdr *iph, _iph;
+	unsigned int start;
+	bool found = false;
+	__be32 info;
+	int optlen;
+
+	iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+	if (!iph)
+		return -EBADMSG;
+	start = sizeof(struct iphdr);
+
+	optlen = iph->ihl * 4 - (int)sizeof(struct iphdr);
+	if (optlen <= 0)
+		return -ENOENT;
+
+	memset(opt, 0, sizeof(struct ip_options));
+	/* Copy the options since __ip_options_compile() modifies
+	 * the options.
+	 */
+	if (skb_copy_bits(skb, start, opt->__data, optlen))
+		return -EBADMSG;
+	opt->optlen = optlen;
+
+	if (__ip_options_compile(net, opt, NULL, &info))
+		return -EBADMSG;
+
+	switch (target) {
+	case IPOPT_SSRR:
+	case IPOPT_LSRR:
+		if (!opt->srr)
+			break;
+		found = target == IPOPT_SSRR ? opt->is_strictroute :
+					       !opt->is_strictroute;
+		if (found)
+			*offset = opt->srr + start;
+		break;
+	case IPOPT_RR:
+		if (!opt->rr)
+			break;
+		*offset = opt->rr + start;
+		found = true;
+		break;
+	case IPOPT_RA:
+		if (!opt->router_alert)
+			break;
+		*offset = opt->router_alert + start;
+		found = true;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	return found ? target : -ENOENT;
+}
+
+static void nft_exthdr_ipv4_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	u32 *dest = &regs->data[priv->dreg];
+	struct sk_buff *skb = pkt->skb;
+	unsigned int offset;
+	int err;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		goto err;
+
+	err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type);
+	if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+		*dest = (err >= 0);
+		return;
+	} else if (err < 0) {
+		goto err;
+	}
+	offset += priv->offset;
+
+	dest[priv->len / NFT_REG32_SIZE] = 0;
+	if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+		goto err;
+	return;
+err:
+	regs->verdict.code = NFT_BREAK;
+}
+
 static void *
 nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
 		       unsigned int len, void *buffer, unsigned int *tcphdr_len)
@@ -315,6 +412,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
 	return nft_validate_register_load(priv->sreg, priv->len);
 }
 
+static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
+				const struct nft_expr *expr,
+				const struct nlattr * const tb[])
+{
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	int err = nft_exthdr_init(ctx, expr, tb);
+
+	if (err < 0)
+		return err;
+
+	switch (priv->type) {
+	case IPOPT_SSRR:
+	case IPOPT_LSRR:
+	case IPOPT_RR:
+	case IPOPT_RA:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
 static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
 {
 	if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
@@ -361,6 +480,14 @@ static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
 	.dump		= nft_exthdr_dump,
 };
 
+static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
+	.type		= &nft_exthdr_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+	.eval		= nft_exthdr_ipv4_eval,
+	.init		= nft_exthdr_ipv4_init,
+	.dump		= nft_exthdr_dump,
+};
+
 static const struct nft_expr_ops nft_exthdr_tcp_ops = {
 	.type		= &nft_exthdr_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -401,6 +528,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
 		if (tb[NFTA_EXTHDR_DREG])
 			return &nft_exthdr_ipv6_ops;
 		break;
+	case NFT_EXTHDR_OP_IPV4:
+		if (ctx->family != NFPROTO_IPV6) {
+			if (tb[NFTA_EXTHDR_DREG])
+				return &nft_exthdr_ipv4_ops;
+		}
+		break;
 	}
 
 	return ERR_PTR(-EOPNOTSUPP);
-- 
cgit v1.2.3


From 4cfd218855923a07dc02a5bec3d3bb37a118ebc2 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 18 Jun 2019 23:13:48 +0200
Subject: PCI: let pci_disable_link_state propagate errors

Drivers may rely on pci_disable_link_state() having disabled certain
ASPM link states. If OS can't control ASPM then pci_disable_link_state()
turns into a no-op w/o informing the caller. The driver therefore may
falsely assume the respective ASPM link states are disabled.
Let pci_disable_link_state() propagate errors to the caller, enabling
the caller to react accordingly.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/pci/pcie/aspm.c  | 20 +++++++++++---------
 include/linux/pci-aspm.h |  7 ++++---
 2 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index fd4cb75088f9..e44af7f4d37f 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -1062,18 +1062,18 @@ void pcie_aspm_powersave_config_link(struct pci_dev *pdev)
 	up_read(&pci_bus_sem);
 }
 
-static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
+static int __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
 {
 	struct pci_dev *parent = pdev->bus->self;
 	struct pcie_link_state *link;
 
 	if (!pci_is_pcie(pdev))
-		return;
+		return 0;
 
 	if (pdev->has_secondary_link)
 		parent = pdev;
 	if (!parent || !parent->link_state)
-		return;
+		return -EINVAL;
 
 	/*
 	 * A driver requested that ASPM be disabled on this device, but
@@ -1085,7 +1085,7 @@ static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
 	 */
 	if (aspm_disabled) {
 		pci_warn(pdev, "can't disable ASPM; OS doesn't have ASPM control\n");
-		return;
+		return -EPERM;
 	}
 
 	if (sem)
@@ -1105,11 +1105,13 @@ static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
 	mutex_unlock(&aspm_lock);
 	if (sem)
 		up_read(&pci_bus_sem);
+
+	return 0;
 }
 
-void pci_disable_link_state_locked(struct pci_dev *pdev, int state)
+int pci_disable_link_state_locked(struct pci_dev *pdev, int state)
 {
-	__pci_disable_link_state(pdev, state, false);
+	return __pci_disable_link_state(pdev, state, false);
 }
 EXPORT_SYMBOL(pci_disable_link_state_locked);
 
@@ -1117,14 +1119,14 @@ EXPORT_SYMBOL(pci_disable_link_state_locked);
  * pci_disable_link_state - Disable device's link state, so the link will
  * never enter specific states.  Note that if the BIOS didn't grant ASPM
  * control to the OS, this does nothing because we can't touch the LNKCTL
- * register.
+ * register. Returns 0 or a negative errno.
  *
  * @pdev: PCI device
  * @state: ASPM link state to disable
  */
-void pci_disable_link_state(struct pci_dev *pdev, int state)
+int pci_disable_link_state(struct pci_dev *pdev, int state)
 {
-	__pci_disable_link_state(pdev, state, true);
+	return __pci_disable_link_state(pdev, state, true);
 }
 EXPORT_SYMBOL(pci_disable_link_state);
 
diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h
index df28af5cef21..67064145d76e 100644
--- a/include/linux/pci-aspm.h
+++ b/include/linux/pci-aspm.h
@@ -24,11 +24,12 @@
 #define PCIE_LINK_STATE_CLKPM	4
 
 #ifdef CONFIG_PCIEASPM
-void pci_disable_link_state(struct pci_dev *pdev, int state);
-void pci_disable_link_state_locked(struct pci_dev *pdev, int state);
+int pci_disable_link_state(struct pci_dev *pdev, int state);
+int pci_disable_link_state_locked(struct pci_dev *pdev, int state);
 void pcie_no_aspm(void);
 #else
-static inline void pci_disable_link_state(struct pci_dev *pdev, int state) { }
+static inline int pci_disable_link_state(struct pci_dev *pdev, int state)
+{ return 0; }
 static inline void pcie_no_aspm(void) { }
 #endif
 
-- 
cgit v1.2.3


From 438ac88009bcb10f9ced07fbb4b32d5377ee936b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 19 Jun 2019 23:46:28 +0200
Subject: net: fastopen: robustness and endianness fixes for SipHash

Some changes to the TCP fastopen code to make it more robust
against future changes in the choice of key/cookie size, etc.

- Instead of keeping the SipHash key in an untyped u8[] buffer
  and casting it to the right type upon use, use the correct
  type directly. This ensures that the key will appear at the
  correct alignment if we ever change the way these data
  structures are allocated. (Currently, they are only allocated
  via kmalloc so they always appear at the correct alignment)

- Use DIV_ROUND_UP when sizing the u64[] array to hold the
  cookie, so it is always of sufficient size, even if
  TCP_FASTOPEN_COOKIE_MAX is no longer a multiple of 8.

- Drop the 'len' parameter from the tcp_fastopen_reset_cipher()
  function, which is no longer used.

- Add endian swabbing when setting the keys and calculating the hash,
  to ensure that cookie values are the same for a given key and
  source/destination address pair regardless of the endianness of
  the server.

Note that none of these are functional changes wrt the current
state of the code, with the exception of the swabbing, which only
affects big endian systems.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h        |  2 +-
 include/net/tcp.h          |  8 ++++----
 net/ipv4/sysctl_net_ipv4.c |  3 +--
 net/ipv4/tcp.c             |  3 +--
 net/ipv4/tcp_fastopen.c    | 35 +++++++++++++++++------------------
 5 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2689b0b0b68a..f3a85a7fb4b1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -58,7 +58,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 
 /* TCP Fast Open Cookie as stored in memory */
 struct tcp_fastopen_cookie {
-	u64	val[TCP_FASTOPEN_COOKIE_MAX / sizeof(u64)];
+	__le64	val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))];
 	s8	len;
 	bool	exp;	/* In RFC6994 experimental option format */
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 573c9e9b0d72..9d36cc88d043 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -43,6 +43,7 @@
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
 #include <linux/bpf-cgroup.h>
+#include <linux/siphash.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -1612,8 +1613,7 @@ void tcp_free_fastopen_req(struct tcp_sock *tp);
 void tcp_fastopen_destroy_cipher(struct sock *sk);
 void tcp_fastopen_ctx_destroy(struct net *net);
 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
-			      void *primary_key, void *backup_key,
-			      unsigned int len);
+			      void *primary_key, void *backup_key);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
@@ -1623,14 +1623,14 @@ void tcp_fastopen_init_key_once(struct net *net);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
 bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
-#define TCP_FASTOPEN_KEY_LENGTH 16
+#define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
 #define TCP_FASTOPEN_KEY_MAX 2
 #define TCP_FASTOPEN_KEY_BUF_LENGTH \
 	(TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)
 
 /* Fastopen key context */
 struct tcp_fastopen_context {
-	__u8		key[TCP_FASTOPEN_KEY_MAX][TCP_FASTOPEN_KEY_LENGTH];
+	siphash_key_t	key[TCP_FASTOPEN_KEY_MAX];
 	int		num;
 	struct rcu_head	rcu;
 };
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7d802acde040..7d66306b5f39 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -365,8 +365,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 			}
 		}
 		tcp_fastopen_reset_cipher(net, NULL, key,
-					  backup_data ? key + 4 : NULL,
-					  TCP_FASTOPEN_KEY_LENGTH);
+					  backup_data ? key + 4 : NULL);
 	}
 
 bad_key:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index efd7f2b1d1f0..47c217905864 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2822,8 +2822,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
 			backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
 
-		return tcp_fastopen_reset_cipher(net, sk, key, backup_key,
-						 TCP_FASTOPEN_KEY_LENGTH);
+		return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
 	}
 	default:
 		/* fallthru */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f918599181dd..3fd451271a70 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -7,7 +7,6 @@
 #include <linux/tcp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
-#include <linux/siphash.h>
 #include <net/inetpeer.h>
 #include <net/tcp.h>
 
@@ -31,7 +30,7 @@ void tcp_fastopen_init_key_once(struct net *net)
 	 * for a valid cookie, so this is an acceptable risk.
 	 */
 	get_random_bytes(key, sizeof(key));
-	tcp_fastopen_reset_cipher(net, NULL, key, NULL, sizeof(key));
+	tcp_fastopen_reset_cipher(net, NULL, key, NULL);
 }
 
 static void tcp_fastopen_ctx_free(struct rcu_head *head)
@@ -68,8 +67,7 @@ void tcp_fastopen_ctx_destroy(struct net *net)
 }
 
 int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
-			      void *primary_key, void *backup_key,
-			      unsigned int len)
+			      void *primary_key, void *backup_key)
 {
 	struct tcp_fastopen_context *ctx, *octx;
 	struct fastopen_queue *q;
@@ -81,9 +79,11 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 		goto out;
 	}
 
-	memcpy(ctx->key[0], primary_key, len);
+	ctx->key[0].key[0] = get_unaligned_le64(primary_key);
+	ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
 	if (backup_key) {
-		memcpy(ctx->key[1], backup_key, len);
+		ctx->key[1].key[0] = get_unaligned_le64(backup_key);
+		ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
 		ctx->num = 2;
 	} else {
 		ctx->num = 1;
@@ -110,19 +110,18 @@ out:
 
 static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
 					     struct sk_buff *syn,
-					     const u8 *key,
+					     const siphash_key_t *key,
 					     struct tcp_fastopen_cookie *foc)
 {
-	BUILD_BUG_ON(TCP_FASTOPEN_KEY_LENGTH != sizeof(siphash_key_t));
 	BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
 
 	if (req->rsk_ops->family == AF_INET) {
 		const struct iphdr *iph = ip_hdr(syn);
 
-		foc->val[0] = siphash(&iph->saddr,
-				      sizeof(iph->saddr) +
-				      sizeof(iph->daddr),
-				      (const siphash_key_t *)key);
+		foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
+					  sizeof(iph->saddr) +
+					  sizeof(iph->daddr),
+					  key));
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 		return true;
 	}
@@ -130,10 +129,10 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
 	if (req->rsk_ops->family == AF_INET6) {
 		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
 
-		foc->val[0] = siphash(&ip6h->saddr,
-				      sizeof(ip6h->saddr) +
-				      sizeof(ip6h->daddr),
-				      (const siphash_key_t *)key);
+		foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
+					  sizeof(ip6h->saddr) +
+					  sizeof(ip6h->daddr),
+					  key));
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
 		return true;
 	}
@@ -154,7 +153,7 @@ static void tcp_fastopen_cookie_gen(struct sock *sk,
 	rcu_read_lock();
 	ctx = tcp_fastopen_get_ctx(sk);
 	if (ctx)
-		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[0], foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
 	rcu_read_unlock();
 }
 
@@ -218,7 +217,7 @@ static int tcp_fastopen_cookie_gen_check(struct sock *sk,
 	if (!ctx)
 		goto out;
 	for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
-		__tcp_fastopen_cookie_gen_cipher(req, syn, ctx->key[i], foc);
+		__tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
 		if (tcp_fastopen_cookie_match(foc, orig)) {
 			ret = i + 1;
 			goto out;
-- 
cgit v1.2.3


From 08003d0b63a63bebaccca90e2f1d628dfd66cd4d Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 20 Jun 2019 10:52:40 -0400
Subject: inet: fix compilation warnings in fqdir_pre_exit()

The linux-next commit "inet: fix various use-after-free in defrags
units" [1] introduced compilation warnings,

./include/net/inet_frag.h:117:1: warning: 'inline' is not at beginning
of declaration [-Wold-style-declaration]
 static void inline fqdir_pre_exit(struct fqdir *fqdir)
 ^~~~~~
In file included from ./include/net/netns/ipv4.h:10,
                 from ./include/net/net_namespace.h:20,
                 from ./include/linux/netdevice.h:38,
                 from ./include/linux/icmpv6.h:13,
                 from ./include/linux/ipv6.h:86,
                 from ./include/net/ipv6.h:12,
                 from ./include/rdma/ib_verbs.h:51,
                 from ./include/linux/mlx5/device.h:37,
                 from ./include/linux/mlx5/driver.h:51,
                 from
drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c:37:

[1] https://lore.kernel.org/netdev/20190618180900.88939-3-edumazet@google.com/

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 46574d996f1d..010f26b31c89 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -114,7 +114,7 @@ void inet_frags_fini(struct inet_frags *);
 
 int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net);
 
-static void inline fqdir_pre_exit(struct fqdir *fqdir)
+static inline void fqdir_pre_exit(struct fqdir *fqdir)
 {
 	fqdir->high_thresh = 0; /* prevent creation of new frags */
 	fqdir->dead = true;
-- 
cgit v1.2.3


From 0e09edcce7ad9c8120eb8462334e1c9e8f3be09a Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 20 Jun 2019 17:36:37 -0700
Subject: ipv6: introduce RT6_LOOKUP_F_DST_NOREF flag in ip6_pol_route()

This new flag is to instruct the route lookup function to not take
refcnt on the dst entry. The user which does route lookup with this flag
must properly use rcu protection.
ip6_pol_route() is the major route lookup function for both tx and rx
path.
In this function:
Do not take refcnt on dst if RT6_LOOKUP_F_DST_NOREF flag is set, and
directly return the route entry. The caller should be holding rcu lock
when using this flag, and decide whether to take refcnt or not.

One note on the dst cache in the uncached_list:
As uncached_list does not consume refcnt, one refcnt is always returned
back to the caller even if RT6_LOOKUP_F_DST_NOREF flag is set.
Uncached dst is only possible in the output path. So in such call path,
caller MUST check if the dst is in the uncached_list before assuming
that there is no refcnt taken on the returned dst.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  1 +
 net/ipv6/route.c        | 73 ++++++++++++++++++++-----------------------------
 2 files changed, 31 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 7375a165fd98..82bced2fc1e3 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -36,6 +36,7 @@ struct route_info {
 #define RT6_LOOKUP_F_SRCPREF_PUBLIC	0x00000010
 #define RT6_LOOKUP_F_SRCPREF_COA	0x00000020
 #define RT6_LOOKUP_F_IGNORE_LINKSTATE	0x00000040
+#define RT6_LOOKUP_F_DST_NOREF		0x00000080
 
 /* We do not (yet ?) support IPv6 jumbograms (RFC 2675)
  * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4c5142a30808..5469e0a9c810 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1391,9 +1391,6 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
 
 	pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
 
-	if (pcpu_rt)
-		ip6_hold_safe(NULL, &pcpu_rt);
-
 	return pcpu_rt;
 }
 
@@ -1403,12 +1400,9 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
 	struct rt6_info *pcpu_rt, *prev, **p;
 
 	pcpu_rt = ip6_rt_pcpu_alloc(res);
-	if (!pcpu_rt) {
-		dst_hold(&net->ipv6.ip6_null_entry->dst);
-		return net->ipv6.ip6_null_entry;
-	}
+	if (!pcpu_rt)
+		return NULL;
 
-	dst_hold(&pcpu_rt->dst);
 	p = this_cpu_ptr(res->nh->rt6i_pcpu);
 	prev = cmpxchg(p, NULL, pcpu_rt);
 	BUG_ON(prev);
@@ -2189,9 +2183,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       const struct sk_buff *skb, int flags)
 {
 	struct fib6_result res = {};
-	struct rt6_info *rt;
+	struct rt6_info *rt = NULL;
 	int strict = 0;
 
+	WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
+		     !rcu_read_lock_held());
+
 	strict |= flags & RT6_LOOKUP_F_IFACE;
 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
 	if (net->ipv6.devconf_all->forwarding == 0)
@@ -2200,23 +2197,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 	rcu_read_lock();
 
 	fib6_table_lookup(net, table, oif, fl6, &res, strict);
-	if (res.f6i == net->ipv6.fib6_null_entry) {
-		rt = net->ipv6.ip6_null_entry;
-		rcu_read_unlock();
-		dst_hold(&rt->dst);
-		return rt;
-	}
+	if (res.f6i == net->ipv6.fib6_null_entry)
+		goto out;
 
 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
 
 	/*Search through exception table */
 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
 	if (rt) {
-		if (ip6_hold_safe(net, &rt))
-			dst_use_noref(&rt->dst, jiffies);
-
-		rcu_read_unlock();
-		return rt;
+		goto out;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !res.nh->fib_nh_gw_family)) {
 		/* Create a RTF_CACHE clone which will not be
@@ -2224,40 +2213,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		 * the daddr in the skb during the neighbor look-up is different
 		 * from the fl6->daddr used to look-up route here.
 		 */
-		struct rt6_info *uncached_rt;
+		rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
 
-		uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
-
-		rcu_read_unlock();
-
-		if (uncached_rt) {
-			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
-			 * No need for another dst_hold()
+		if (rt) {
+			/* 1 refcnt is taken during ip6_rt_cache_alloc().
+			 * As rt6_uncached_list_add() does not consume refcnt,
+			 * this refcnt is always returned to the caller even
+			 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
 			 */
-			rt6_uncached_list_add(uncached_rt);
+			rt6_uncached_list_add(rt);
 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
-		} else {
-			uncached_rt = net->ipv6.ip6_null_entry;
-			dst_hold(&uncached_rt->dst);
-		}
+			rcu_read_unlock();
 
-		return uncached_rt;
+			return rt;
+		}
 	} else {
 		/* Get a percpu copy */
-
-		struct rt6_info *pcpu_rt;
-
 		local_bh_disable();
-		pcpu_rt = rt6_get_pcpu_route(&res);
+		rt = rt6_get_pcpu_route(&res);
 
-		if (!pcpu_rt)
-			pcpu_rt = rt6_make_pcpu_route(net, &res);
+		if (!rt)
+			rt = rt6_make_pcpu_route(net, &res);
 
 		local_bh_enable();
-		rcu_read_unlock();
-
-		return pcpu_rt;
 	}
+out:
+	if (!rt)
+		rt = net->ipv6.ip6_null_entry;
+	if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+		ip6_hold_safe(net, &rt);
+	rcu_read_unlock();
+
+	return rt;
 }
 EXPORT_SYMBOL_GPL(ip6_pol_route);
 
-- 
cgit v1.2.3


From d64a1f574a2957b4bcb06452d36cc1c6bf16e9fc Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 20 Jun 2019 17:36:39 -0700
Subject: ipv6: honor RT6_LOOKUP_F_DST_NOREF in rule lookup logic

This patch specifically converts the rule lookup logic to honor this
flag and not release refcnt when traversing each rule and calling
lookup() on each routing table.
Similar to previous patch, we also need some special handling of dst
entries in uncached list because there is always 1 refcnt taken for them
even if RT6_LOOKUP_F_DST_NOREF flag is set.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 10 ++++++++++
 net/ipv6/fib6_rules.c   | 12 +++++++-----
 net/ipv6/ip6_fib.c      |  5 +++--
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 82bced2fc1e3..0709835c01ad 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -94,6 +94,16 @@ static inline struct dst_entry *ip6_route_output(struct net *net,
 	return ip6_route_output_flags(net, sk, fl6, 0);
 }
 
+/* Only conditionally release dst if flags indicates
+ * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list.
+ */
+static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
+{
+	if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
+	    !list_empty(&rt->rt6i_uncached))
+		ip6_rt_put(rt);
+}
+
 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb, int flags);
 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index bcfae13409b5..d22b6c140f23 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 		rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
 		if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
 			return &rt->dst;
-		ip6_rt_put(rt);
+		ip6_rt_put_flags(rt, flags);
 		rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
 		if (rt->dst.error != -EAGAIN)
 			return &rt->dst;
-		ip6_rt_put(rt);
+		ip6_rt_put_flags(rt, flags);
 	}
 
-	dst_hold(&net->ipv6.ip6_null_entry->dst);
+	if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+		dst_hold(&net->ipv6.ip6_null_entry->dst);
 	return &net->ipv6.ip6_null_entry->dst;
 }
 
@@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 			goto out;
 	}
 again:
-	ip6_rt_put(rt);
+	ip6_rt_put_flags(rt, flags);
 	err = -EAGAIN;
 	rt = NULL;
 	goto out;
 
 discard_pkt:
-	dst_hold(&rt->dst);
+	if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+		dst_hold(&rt->dst);
 out:
 	res->rt6 = rt;
 	return err;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1d16a01eccf5..5b1c9b5b9247 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -316,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 
 	rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
 	if (rt->dst.error == -EAGAIN) {
-		ip6_rt_put(rt);
+		ip6_rt_put_flags(rt, flags);
 		rt = net->ipv6.ip6_null_entry;
-		dst_hold(&rt->dst);
+		if (!(flags | RT6_LOOKUP_F_DST_NOREF))
+			dst_hold(&rt->dst);
 	}
 
 	return &rt->dst;
-- 
cgit v1.2.3


From 7d9e5f422150ed00de744e02a80734d74cc9704d Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 20 Jun 2019 17:36:41 -0700
Subject: ipv6: convert major tx path to use RT6_LOOKUP_F_DST_NOREF

For tx path, in most cases, we still have to take refcnt on the dst
cause the caller is caching the dst somewhere. But it still is
beneficial to make use of RT6_LOOKUP_F_DST_NOREF flag while doing the
route lookup. It is cause this flag prevents manipulating refcnt on
net->ipv6.ip6_null_entry when doing fib6_rule_lookup() to traverse each
routing table. The null_entry is a shared object and constant updates on
it cause false sharing.

We converted the current major lookup function ip6_route_output_flags()
to make use of RT6_LOOKUP_F_DST_NOREF.

Together with the change in the rx path, we see noticable performance
boost:
I ran synflood tests between 2 hosts under the same switch. Both hosts
have 20G mlx NIC, and 8 tx/rx queues.
Sender sends pure SYN flood with random src IPs and ports using trafgen.
Receiver has a simple TCP listener on the target port.
Both hosts have multiple custom rules:
- For incoming packets, only local table is traversed.
- For outgoing packets, 3 tables are traversed to find the route.
The packet processing rate on the receiver is as follows:
- Before the fix: 3.78Mpps
- After the fix:  5.50Mpps

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c       |  5 +++--
 include/net/ip6_route.h |  4 ++++
 net/ipv6/route.c        | 29 +++++++++++++++++++++++++++--
 net/l3mdev/l3mdev.c     |  7 +++----
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 11b9525dff27..69ef9cce5858 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1072,12 +1072,14 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
 #if IS_ENABLED(CONFIG_IPV6)
 /* send to link-local or multicast address via interface enslaved to
  * VRF device. Force lookup to VRF table without changing flow struct
+ * Note: Caller to this function must hold rcu_read_lock() and no refcnt
+ * is taken on the dst by this function.
  */
 static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
 					      struct flowi6 *fl6)
 {
 	struct net *net = dev_net(dev);
-	int flags = RT6_LOOKUP_F_IFACE;
+	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF;
 	struct dst_entry *dst = NULL;
 	struct rt6_info *rt;
 
@@ -1087,7 +1089,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
 	 */
 	if (fl6->flowi6_oif == dev->ifindex) {
 		dst = &net->ipv6.ip6_null_entry->dst;
-		dst_hold(dst);
 		return dst;
 	}
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 0709835c01ad..89ad7917b98d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -84,6 +84,10 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
 					 struct flowi6 *fl6,
 					 const struct sk_buff *skb, int flags);
 
+struct dst_entry *ip6_route_output_flags_noref(struct net *net,
+					       const struct sock *sk,
+					       struct flowi6 *fl6, int flags);
+
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 					 struct flowi6 *fl6, int flags);
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 66fc69ef5909..3975ae8e2440 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2415,8 +2415,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net,
 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
 }
 
-struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
-					 struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_output_flags_noref(struct net *net,
+					       const struct sock *sk,
+					       struct flowi6 *fl6, int flags)
 {
 	bool any_src;
 
@@ -2424,6 +2425,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
 		struct dst_entry *dst;
 
+		/* This function does not take refcnt on the dst */
 		dst = l3mdev_link_scope_lookup(net, fl6);
 		if (dst)
 			return dst;
@@ -2431,6 +2433,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 
 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
 
+	flags |= RT6_LOOKUP_F_DST_NOREF;
 	any_src = ipv6_addr_any(&fl6->saddr);
 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
 	    (fl6->flowi6_oif && any_src))
@@ -2443,6 +2446,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 
 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
 }
+EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
+
+struct dst_entry *ip6_route_output_flags(struct net *net,
+					 const struct sock *sk,
+					 struct flowi6 *fl6,
+					 int flags)
+{
+        struct dst_entry *dst;
+        struct rt6_info *rt6;
+
+        rcu_read_lock();
+        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+        rt6 = (struct rt6_info *)dst;
+        /* For dst cached in uncached_list, refcnt is already taken. */
+        if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+                dst = &net->ipv6.ip6_null_entry->dst;
+                dst_hold(dst);
+        }
+        rcu_read_unlock();
+
+        return dst;
+}
 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
 
 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index cfc9fcb97465..f35899d45a9a 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
  *			     local and multicast addresses
  *	@net: network namespace for device index lookup
  *	@fl6: IPv6 flow struct for lookup
+ *	This function does not hold refcnt on the returned dst.
+ *	Caller must hold rcu_read_lock().
  */
 
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
@@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
 	struct dst_entry *dst = NULL;
 	struct net_device *dev;
 
+	WARN_ON_ONCE(!rcu_read_lock_held());
 	if (fl6->flowi6_oif) {
-		rcu_read_lock();
-
 		dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
 		if (dev && netif_is_l3_slave(dev))
 			dev = netdev_master_upper_dev_get_rcu(dev);
@@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
 		if (dev && netif_is_l3_master(dev) &&
 		    dev->l3mdev_ops->l3mdev_link_scope_lookup)
 			dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
-
-		rcu_read_unlock();
 	}
 
 	return dst;
-- 
cgit v1.2.3


From 564c91f7e563256be835f31db97a60908702c9ec Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 21 Jun 2019 17:45:20 +0200
Subject: fib_frontend, ip6_fib: Select routes or exceptions dump from
 RTM_F_CLONED

The following patches add back the ability to dump IPv4 and IPv6 exception
routes, and we need to allow selection of regular routes or exceptions.

Use RTM_F_CLONED as filter to decide whether to dump routes or exceptions:
iproute2 passes it in dump requests (except for IPv6 cache flush requests,
this will be fixed in iproute2) and this used to work as long as
exceptions were stored directly in the FIB, for both IPv4 and IPv6.

Caveat: if strict checking is not requested (that is, if the dump request
doesn't go through ip_valid_fib_dump_req()), we can't filter on protocol,
tables or route types.

In this case, filtering on RTM_F_CLONED would be inconsistent: we would
fix 'ip route list cache' by returning exception routes and at the same
time introduce another bug in case another selector is present, e.g. on
'ip route list cache table main' we would return all exception routes,
without filtering on tables.

Keep this consistent by applying no filters at all, and dumping both
routes and exceptions, if strict checking is not requested. iproute2
currently filters results anyway, and no unwanted results will be
presented to the user. The kernel will just dump more data than needed.

v7: No changes

v6: Rebase onto net-next, no changes

v5: New patch: add dump_routes and dump_exceptions flags in filter and
    simply clear the unwanted one if strict checking is enabled, don't
    ignore NLM_F_MATCH and don't set filter_set if NLM_F_MATCH is set.
    Skip filtering altogether if no strict checking is requested:
    selecting routes or exceptions only would be inconsistent with the
    fact we can't filter on tables.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    | 2 ++
 net/ipv4/fib_frontend.c | 8 +++++++-
 net/ipv6/ip6_fib.c      | 3 ++-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 7e1e621a56df..4c81846ccce8 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -245,6 +245,8 @@ struct fib_dump_filter {
 	/* filter_set is an optimization that an entry is set */
 	bool			filter_set;
 	bool			dump_all_families;
+	bool			dump_routes;
+	bool			dump_exceptions;
 	unsigned char		protocol;
 	unsigned char		rt_type;
 	unsigned int		flags;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 108191667531..ed7fb5fd885c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -912,10 +912,15 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 		NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
+
 	if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
 		NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
 		return -EINVAL;
 	}
+	if (rtm->rtm_flags & RTM_F_CLONED)
+		filter->dump_routes = false;
+	else
+		filter->dump_exceptions = false;
 
 	filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC);
 	filter->flags    = rtm->rtm_flags;
@@ -962,9 +967,10 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
 
 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct fib_dump_filter filter = { .dump_routes = true,
+					  .dump_exceptions = true };
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
-	struct fib_dump_filter filter = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
 	struct fib_table *tb;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5b1c9b5b9247..083e175e11ef 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -553,9 +553,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 
 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
+					 .filter.dump_routes = true };
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
-	struct rt6_rtnl_dump_arg arg = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
 	struct fib6_walker *w;
-- 
cgit v1.2.3


From ee28906fd7a1437ca77a60a99b6b9c6d676220f8 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 21 Jun 2019 17:45:23 +0200
Subject: ipv4: Dump route exceptions if requested

Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.

This implies that the command 'ip route list cache' doesn't return any
result anymore.

If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.

With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.

A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():

  node (fa) #1 [1]
    nexthop #1
    bucket #1 -> #0 in chain (1)
    bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
    bucket #3 -> #0 in chain (5) -> #1 in chain (6)

    nexthop #2
    bucket #1 -> #0 in chain (7) -> #1 in chain (8)
    bucket #2 -> #0 in chain (9)
  --
  node (fa) #2 [2]
    nexthop #1
    bucket #1 -> #0 in chain (1) -> #1 in chain (2)
    bucket #2 -> #0 in chain (3)

it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.

It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.

To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.

Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:

                    iproute2
kernel         4.14.0   4.15.0   4.19.0   5.0.0   5.1.0
 3.5-rc4         +        +        +        +       +
 4.4
 4.9
 4.14
 4.15
 4.19
 5.0
 5.1
 fixed           +        +        +        +       +

v7:
   - Move loop over nexthop objects to route.c, and pass struct fib_info
     and table ID to it, not a struct fib_alias (suggested by David Ahern)
   - While at it, note that the NULL check on fa->fa_info is redundant,
     and the check on RTNH_F_DEAD is also not consistent with what's done
     with regular route listing: just keep it for nhc_flags
   - Rename entry point function for dumping exceptions to
     fib_dump_info_fnhe(), and rearrange arguments for consistency with
     fib_dump_info()
   - Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
     one bucket at a time
   - Expand commit message to describe why we can have a single "skip"
     counter for all exceptions stored in bucket chains in nexthop objects
     (suggested by David Ahern)

v6:
   - Rebased onto net-next
   - Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
     avoids need to export rt_fill_info() and to touch exceptions from
     fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
     (suggested by David Ahern)

Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  4 +++
 net/ipv4/fib_trie.c | 44 ++++++++++++++++++++++----------
 net/ipv4/route.c    | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 065b47754f05..cfcd0f5980f9 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -230,6 +230,10 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
 void rt_add_uncached_list(struct rtable *rt);
 void rt_del_uncached_list(struct rtable *rt);
 
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
+		       u32 table_id, struct fib_info *fi,
+		       int *fa_index, int fa_start);
+
 static inline void ip_rt_put(struct rtable *rt)
 {
 	/* dst_release() accepts a NULL parameter.
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 90f0fc8c87bd..4400f5051977 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2090,22 +2090,26 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 {
 	unsigned int flags = NLM_F_MULTI;
 	__be32 xkey = htonl(l->key);
+	int i, s_i, i_fa, s_fa, err;
 	struct fib_alias *fa;
-	int i, s_i;
 
-	if (filter->filter_set)
+	if (filter->filter_set ||
+	    !filter->dump_exceptions || !filter->dump_routes)
 		flags |= NLM_F_DUMP_FILTERED;
 
 	s_i = cb->args[4];
+	s_fa = cb->args[5];
 	i = 0;
 
 	/* rcu_read_lock is hold by caller */
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
-		int err;
+		struct fib_info *fi = fa->fa_info;
 
 		if (i < s_i)
 			goto next;
 
+		i_fa = 0;
+
 		if (tb->tb_id != fa->tb_id)
 			goto next;
 
@@ -2114,29 +2118,43 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 				goto next;
 
 			if ((filter->protocol &&
-			     fa->fa_info->fib_protocol != filter->protocol))
+			     fi->fib_protocol != filter->protocol))
 				goto next;
 
 			if (filter->dev &&
-			    !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+			    !fib_info_nh_uses_dev(fi, filter->dev))
 				goto next;
 		}
 
-		err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
-				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-				    tb->tb_id, fa->fa_type,
-				    xkey, KEYLENGTH - fa->fa_slen,
-				    fa->fa_tos, fa->fa_info, flags);
-		if (err < 0) {
-			cb->args[4] = i;
-			return err;
+		if (filter->dump_routes && !s_fa) {
+			err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+					    tb->tb_id, fa->fa_type,
+					    xkey, KEYLENGTH - fa->fa_slen,
+					    fa->fa_tos, fi, flags);
+			if (err < 0)
+				goto stop;
+			i_fa++;
 		}
+
+		if (filter->dump_exceptions) {
+			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
+						 &i_fa, s_fa);
+			if (err < 0)
+				goto stop;
+		}
+
 next:
 		i++;
 	}
 
 	cb->args[4] = i;
 	return skb->len;
+
+stop:
+	cb->args[4] = i;
+	cb->args[5] = i_fa;
+	return err;
 }
 
 /* rcu_read_lock needs to be hold by caller from readside */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b1628d25e828..6aee412a68bd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2812,6 +2812,79 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
+			    struct netlink_callback *cb, u32 table_id,
+			    struct fnhe_hash_bucket *bucket, int genid,
+			    int *fa_index, int fa_start)
+{
+	int i;
+
+	for (i = 0; i < FNHE_HASH_SIZE; i++) {
+		struct fib_nh_exception *fnhe;
+
+		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
+		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
+			struct rtable *rt;
+			int err;
+
+			if (*fa_index < fa_start)
+				goto next;
+
+			if (fnhe->fnhe_genid != genid)
+				goto next;
+
+			if (fnhe->fnhe_expires &&
+			    time_after(jiffies, fnhe->fnhe_expires))
+				goto next;
+
+			rt = rcu_dereference(fnhe->fnhe_rth_input);
+			if (!rt)
+				rt = rcu_dereference(fnhe->fnhe_rth_output);
+			if (!rt)
+				goto next;
+
+			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
+					   table_id, NULL, skb,
+					   NETLINK_CB(cb->skb).portid,
+					   cb->nlh->nlmsg_seq);
+			if (err)
+				return err;
+next:
+			(*fa_index)++;
+		}
+	}
+
+	return 0;
+}
+
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
+		       u32 table_id, struct fib_info *fi,
+		       int *fa_index, int fa_start)
+{
+	struct net *net = sock_net(cb->skb->sk);
+	int nhsel, genid = fnhe_genid(net);
+
+	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
+		struct fnhe_hash_bucket *bucket;
+		int err;
+
+		if (nhc->nhc_flags & RTNH_F_DEAD)
+			continue;
+
+		bucket = rcu_dereference(nhc->nhc_exceptions);
+		if (!bucket)
+			continue;
+
+		err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, genid,
+				       fa_index, fa_start);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
 						   u8 ip_proto, __be16 sport,
 						   __be16 dport)
-- 
cgit v1.2.3


From 1e47b4837f3bdaa425727cfe09f5ae3b6c4c41a9 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 21 Jun 2019 17:45:27 +0200
Subject: ipv6: Dump route exceptions if requested

Since commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst
cache"), route exceptions reside in a separate hash table, and won't be
found by walking the FIB, so they won't be dumped to userspace on a
RTM_GETROUTE message.

This causes 'ip -6 route list cache' and 'ip -6 route flush cache' to
have no function anymore:

 # ip -6 route get fc00:3::1
 fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 539sec mtu 1400 pref medium
 # ip -6 route get fc00:4::1
 fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 536sec mtu 1500 pref medium
 # ip -6 route list cache
 # ip -6 route flush cache
 # ip -6 route get fc00:3::1
 fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 520sec mtu 1400 pref medium
 # ip -6 route get fc00:4::1
 fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 519sec mtu 1500 pref medium

because iproute2 lists cached routes using RTM_GETROUTE, and flushes them
by listing all the routes, and deleting them with RTM_DELROUTE one by one.

If cached routes are requested using the RTM_F_CLONED flag together with
strict checking, or if no strict checking is requested (and hence we can't
consistently apply filters), look up exceptions in the hash table
associated with the current fib6_info in rt6_dump_route(), and, if present
and not expired, add them to the dump.

We might be unable to dump all the entries for a given node in a single
message, so keep track of how many entries were handled for the current
node in fib6_walker, and skip that amount in case we start from the same
partially dumped node.

When a partial dump restarts, as the starting node might change when
'sernum' changes, we have no guarantee that we need to skip the same
amount of in-node entries. Therefore, we need two counters, and we need to
zero the in-node counter if the node from which the dump is resumed
differs.

Note that, with the current version of iproute2, this only fixes the
'ip -6 route list cache': on a flush command, iproute2 doesn't pass
RTM_F_CLONED and, due to this inconsistency, 'ip -6 route flush cache' is
still unable to fetch the routes to be flushed. This will be addressed in
a patch for iproute2.

To flush cached routes, a procfs entry could be introduced instead: that's
how it works for IPv4. We already have a rt6_flush_exception() function
ready to be wired to it. However, this would not solve the issue for
listing.

Versions of iproute2 and kernel tested:

                    iproute2
kernel             4.14.0   4.15.0   4.19.0   5.0.0   5.1.0    5.1.0, patched
 3.18    list        +        +        +        +       +            +
         flush       +        +        +        +       +            +
 4.4     list        +        +        +        +       +            +
         flush       +        +        +        +       +            +
 4.9     list        +        +        +        +       +            +
         flush       +        +        +        +       +            +
 4.14    list        +        +        +        +       +            +
         flush       +        +        +        +       +            +
 4.15    list
         flush
 4.19    list
         flush
 5.0     list
         flush
 5.1     list
         flush
 with    list        +        +        +        +       +            +
 fix     flush       +        +        +                             +

v7:
  - Explain usage of "skip" counters in commit message (suggested by
    David Ahern)

v6:
  - Rebase onto net-next, use recently introduced nexthop walker
  - Make rt6_nh_dump_exceptions() a separate function (suggested by David
    Ahern)

v5:
  - Use dump_routes and dump_exceptions from filter, ignore NLM_F_MATCH,
    update test results (flushing works with iproute2 < 5.0.0 now)

v4:
  - Split NLM_F_MATCH and strict check handling in separate patches
  - Filter routes using RTM_F_CLONED: if it's not set, only return
    non-cached routes, and if it's set, only return cached routes:
    change requested by David Ahern and Martin Lau. This implies that
    iproute2 needs a separate patch to be able to flush IPv6 cached
    routes. This is not ideal because we can't fix the breakage caused
    by 2b760fcf5cfb entirely in kernel. However, two years have passed
    since then, and this makes it more tolerable

v3:
  - More descriptive comment about expired exceptions in rt6_dump_route()
  - Swap return values of rt6_dump_route() (suggested by Martin Lau)
  - Don't zero skip_in_node in case we don't dump anything in a given pass
    (also suggested by Martin Lau)
  - Remove check on RTM_F_CLONED altogether: in the current UAPI semantic,
    it's just a flag to indicate the route was cloned, not to filter on
    routes

v2: Add tracking of number of entries to be skipped in current node after
    a partial dump. As we restart from the same node, if not all the
    exceptions for a given node fit in a single message, the dump will
    not terminate, as suggested by Martin Lau. This is a concrete
    possibility, setting up a big number of exceptions for the same route
    actually causes the issue, suggested by David Ahern.

Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |   1 +
 include/net/ip6_route.h |   2 +-
 net/ipv6/ip6_fib.c      |  12 ++++-
 net/ipv6/route.c        | 114 +++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 116 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 87331f2c4af0..4b5656c71abc 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -316,6 +316,7 @@ struct fib6_walker {
 	enum fib6_walk_state state;
 	unsigned int skip;
 	unsigned int count;
+	unsigned int skip_in_node;
 	int (*func)(struct fib6_walker *);
 	void *args;
 };
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 89ad7917b98d..c8bba0c28286 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -197,7 +197,7 @@ struct rt6_rtnl_dump_arg {
 	struct fib_dump_filter filter;
 };
 
-int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
+int rt6_dump_route(struct fib6_info *f6i, void *p_arg, unsigned int skip);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 3e9ce86a819c..0b68839b984b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -464,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w)
 	struct fib6_info *rt;
 
 	for_each_fib6_walker_rt(w) {
-		res = rt6_dump_route(rt, w->args);
+		res = rt6_dump_route(rt, w->args, w->skip_in_node);
 		if (res >= 0) {
 			/* Frame is full, suspend walking */
 			w->leaf = rt;
+
+			/* We'll restart from this node, so if some routes were
+			 * already dumped, skip them next time.
+			 */
+			w->skip_in_node += res;
+
 			return 1;
 		}
+		w->skip_in_node = 0;
 
 		/* Multipath routes are dumped in one route with the
 		 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
@@ -521,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 	if (cb->args[4] == 0) {
 		w->count = 0;
 		w->skip = 0;
+		w->skip_in_node = 0;
 
 		spin_lock_bh(&table->tb6_lock);
 		res = fib6_walk(net, w);
@@ -536,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 			w->state = FWS_INIT;
 			w->node = w->root;
 			w->skip = w->count;
+			w->skip_in_node = 0;
 		} else
 			w->skip = 0;
 
@@ -2094,6 +2103,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 	c.w.func = fib6_clean_node;
 	c.w.count = 0;
 	c.w.skip = 0;
+	c.w.skip_in_node = 0;
 	c.func = func;
 	c.sernum = sernum;
 	c.arg = arg;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7c86ef046ebb..be5e65c97652 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5522,13 +5522,73 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
 	return false;
 }
 
+struct fib6_nh_exception_dump_walker {
+	struct rt6_rtnl_dump_arg *dump;
+	struct fib6_info *rt;
+	unsigned int flags;
+	unsigned int skip;
+	unsigned int count;
+};
+
+static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
+{
+	struct fib6_nh_exception_dump_walker *w = arg;
+	struct rt6_rtnl_dump_arg *dump = w->dump;
+	struct rt6_exception_bucket *bucket;
+	struct rt6_exception *rt6_ex;
+	int i, err;
+
+	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
+	if (!bucket)
+		return 0;
+
+	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+			if (w->skip) {
+				w->skip--;
+				continue;
+			}
+
+			/* Expiration of entries doesn't bump sernum, insertion
+			 * does. Removal is triggered by insertion, so we can
+			 * rely on the fact that if entries change between two
+			 * partial dumps, this node is scanned again completely,
+			 * see rt6_insert_exception() and fib6_dump_table().
+			 *
+			 * Count expired entries we go through as handled
+			 * entries that we'll skip next time, in case of partial
+			 * node dump. Otherwise, if entries expire meanwhile,
+			 * we'll skip the wrong amount.
+			 */
+			if (rt6_check_expired(rt6_ex->rt6i)) {
+				w->count++;
+				continue;
+			}
+
+			err = rt6_fill_node(dump->net, dump->skb, w->rt,
+					    &rt6_ex->rt6i->dst, NULL, NULL, 0,
+					    RTM_NEWROUTE,
+					    NETLINK_CB(dump->cb->skb).portid,
+					    dump->cb->nlh->nlmsg_seq, w->flags);
+			if (err)
+				return err;
+
+			w->count++;
+		}
+		bucket++;
+	}
+
+	return 0;
+}
+
 /* Return -1 if done with node, number of handled routes on partial dump */
-int rt6_dump_route(struct fib6_info *rt, void *p_arg)
+int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 	struct fib_dump_filter *filter = &arg->filter;
 	unsigned int flags = NLM_F_MULTI;
 	struct net *net = arg->net;
+	int count = 0;
 
 	if (rt == net->ipv6.fib6_null_entry)
 		return -1;
@@ -5538,19 +5598,51 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 		/* success since this is not a prefix route */
 		return -1;
 	}
-	if (filter->filter_set) {
-		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
-		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
-		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
-			return -1;
-		}
+	if (filter->filter_set &&
+	    ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
+	     (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
+	     (filter->protocol && rt->fib6_protocol != filter->protocol))) {
+		return -1;
+	}
+
+	if (filter->filter_set ||
+	    !filter->dump_routes || !filter->dump_exceptions) {
 		flags |= NLM_F_DUMP_FILTERED;
 	}
 
-	if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE,
-			  NETLINK_CB(arg->cb->skb).portid,
-			  arg->cb->nlh->nlmsg_seq, flags))
-		return 0;
+	if (filter->dump_routes) {
+		if (skip) {
+			skip--;
+		} else {
+			if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
+					  0, RTM_NEWROUTE,
+					  NETLINK_CB(arg->cb->skb).portid,
+					  arg->cb->nlh->nlmsg_seq, flags)) {
+				return 0;
+			}
+			count++;
+		}
+	}
+
+	if (filter->dump_exceptions) {
+		struct fib6_nh_exception_dump_walker w = { .dump = arg,
+							   .rt = rt,
+							   .flags = flags,
+							   .skip = skip,
+							   .count = 0 };
+		int err;
+
+		if (rt->nh) {
+			err = nexthop_for_each_fib6_nh(rt->nh,
+						       rt6_nh_dump_exceptions,
+						       &w);
+		} else {
+			err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
+		}
+
+		if (err)
+			return count += w.count;
+	}
 
 	return -1;
 }
-- 
cgit v1.2.3


From 792c4e9d0bbb53b34bf1c07c2ef25609d746c57d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Thu, 20 Jun 2019 07:03:47 +0000
Subject: net/mlx5: Convert mkey_table to XArray

The lock protecting the data structure does not need to be an rwlock.  The
only read access to the lock is in an error path, and if that's limiting
your scalability, you have bigger performance problems.

Eliminate mlx5_mkey_table in favour of using the xarray directly.
reg_mr_callback must use GFP_ATOMIC for allocating XArray nodes as it may
be called in interrupt context.

This also fixes a minor bug where SRCU locking was being used on the radix
tree read side, when RCU was needed too.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c              |  8 ++++----
 drivers/infiniband/hw/mlx5/devx.c            | 18 ++++--------------
 drivers/infiniband/hw/mlx5/mr.c              | 10 +++++-----
 drivers/infiniband/hw/mlx5/odp.c             | 10 +++++-----
 drivers/net/ethernet/mellanox/mlx5/core/mr.c | 27 +++++++++++----------------
 include/linux/mlx5/driver.h                  | 13 ++-----------
 include/linux/mlx5/qp.h                      |  5 -----
 7 files changed, 31 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 2e2e65f00257..0220736b073e 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -522,9 +522,9 @@ repoll:
 	case MLX5_CQE_SIG_ERR:
 		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
 
-		read_lock(&dev->mdev->priv.mkey_table.lock);
-		mmkey = __mlx5_mr_lookup(dev->mdev,
-					 mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
+		xa_lock(&dev->mdev->priv.mkey_table);
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
 		mr = to_mibmr(mmkey);
 		get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
 		mr->sig->sig_err_exists = true;
@@ -537,7 +537,7 @@ repoll:
 			     mr->sig->err_item.expected,
 			     mr->sig->err_item.actual);
 
-		read_unlock(&dev->mdev->priv.mkey_table.lock);
+		xa_unlock(&dev->mdev->priv.mkey_table);
 		goto repoll;
 	}
 
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 80b42d069328..931f587dfb8f 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1043,13 +1043,10 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
 				     struct mlx5_ib_dev *dev,
 				     void *in, void *out)
 {
-	struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
 	struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr;
-	unsigned long flags;
 	struct mlx5_core_mkey *mkey;
 	void *mkc;
 	u8 key;
-	int err;
 
 	mkey = &devx_mr->mmkey;
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
@@ -1062,11 +1059,8 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
 	mkey->pd = MLX5_GET(mkc, mkc, pd);
 	devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
 
-	write_lock_irqsave(&table->lock, flags);
-	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key),
-				mkey);
-	write_unlock_irqrestore(&table->lock, flags);
-	return err;
+	return xa_err(xa_store(&dev->mdev->priv.mkey_table,
+			       mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL));
 }
 
 static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
@@ -1117,12 +1111,8 @@ static void devx_free_indirect_mkey(struct rcu_head *rcu)
  */
 static void devx_cleanup_mkey(struct devx_obj *obj)
 {
-	struct mlx5_mkey_table *table = &obj->mdev->priv.mkey_table;
-	unsigned long flags;
-
-	write_lock_irqsave(&table->lock, flags);
-	radix_tree_delete(&table->tree, mlx5_base_mkey(obj->devx_mr.mmkey.key));
-	write_unlock_irqrestore(&table->lock, flags);
+	xa_erase(&obj->mdev->priv.mkey_table,
+		 mlx5_base_mkey(obj->devx_mr.mmkey.key));
 }
 
 static int devx_obj_cleanup(struct ib_uobject *uobject,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 5f09699fab98..83b452d977d4 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -130,7 +130,7 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context)
 	struct mlx5_cache_ent *ent = &cache->ent[c];
 	u8 key;
 	unsigned long flags;
-	struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
+	struct xarray *mkeys = &dev->mdev->priv.mkey_table;
 	int err;
 
 	spin_lock_irqsave(&ent->lock, flags);
@@ -158,12 +158,12 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context)
 	ent->size++;
 	spin_unlock_irqrestore(&ent->lock, flags);
 
-	write_lock_irqsave(&table->lock, flags);
-	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key),
-				&mr->mmkey);
+	xa_lock_irqsave(mkeys, flags);
+	err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key),
+				&mr->mmkey, GFP_ATOMIC));
+	xa_unlock_irqrestore(mkeys, flags);
 	if (err)
 		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
-	write_unlock_irqrestore(&table->lock, flags);
 
 	if (!completion_done(&ent->compl))
 		complete(&ent->compl);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 12ccee1eb047..c594489eb2d7 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -768,7 +768,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 	bcnt -= *bytes_committed;
 
 next_mr:
-	mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key));
+	mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key));
 	if (!mkey_is_eq(mmkey, key)) {
 		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
 		ret = -EFAULT;
@@ -1686,8 +1686,8 @@ static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev,
 		struct mlx5_core_mkey *mmkey;
 		struct mlx5_ib_mr *mr;
 
-		mmkey = __mlx5_mr_lookup(dev->mdev,
-					 mlx5_base_mkey(sg_list[i].lkey));
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(sg_list[i].lkey));
 		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
 		atomic_dec(&mr->num_pending_prefetch);
 	}
@@ -1706,8 +1706,8 @@ static bool num_pending_prefetch_inc(struct ib_pd *pd,
 		struct mlx5_core_mkey *mmkey;
 		struct mlx5_ib_mr *mr;
 
-		mmkey = __mlx5_mr_lookup(dev->mdev,
-					 mlx5_base_mkey(sg_list[i].lkey));
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(sg_list[i].lkey));
 		if (!mmkey || mmkey->key != sg_list[i].lkey) {
 			ret = false;
 			break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index ea744d8466ea..9231b39d18b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -38,15 +38,12 @@
 
 void mlx5_init_mkey_table(struct mlx5_core_dev *dev)
 {
-	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
-
-	memset(table, 0, sizeof(*table));
-	rwlock_init(&table->lock);
-	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	xa_init_flags(&dev->priv.mkey_table, XA_FLAGS_LOCK_IRQ);
 }
 
 void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev)
 {
+	WARN_ON(!xa_empty(&dev->priv.mkey_table));
 }
 
 int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
@@ -56,8 +53,8 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
 			     mlx5_async_cbk_t callback,
 			     struct mlx5_async_work *context)
 {
-	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
 	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0};
+	struct xarray *mkeys = &dev->priv.mkey_table;
 	u32 mkey_index;
 	void *mkc;
 	int err;
@@ -88,12 +85,10 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
 	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
 		      mkey_index, key, mkey->key);
 
-	/* connect to mkey tree */
-	write_lock_irq(&table->lock);
-	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), mkey);
-	write_unlock_irq(&table->lock);
+	err = xa_err(xa_store_irq(mkeys, mlx5_base_mkey(mkey->key), mkey,
+				  GFP_KERNEL));
 	if (err) {
-		mlx5_core_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n",
+		mlx5_core_warn(dev, "failed xarray insert of mkey 0x%x, %d\n",
 			       mlx5_base_mkey(mkey->key), err);
 		mlx5_core_destroy_mkey(dev, mkey);
 	}
@@ -114,17 +109,17 @@ EXPORT_SYMBOL(mlx5_core_create_mkey);
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
 			   struct mlx5_core_mkey *mkey)
 {
-	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
 	u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0};
 	u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)]   = {0};
+	struct xarray *mkeys = &dev->priv.mkey_table;
 	struct mlx5_core_mkey *deleted_mkey;
 	unsigned long flags;
 
-	write_lock_irqsave(&table->lock, flags);
-	deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key));
-	write_unlock_irqrestore(&table->lock, flags);
+	xa_lock_irqsave(mkeys, flags);
+	deleted_mkey = __xa_erase(mkeys, mlx5_base_mkey(mkey->key));
+	xa_unlock_irqrestore(mkeys, flags);
 	if (!deleted_mkey) {
-		mlx5_core_dbg(dev, "failed radix tree delete of mkey 0x%x\n",
+		mlx5_core_dbg(dev, "failed xarray delete of mkey 0x%x\n",
 			      mlx5_base_mkey(mkey->key));
 		return -ENOENT;
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d8ab633406c2..87f77ded78d4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -41,7 +41,7 @@
 #include <linux/semaphore.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
 #include <linux/workqueue.h>
 #include <linux/mempool.h>
 #include <linux/interrupt.h>
@@ -452,13 +452,6 @@ struct mlx5_qp_table {
 	struct radix_tree_root	tree;
 };
 
-struct mlx5_mkey_table {
-	/* protect radix tree
-	 */
-	rwlock_t		lock;
-	struct radix_tree_root	tree;
-};
-
 struct mlx5_vf_context {
 	int	enabled;
 	u64	port_guid;
@@ -546,9 +539,7 @@ struct mlx5_priv {
 	struct dentry	       *cmdif_debugfs;
 	/* end: qp staff */
 
-	/* start: mkey staff */
-	struct mlx5_mkey_table	mkey_table;
-	/* end: mkey staff */
+	struct xarray           mkey_table;
 
 	/* start: alloc staff */
 	/* protect buffer alocation according to numa node */
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 3ba4edbd17a6..d1f353c64797 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -551,11 +551,6 @@ static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u
 	return radix_tree_lookup(&dev->priv.qp_table.tree, qpn);
 }
 
-static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key)
-{
-	return radix_tree_lookup(&dev->priv.mkey_table.tree, key);
-}
-
 int mlx5_core_create_dct(struct mlx5_core_dev *dev,
 			 struct mlx5_core_dct *qp,
 			 u32 *in, int inlen,
-- 
cgit v1.2.3


From 5db7c8b9f9fc2aeec671ae3ca6375752c162e0e7 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 18 Jun 2019 23:07:36 +0300
Subject: ipvs: fix tinfo memory leak in start_sync_thread

syzkaller reports for memory leak in start_sync_thread [1]

As Eric points out, kthread may start and stop before the
threadfn function is called, so there is no chance the
data (tinfo in our case) to be released in thread.

Fix this by releasing tinfo in the controlling code instead.

[1]
BUG: memory leak
unreferenced object 0xffff8881206bf700 (size 32):
 comm "syz-executor761", pid 7268, jiffies 4294943441 (age 20.470s)
 hex dump (first 32 bytes):
   00 40 7c 09 81 88 ff ff 80 45 b8 21 81 88 ff ff  .@|......E.!....
   00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
 backtrace:
   [<0000000057619e23>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
   [<0000000057619e23>] slab_post_alloc_hook mm/slab.h:439 [inline]
   [<0000000057619e23>] slab_alloc mm/slab.c:3326 [inline]
   [<0000000057619e23>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553
   [<0000000086ce5479>] kmalloc include/linux/slab.h:547 [inline]
   [<0000000086ce5479>] start_sync_thread+0x5d2/0xe10 net/netfilter/ipvs/ip_vs_sync.c:1862
   [<000000001a9229cc>] do_ip_vs_set_ctl+0x4c5/0x780 net/netfilter/ipvs/ip_vs_ctl.c:2402
   [<00000000ece457c8>] nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
   [<00000000ece457c8>] nf_setsockopt+0x4c/0x80 net/netfilter/nf_sockopt.c:115
   [<00000000942f62d4>] ip_setsockopt net/ipv4/ip_sockglue.c:1258 [inline]
   [<00000000942f62d4>] ip_setsockopt+0x9b/0xb0 net/ipv4/ip_sockglue.c:1238
   [<00000000a56a8ffd>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616
   [<00000000fa895401>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130
   [<0000000095eef4cf>] __sys_setsockopt+0x98/0x120 net/socket.c:2078
   [<000000009747cf88>] __do_sys_setsockopt net/socket.c:2089 [inline]
   [<000000009747cf88>] __se_sys_setsockopt net/socket.c:2086 [inline]
   [<000000009747cf88>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086
   [<00000000ded8ba80>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
   [<00000000893b4ac8>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported-by: syzbot+7e2e50c8adfccd2e5041@syzkaller.appspotmail.com
Suggested-by: Eric Biggers <ebiggers@kernel.org>
Fixes: 998e7a76804b ("ipvs: Use kthread_run() instead of doing a double-fork via kernel_thread()")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             |   6 +-
 net/netfilter/ipvs/ip_vs_ctl.c  |   4 --
 net/netfilter/ipvs/ip_vs_sync.c | 134 +++++++++++++++++++++-------------------
 3 files changed, 76 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 2ac40135b576..b36a1df93e7c 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -808,11 +808,12 @@ struct ipvs_master_sync_state {
 	struct ip_vs_sync_buff	*sync_buff;
 	unsigned long		sync_queue_len;
 	unsigned int		sync_queue_delay;
-	struct task_struct	*master_thread;
 	struct delayed_work	master_wakeup_work;
 	struct netns_ipvs	*ipvs;
 };
 
+struct ip_vs_sync_thread_data;
+
 /* How much time to keep dests in trash */
 #define IP_VS_DEST_TRASH_PERIOD		(120 * HZ)
 
@@ -943,7 +944,8 @@ struct netns_ipvs {
 	spinlock_t		sync_lock;
 	struct ipvs_master_sync_state *ms;
 	spinlock_t		sync_buff_lock;
-	struct task_struct	**backup_threads;
+	struct ip_vs_sync_thread_data *master_tinfo;
+	struct ip_vs_sync_thread_data *backup_tinfo;
 	int			threads_mask;
 	volatile int		sync_state;
 	struct mutex		sync_mutex;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 776c87ed4813..741d91aa4a8d 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2396,9 +2396,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 			cfg.syncid = dm->syncid;
 			ret = start_sync_thread(ipvs, &cfg, dm->state);
 		} else {
-			mutex_lock(&ipvs->sync_mutex);
 			ret = stop_sync_thread(ipvs, dm->state);
-			mutex_unlock(&ipvs->sync_mutex);
 		}
 		goto out_dec;
 	}
@@ -3515,10 +3513,8 @@ static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
 	if (!attrs[IPVS_DAEMON_ATTR_STATE])
 		return -EINVAL;
 
-	mutex_lock(&ipvs->sync_mutex);
 	ret = stop_sync_thread(ipvs,
 			       nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
-	mutex_unlock(&ipvs->sync_mutex);
 	return ret;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 2526be6b3d90..a4a78c4b06de 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -195,6 +195,7 @@ union ip_vs_sync_conn {
 #define IPVS_OPT_F_PARAM	(1 << (IPVS_OPT_PARAM-1))
 
 struct ip_vs_sync_thread_data {
+	struct task_struct *task;
 	struct netns_ipvs *ipvs;
 	struct socket *sock;
 	char *buf;
@@ -374,8 +375,11 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs,
 					      max(IPVS_SYNC_SEND_DELAY, 1));
 		ms->sync_queue_len++;
 		list_add_tail(&sb->list, &ms->sync_queue);
-		if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
-			wake_up_process(ms->master_thread);
+		if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) {
+			int id = (int)(ms - ipvs->ms);
+
+			wake_up_process(ipvs->master_tinfo[id].task);
+		}
 	} else
 		ip_vs_sync_buff_release(sb);
 	spin_unlock(&ipvs->sync_lock);
@@ -1636,8 +1640,10 @@ static void master_wakeup_work_handler(struct work_struct *work)
 	spin_lock_bh(&ipvs->sync_lock);
 	if (ms->sync_queue_len &&
 	    ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+		int id = (int)(ms - ipvs->ms);
+
 		ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
-		wake_up_process(ms->master_thread);
+		wake_up_process(ipvs->master_tinfo[id].task);
 	}
 	spin_unlock_bh(&ipvs->sync_lock);
 }
@@ -1703,10 +1709,6 @@ done:
 	if (sb)
 		ip_vs_sync_buff_release(sb);
 
-	/* release the sending multicast socket */
-	sock_release(tinfo->sock);
-	kfree(tinfo);
-
 	return 0;
 }
 
@@ -1740,11 +1742,6 @@ static int sync_thread_backup(void *data)
 		}
 	}
 
-	/* release the sending multicast socket */
-	sock_release(tinfo->sock);
-	kfree(tinfo->buf);
-	kfree(tinfo);
-
 	return 0;
 }
 
@@ -1752,8 +1749,8 @@ static int sync_thread_backup(void *data)
 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 		      int state)
 {
-	struct ip_vs_sync_thread_data *tinfo = NULL;
-	struct task_struct **array = NULL, *task;
+	struct ip_vs_sync_thread_data *ti = NULL, *tinfo;
+	struct task_struct *task;
 	struct net_device *dev;
 	char *name;
 	int (*threadfn)(void *data);
@@ -1822,7 +1819,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 		threadfn = sync_thread_master;
 	} else if (state == IP_VS_STATE_BACKUP) {
 		result = -EEXIST;
-		if (ipvs->backup_threads)
+		if (ipvs->backup_tinfo)
 			goto out_early;
 
 		ipvs->bcfg = *c;
@@ -1849,28 +1846,22 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 					  master_wakeup_work_handler);
 			ms->ipvs = ipvs;
 		}
-	} else {
-		array = kcalloc(count, sizeof(struct task_struct *),
-				GFP_KERNEL);
-		result = -ENOMEM;
-		if (!array)
-			goto out;
 	}
+	result = -ENOMEM;
+	ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data),
+		     GFP_KERNEL);
+	if (!ti)
+		goto out;
 
 	for (id = 0; id < count; id++) {
-		result = -ENOMEM;
-		tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
-		if (!tinfo)
-			goto out;
+		tinfo = &ti[id];
 		tinfo->ipvs = ipvs;
-		tinfo->sock = NULL;
 		if (state == IP_VS_STATE_BACKUP) {
+			result = -ENOMEM;
 			tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
 					     GFP_KERNEL);
 			if (!tinfo->buf)
 				goto out;
-		} else {
-			tinfo->buf = NULL;
 		}
 		tinfo->id = id;
 		if (state == IP_VS_STATE_MASTER)
@@ -1885,17 +1876,15 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 			result = PTR_ERR(task);
 			goto out;
 		}
-		tinfo = NULL;
-		if (state == IP_VS_STATE_MASTER)
-			ipvs->ms[id].master_thread = task;
-		else
-			array[id] = task;
+		tinfo->task = task;
 	}
 
 	/* mark as active */
 
-	if (state == IP_VS_STATE_BACKUP)
-		ipvs->backup_threads = array;
+	if (state == IP_VS_STATE_MASTER)
+		ipvs->master_tinfo = ti;
+	else
+		ipvs->backup_tinfo = ti;
 	spin_lock_bh(&ipvs->sync_buff_lock);
 	ipvs->sync_state |= state;
 	spin_unlock_bh(&ipvs->sync_buff_lock);
@@ -1910,29 +1899,31 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 
 out:
 	/* We do not need RTNL lock anymore, release it here so that
-	 * sock_release below and in the kthreads can use rtnl_lock
-	 * to leave the mcast group.
+	 * sock_release below can use rtnl_lock to leave the mcast group.
 	 */
 	rtnl_unlock();
-	count = id;
-	while (count-- > 0) {
-		if (state == IP_VS_STATE_MASTER)
-			kthread_stop(ipvs->ms[count].master_thread);
-		else
-			kthread_stop(array[count]);
+	id = min(id, count - 1);
+	if (ti) {
+		for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+			if (tinfo->task)
+				kthread_stop(tinfo->task);
+		}
 	}
 	if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
 		kfree(ipvs->ms);
 		ipvs->ms = NULL;
 	}
 	mutex_unlock(&ipvs->sync_mutex);
-	if (tinfo) {
-		if (tinfo->sock)
-			sock_release(tinfo->sock);
-		kfree(tinfo->buf);
-		kfree(tinfo);
+
+	/* No more mutexes, release socks */
+	if (ti) {
+		for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+			if (tinfo->sock)
+				sock_release(tinfo->sock);
+			kfree(tinfo->buf);
+		}
+		kfree(ti);
 	}
-	kfree(array);
 	return result;
 
 out_early:
@@ -1944,15 +1935,18 @@ out_early:
 
 int stop_sync_thread(struct netns_ipvs *ipvs, int state)
 {
-	struct task_struct **array;
+	struct ip_vs_sync_thread_data *ti, *tinfo;
 	int id;
 	int retc = -EINVAL;
 
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
 
+	mutex_lock(&ipvs->sync_mutex);
 	if (state == IP_VS_STATE_MASTER) {
+		retc = -ESRCH;
 		if (!ipvs->ms)
-			return -ESRCH;
+			goto err;
+		ti = ipvs->master_tinfo;
 
 		/*
 		 * The lock synchronizes with sb_queue_tail(), so that we don't
@@ -1971,38 +1965,56 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state)
 			struct ipvs_master_sync_state *ms = &ipvs->ms[id];
 			int ret;
 
+			tinfo = &ti[id];
 			pr_info("stopping master sync thread %d ...\n",
-				task_pid_nr(ms->master_thread));
+				task_pid_nr(tinfo->task));
 			cancel_delayed_work_sync(&ms->master_wakeup_work);
-			ret = kthread_stop(ms->master_thread);
+			ret = kthread_stop(tinfo->task);
 			if (retc >= 0)
 				retc = ret;
 		}
 		kfree(ipvs->ms);
 		ipvs->ms = NULL;
+		ipvs->master_tinfo = NULL;
 	} else if (state == IP_VS_STATE_BACKUP) {
-		if (!ipvs->backup_threads)
-			return -ESRCH;
+		retc = -ESRCH;
+		if (!ipvs->backup_tinfo)
+			goto err;
+		ti = ipvs->backup_tinfo;
 
 		ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
-		array = ipvs->backup_threads;
 		retc = 0;
 		for (id = ipvs->threads_mask; id >= 0; id--) {
 			int ret;
 
+			tinfo = &ti[id];
 			pr_info("stopping backup sync thread %d ...\n",
-				task_pid_nr(array[id]));
-			ret = kthread_stop(array[id]);
+				task_pid_nr(tinfo->task));
+			ret = kthread_stop(tinfo->task);
 			if (retc >= 0)
 				retc = ret;
 		}
-		kfree(array);
-		ipvs->backup_threads = NULL;
+		ipvs->backup_tinfo = NULL;
+	} else {
+		goto err;
 	}
+	id = ipvs->threads_mask;
+	mutex_unlock(&ipvs->sync_mutex);
+
+	/* No more mutexes, release socks */
+	for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+		if (tinfo->sock)
+			sock_release(tinfo->sock);
+		kfree(tinfo->buf);
+	}
+	kfree(ti);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
+	return retc;
 
+err:
+	mutex_unlock(&ipvs->sync_mutex);
 	return retc;
 }
 
@@ -2021,7 +2033,6 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
 {
 	int retc;
 
-	mutex_lock(&ipvs->sync_mutex);
 	retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
 	if (retc && retc != -ESRCH)
 		pr_err("Failed to stop Master Daemon\n");
@@ -2029,5 +2040,4 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
 	retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
 	if (retc && retc != -ESRCH)
 		pr_err("Failed to stop Backup Daemon\n");
-	mutex_unlock(&ipvs->sync_mutex);
 }
-- 
cgit v1.2.3


From e7d4798960b3ebcd243ae6a59e04d4fe6518c96c Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <toshiaki.makita1@gmail.com>
Date: Thu, 13 Jun 2019 18:39:58 +0900
Subject: xdp: Add tracepoint for bulk XDP_TX

This is introduced for admins to check what is happening on XDP_TX when
bulk XDP_TX is in use, which will be first introduced in veth in next
commit.

v3:
- Add act field to be in line with other XDP tracepoints.

Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/trace/events/xdp.h | 29 +++++++++++++++++++++++++++++
 kernel/bpf/core.c          |  1 +
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index bb5e380e2ef3..81e708c4b513 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -50,6 +50,35 @@ TRACE_EVENT(xdp_exception,
 		  __entry->ifindex)
 );
 
+TRACE_EVENT(xdp_bulk_tx,
+
+	TP_PROTO(const struct net_device *dev,
+		 int sent, int drops, int err),
+
+	TP_ARGS(dev, sent, drops, err),
+
+	TP_STRUCT__entry(
+		__field(int, ifindex)
+		__field(u32, act)
+		__field(int, drops)
+		__field(int, sent)
+		__field(int, err)
+	),
+
+	TP_fast_assign(
+		__entry->ifindex	= dev->ifindex;
+		__entry->act		= XDP_TX;
+		__entry->drops		= drops;
+		__entry->sent		= sent;
+		__entry->err		= err;
+	),
+
+	TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d",
+		  __entry->ifindex,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->sent, __entry->drops, __entry->err)
+);
+
 DECLARE_EVENT_CLASS(xdp_redirect_template,
 
 	TP_PROTO(const struct net_device *dev,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ad3be85f1411..561ed07d3007 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2101,3 +2101,4 @@ EXPORT_SYMBOL(bpf_stats_enabled_key);
 #include <linux/bpf_trace.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
-- 
cgit v1.2.3


From 0e58983de0d89f6ee75daab1b0ea918cfcf6ddbf Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Sun, 4 Nov 2018 19:07:02 +0200
Subject: linux/dim: Move logic to dim.h

In preparation for supporting more implementations of the DIM
algorithm, I'm moving what would become common logic to a common
library. Downstream DIM implementations will use the common lib
for their implementation.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 MAINTAINERS             |   1 +
 include/linux/dim.h     | 153 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/net_dim.h | 148 +---------------------------------------------
 3 files changed, 156 insertions(+), 146 deletions(-)
 create mode 100644 include/linux/dim.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 429c6c624861..5d4b852d9d39 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5589,6 +5589,7 @@ DYNAMIC INTERRUPT MODERATION
 M:	Tal Gilboa <talgi@mellanox.com>
 S:	Maintained
 F:	include/linux/net_dim.h
+F:	include/linux/dim.h
 
 DZ DECSTATION DZ11 SERIAL DRIVER
 M:	"Maciej W. Rozycki" <macro@linux-mips.org>
diff --git a/include/linux/dim.h b/include/linux/dim.h
new file mode 100644
index 000000000000..67d7ca40f3dd
--- /dev/null
+++ b/include/linux/dim.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef DIM_H
+#define DIM_H
+
+#include <linux/module.h>
+
+#define NET_DIM_NEVENTS 64
+
+/* more than 10% difference */
+#define IS_SIGNIFICANT_DIFF(val, ref) \
+	(((100UL * abs((val) - (ref))) / (ref)) > 10)
+#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
+& (BIT_ULL(bits) - 1))
+
+struct net_dim_cq_moder {
+	u16 usec;
+	u16 pkts;
+	u8 cq_period_mode;
+};
+
+struct net_dim_sample {
+	ktime_t time;
+	u32 pkt_ctr;
+	u32 byte_ctr;
+	u16 event_ctr;
+};
+
+struct net_dim_stats {
+	int ppms; /* packets per msec */
+	int bpms; /* bytes per msec */
+	int epms; /* events per msec */
+};
+
+struct net_dim { /* Dynamic Interrupt Moderation */
+	u8 state;
+	struct net_dim_stats prev_stats;
+	struct net_dim_sample start_sample;
+	struct work_struct work;
+	u8 profile_ix;
+	u8 mode;
+	u8 tune_state;
+	u8 steps_right;
+	u8 steps_left;
+	u8 tired;
+};
+
+enum {
+	NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	NET_DIM_CQ_PERIOD_NUM_MODES
+};
+
+enum {
+	NET_DIM_START_MEASURE,
+	NET_DIM_MEASURE_IN_PROGRESS,
+	NET_DIM_APPLY_NEW_PROFILE,
+};
+
+enum {
+	NET_DIM_PARKING_ON_TOP,
+	NET_DIM_PARKING_TIRED,
+	NET_DIM_GOING_RIGHT,
+	NET_DIM_GOING_LEFT,
+};
+
+enum {
+	NET_DIM_STATS_WORSE,
+	NET_DIM_STATS_SAME,
+	NET_DIM_STATS_BETTER,
+};
+
+enum {
+	NET_DIM_STEPPED,
+	NET_DIM_TOO_TIRED,
+	NET_DIM_ON_EDGE,
+};
+
+static inline bool net_dim_on_top(struct net_dim *net_dim)
+{
+	switch (net_dim->tune_state) {
+	case NET_DIM_PARKING_ON_TOP:
+	case NET_DIM_PARKING_TIRED:
+		return true;
+	case NET_DIM_GOING_RIGHT:
+		return (net_dim->steps_left > 1) && (net_dim->steps_right == 1);
+	default: /* NET_DIM_GOING_LEFT */
+		return (net_dim->steps_right > 1) && (net_dim->steps_left == 1);
+	}
+}
+
+static inline void net_dim_turn(struct net_dim *net_dim)
+{
+	switch (net_dim->tune_state) {
+	case NET_DIM_PARKING_ON_TOP:
+	case NET_DIM_PARKING_TIRED:
+		break;
+	case NET_DIM_GOING_RIGHT:
+		net_dim->tune_state = NET_DIM_GOING_LEFT;
+		net_dim->steps_left = 0;
+		break;
+	case NET_DIM_GOING_LEFT:
+		net_dim->tune_state = NET_DIM_GOING_RIGHT;
+		net_dim->steps_right = 0;
+		break;
+	}
+}
+
+static inline void net_dim_park_on_top(struct net_dim *net_dim)
+{
+	net_dim->steps_right  = 0;
+	net_dim->steps_left   = 0;
+	net_dim->tired        = 0;
+	net_dim->tune_state   = NET_DIM_PARKING_ON_TOP;
+}
+
+static inline void net_dim_park_tired(struct net_dim *net_dim)
+{
+	net_dim->steps_right  = 0;
+	net_dim->steps_left   = 0;
+	net_dim->tune_state   = NET_DIM_PARKING_TIRED;
+}
+
+static inline void
+net_dim_sample(u16 event_ctr, u64 packets, u64 bytes, struct net_dim_sample *s)
+{
+	s->time	     = ktime_get();
+	s->pkt_ctr   = packets;
+	s->byte_ctr  = bytes;
+	s->event_ctr = event_ctr;
+}
+
+static inline void
+net_dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
+		   struct net_dim_stats *curr_stats)
+{
+	/* u32 holds up to 71 minutes, should be enough */
+	u32 delta_us = ktime_us_delta(end->time, start->time);
+	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+			     start->byte_ctr);
+
+	if (!delta_us)
+		return;
+
+	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+	curr_stats->epms = DIV_ROUND_UP(NET_DIM_NEVENTS * USEC_PER_MSEC,
+					delta_us);
+}
+
+#endif /* DIM_H */
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index fd458389f7d1..373cda74b167 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -35,73 +35,10 @@
 #define NET_DIM_H
 
 #include <linux/module.h>
-
-struct net_dim_cq_moder {
-	u16 usec;
-	u16 pkts;
-	u8 cq_period_mode;
-};
-
-struct net_dim_sample {
-	ktime_t time;
-	u32     pkt_ctr;
-	u32     byte_ctr;
-	u16     event_ctr;
-};
-
-struct net_dim_stats {
-	int ppms; /* packets per msec */
-	int bpms; /* bytes per msec */
-	int epms; /* events per msec */
-};
-
-struct net_dim { /* Adaptive Moderation */
-	u8                                      state;
-	struct net_dim_stats                    prev_stats;
-	struct net_dim_sample                   start_sample;
-	struct work_struct                      work;
-	u8                                      profile_ix;
-	u8                                      mode;
-	u8                                      tune_state;
-	u8                                      steps_right;
-	u8                                      steps_left;
-	u8                                      tired;
-};
-
-enum {
-	NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
-	NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
-	NET_DIM_CQ_PERIOD_NUM_MODES
-};
-
-/* Adaptive moderation logic */
-enum {
-	NET_DIM_START_MEASURE,
-	NET_DIM_MEASURE_IN_PROGRESS,
-	NET_DIM_APPLY_NEW_PROFILE,
-};
-
-enum {
-	NET_DIM_PARKING_ON_TOP,
-	NET_DIM_PARKING_TIRED,
-	NET_DIM_GOING_RIGHT,
-	NET_DIM_GOING_LEFT,
-};
-
-enum {
-	NET_DIM_STATS_WORSE,
-	NET_DIM_STATS_SAME,
-	NET_DIM_STATS_BETTER,
-};
-
-enum {
-	NET_DIM_STEPPED,
-	NET_DIM_TOO_TIRED,
-	NET_DIM_ON_EDGE,
-};
+#include <linux/dim.h>
 
 #define NET_DIM_PARAMS_NUM_PROFILES 5
-/* Adaptive moderation profiles */
+/* Netdev dynamic interrupt moderation profiles */
 #define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
 #define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
 #define NET_DIM_DEF_PROFILE_CQE 1
@@ -188,36 +125,6 @@ net_dim_get_def_tx_moderation(u8 cq_period_mode)
 	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
 }
 
-static inline bool net_dim_on_top(struct net_dim *dim)
-{
-	switch (dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
-		return true;
-	case NET_DIM_GOING_RIGHT:
-		return (dim->steps_left > 1) && (dim->steps_right == 1);
-	default: /* NET_DIM_GOING_LEFT */
-		return (dim->steps_right > 1) && (dim->steps_left == 1);
-	}
-}
-
-static inline void net_dim_turn(struct net_dim *dim)
-{
-	switch (dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
-		break;
-	case NET_DIM_GOING_RIGHT:
-		dim->tune_state = NET_DIM_GOING_LEFT;
-		dim->steps_left = 0;
-		break;
-	case NET_DIM_GOING_LEFT:
-		dim->tune_state = NET_DIM_GOING_RIGHT;
-		dim->steps_right = 0;
-		break;
-	}
-}
-
 static inline int net_dim_step(struct net_dim *dim)
 {
 	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
@@ -245,21 +152,6 @@ static inline int net_dim_step(struct net_dim *dim)
 	return NET_DIM_STEPPED;
 }
 
-static inline void net_dim_park_on_top(struct net_dim *dim)
-{
-	dim->steps_right  = 0;
-	dim->steps_left   = 0;
-	dim->tired        = 0;
-	dim->tune_state   = NET_DIM_PARKING_ON_TOP;
-}
-
-static inline void net_dim_park_tired(struct net_dim *dim)
-{
-	dim->steps_right  = 0;
-	dim->steps_left   = 0;
-	dim->tune_state   = NET_DIM_PARKING_TIRED;
-}
-
 static inline void net_dim_exit_parking(struct net_dim *dim)
 {
 	dim->tune_state = dim->profile_ix ? NET_DIM_GOING_LEFT :
@@ -267,9 +159,6 @@ static inline void net_dim_exit_parking(struct net_dim *dim)
 	net_dim_step(dim);
 }
 
-#define IS_SIGNIFICANT_DIFF(val, ref) \
-	(((100UL * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference */
-
 static inline int net_dim_stats_compare(struct net_dim_stats *curr,
 					struct net_dim_stats *prev)
 {
@@ -351,39 +240,6 @@ static inline bool net_dim_decision(struct net_dim_stats *curr_stats,
 	return dim->profile_ix != prev_ix;
 }
 
-static inline void net_dim_sample(u16 event_ctr,
-				  u64 packets,
-				  u64 bytes,
-				  struct net_dim_sample *s)
-{
-	s->time	     = ktime_get();
-	s->pkt_ctr   = packets;
-	s->byte_ctr  = bytes;
-	s->event_ctr = event_ctr;
-}
-
-#define NET_DIM_NEVENTS 64
-#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1))
-
-static inline void net_dim_calc_stats(struct net_dim_sample *start,
-				      struct net_dim_sample *end,
-				      struct net_dim_stats *curr_stats)
-{
-	/* u32 holds up to 71 minutes, should be enough */
-	u32 delta_us = ktime_us_delta(end->time, start->time);
-	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
-	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
-			     start->byte_ctr);
-
-	if (!delta_us)
-		return;
-
-	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
-	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
-	curr_stats->epms = DIV_ROUND_UP(NET_DIM_NEVENTS * USEC_PER_MSEC,
-					delta_us);
-}
-
 static inline void net_dim(struct net_dim *dim,
 			   struct net_dim_sample end_sample)
 {
-- 
cgit v1.2.3


From 449986ea92412727e8c553eaa5c8d3ed884253c4 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Mon, 5 Nov 2018 11:57:10 +0200
Subject: linux/dim: Remove "net" prefix from internal DIM members

Only renaming functions and structs which aren't used by an external code.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/dim.h     | 86 ++++++++++++++++++++++++------------------------
 include/linux/net_dim.h | 87 ++++++++++++++++++++++++-------------------------
 2 files changed, 86 insertions(+), 87 deletions(-)

(limited to 'include')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index 67d7ca40f3dd..6ee991681d62 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -6,7 +6,7 @@
 
 #include <linux/module.h>
 
-#define NET_DIM_NEVENTS 64
+#define DIM_NEVENTS 64
 
 /* more than 10% difference */
 #define IS_SIGNIFICANT_DIFF(val, ref) \
@@ -27,7 +27,7 @@ struct net_dim_sample {
 	u16 event_ctr;
 };
 
-struct net_dim_stats {
+struct dim_stats {
 	int ppms; /* packets per msec */
 	int bpms; /* bytes per msec */
 	int epms; /* events per msec */
@@ -35,7 +35,7 @@ struct net_dim_stats {
 
 struct net_dim { /* Dynamic Interrupt Moderation */
 	u8 state;
-	struct net_dim_stats prev_stats;
+	struct dim_stats prev_stats;
 	struct net_dim_sample start_sample;
 	struct work_struct work;
 	u8 profile_ix;
@@ -59,67 +59,67 @@ enum {
 };
 
 enum {
-	NET_DIM_PARKING_ON_TOP,
-	NET_DIM_PARKING_TIRED,
-	NET_DIM_GOING_RIGHT,
-	NET_DIM_GOING_LEFT,
+	DIM_PARKING_ON_TOP,
+	DIM_PARKING_TIRED,
+	DIM_GOING_RIGHT,
+	DIM_GOING_LEFT,
 };
 
 enum {
-	NET_DIM_STATS_WORSE,
-	NET_DIM_STATS_SAME,
-	NET_DIM_STATS_BETTER,
+	DIM_STATS_WORSE,
+	DIM_STATS_SAME,
+	DIM_STATS_BETTER,
 };
 
 enum {
-	NET_DIM_STEPPED,
-	NET_DIM_TOO_TIRED,
-	NET_DIM_ON_EDGE,
+	DIM_STEPPED,
+	DIM_TOO_TIRED,
+	DIM_ON_EDGE,
 };
 
-static inline bool net_dim_on_top(struct net_dim *net_dim)
+static inline bool dim_on_top(struct net_dim *dim)
 {
-	switch (net_dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
 		return true;
-	case NET_DIM_GOING_RIGHT:
-		return (net_dim->steps_left > 1) && (net_dim->steps_right == 1);
-	default: /* NET_DIM_GOING_LEFT */
-		return (net_dim->steps_right > 1) && (net_dim->steps_left == 1);
+	case DIM_GOING_RIGHT:
+		return (dim->steps_left > 1) && (dim->steps_right == 1);
+	default: /* DIM_GOING_LEFT */
+		return (dim->steps_right > 1) && (dim->steps_left == 1);
 	}
 }
 
-static inline void net_dim_turn(struct net_dim *net_dim)
+static inline void dim_turn(struct net_dim *dim)
 {
-	switch (net_dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
 		break;
-	case NET_DIM_GOING_RIGHT:
-		net_dim->tune_state = NET_DIM_GOING_LEFT;
-		net_dim->steps_left = 0;
+	case DIM_GOING_RIGHT:
+		dim->tune_state = DIM_GOING_LEFT;
+		dim->steps_left = 0;
 		break;
-	case NET_DIM_GOING_LEFT:
-		net_dim->tune_state = NET_DIM_GOING_RIGHT;
-		net_dim->steps_right = 0;
+	case DIM_GOING_LEFT:
+		dim->tune_state = DIM_GOING_RIGHT;
+		dim->steps_right = 0;
 		break;
 	}
 }
 
-static inline void net_dim_park_on_top(struct net_dim *net_dim)
+static inline void dim_park_on_top(struct net_dim *dim)
 {
-	net_dim->steps_right  = 0;
-	net_dim->steps_left   = 0;
-	net_dim->tired        = 0;
-	net_dim->tune_state   = NET_DIM_PARKING_ON_TOP;
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tired        = 0;
+	dim->tune_state   = DIM_PARKING_ON_TOP;
 }
 
-static inline void net_dim_park_tired(struct net_dim *net_dim)
+static inline void dim_park_tired(struct net_dim *dim)
 {
-	net_dim->steps_right  = 0;
-	net_dim->steps_left   = 0;
-	net_dim->tune_state   = NET_DIM_PARKING_TIRED;
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tune_state   = DIM_PARKING_TIRED;
 }
 
 static inline void
@@ -132,8 +132,8 @@ net_dim_sample(u16 event_ctr, u64 packets, u64 bytes, struct net_dim_sample *s)
 }
 
 static inline void
-net_dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
-		   struct net_dim_stats *curr_stats)
+dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
+	       struct dim_stats *curr_stats)
 {
 	/* u32 holds up to 71 minutes, should be enough */
 	u32 delta_us = ktime_us_delta(end->time, start->time);
@@ -146,7 +146,7 @@ net_dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
 
 	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
 	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
-	curr_stats->epms = DIV_ROUND_UP(NET_DIM_NEVENTS * USEC_PER_MSEC,
+	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
 					delta_us);
 }
 
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index 373cda74b167..f89fa4fdfb46 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -128,67 +128,67 @@ net_dim_get_def_tx_moderation(u8 cq_period_mode)
 static inline int net_dim_step(struct net_dim *dim)
 {
 	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
-		return NET_DIM_TOO_TIRED;
+		return DIM_TOO_TIRED;
 
 	switch (dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
-	case NET_DIM_PARKING_TIRED:
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
 		break;
-	case NET_DIM_GOING_RIGHT:
+	case DIM_GOING_RIGHT:
 		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
-			return NET_DIM_ON_EDGE;
+			return DIM_ON_EDGE;
 		dim->profile_ix++;
 		dim->steps_right++;
 		break;
-	case NET_DIM_GOING_LEFT:
+	case DIM_GOING_LEFT:
 		if (dim->profile_ix == 0)
-			return NET_DIM_ON_EDGE;
+			return DIM_ON_EDGE;
 		dim->profile_ix--;
 		dim->steps_left++;
 		break;
 	}
 
 	dim->tired++;
-	return NET_DIM_STEPPED;
+	return DIM_STEPPED;
 }
 
 static inline void net_dim_exit_parking(struct net_dim *dim)
 {
-	dim->tune_state = dim->profile_ix ? NET_DIM_GOING_LEFT :
-					  NET_DIM_GOING_RIGHT;
+	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT :
+					  DIM_GOING_RIGHT;
 	net_dim_step(dim);
 }
 
-static inline int net_dim_stats_compare(struct net_dim_stats *curr,
-					struct net_dim_stats *prev)
+static inline int net_dim_stats_compare(struct dim_stats *curr,
+					struct dim_stats *prev)
 {
 	if (!prev->bpms)
-		return curr->bpms ? NET_DIM_STATS_BETTER :
-				    NET_DIM_STATS_SAME;
+		return curr->bpms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
 
 	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
-		return (curr->bpms > prev->bpms) ? NET_DIM_STATS_BETTER :
-						   NET_DIM_STATS_WORSE;
+		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
 
 	if (!prev->ppms)
-		return curr->ppms ? NET_DIM_STATS_BETTER :
-				    NET_DIM_STATS_SAME;
+		return curr->ppms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
 
 	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
-		return (curr->ppms > prev->ppms) ? NET_DIM_STATS_BETTER :
-						   NET_DIM_STATS_WORSE;
+		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
 
 	if (!prev->epms)
-		return NET_DIM_STATS_SAME;
+		return DIM_STATS_SAME;
 
 	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
-		return (curr->epms < prev->epms) ? NET_DIM_STATS_BETTER :
-						   NET_DIM_STATS_WORSE;
+		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
 
-	return NET_DIM_STATS_SAME;
+	return DIM_STATS_SAME;
 }
 
-static inline bool net_dim_decision(struct net_dim_stats *curr_stats,
+static inline bool net_dim_decision(struct dim_stats *curr_stats,
 				    struct net_dim *dim)
 {
 	int prev_state = dim->tune_state;
@@ -197,44 +197,44 @@ static inline bool net_dim_decision(struct net_dim_stats *curr_stats,
 	int step_res;
 
 	switch (dim->tune_state) {
-	case NET_DIM_PARKING_ON_TOP:
+	case DIM_PARKING_ON_TOP:
 		stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats);
-		if (stats_res != NET_DIM_STATS_SAME)
+		if (stats_res != DIM_STATS_SAME)
 			net_dim_exit_parking(dim);
 		break;
 
-	case NET_DIM_PARKING_TIRED:
+	case DIM_PARKING_TIRED:
 		dim->tired--;
 		if (!dim->tired)
 			net_dim_exit_parking(dim);
 		break;
 
-	case NET_DIM_GOING_RIGHT:
-	case NET_DIM_GOING_LEFT:
+	case DIM_GOING_RIGHT:
+	case DIM_GOING_LEFT:
 		stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats);
-		if (stats_res != NET_DIM_STATS_BETTER)
-			net_dim_turn(dim);
+		if (stats_res != DIM_STATS_BETTER)
+			dim_turn(dim);
 
-		if (net_dim_on_top(dim)) {
-			net_dim_park_on_top(dim);
+		if (dim_on_top(dim)) {
+			dim_park_on_top(dim);
 			break;
 		}
 
 		step_res = net_dim_step(dim);
 		switch (step_res) {
-		case NET_DIM_ON_EDGE:
-			net_dim_park_on_top(dim);
+		case DIM_ON_EDGE:
+			dim_park_on_top(dim);
 			break;
-		case NET_DIM_TOO_TIRED:
-			net_dim_park_tired(dim);
+		case DIM_TOO_TIRED:
+			dim_park_tired(dim);
 			break;
 		}
 
 		break;
 	}
 
-	if ((prev_state      != NET_DIM_PARKING_ON_TOP) ||
-	    (dim->tune_state != NET_DIM_PARKING_ON_TOP))
+	if (prev_state != DIM_PARKING_ON_TOP ||
+	    dim->tune_state != DIM_PARKING_ON_TOP)
 		dim->prev_stats = *curr_stats;
 
 	return dim->profile_ix != prev_ix;
@@ -243,7 +243,7 @@ static inline bool net_dim_decision(struct net_dim_stats *curr_stats,
 static inline void net_dim(struct net_dim *dim,
 			   struct net_dim_sample end_sample)
 {
-	struct net_dim_stats curr_stats;
+	struct dim_stats curr_stats;
 	u16 nevents;
 
 	switch (dim->state) {
@@ -251,10 +251,9 @@ static inline void net_dim(struct net_dim *dim,
 		nevents = BIT_GAP(BITS_PER_TYPE(u16),
 				  end_sample.event_ctr,
 				  dim->start_sample.event_ctr);
-		if (nevents < NET_DIM_NEVENTS)
+		if (nevents < DIM_NEVENTS)
 			break;
-		net_dim_calc_stats(&dim->start_sample, &end_sample,
-				   &curr_stats);
+		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
 		if (net_dim_decision(&curr_stats, dim)) {
 			dim->state = NET_DIM_APPLY_NEW_PROFILE;
 			schedule_work(&dim->work);
-- 
cgit v1.2.3


From c002bd529d719858d4cc233431c88c9efa844053 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Mon, 5 Nov 2018 12:07:52 +0200
Subject: linux/dim: Rename externally exposed macros

Renamed macros in use by external drivers.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c        |  4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c     |  2 +-
 drivers/net/ethernet/broadcom/genet/bcmgenet.c    |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++-----
 include/linux/dim.h                               | 12 ++++++------
 include/linux/net_dim.h                           | 18 +++++++++---------
 8 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index c623896e3ccb..b5e2f9d2cb71 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1099,7 +1099,7 @@ static void bcm_sysport_dim_work(struct work_struct *work)
 			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcm_sysport_set_rx_coalesce(priv, cur_profile.usec, cur_profile.pkts);
-	dim->state = NET_DIM_START_MEASURE;
+	dim->state = DIM_START_MEASURE;
 }
 
 /* RX and misc interrupt routine */
@@ -1440,7 +1440,7 @@ static void bcm_sysport_init_dim(struct bcm_sysport_priv *priv,
 	struct bcm_sysport_net_dim *dim = &priv->dim;
 
 	INIT_WORK(&dim->dim.work, cb);
-	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	dim->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 	dim->event_ctr = 0;
 	dim->packets = 0;
 	dim->bytes = 0;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 8314c00d7537..49de873043c0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7810,7 +7810,7 @@ static void bnxt_enable_napi(struct bnxt *bp)
 
 		if (bp->bnapi[i]->rx_ring) {
 			INIT_WORK(&cpr->dim.work, bnxt_dim_work);
-			cpr->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+			cpr->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 		}
 		napi_enable(&bp->bnapi[i]->napi);
 	}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index afa97c8bb081..16a4588709d1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -28,5 +28,5 @@ void bnxt_dim_work(struct work_struct *work)
 	cpr->rx_ring_coal.coal_bufs = cur_moder.pkts;
 
 	bnxt_hwrm_set_ring_coal(bnapi->bp, bnapi);
-	dim->state = NET_DIM_START_MEASURE;
+	dim->state = DIM_START_MEASURE;
 }
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 374b9ff05c88..5286a46ecfb0 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1928,7 +1928,7 @@ static void bcmgenet_dim_work(struct work_struct *work)
 			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcmgenet_set_rx_coalesce(ring, cur_profile.usec, cur_profile.pkts);
-	dim->state = NET_DIM_START_MEASURE;
+	dim->state = DIM_START_MEASURE;
 }
 
 /* Assign skb to RX DMA descriptor. */
@@ -2085,7 +2085,7 @@ static void bcmgenet_init_dim(struct bcmgenet_rx_ring *ring,
 	struct bcmgenet_net_dim *dim = &ring->dim;
 
 	INIT_WORK(&dim->dim.work, cb);
-	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	dim->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 	dim->event_ctr = 0;
 	dim->packets = 0;
 	dim->bytes = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index d67adf70a97b..a80303add7c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -38,7 +38,7 @@ mlx5e_complete_dim_work(struct net_dim *dim, struct net_dim_cq_moder moder,
 			struct mlx5_core_dev *mdev, struct mlx5_core_cq *mcq)
 {
 	mlx5_core_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts);
-	dim->state = NET_DIM_START_MEASURE;
+	dim->state = DIM_START_MEASURE;
 }
 
 void mlx5e_rx_dim_work(struct work_struct *work)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 457cc39423f2..5b89e992e482 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -584,11 +584,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	switch (params->rx_cq_moderation.cq_period_mode) {
 	case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
-		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
+		rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 		break;
 	case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
 	default:
-		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+		rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 	}
 
 	rq->page_cache.head = 0;
@@ -2151,7 +2151,7 @@ static void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
 
 	mlx5e_build_common_cq_param(priv, param);
 
-	param->cq_period_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	param->cq_period_mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 }
 
 static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
@@ -4440,8 +4440,8 @@ static struct net_dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
 static u8 mlx5_to_net_dim_cq_period_mode(u8 cq_period_mode)
 {
 	return cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE ?
-		NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE :
-		NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+		DIM_CQ_PERIOD_MODE_START_FROM_CQE :
+		DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 }
 
 void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
diff --git a/include/linux/dim.h b/include/linux/dim.h
index 6ee991681d62..989dbbdf9d45 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -47,15 +47,15 @@ struct net_dim { /* Dynamic Interrupt Moderation */
 };
 
 enum {
-	NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
-	NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
-	NET_DIM_CQ_PERIOD_NUM_MODES
+	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	DIM_CQ_PERIOD_NUM_MODES
 };
 
 enum {
-	NET_DIM_START_MEASURE,
-	NET_DIM_MEASURE_IN_PROGRESS,
-	NET_DIM_APPLY_NEW_PROFILE,
+	DIM_START_MEASURE,
+	DIM_MEASURE_IN_PROGRESS,
+	DIM_APPLY_NEW_PROFILE,
 };
 
 enum {
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index f89fa4fdfb46..e0c97f824dd0 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -78,13 +78,13 @@
 }
 
 static const struct net_dim_cq_moder
-rx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_RX_EQE_PROFILES,
 	NET_DIM_RX_CQE_PROFILES,
 };
 
 static const struct net_dim_cq_moder
-tx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_TX_EQE_PROFILES,
 	NET_DIM_TX_CQE_PROFILES,
 };
@@ -101,7 +101,7 @@ net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 static inline struct net_dim_cq_moder
 net_dim_get_def_rx_moderation(u8 cq_period_mode)
 {
-	u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
 			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
 
 	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
@@ -119,7 +119,7 @@ net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
 static inline struct net_dim_cq_moder
 net_dim_get_def_tx_moderation(u8 cq_period_mode)
 {
-	u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
 			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
 
 	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
@@ -247,7 +247,7 @@ static inline void net_dim(struct net_dim *dim,
 	u16 nevents;
 
 	switch (dim->state) {
-	case NET_DIM_MEASURE_IN_PROGRESS:
+	case DIM_MEASURE_IN_PROGRESS:
 		nevents = BIT_GAP(BITS_PER_TYPE(u16),
 				  end_sample.event_ctr,
 				  dim->start_sample.event_ctr);
@@ -255,17 +255,17 @@ static inline void net_dim(struct net_dim *dim,
 			break;
 		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
 		if (net_dim_decision(&curr_stats, dim)) {
-			dim->state = NET_DIM_APPLY_NEW_PROFILE;
+			dim->state = DIM_APPLY_NEW_PROFILE;
 			schedule_work(&dim->work);
 			break;
 		}
 		/* fall through */
-	case NET_DIM_START_MEASURE:
+	case DIM_START_MEASURE:
 		net_dim_sample(end_sample.event_ctr, end_sample.pkt_ctr, end_sample.byte_ctr,
 			       &dim->start_sample);
-		dim->state = NET_DIM_MEASURE_IN_PROGRESS;
+		dim->state = DIM_MEASURE_IN_PROGRESS;
 		break;
-	case NET_DIM_APPLY_NEW_PROFILE:
+	case DIM_APPLY_NEW_PROFILE:
 		break;
 	}
 }
-- 
cgit v1.2.3


From e5b6ab02d7aa4118c9a36491633812dcc442acbe Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Mon, 14 Jan 2019 15:32:49 +0200
Subject: linux/dim: Rename net_dim_sample() to net_dim_update_sample()

In order to avoid confusion between the function and the similarly
named struct.
In preparation for removing the 'net' prefix from dim members.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c        | 4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 8 ++++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.c    | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 6 ++----
 include/linux/dim.h                               | 3 ++-
 include/linux/net_dim.h                           | 4 ++--
 6 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index b5e2f9d2cb71..faaf8ade15e5 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1019,8 +1019,8 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (priv->dim.use_dim) {
-		net_dim_sample(priv->dim.event_ctr, priv->dim.packets,
-			       priv->dim.bytes, &dim_sample);
+		net_dim_update_sample(priv->dim.event_ctr, priv->dim.packets,
+				      priv->dim.bytes, &dim_sample);
 		net_dim(&priv->dim.dim, dim_sample);
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 49de873043c0..eaec949c367a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2130,10 +2130,10 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 	if (bp->flags & BNXT_FLAG_DIM) {
 		struct net_dim_sample dim_sample;
 
-		net_dim_sample(cpr->event_ctr,
-			       cpr->rx_packets,
-			       cpr->rx_bytes,
-			       &dim_sample);
+		net_dim_update_sample(cpr->event_ctr,
+				      cpr->rx_packets,
+				      cpr->rx_bytes,
+				      &dim_sample);
 		net_dim(&cpr->dim, dim_sample);
 	}
 	return work_done;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 5286a46ecfb0..297ae786ffed 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1909,8 +1909,8 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (ring->dim.use_dim) {
-		net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
-			       ring->dim.bytes, &dim_sample);
+		net_dim_update_sample(ring->dim.event_ctr, ring->dim.packets,
+				      ring->dim.bytes, &dim_sample);
 		net_dim(&ring->dim.dim, dim_sample);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index f9862bf75491..07432e6428cf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -53,8 +53,7 @@ static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
 	if (unlikely(!test_bit(MLX5E_SQ_STATE_AM, &sq->state)))
 		return;
 
-	net_dim_sample(sq->cq.event_ctr, stats->packets, stats->bytes,
-		       &dim_sample);
+	net_dim_update_sample(sq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
 	net_dim(&sq->dim, dim_sample);
 }
 
@@ -66,8 +65,7 @@ static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_AM, &rq->state)))
 		return;
 
-	net_dim_sample(rq->cq.event_ctr, stats->packets, stats->bytes,
-		       &dim_sample);
+	net_dim_update_sample(rq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
 	net_dim(&rq->dim, dim_sample);
 }
 
diff --git a/include/linux/dim.h b/include/linux/dim.h
index 989dbbdf9d45..f0f20ed25497 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -123,7 +123,8 @@ static inline void dim_park_tired(struct net_dim *dim)
 }
 
 static inline void
-net_dim_sample(u16 event_ctr, u64 packets, u64 bytes, struct net_dim_sample *s)
+net_dim_update_sample(u16 event_ctr, u64 packets, u64 bytes,
+		      struct net_dim_sample *s)
 {
 	s->time	     = ktime_get();
 	s->pkt_ctr   = packets;
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index e0c97f824dd0..d4b40adc7fa1 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -261,8 +261,8 @@ static inline void net_dim(struct net_dim *dim,
 		}
 		/* fall through */
 	case DIM_START_MEASURE:
-		net_dim_sample(end_sample.event_ctr, end_sample.pkt_ctr, end_sample.byte_ctr,
-			       &dim->start_sample);
+		net_dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				      end_sample.byte_ctr, &dim->start_sample);
 		dim->state = DIM_MEASURE_IN_PROGRESS;
 		break;
 	case DIM_APPLY_NEW_PROFILE:
-- 
cgit v1.2.3


From 8960b38932bee8db0bc9c4d8c135f21df6cdd297 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Thu, 31 Jan 2019 16:44:48 +0200
Subject: linux/dim: Rename externally used net_dim members

Removed 'net' prefix from functions and structs used by external drivers.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c         | 16 ++++++------
 drivers/net/ethernet/broadcom/bcmsysport.h         |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          | 10 ++++----
 drivers/net/ethernet/broadcom/bnxt/bnxt.h          |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c  |  4 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c      |  5 ++--
 drivers/net/ethernet/broadcom/genet/bcmgenet.c     | 14 +++++-----
 drivers/net/ethernet/broadcom/genet/bcmgenet.h     |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  8 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   | 10 ++++----
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  4 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 12 ++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |  8 +++---
 include/linux/dim.h                                | 21 ++++++++-------
 include/linux/net_dim.h                            | 30 +++++++++++-----------
 15 files changed, 73 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index faaf8ade15e5..c1247b2948ff 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -612,7 +612,7 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
 				    struct ethtool_coalesce *ec)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 	u32 usecs, pkts;
 	unsigned int i;
 
@@ -995,7 +995,7 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 {
 	struct bcm_sysport_priv *priv =
 		container_of(napi, struct bcm_sysport_priv, napi);
-	struct net_dim_sample dim_sample;
+	struct dim_sample dim_sample;
 	unsigned int work_done = 0;
 
 	work_done = bcm_sysport_desc_rx(priv, budget);
@@ -1019,8 +1019,8 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (priv->dim.use_dim) {
-		net_dim_update_sample(priv->dim.event_ctr, priv->dim.packets,
-				      priv->dim.bytes, &dim_sample);
+		dim_update_sample(priv->dim.event_ctr, priv->dim.packets,
+				  priv->dim.bytes, &dim_sample);
 		net_dim(&priv->dim.dim, dim_sample);
 	}
 
@@ -1090,13 +1090,13 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
 
 static void bcm_sysport_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct bcm_sysport_net_dim *ndim =
 			container_of(dim, struct bcm_sysport_net_dim, dim);
 	struct bcm_sysport_priv *priv =
 			container_of(ndim, struct bcm_sysport_priv, dim);
-	struct net_dim_cq_moder cur_profile =
-			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
+	struct dim_cq_moder cur_profile = net_dim_get_rx_moderation(dim->mode,
+								    dim->profile_ix);
 
 	bcm_sysport_set_rx_coalesce(priv, cur_profile.usec, cur_profile.pkts);
 	dim->state = DIM_START_MEASURE;
@@ -1449,7 +1449,7 @@ static void bcm_sysport_init_dim(struct bcm_sysport_priv *priv,
 static void bcm_sysport_init_rx_coalesce(struct bcm_sysport_priv *priv)
 {
 	struct bcm_sysport_net_dim *dim = &priv->dim;
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 	u32 usecs, pkts;
 
 	usecs = priv->rx_coalesce_usecs;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index 6f3141c86436..cbe6d559d964 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -705,7 +705,7 @@ struct bcm_sysport_net_dim {
 	u16			event_ctr;
 	unsigned long		packets;
 	unsigned long		bytes;
-	struct net_dim		dim;
+	struct dim		dim;
 };
 
 /* Software view of the TX ring */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index eaec949c367a..c54668004600 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2128,12 +2128,12 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 		}
 	}
 	if (bp->flags & BNXT_FLAG_DIM) {
-		struct net_dim_sample dim_sample;
+		struct dim_sample dim_sample;
 
-		net_dim_update_sample(cpr->event_ctr,
-				      cpr->rx_packets,
-				      cpr->rx_bytes,
-				      &dim_sample);
+		dim_update_sample(cpr->event_ctr,
+				  cpr->rx_packets,
+				  cpr->rx_bytes,
+				  &dim_sample);
 		net_dim(&cpr->dim, dim_sample);
 	}
 	return work_done;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index eca36dd6b751..a552c5539cc9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -809,7 +809,7 @@ struct bnxt_cp_ring_info {
 	u64			rx_bytes;
 	u64			event_ctr;
 
-	struct net_dim		dim;
+	struct dim		dim;
 
 	union {
 		struct tx_cmp	*cp_desc_ring[MAX_CP_PAGES];
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
index 94e208e9789f..3d1d53fbb135 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
@@ -21,7 +21,7 @@ static ssize_t debugfs_dim_read(struct file *filep,
 				char __user *buffer,
 				size_t count, loff_t *ppos)
 {
-	struct net_dim *dim = filep->private_data;
+	struct dim *dim = filep->private_data;
 	int len;
 	char *buf;
 
@@ -61,7 +61,7 @@ static const struct file_operations debugfs_dim_fops = {
 	.read = debugfs_dim_read,
 };
 
-static struct dentry *debugfs_dim_ring_init(struct net_dim *dim, int ring_idx,
+static struct dentry *debugfs_dim_ring_init(struct dim *dim, int ring_idx,
 					    struct dentry *dd)
 {
 	static char qname[16];
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index 16a4588709d1..11605f9fa61e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -13,15 +13,14 @@
 
 void bnxt_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim,
-					   work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct bnxt_cp_ring_info *cpr = container_of(dim,
 						     struct bnxt_cp_ring_info,
 						     dim);
 	struct bnxt_napi *bnapi = container_of(cpr,
 					       struct bnxt_napi,
 					       cp_ring);
-	struct net_dim_cq_moder cur_moder =
+	struct dim_cq_moder cur_moder =
 		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	cpr->rx_ring_coal.coal_ticks = cur_moder.usec;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 297ae786ffed..b7f8f4f1088f 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -643,7 +643,7 @@ static void bcmgenet_set_rx_coalesce(struct bcmgenet_rx_ring *ring,
 static void bcmgenet_set_ring_rx_coalesce(struct bcmgenet_rx_ring *ring,
 					  struct ethtool_coalesce *ec)
 {
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 	u32 usecs, pkts;
 
 	ring->rx_coalesce_usecs = ec->rx_coalesce_usecs;
@@ -1898,7 +1898,7 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 {
 	struct bcmgenet_rx_ring *ring = container_of(napi,
 			struct bcmgenet_rx_ring, napi);
-	struct net_dim_sample dim_sample;
+	struct dim_sample dim_sample;
 	unsigned int work_done;
 
 	work_done = bcmgenet_desc_rx(ring, budget);
@@ -1909,8 +1909,8 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (ring->dim.use_dim) {
-		net_dim_update_sample(ring->dim.event_ctr, ring->dim.packets,
-				      ring->dim.bytes, &dim_sample);
+		dim_update_sample(ring->dim.event_ctr, ring->dim.packets,
+				  ring->dim.bytes, &dim_sample);
 		net_dim(&ring->dim.dim, dim_sample);
 	}
 
@@ -1919,12 +1919,12 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 
 static void bcmgenet_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct bcmgenet_net_dim *ndim =
 			container_of(dim, struct bcmgenet_net_dim, dim);
 	struct bcmgenet_rx_ring *ring =
 			container_of(ndim, struct bcmgenet_rx_ring, dim);
-	struct net_dim_cq_moder cur_profile =
+	struct dim_cq_moder cur_profile =
 			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcmgenet_set_rx_coalesce(ring, cur_profile.usec, cur_profile.pkts);
@@ -2094,7 +2094,7 @@ static void bcmgenet_init_dim(struct bcmgenet_rx_ring *ring,
 static void bcmgenet_init_rx_coalesce(struct bcmgenet_rx_ring *ring)
 {
 	struct bcmgenet_net_dim *dim = &ring->dim;
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 	u32 usecs, pkts;
 
 	usecs = ring->rx_coalesce_usecs;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 14b49612aa86..6e418d9c3706 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -581,7 +581,7 @@ struct bcmgenet_net_dim {
 	u16		event_ctr;
 	unsigned long	packets;
 	unsigned long	bytes;
-	struct net_dim	dim;
+	struct dim	dim;
 };
 
 struct bcmgenet_rx_ring {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3a183d690e23..11efd6e4bdc3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -238,9 +238,9 @@ struct mlx5e_params {
 	u16 num_channels;
 	u8  num_tc;
 	bool rx_cqe_compress_def;
-	struct net_dim_cq_moder rx_cq_moderation;
-	struct net_dim_cq_moder tx_cq_moderation;
 	bool tunneled_offload_en;
+	struct dim_cq_moder rx_cq_moderation;
+	struct dim_cq_moder tx_cq_moderation;
 	bool lro_en;
 	u8  tx_min_inline_mode;
 	bool vlan_strip_disable;
@@ -356,7 +356,7 @@ struct mlx5e_txqsq {
 	/* dirtied @completion */
 	u16                        cc;
 	u32                        dma_fifo_cc;
-	struct net_dim             dim; /* Adaptive Moderation */
+	struct dim                 dim; /* Adaptive Moderation */
 
 	/* dirtied @xmit */
 	u16                        pc ____cacheline_aligned_in_smp;
@@ -595,7 +595,7 @@ struct mlx5e_rq {
 	int                    ix;
 	unsigned int           hw_mtu;
 
-	struct net_dim         dim; /* Dynamic Interrupt Moderation */
+	struct dim         dim; /* Dynamic Interrupt Moderation */
 
 	/* XDP */
 	struct bpf_prog       *xdp_prog;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index a80303add7c0..ba3c1be9f2d3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -34,7 +34,7 @@
 #include "en.h"
 
 static void
-mlx5e_complete_dim_work(struct net_dim *dim, struct net_dim_cq_moder moder,
+mlx5e_complete_dim_work(struct dim *dim, struct dim_cq_moder moder,
 			struct mlx5_core_dev *mdev, struct mlx5_core_cq *mcq)
 {
 	mlx5_core_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts);
@@ -43,9 +43,9 @@ mlx5e_complete_dim_work(struct net_dim *dim, struct net_dim_cq_moder moder,
 
 void mlx5e_rx_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim);
-	struct net_dim_cq_moder cur_moder =
+	struct dim_cq_moder cur_moder =
 		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	mlx5e_complete_dim_work(dim, cur_moder, rq->mdev, &rq->cq.mcq);
@@ -53,9 +53,9 @@ void mlx5e_rx_dim_work(struct work_struct *work)
 
 void mlx5e_tx_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct dim *dim = container_of(work, struct dim, work);
 	struct mlx5e_txqsq *sq = container_of(dim, struct mlx5e_txqsq, dim);
-	struct net_dim_cq_moder cur_moder =
+	struct dim_cq_moder cur_moder =
 		net_dim_get_tx_moderation(dim->mode, dim->profile_ix);
 
 	mlx5e_complete_dim_work(dim, cur_moder, sq->cq.mdev, &sq->cq.mcq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index dd764e0471f2..c853b657739c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -466,7 +466,7 @@ static int mlx5e_set_channels(struct net_device *dev,
 int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
 			       struct ethtool_coalesce *coal)
 {
-	struct net_dim_cq_moder *rx_moder, *tx_moder;
+	struct dim_cq_moder *rx_moder, *tx_moder;
 
 	if (!MLX5_CAP_GEN(priv->mdev, cq_moderation))
 		return -EOPNOTSUPP;
@@ -521,7 +521,7 @@ mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesc
 int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 			       struct ethtool_coalesce *coal)
 {
-	struct net_dim_cq_moder *rx_moder, *tx_moder;
+	struct dim_cq_moder *rx_moder, *tx_moder;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_channels new_channels = {};
 	int err = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5b89e992e482..9705101c0235 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1569,7 +1569,7 @@ static void mlx5e_destroy_cq(struct mlx5e_cq *cq)
 }
 
 static int mlx5e_open_cq(struct mlx5e_channel *c,
-			 struct net_dim_cq_moder moder,
+			 struct dim_cq_moder moder,
 			 struct mlx5e_cq_param *param,
 			 struct mlx5e_cq *cq)
 {
@@ -1774,7 +1774,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 			      struct mlx5e_channel **cp)
 {
 	int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, ix));
-	struct net_dim_cq_moder icocq_moder = {0, 0};
+	struct dim_cq_moder icocq_moder = {0, 0};
 	struct net_device *netdev = priv->netdev;
 	struct mlx5e_channel *c;
 	unsigned int irq;
@@ -4411,9 +4411,9 @@ static bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
 		link_speed > MLX5E_SLOW_PCI_RATIO * pci_bw;
 }
 
-static struct net_dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
+static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
 {
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 
 	moder.cq_period_mode = cq_period_mode;
 	moder.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
@@ -4424,9 +4424,9 @@ static struct net_dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
 	return moder;
 }
 
-static struct net_dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
+static struct dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
 {
-	struct net_dim_cq_moder moder;
+	struct dim_cq_moder moder;
 
 	moder.cq_period_mode = cq_period_mode;
 	moder.pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 07432e6428cf..e6c434efbd46 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -48,24 +48,24 @@ static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
 static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
 {
 	struct mlx5e_sq_stats *stats = sq->stats;
-	struct net_dim_sample dim_sample;
+	struct dim_sample dim_sample;
 
 	if (unlikely(!test_bit(MLX5E_SQ_STATE_AM, &sq->state)))
 		return;
 
-	net_dim_update_sample(sq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
+	dim_update_sample(sq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
 	net_dim(&sq->dim, dim_sample);
 }
 
 static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
 {
 	struct mlx5e_rq_stats *stats = rq->stats;
-	struct net_dim_sample dim_sample;
+	struct dim_sample dim_sample;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_AM, &rq->state)))
 		return;
 
-	net_dim_update_sample(rq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
+	dim_update_sample(rq->cq.event_ctr, stats->packets, stats->bytes, &dim_sample);
 	net_dim(&rq->dim, dim_sample);
 }
 
diff --git a/include/linux/dim.h b/include/linux/dim.h
index f0f20ed25497..60e5074a7cc0 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -14,13 +14,13 @@
 #define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
 & (BIT_ULL(bits) - 1))
 
-struct net_dim_cq_moder {
+struct dim_cq_moder {
 	u16 usec;
 	u16 pkts;
 	u8 cq_period_mode;
 };
 
-struct net_dim_sample {
+struct dim_sample {
 	ktime_t time;
 	u32 pkt_ctr;
 	u32 byte_ctr;
@@ -33,10 +33,10 @@ struct dim_stats {
 	int epms; /* events per msec */
 };
 
-struct net_dim { /* Dynamic Interrupt Moderation */
+struct dim { /* Dynamic Interrupt Moderation */
 	u8 state;
 	struct dim_stats prev_stats;
-	struct net_dim_sample start_sample;
+	struct dim_sample start_sample;
 	struct work_struct work;
 	u8 profile_ix;
 	u8 mode;
@@ -77,7 +77,7 @@ enum {
 	DIM_ON_EDGE,
 };
 
-static inline bool dim_on_top(struct net_dim *dim)
+static inline bool dim_on_top(struct dim *dim)
 {
 	switch (dim->tune_state) {
 	case DIM_PARKING_ON_TOP:
@@ -90,7 +90,7 @@ static inline bool dim_on_top(struct net_dim *dim)
 	}
 }
 
-static inline void dim_turn(struct net_dim *dim)
+static inline void dim_turn(struct dim *dim)
 {
 	switch (dim->tune_state) {
 	case DIM_PARKING_ON_TOP:
@@ -107,7 +107,7 @@ static inline void dim_turn(struct net_dim *dim)
 	}
 }
 
-static inline void dim_park_on_top(struct net_dim *dim)
+static inline void dim_park_on_top(struct dim *dim)
 {
 	dim->steps_right  = 0;
 	dim->steps_left   = 0;
@@ -115,7 +115,7 @@ static inline void dim_park_on_top(struct net_dim *dim)
 	dim->tune_state   = DIM_PARKING_ON_TOP;
 }
 
-static inline void dim_park_tired(struct net_dim *dim)
+static inline void dim_park_tired(struct dim *dim)
 {
 	dim->steps_right  = 0;
 	dim->steps_left   = 0;
@@ -123,8 +123,7 @@ static inline void dim_park_tired(struct net_dim *dim)
 }
 
 static inline void
-net_dim_update_sample(u16 event_ctr, u64 packets, u64 bytes,
-		      struct net_dim_sample *s)
+dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
 {
 	s->time	     = ktime_get();
 	s->pkt_ctr   = packets;
@@ -133,7 +132,7 @@ net_dim_update_sample(u16 event_ctr, u64 packets, u64 bytes,
 }
 
 static inline void
-dim_calc_stats(struct net_dim_sample *start, struct net_dim_sample *end,
+dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
 	       struct dim_stats *curr_stats)
 {
 	/* u32 holds up to 71 minutes, should be enough */
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index d4b40adc7fa1..4e009ec193ef 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -77,28 +77,28 @@
 	{64, 32}   \
 }
 
-static const struct net_dim_cq_moder
+static const struct dim_cq_moder
 rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_RX_EQE_PROFILES,
 	NET_DIM_RX_CQE_PROFILES,
 };
 
-static const struct net_dim_cq_moder
+static const struct dim_cq_moder
 tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_TX_EQE_PROFILES,
 	NET_DIM_TX_CQE_PROFILES,
 };
 
-static inline struct net_dim_cq_moder
+static inline struct dim_cq_moder
 net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 {
-	struct net_dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
+	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
 
 	cq_moder.cq_period_mode = cq_period_mode;
 	return cq_moder;
 }
 
-static inline struct net_dim_cq_moder
+static inline struct dim_cq_moder
 net_dim_get_def_rx_moderation(u8 cq_period_mode)
 {
 	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
@@ -107,16 +107,16 @@ net_dim_get_def_rx_moderation(u8 cq_period_mode)
 	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
 }
 
-static inline struct net_dim_cq_moder
+static inline struct dim_cq_moder
 net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
 {
-	struct net_dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
+	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
 
 	cq_moder.cq_period_mode = cq_period_mode;
 	return cq_moder;
 }
 
-static inline struct net_dim_cq_moder
+static inline struct dim_cq_moder
 net_dim_get_def_tx_moderation(u8 cq_period_mode)
 {
 	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
@@ -125,7 +125,7 @@ net_dim_get_def_tx_moderation(u8 cq_period_mode)
 	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
 }
 
-static inline int net_dim_step(struct net_dim *dim)
+static inline int net_dim_step(struct dim *dim)
 {
 	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
 		return DIM_TOO_TIRED;
@@ -152,7 +152,7 @@ static inline int net_dim_step(struct net_dim *dim)
 	return DIM_STEPPED;
 }
 
-static inline void net_dim_exit_parking(struct net_dim *dim)
+static inline void net_dim_exit_parking(struct dim *dim)
 {
 	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT :
 					  DIM_GOING_RIGHT;
@@ -189,7 +189,7 @@ static inline int net_dim_stats_compare(struct dim_stats *curr,
 }
 
 static inline bool net_dim_decision(struct dim_stats *curr_stats,
-				    struct net_dim *dim)
+				    struct dim *dim)
 {
 	int prev_state = dim->tune_state;
 	int prev_ix = dim->profile_ix;
@@ -240,8 +240,8 @@ static inline bool net_dim_decision(struct dim_stats *curr_stats,
 	return dim->profile_ix != prev_ix;
 }
 
-static inline void net_dim(struct net_dim *dim,
-			   struct net_dim_sample end_sample)
+static inline void net_dim(struct dim *dim,
+			   struct dim_sample end_sample)
 {
 	struct dim_stats curr_stats;
 	u16 nevents;
@@ -261,8 +261,8 @@ static inline void net_dim(struct net_dim *dim,
 		}
 		/* fall through */
 	case DIM_START_MEASURE:
-		net_dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
-				      end_sample.byte_ctr, &dim->start_sample);
+		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				  end_sample.byte_ctr, &dim->start_sample);
 		dim->state = DIM_MEASURE_IN_PROGRESS;
 		break;
 	case DIM_APPLY_NEW_PROFILE:
-- 
cgit v1.2.3


From 4f75da3666c0c572967729a2401ac650be5581b6 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Thu, 10 Jan 2019 17:33:17 +0200
Subject: linux/dim: Move implementation to .c files

Moved all logic from dim.h and net_dim.h to dim.c and net_dim.c.
This is both more structurally appealing and would allow to only
expose externally used functions.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 MAINTAINERS                                       |   2 +-
 drivers/net/ethernet/broadcom/Kconfig             |   1 +
 drivers/net/ethernet/broadcom/bcmsysport.h        |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c     |   2 +-
 drivers/net/ethernet/broadcom/genet/bcmgenet.h    |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c  |   2 +-
 include/linux/dim.h                               | 319 +++++++++++++++++-----
 include/linux/net_dim.h                           | 273 ------------------
 lib/Kconfig                                       |   8 +
 lib/Makefile                                      |   1 +
 lib/dim/Makefile                                  |   9 +
 lib/dim/dim.c                                     |  74 +++++
 lib/dim/net_dim.c                                 | 190 +++++++++++++
 17 files changed, 547 insertions(+), 345 deletions(-)
 delete mode 100644 include/linux/net_dim.h
 create mode 100644 lib/dim/Makefile
 create mode 100644 lib/dim/dim.c
 create mode 100644 lib/dim/net_dim.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5d4b852d9d39..f78dd16195e3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5588,8 +5588,8 @@ F:	include/linux/dynamic_debug.h
 DYNAMIC INTERRUPT MODERATION
 M:	Tal Gilboa <talgi@mellanox.com>
 S:	Maintained
-F:	include/linux/net_dim.h
 F:	include/linux/dim.h
+F:	lib/dim/
 
 DZ DECSTATION DZ11 SERIAL DRIVER
 M:	"Maciej W. Rozycki" <macro@linux-mips.org>
diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index b123509d385f..2e4a8c7237ef 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -8,6 +8,7 @@ config NET_VENDOR_BROADCOM
 	default y
 	depends on (SSB_POSSIBLE && HAS_DMA) || PCI || BCM63XX || \
 		   SIBYTE_SB1xxx_SOC
+	select DIMLIB
 	---help---
 	  If you have a network (Ethernet) chipset belonging to this class,
 	  say Y.
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index cbe6d559d964..f6677a02d811 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -14,7 +14,7 @@
 #include <linux/bitmap.h>
 #include <linux/ethtool.h>
 #include <linux/if_vlan.h>
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 
 /* Receive/transmit descriptor format */
 #define DESC_ADDR_HI_STATUS_LEN	0x00
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index a552c5539cc9..54c01705f3bd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -23,7 +23,7 @@
 #include <net/devlink.h>
 #include <net/dst_metadata.h>
 #include <net/xdp.h>
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 
 struct tx_bd {
 	__le32 tx_bd_len_flags_type;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
index 3d1d53fbb135..61393f351a77 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include "bnxt_hsi.h"
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 #include "bnxt.h"
 #include "bnxt_debugfs.h"
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index 11605f9fa61e..6f6576dc417a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -7,7 +7,7 @@
  * the Free Software Foundation.
  */
 
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 #include "bnxt_hsi.h"
 #include "bnxt.h"
 
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 6e418d9c3706..b2f05e47dc65 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -16,7 +16,7 @@
 #include <linux/mii.h>
 #include <linux/if_vlan.h>
 #include <linux/phy.h>
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 
 /* total number of Buffer Descriptors, same for Rx/Tx */
 #define TOTAL_DESC				256
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 2391e3cfb56b..7845aa5bf6be 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -34,6 +34,7 @@ config MLX5_CORE_EN
 	depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
 	depends on IPV6=y || IPV6=n || MLX5_CORE=m
 	select PAGE_POOL
+	select DIMLIB
 	default n
 	---help---
 	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 11efd6e4bdc3..abf42d3aabe9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -48,7 +48,7 @@
 #include <linux/rhashtable.h>
 #include <net/switchdev.h>
 #include <net/xdp.h>
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 #include <linux/bits.h>
 #include "wq.h"
 #include "mlx5_core.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index ba3c1be9f2d3..ca9cfbf57d8f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -30,7 +30,7 @@
  * SOFTWARE.
  */
 
-#include <linux/net_dim.h>
+#include <linux/dim.h>
 #include "en.h"
 
 static void
diff --git a/include/linux/dim.h b/include/linux/dim.h
index 60e5074a7cc0..f48ede3e0322 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -6,20 +6,49 @@
 
 #include <linux/module.h>
 
+/**
+ * Number of events between DIM iterations.
+ * Causes a moderation of the algorithm run.
+ */
 #define DIM_NEVENTS 64
 
-/* more than 10% difference */
+/**
+ * Is a difference between values justifies taking an action.
+ * We consider 10% difference as significant.
+ */
 #define IS_SIGNIFICANT_DIFF(val, ref) \
 	(((100UL * abs((val) - (ref))) / (ref)) > 10)
+
+/**
+ * Calculate the gap between two values.
+ * Take wrap-around and variable size into consideration.
+ */
 #define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
-& (BIT_ULL(bits) - 1))
+		& (BIT_ULL(bits) - 1))
 
+/**
+ * Structure for CQ moderation values.
+ * Used for communications between DIM and its consumer.
+ *
+ * @usec: CQ timer suggestion (by DIM)
+ * @pkts: CQ packet counter suggestion (by DIM)
+ * @cq_period_mode: CQ priod count mode (from CQE/EQE)
+ */
 struct dim_cq_moder {
 	u16 usec;
 	u16 pkts;
 	u8 cq_period_mode;
 };
 
+/**
+ * Structure for DIM sample data.
+ * Used for communications between DIM and its consumer.
+ *
+ * @time: Sample timestamp
+ * @pkt_ctr: Number of packets
+ * @byte_ctr: Number of bytes
+ * @event_ctr: Number of events
+ */
 struct dim_sample {
 	ktime_t time;
 	u32 pkt_ctr;
@@ -27,13 +56,36 @@ struct dim_sample {
 	u16 event_ctr;
 };
 
+/**
+ * Structure for DIM stats.
+ * Used for holding current measured rates.
+ *
+ * @ppms: Packets per msec
+ * @bpms: Bytes per msec
+ * @epms: Events per msec
+ */
 struct dim_stats {
-	int ppms; /* packets per msec */
-	int bpms; /* bytes per msec */
-	int epms; /* events per msec */
+	int ppms;
+	int bpms;
+	int epms;
 };
 
-struct dim { /* Dynamic Interrupt Moderation */
+/**
+ * Main structure for dynamic interrupt moderation (DIM).
+ * Used for holding all information about a specific DIM instance.
+ *
+ * @state: Algorithm state (see below)
+ * @prev_stats: Measured rates from previous iteration (for comparison)
+ * @start_sample: Sampled data at start of current iteration
+ * @work: Work to perform on action required
+ * @profile_ix: Current moderation profile
+ * @mode: CQ period count mode
+ * @tune_state: Algorithm tuning state (see below)
+ * @steps_right: Number of steps taken towards higher moderation
+ * @steps_left: Number of steps taken towards lower moderation
+ * @tired: Parking depth counter
+ */
+struct dim {
 	u8 state;
 	struct dim_stats prev_stats;
 	struct dim_sample start_sample;
@@ -46,18 +98,49 @@ struct dim { /* Dynamic Interrupt Moderation */
 	u8 tired;
 };
 
+/**
+ * enum dim_cq_period_mode
+ *
+ * These are the modes for CQ period count.
+ *
+ * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE
+ * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset)
+ * @DIM_CQ_PERIOD_NUM_MODES: Number of modes
+ */
 enum {
 	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
 	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
 	DIM_CQ_PERIOD_NUM_MODES
 };
 
+/**
+ * enum dim_state
+ *
+ * These are the DIM algorithm states.
+ * These will determine if the algorithm is in a valid state to start an iteration.
+ *
+ * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile)
+ * @DIM_MEASURE_IN_PROGRESS: Algorithm is already in progress - check if
+ * need to perform an action
+ * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure
+ */
 enum {
 	DIM_START_MEASURE,
 	DIM_MEASURE_IN_PROGRESS,
 	DIM_APPLY_NEW_PROFILE,
 };
 
+/**
+ * enum dim_tune_state
+ *
+ * These are the DIM algorithm tune states.
+ * These will determine which action the algorithm should perform.
+ *
+ * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference
+ * @DIM_PARKING_TIRED: Algorithm found a deep top point - don't exit if tired > 0
+ * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels
+ * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels
+ */
 enum {
 	DIM_PARKING_ON_TOP,
 	DIM_PARKING_TIRED,
@@ -65,63 +148,95 @@ enum {
 	DIM_GOING_LEFT,
 };
 
+/**
+ * enum dim_stats_state
+ *
+ * These are the DIM algorithm statistics states.
+ * These will determine the verdict of current iteration.
+ *
+ * @DIM_STATS_WORSE: Current iteration shows worse performance than before
+ * @DIM_STATS_WORSE: Current iteration shows same performance than before
+ * @DIM_STATS_WORSE: Current iteration shows better performance than before
+ */
 enum {
 	DIM_STATS_WORSE,
 	DIM_STATS_SAME,
 	DIM_STATS_BETTER,
 };
 
+/**
+ * enum dim_step_result
+ *
+ * These are the DIM algorithm step results.
+ * These describe the result of a step.
+ *
+ * @DIM_STEPPED: Performed a regular step
+ * @DIM_TOO_TIRED: Same kind of step was done multiple times - should go to
+ * tired parking
+ * @DIM_ON_EDGE: Stepped to the most left/right profile
+ */
 enum {
 	DIM_STEPPED,
 	DIM_TOO_TIRED,
 	DIM_ON_EDGE,
 };
 
-static inline bool dim_on_top(struct dim *dim)
-{
-	switch (dim->tune_state) {
-	case DIM_PARKING_ON_TOP:
-	case DIM_PARKING_TIRED:
-		return true;
-	case DIM_GOING_RIGHT:
-		return (dim->steps_left > 1) && (dim->steps_right == 1);
-	default: /* DIM_GOING_LEFT */
-		return (dim->steps_right > 1) && (dim->steps_left == 1);
-	}
-}
+/**
+ *	dim_on_top - check if current state is a good place to stop (top location)
+ *	@dim: DIM context
+ *
+ * Check if current profile is a good place to park at.
+ * This will result in reducing the DIM checks frequency as we assume we
+ * shouldn't probably change profiles, unless traffic pattern wasn't changed.
+ */
+bool dim_on_top(struct dim *dim);
 
-static inline void dim_turn(struct dim *dim)
-{
-	switch (dim->tune_state) {
-	case DIM_PARKING_ON_TOP:
-	case DIM_PARKING_TIRED:
-		break;
-	case DIM_GOING_RIGHT:
-		dim->tune_state = DIM_GOING_LEFT;
-		dim->steps_left = 0;
-		break;
-	case DIM_GOING_LEFT:
-		dim->tune_state = DIM_GOING_RIGHT;
-		dim->steps_right = 0;
-		break;
-	}
-}
+/**
+ *	dim_turn - change profile alterning direction
+ *	@dim: DIM context
+ *
+ * Go left if we were going right and vice-versa.
+ * Do nothing if currently parking.
+ */
+void dim_turn(struct dim *dim);
 
-static inline void dim_park_on_top(struct dim *dim)
-{
-	dim->steps_right  = 0;
-	dim->steps_left   = 0;
-	dim->tired        = 0;
-	dim->tune_state   = DIM_PARKING_ON_TOP;
-}
+/**
+ *	dim_park_on_top - enter a parking state on a top location
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history.
+ */
+void dim_park_on_top(struct dim *dim);
 
-static inline void dim_park_tired(struct dim *dim)
-{
-	dim->steps_right  = 0;
-	dim->steps_left   = 0;
-	dim->tune_state   = DIM_PARKING_TIRED;
-}
+/**
+ *	dim_park_tired - enter a tired parking state
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history and cause DIM checks frequency to reduce.
+ */
+void dim_park_tired(struct dim *dim);
+
+/**
+ *	dim_calc_stats - calculate the difference between two samples
+ *	@start: start sample
+ *	@end: end sample
+ *	@curr_stats: delta between samples
+ *
+ * Calculate the delta between two samples (in data rates).
+ * Takes into consideration counter wrap-around.
+ */
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats);
 
+/**
+ *	dim_update_sample - set a sample's fields with give values
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@s: DIM sample
+ */
 static inline void
 dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
 {
@@ -131,23 +246,99 @@ dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
 	s->event_ctr = event_ctr;
 }
 
-static inline void
-dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
-	       struct dim_stats *curr_stats)
-{
-	/* u32 holds up to 71 minutes, should be enough */
-	u32 delta_us = ktime_us_delta(end->time, start->time);
-	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
-	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
-			     start->byte_ctr);
-
-	if (!delta_us)
-		return;
-
-	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
-	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
-	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
-					delta_us);
+/* Net DIM */
+
+/*
+ * Net DIM profiles:
+ *        There are different set of profiles for each CQ period mode.
+ *        There are different set of profiles for RX/TX CQs.
+ *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
+ */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
+#define NET_DIM_RX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
 }
 
+#define NET_DIM_RX_CQE_PROFILES { \
+	{2,  256},             \
+	{8,  128},             \
+	{16, 64},              \
+	{32, 64},              \
+	{64, 64}               \
+}
+
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
+static const struct dim_cq_moder
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct dim_cq_moder
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
+};
+
+/**
+ *	net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_rx_moderation - provide the default RX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_rx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim_get_tx_moderation - provide a CQ moderation object for the given TX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_tx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_tx_moderation - provide the default TX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim - main DIM algorithm entry point
+ *	@dim: DIM instance information
+ *	@end_sample: Current data measurement
+ *
+ * Called by the consumer.
+ * This is the main logic of the algorithm, where data is processed in order to decide on next
+ * required action.
+ */
+void net_dim(struct dim *dim, struct dim_sample end_sample);
+
 #endif /* DIM_H */
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
deleted file mode 100644
index 4e009ec193ef..000000000000
--- a/include/linux/net_dim.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
- * Copyright (c) 2017-2018, Broadcom Limited. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef NET_DIM_H
-#define NET_DIM_H
-
-#include <linux/module.h>
-#include <linux/dim.h>
-
-#define NET_DIM_PARAMS_NUM_PROFILES 5
-/* Netdev dynamic interrupt moderation profiles */
-#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
-#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
-#define NET_DIM_DEF_PROFILE_CQE 1
-#define NET_DIM_DEF_PROFILE_EQE 1
-
-/* All profiles sizes must be NET_PARAMS_DIM_NUM_PROFILES */
-#define NET_DIM_RX_EQE_PROFILES { \
-	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
-}
-
-#define NET_DIM_RX_CQE_PROFILES { \
-	{2,  256},             \
-	{8,  128},             \
-	{16, 64},              \
-	{32, 64},              \
-	{64, 64}               \
-}
-
-#define NET_DIM_TX_EQE_PROFILES { \
-	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
-	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
-}
-
-#define NET_DIM_TX_CQE_PROFILES { \
-	{5,  128},  \
-	{8,  64},  \
-	{16, 32},  \
-	{32, 32},  \
-	{64, 32}   \
-}
-
-static const struct dim_cq_moder
-rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
-	NET_DIM_RX_EQE_PROFILES,
-	NET_DIM_RX_CQE_PROFILES,
-};
-
-static const struct dim_cq_moder
-tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
-	NET_DIM_TX_EQE_PROFILES,
-	NET_DIM_TX_CQE_PROFILES,
-};
-
-static inline struct dim_cq_moder
-net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
-{
-	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
-
-	cq_moder.cq_period_mode = cq_period_mode;
-	return cq_moder;
-}
-
-static inline struct dim_cq_moder
-net_dim_get_def_rx_moderation(u8 cq_period_mode)
-{
-	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
-			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
-
-	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
-}
-
-static inline struct dim_cq_moder
-net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
-{
-	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
-
-	cq_moder.cq_period_mode = cq_period_mode;
-	return cq_moder;
-}
-
-static inline struct dim_cq_moder
-net_dim_get_def_tx_moderation(u8 cq_period_mode)
-{
-	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
-			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
-
-	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
-}
-
-static inline int net_dim_step(struct dim *dim)
-{
-	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
-		return DIM_TOO_TIRED;
-
-	switch (dim->tune_state) {
-	case DIM_PARKING_ON_TOP:
-	case DIM_PARKING_TIRED:
-		break;
-	case DIM_GOING_RIGHT:
-		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
-			return DIM_ON_EDGE;
-		dim->profile_ix++;
-		dim->steps_right++;
-		break;
-	case DIM_GOING_LEFT:
-		if (dim->profile_ix == 0)
-			return DIM_ON_EDGE;
-		dim->profile_ix--;
-		dim->steps_left++;
-		break;
-	}
-
-	dim->tired++;
-	return DIM_STEPPED;
-}
-
-static inline void net_dim_exit_parking(struct dim *dim)
-{
-	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT :
-					  DIM_GOING_RIGHT;
-	net_dim_step(dim);
-}
-
-static inline int net_dim_stats_compare(struct dim_stats *curr,
-					struct dim_stats *prev)
-{
-	if (!prev->bpms)
-		return curr->bpms ? DIM_STATS_BETTER :
-				    DIM_STATS_SAME;
-
-	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
-		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
-						   DIM_STATS_WORSE;
-
-	if (!prev->ppms)
-		return curr->ppms ? DIM_STATS_BETTER :
-				    DIM_STATS_SAME;
-
-	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
-		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
-						   DIM_STATS_WORSE;
-
-	if (!prev->epms)
-		return DIM_STATS_SAME;
-
-	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
-		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
-						   DIM_STATS_WORSE;
-
-	return DIM_STATS_SAME;
-}
-
-static inline bool net_dim_decision(struct dim_stats *curr_stats,
-				    struct dim *dim)
-{
-	int prev_state = dim->tune_state;
-	int prev_ix = dim->profile_ix;
-	int stats_res;
-	int step_res;
-
-	switch (dim->tune_state) {
-	case DIM_PARKING_ON_TOP:
-		stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats);
-		if (stats_res != DIM_STATS_SAME)
-			net_dim_exit_parking(dim);
-		break;
-
-	case DIM_PARKING_TIRED:
-		dim->tired--;
-		if (!dim->tired)
-			net_dim_exit_parking(dim);
-		break;
-
-	case DIM_GOING_RIGHT:
-	case DIM_GOING_LEFT:
-		stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats);
-		if (stats_res != DIM_STATS_BETTER)
-			dim_turn(dim);
-
-		if (dim_on_top(dim)) {
-			dim_park_on_top(dim);
-			break;
-		}
-
-		step_res = net_dim_step(dim);
-		switch (step_res) {
-		case DIM_ON_EDGE:
-			dim_park_on_top(dim);
-			break;
-		case DIM_TOO_TIRED:
-			dim_park_tired(dim);
-			break;
-		}
-
-		break;
-	}
-
-	if (prev_state != DIM_PARKING_ON_TOP ||
-	    dim->tune_state != DIM_PARKING_ON_TOP)
-		dim->prev_stats = *curr_stats;
-
-	return dim->profile_ix != prev_ix;
-}
-
-static inline void net_dim(struct dim *dim,
-			   struct dim_sample end_sample)
-{
-	struct dim_stats curr_stats;
-	u16 nevents;
-
-	switch (dim->state) {
-	case DIM_MEASURE_IN_PROGRESS:
-		nevents = BIT_GAP(BITS_PER_TYPE(u16),
-				  end_sample.event_ctr,
-				  dim->start_sample.event_ctr);
-		if (nevents < DIM_NEVENTS)
-			break;
-		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
-		if (net_dim_decision(&curr_stats, dim)) {
-			dim->state = DIM_APPLY_NEW_PROFILE;
-			schedule_work(&dim->work);
-			break;
-		}
-		/* fall through */
-	case DIM_START_MEASURE:
-		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
-				  end_sample.byte_ctr, &dim->start_sample);
-		dim->state = DIM_MEASURE_IN_PROGRESS;
-		break;
-	case DIM_APPLY_NEW_PROFILE:
-		break;
-	}
-}
-
-#endif /* NET_DIM_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 90623a0e1942..78ddb9526b62 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -562,6 +562,14 @@ config SIGNATURE
 	  Digital signature verification. Currently only RSA is supported.
 	  Implementation is done using GnuPG MPI library
 
+config DIMLIB
+	bool "DIM library"
+	default y
+	help
+	  Dynamic Interrupt Moderation library.
+	  Implements an algorithm for dynamically change CQ modertion values
+	  according to run time performance.
+
 #
 # libfdt files, only selected if needed.
 #
diff --git a/lib/Makefile b/lib/Makefile
index fb7697031a79..dcb558c7554d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -202,6 +202,7 @@ obj-$(CONFIG_GLOB) += glob.o
 obj-$(CONFIG_GLOB_SELFTEST) += globtest.o
 
 obj-$(CONFIG_MPILIB) += mpi/
+obj-$(CONFIG_DIMLIB) += dim/
 obj-$(CONFIG_SIGNATURE) += digsig.o
 
 lib-$(CONFIG_CLZ_TAB) += clz_tab.o
diff --git a/lib/dim/Makefile b/lib/dim/Makefile
new file mode 100644
index 000000000000..160afe288df0
--- /dev/null
+++ b/lib/dim/Makefile
@@ -0,0 +1,9 @@
+#
+# DIM Dynamic Interrupt Moderation library
+#
+
+obj-$(CONFIG_DIMLIB) = net_dim.o
+
+net_dim-y = \
+	dim.o		\
+	net_dim.o
diff --git a/lib/dim/dim.c b/lib/dim/dim.c
new file mode 100644
index 000000000000..17d5236759bd
--- /dev/null
+++ b/lib/dim/dim.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+bool dim_on_top(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		return true;
+	case DIM_GOING_RIGHT:
+		return (dim->steps_left > 1) && (dim->steps_right == 1);
+	default: /* DIM_GOING_LEFT */
+		return (dim->steps_right > 1) && (dim->steps_left == 1);
+	}
+}
+EXPORT_SYMBOL(dim_on_top);
+
+void dim_turn(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		dim->tune_state = DIM_GOING_LEFT;
+		dim->steps_left = 0;
+		break;
+	case DIM_GOING_LEFT:
+		dim->tune_state = DIM_GOING_RIGHT;
+		dim->steps_right = 0;
+		break;
+	}
+}
+EXPORT_SYMBOL(dim_turn);
+
+void dim_park_on_top(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tired        = 0;
+	dim->tune_state   = DIM_PARKING_ON_TOP;
+}
+EXPORT_SYMBOL(dim_park_on_top);
+
+void dim_park_tired(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tune_state   = DIM_PARKING_TIRED;
+}
+EXPORT_SYMBOL(dim_park_tired);
+
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats)
+{
+	/* u32 holds up to 71 minutes, should be enough */
+	u32 delta_us = ktime_us_delta(end->time, start->time);
+	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+			     start->byte_ctr);
+
+	if (!delta_us)
+		return;
+
+	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
+					delta_us);
+}
+EXPORT_SYMBOL(dim_calc_stats);
diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c
new file mode 100644
index 000000000000..5bcc902c5388
--- /dev/null
+++ b/lib/dim/net_dim.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+struct dim_cq_moder
+net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+EXPORT_SYMBOL(net_dim_get_rx_moderation);
+
+struct dim_cq_moder
+net_dim_get_def_rx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
+}
+EXPORT_SYMBOL(net_dim_get_def_rx_moderation);
+
+struct dim_cq_moder
+net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+EXPORT_SYMBOL(net_dim_get_tx_moderation);
+
+struct dim_cq_moder
+net_dim_get_def_tx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
+}
+EXPORT_SYMBOL(net_dim_get_def_tx_moderation);
+
+static int net_dim_step(struct dim *dim)
+{
+	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
+		return DIM_TOO_TIRED;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+		break;
+	case DIM_GOING_LEFT:
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+		break;
+	}
+
+	dim->tired++;
+	return DIM_STEPPED;
+}
+
+static void net_dim_exit_parking(struct dim *dim)
+{
+	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT;
+	net_dim_step(dim);
+}
+
+static int net_dim_stats_compare(struct dim_stats *curr,
+				 struct dim_stats *prev)
+{
+	if (!prev->bpms)
+		return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
+		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->ppms)
+		return curr->ppms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
+		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->epms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
+		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_state = dim->tune_state;
+	int prev_ix = dim->profile_ix;
+	int stats_res;
+	int step_res;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_SAME)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_PARKING_TIRED:
+		dim->tired--;
+		if (!dim->tired)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_GOING_RIGHT:
+	case DIM_GOING_LEFT:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_BETTER)
+			dim_turn(dim);
+
+		if (dim_on_top(dim)) {
+			dim_park_on_top(dim);
+			break;
+		}
+
+		step_res = net_dim_step(dim);
+		switch (step_res) {
+		case DIM_ON_EDGE:
+			dim_park_on_top(dim);
+			break;
+		case DIM_TOO_TIRED:
+			dim_park_tired(dim);
+			break;
+		}
+
+		break;
+	}
+
+	if (prev_state != DIM_PARKING_ON_TOP ||
+	    dim->tune_state != DIM_PARKING_ON_TOP)
+		dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void net_dim(struct dim *dim, struct dim_sample end_sample)
+{
+	struct dim_stats curr_stats;
+	u16 nevents;
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = BIT_GAP(BITS_PER_TYPE(u16),
+				  end_sample.event_ctr,
+				  dim->start_sample.event_ctr);
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
+		if (net_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				  end_sample.byte_ctr, &dim->start_sample);
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+EXPORT_SYMBOL(net_dim);
-- 
cgit v1.2.3


From 398c2b05bbee21cc172dfff017c0351d4d14e04c Mon Sep 17 00:00:00 2001
From: Yamin Friedman <yaminf@mellanox.com>
Date: Thu, 22 Nov 2018 09:51:17 +0200
Subject: linux/dim: Add completions count to dim_sample

Added a measurement of completions per/msec to allow for completion based
dim algorithms.

In order to use dynamic interrupt moderation with RDMA we need to have a
different measurment than packets per second. This change is meant to
prepare for adding a new DIM method.

All drivers that use net_dim and thus do not need a completion count will
have the completions set to 0.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/dim.h | 28 +++++++++++++++++++++++++---
 lib/dim/dim.c       |  9 +++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index f48ede3e0322..aa9bdd47a648 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -37,6 +37,7 @@
 struct dim_cq_moder {
 	u16 usec;
 	u16 pkts;
+	u16 comps;
 	u8 cq_period_mode;
 };
 
@@ -54,6 +55,7 @@ struct dim_sample {
 	u32 pkt_ctr;
 	u32 byte_ctr;
 	u16 event_ctr;
+	u32 comp_ctr;
 };
 
 /**
@@ -65,9 +67,11 @@ struct dim_sample {
  * @epms: Events per msec
  */
 struct dim_stats {
-	int ppms;
-	int bpms;
-	int epms;
+	int ppms; /* packets per msec */
+	int bpms; /* bytes per msec */
+	int epms; /* events per msec */
+	int cpms; /* completions per msec */
+	int cpe_ratio; /* ratio of completions to events */
 };
 
 /**
@@ -89,6 +93,7 @@ struct dim {
 	u8 state;
 	struct dim_stats prev_stats;
 	struct dim_sample start_sample;
+	struct dim_sample measuring_sample;
 	struct work_struct work;
 	u8 profile_ix;
 	u8 mode;
@@ -246,6 +251,23 @@ dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
 	s->event_ctr = event_ctr;
 }
 
+/**
+ *	dim_update_sample_with_comps - set a sample's fields with given
+ *	values including the completion parameter
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@comps: number of completions to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps,
+			     struct dim_sample *s)
+{
+	dim_update_sample(event_ctr, packets, bytes, s);
+	s->comp_ctr = comps;
+}
+
 /* Net DIM */
 
 /*
diff --git a/lib/dim/dim.c b/lib/dim/dim.c
index 17d5236759bd..439d641ec796 100644
--- a/lib/dim/dim.c
+++ b/lib/dim/dim.c
@@ -62,6 +62,8 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
 	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
 	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
 			     start->byte_ctr);
+	u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr,
+			     start->comp_ctr);
 
 	if (!delta_us)
 		return;
@@ -70,5 +72,12 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
 	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
 	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
 					delta_us);
+	curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us);
+	if (curr_stats->epms != 0)
+		curr_stats->cpe_ratio =
+				(curr_stats->cpms * 100) / curr_stats->epms;
+	else
+		curr_stats->cpe_ratio = 0;
+
 }
 EXPORT_SYMBOL(dim_calc_stats);
-- 
cgit v1.2.3


From 4ae4916b56435d1d5066616120f9ff907bd96b86 Mon Sep 17 00:00:00 2001
From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Date: Tue, 28 May 2019 10:59:15 -0700
Subject: i40e: fix 'Unknown bps' in dmesg for 2.5Gb/5Gb speeds

This patch fixes 'NIC Link is Up, Unknown bps' message in dmesg
for 2.5Gb/5Gb speeds. This problem is fixed by adding constants
for VIRTCHNL_LINK_SPEED_2_5GB and VIRTCHNL_LINK_SPEED_5GB cases
in the i40e_virtchnl_link_speed() function.

Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_prototype.h | 4 ++++
 include/linux/avf/virtchnl.h                     | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index 882627073dce..eac88bcc6c06 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -350,6 +350,10 @@ i40e_virtchnl_link_speed(enum i40e_aq_link_speed link_speed)
 		return VIRTCHNL_LINK_SPEED_100MB;
 	case I40E_LINK_SPEED_1GB:
 		return VIRTCHNL_LINK_SPEED_1GB;
+	case I40E_LINK_SPEED_2_5GB:
+		return VIRTCHNL_LINK_SPEED_2_5GB;
+	case I40E_LINK_SPEED_5GB:
+		return VIRTCHNL_LINK_SPEED_5GB;
 	case I40E_LINK_SPEED_10GB:
 		return VIRTCHNL_LINK_SPEED_10GB;
 	case I40E_LINK_SPEED_40GB:
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 191621ff7594..ca956b672ac0 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -61,12 +61,14 @@ enum virtchnl_status_code {
 #define VIRTCHNL_ERR_PARAM VIRTCHNL_STATUS_ERR_PARAM
 #define VIRTCHNL_STATUS_NOT_SUPPORTED VIRTCHNL_STATUS_ERR_NOT_SUPPORTED
 
+#define VIRTCHNL_LINK_SPEED_2_5GB_SHIFT		0x0
 #define VIRTCHNL_LINK_SPEED_100MB_SHIFT		0x1
 #define VIRTCHNL_LINK_SPEED_1000MB_SHIFT	0x2
 #define VIRTCHNL_LINK_SPEED_10GB_SHIFT		0x3
 #define VIRTCHNL_LINK_SPEED_40GB_SHIFT		0x4
 #define VIRTCHNL_LINK_SPEED_20GB_SHIFT		0x5
 #define VIRTCHNL_LINK_SPEED_25GB_SHIFT		0x6
+#define VIRTCHNL_LINK_SPEED_5GB_SHIFT		0x7
 
 enum virtchnl_link_speed {
 	VIRTCHNL_LINK_SPEED_UNKNOWN	= 0,
@@ -76,6 +78,8 @@ enum virtchnl_link_speed {
 	VIRTCHNL_LINK_SPEED_40GB	= BIT(VIRTCHNL_LINK_SPEED_40GB_SHIFT),
 	VIRTCHNL_LINK_SPEED_20GB	= BIT(VIRTCHNL_LINK_SPEED_20GB_SHIFT),
 	VIRTCHNL_LINK_SPEED_25GB	= BIT(VIRTCHNL_LINK_SPEED_25GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_2_5GB	= BIT(VIRTCHNL_LINK_SPEED_2_5GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_5GB		= BIT(VIRTCHNL_LINK_SPEED_5GB_SHIFT),
 };
 
 /* for hsplit_0 field of Rx HMC context */
-- 
cgit v1.2.3


From 65c0f2c1663649217455a73d48b1c303f133180a Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Tue, 25 Jun 2019 17:47:50 +0000
Subject: net/mlx5: Introduce vport metadata matching bits and enum constants

When a dual-port VHCA sends a RoCE packet on its non-native port, and
the packet arrives to its affiliated vport FDB, a mismatch might occur
on the rules that match the packet source vport. So we replace the
match on source port with the match on metadata that was configured in
ingress ACL, and that metadata will be passed further also to the NIC
RX table of the eswitch manager.

Introduce vport metadata matching bits and enum constants as a pre-step
towards metadata matching.
    o metadata type C registers in the misc parameters 2 fields.
    o esw_uplink_ingress_acl bit in esw cap. If it set, the device supports
      ingress ACL for the uplink vport.
    o fdb_to_vport_reg_* bits in flow table cap and esw vport context, to
      support propagating the metadata to the nic rx through the loopback
      path.
    o flow_source in flow context, to indicate the known origin of packets.
    o enum constants, to support the above bits.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 56 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e3c154b573a2..d4409654f760 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -528,7 +528,21 @@ struct mlx5_ifc_fte_match_set_misc2_bits {
 
 	struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_udp;
 
-	u8         reserved_at_80[0x100];
+	u8         metadata_reg_c_7[0x20];
+
+	u8         metadata_reg_c_6[0x20];
+
+	u8         metadata_reg_c_5[0x20];
+
+	u8         metadata_reg_c_4[0x20];
+
+	u8         metadata_reg_c_3[0x20];
+
+	u8         metadata_reg_c_2[0x20];
+
+	u8         metadata_reg_c_1[0x20];
+
+	u8         metadata_reg_c_0[0x20];
 
 	u8         metadata_reg_a[0x20];
 
@@ -636,8 +650,22 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 	u8         reserved_at_e00[0x7200];
 };
 
+enum {
+	MLX5_FDB_TO_VPORT_REG_C_0 = 0x01,
+	MLX5_FDB_TO_VPORT_REG_C_1 = 0x02,
+	MLX5_FDB_TO_VPORT_REG_C_2 = 0x04,
+	MLX5_FDB_TO_VPORT_REG_C_3 = 0x08,
+	MLX5_FDB_TO_VPORT_REG_C_4 = 0x10,
+	MLX5_FDB_TO_VPORT_REG_C_5 = 0x20,
+	MLX5_FDB_TO_VPORT_REG_C_6 = 0x40,
+	MLX5_FDB_TO_VPORT_REG_C_7 = 0x80,
+};
+
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
-	u8      reserved_at_0[0x1a];
+	u8      fdb_to_vport_reg_c_id[0x8];
+	u8      reserved_at_8[0xf];
+	u8      flow_source[0x1];
+	u8      reserved_at_18[0x2];
 	u8      multi_fdb_encap[0x1];
 	u8      reserved_at_1b[0x1];
 	u8      fdb_multi_path_to_table[0x1];
@@ -665,7 +693,9 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert_if_not_exist[0x1];
 	u8         vport_cvlan_insert_overwrite[0x1];
-	u8         reserved_at_5[0x14];
+	u8         reserved_at_5[0x3];
+	u8         esw_uplink_ingress_acl[0x1];
+	u8         reserved_at_9[0x10];
 	u8         esw_functions_changed[0x1];
 	u8         reserved_at_1a[0x1];
 	u8         ecpf_vport_exists[0x1];
@@ -2555,6 +2585,12 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800,
 };
 
+enum {
+	MLX5_FLOW_CONTEXT_FLOW_SOURCE_ANY_VPORT         = 0x0,
+	MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK            = 0x1,
+	MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT       = 0x2,
+};
+
 struct mlx5_ifc_vlan_bits {
 	u8         ethtype[0x10];
 	u8         prio[0x3];
@@ -2574,7 +2610,9 @@ struct mlx5_ifc_flow_context_bits {
 	u8         action[0x10];
 
 	u8         extended_destination[0x1];
-	u8         reserved_at_80[0x7];
+	u8         reserved_at_81[0x1];
+	u8         flow_source[0x2];
+	u8         reserved_at_84[0x4];
 	u8         destination_list_size[0x18];
 
 	u8         reserved_at_a0[0x8];
@@ -3099,12 +3137,14 @@ struct mlx5_ifc_hca_vport_context_bits {
 };
 
 struct mlx5_ifc_esw_vport_context_bits {
-	u8         reserved_at_0[0x3];
+	u8         fdb_to_vport_reg_c[0x1];
+	u8         reserved_at_1[0x2];
 	u8         vport_svlan_strip[0x1];
 	u8         vport_cvlan_strip[0x1];
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert[0x2];
-	u8         reserved_at_8[0x18];
+	u8         fdb_to_vport_reg_c_id[0x8];
+	u8         reserved_at_10[0x10];
 
 	u8         reserved_at_20[0x20];
 
@@ -4985,7 +5025,8 @@ struct mlx5_ifc_modify_esw_vport_context_out_bits {
 };
 
 struct mlx5_ifc_esw_vport_context_fields_select_bits {
-	u8         reserved_at_0[0x1c];
+	u8         reserved_at_0[0x1b];
+	u8         fdb_to_vport_reg_c_id[0x1];
 	u8         vport_cvlan_insert[0x1];
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_strip[0x1];
@@ -5182,6 +5223,7 @@ enum {
 	MLX5_ACTION_IN_FIELD_OUT_DIPV4         = 0x16,
 	MLX5_ACTION_IN_FIELD_OUT_FIRST_VID     = 0x17,
 	MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47,
+	MLX5_ACTION_IN_FIELD_METADATA_REG_C_0  = 0x51,
 };
 
 struct mlx5_ifc_alloc_modify_header_context_out_bits {
-- 
cgit v1.2.3


From bb0ee7dcc4ecd6af39823b80ae3995ddc119c373 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Tue, 25 Jun 2019 17:47:58 +0000
Subject: net/mlx5: Add flow context for flow tag

Refactor the flow data structures, add new flow_context and move
flow_tag into it, as flow_tag doesn't belong to the rule action.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/flow.c                  | 13 +++++----
 drivers/infiniband/hw/mlx5/main.c                  | 30 ++++++++++++-------
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |  1 +
 .../mellanox/mlx5/core/diag/fs_tracepoint.h        |  2 +-
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  7 +++--
 .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.c   |  8 +++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 34 +++++++++++-----------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  1 +
 include/linux/mlx5/fs.h                            | 15 +++++++---
 11 files changed, 71 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
index 1fc302d41a53..b8841355fcd5 100644
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ b/drivers/infiniband/hw/mlx5/flow.c
@@ -65,11 +65,12 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = {
 static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
 	struct uverbs_attr_bundle *attrs)
 {
-	struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
+	struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
 	struct mlx5_ib_flow_handler *flow_handler;
 	struct mlx5_ib_flow_matcher *fs_matcher;
 	struct ib_uobject **arr_flow_actions;
 	struct ib_uflow_resources *uflow_res;
+	struct mlx5_flow_act flow_act = {};
 	void *devx_obj;
 	int dest_id, dest_type;
 	void *cmd_in;
@@ -172,17 +173,19 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
 				   arr_flow_actions[i]->object);
 	}
 
-	ret = uverbs_copy_from(&flow_act.flow_tag, attrs,
+	ret = uverbs_copy_from(&flow_context.flow_tag, attrs,
 			       MLX5_IB_ATTR_CREATE_FLOW_TAG);
 	if (!ret) {
-		if (flow_act.flow_tag >= BIT(24)) {
+		if (flow_context.flow_tag >= BIT(24)) {
 			ret = -EINVAL;
 			goto err_out;
 		}
-		flow_act.flags |= FLOW_ACT_HAS_TAG;
+		flow_context.flags |= FLOW_CONTEXT_HAS_TAG;
 	}
 
-	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act,
+	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher,
+					       &flow_context,
+					       &flow_act,
 					       counter_id,
 					       cmd_in, inlen,
 					       dest_id, dest_type);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index abac70ad5c7c..be4c9a687df7 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2666,11 +2666,15 @@ int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
 	}
 }
 
-static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
-			   u32 *match_v, const union ib_flow_spec *ib_spec,
+static int parse_flow_attr(struct mlx5_core_dev *mdev,
+			   struct mlx5_flow_spec *spec,
+			   const union ib_flow_spec *ib_spec,
 			   const struct ib_flow_attr *flow_attr,
 			   struct mlx5_flow_act *action, u32 prev_type)
 {
+	struct mlx5_flow_context *flow_context = &spec->flow_context;
+	u32 *match_c = spec->match_criteria;
+	u32 *match_v = spec->match_value;
 	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					   misc_parameters);
 	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
@@ -2989,8 +2993,8 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
 		if (ib_spec->flow_tag.tag_id >= BIT(24))
 			return -EINVAL;
 
-		action->flow_tag = ib_spec->flow_tag.tag_id;
-		action->flags |= FLOW_ACT_HAS_TAG;
+		flow_context->flow_tag = ib_spec->flow_tag.tag_id;
+		flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
 		break;
 	case IB_FLOW_SPEC_ACTION_DROP:
 		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
@@ -3084,7 +3088,8 @@ is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
 		return VALID_SPEC_NA;
 
 	return is_crypto && is_ipsec &&
-		(!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ?
+		(!egress || (!is_drop &&
+			     !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
 		VALID_SPEC_VALID : VALID_SPEC_INVALID;
 }
 
@@ -3473,7 +3478,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 {
 	struct mlx5_flow_table	*ft = ft_prio->flow_table;
 	struct mlx5_ib_flow_handler *handler;
-	struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
+	struct mlx5_flow_act flow_act = {};
 	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_destination dest_arr[2] = {};
 	struct mlx5_flow_destination *rule_dst = dest_arr;
@@ -3504,8 +3509,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 	}
 
 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-		err = parse_flow_attr(dev->mdev, spec->match_criteria,
-				      spec->match_value,
+		err = parse_flow_attr(dev->mdev, spec,
 				      ib_flow, flow_attr, &flow_act,
 				      prev_type);
 		if (err < 0)
@@ -3572,11 +3576,11 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 					MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
 	}
 
-	if ((flow_act.flags & FLOW_ACT_HAS_TAG)  &&
+	if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
 	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
 	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
 		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
-			     flow_act.flow_tag, flow_attr->type);
+			     spec->flow_context.flow_tag, flow_attr->type);
 		err = -EINVAL;
 		goto free;
 	}
@@ -3947,6 +3951,7 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev,
 		      struct mlx5_ib_flow_prio *ft_prio,
 		      struct mlx5_flow_destination *dst,
 		      struct mlx5_ib_flow_matcher  *fs_matcher,
+		      struct mlx5_flow_context *flow_context,
 		      struct mlx5_flow_act *flow_act,
 		      void *cmd_in, int inlen,
 		      int dst_num)
@@ -3969,6 +3974,7 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev,
 	memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
 	       fs_matcher->mask_len);
 	spec->match_criteria_enable = fs_matcher->match_criteria_enable;
+	spec->flow_context = *flow_context;
 
 	handler->rule = mlx5_add_flow_rules(ft, spec,
 					    flow_act, dst, dst_num);
@@ -4033,6 +4039,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
 struct mlx5_ib_flow_handler *
 mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
 			struct mlx5_ib_flow_matcher *fs_matcher,
+			struct mlx5_flow_context *flow_context,
 			struct mlx5_flow_act *flow_act,
 			u32 counter_id,
 			void *cmd_in, int inlen, int dest_id,
@@ -4085,7 +4092,8 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
 		dst_num++;
 	}
 
-	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act,
+	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
+					flow_context, flow_act,
 					cmd_in, inlen, dst_num);
 
 	if (IS_ERR(handler)) {
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a043af7ee366..1c205c2bd486 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1317,6 +1317,7 @@ extern const struct uapi_definition mlx5_ib_devx_defs[];
 extern const struct uapi_definition mlx5_ib_flow_defs[];
 struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add(
 	struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
+	struct mlx5_flow_context *flow_context,
 	struct mlx5_flow_act *flow_act, u32 counter_id,
 	void *cmd_in, int inlen, int dest_id, int dest_type);
 bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
index a4cf123e3f17..9ec46edf22a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -204,7 +204,7 @@ TRACE_EVENT(mlx5_fs_set_fte,
 			   __entry->index = fte->index;
 			   __entry->action = fte->action.action;
 			   __entry->mask_enable = __entry->fg->mask.match_criteria_enable;
-			   __entry->flow_tag = fte->action.flow_tag;
+			   __entry->flow_tag = fte->flow_context.flow_tag;
 			   memcpy(__entry->mask_outer,
 				  MLX5_ADDR_OF(fte_match_param,
 					       &__entry->fg->mask.match_criteria,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 4421c10f58ae..839662644ed3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -426,7 +426,7 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
 	}
 
 	spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria));
-	flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+	spec->flow_context.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
 	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, dst ? 1 : 0);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 122f457091a2..8ff1ca46d8d3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -716,19 +716,22 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 		      struct mlx5e_tc_flow *flow,
 		      struct netlink_ext_ack *extack)
 {
+	struct mlx5_flow_context *flow_context = &parse_attr->spec.flow_context;
 	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
 	struct mlx5_core_dev *dev = priv->mdev;
 	struct mlx5_flow_destination dest[2] = {};
 	struct mlx5_flow_act flow_act = {
 		.action = attr->action,
-		.flow_tag = attr->flow_tag,
 		.reformat_id = 0,
-		.flags    = FLOW_ACT_HAS_TAG | FLOW_ACT_NO_APPEND,
+		.flags    = FLOW_ACT_NO_APPEND,
 	};
 	struct mlx5_fc *counter = NULL;
 	bool table_created = false;
 	int err, dest_ix = 0;
 
+	flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
+	flow_context->flow_tag = attr->flow_tag;
+
 	if (flow->flags & MLX5E_TC_FLOW_HAIRPIN) {
 		err = mlx5e_hairpin_flow_add(priv, flow, parse_attr, extack);
 		if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
index 52c47d3dd5a5..c76da309506b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -636,7 +636,8 @@ static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
 					   u8 match_criteria_enable,
 					   const u32 *match_c,
 					   const u32 *match_v,
-					   struct mlx5_flow_act *flow_act)
+					   struct mlx5_flow_act *flow_act,
+					   struct mlx5_flow_context *flow_context)
 {
 	const void *outer_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					   outer_headers);
@@ -655,7 +656,7 @@ static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
 	    (match_criteria_enable &
 	     ~(MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS)) ||
 	    (flow_act->action & ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | MLX5_FLOW_CONTEXT_ACTION_ALLOW)) ||
-	     (flow_act->flags & FLOW_ACT_HAS_TAG))
+	     (flow_context->flags & FLOW_CONTEXT_HAS_TAG))
 		return false;
 
 	return true;
@@ -767,7 +768,8 @@ mlx5_fpga_ipsec_fs_create_sa_ctx(struct mlx5_core_dev *mdev,
 					    fg->mask.match_criteria_enable,
 					    fg->mask.match_criteria,
 					    fte->val,
-					    &fte->action))
+					    &fte->action,
+					    &fte->flow_context))
 		return ERR_PTR(-EINVAL);
 	else if (!mlx5_is_fpga_ipsec_rule(mdev,
 					  fg->mask.match_criteria_enable,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 4f1d402926f1..fb1335a433ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -396,7 +396,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
 
-	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->action.flow_tag);
+	MLX5_SET(flow_context, in_flow_context, flow_tag,
+		 fte->flow_context.flow_tag);
 	MLX5_SET(flow_context, in_flow_context, extended_destination,
 		 extended_dest);
 	if (extended_dest) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index fb5b61727ee7..9f5544ac6b8a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -584,7 +584,7 @@ err_ida_remove:
 }
 
 static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
-				u32 *match_value,
+				struct mlx5_flow_spec *spec,
 				struct mlx5_flow_act *flow_act)
 {
 	struct mlx5_flow_steering *steering = get_steering(&ft->node);
@@ -594,9 +594,10 @@ static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
 	if (!fte)
 		return ERR_PTR(-ENOMEM);
 
-	memcpy(fte->val, match_value, sizeof(fte->val));
+	memcpy(fte->val, &spec->match_value, sizeof(fte->val));
 	fte->node.type =  FS_TYPE_FLOW_ENTRY;
 	fte->action = *flow_act;
+	fte->flow_context = spec->flow_context;
 
 	tree_init_node(&fte->node, NULL, del_sw_fte);
 
@@ -1428,7 +1429,9 @@ static bool check_conflicting_actions(u32 action1, u32 action2)
 	return false;
 }
 
-static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act *flow_act)
+static int check_conflicting_ftes(struct fs_fte *fte,
+				  const struct mlx5_flow_context *flow_context,
+				  const struct mlx5_flow_act *flow_act)
 {
 	if (check_conflicting_actions(flow_act->action, fte->action.action)) {
 		mlx5_core_warn(get_dev(&fte->node),
@@ -1436,12 +1439,12 @@ static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act
 		return -EEXIST;
 	}
 
-	if ((flow_act->flags & FLOW_ACT_HAS_TAG) &&
-	    fte->action.flow_tag != flow_act->flow_tag) {
+	if ((flow_context->flags & FLOW_CONTEXT_HAS_TAG) &&
+	    fte->flow_context.flow_tag != flow_context->flow_tag) {
 		mlx5_core_warn(get_dev(&fte->node),
 			       "FTE flow tag %u already exists with different flow tag %u\n",
-			       fte->action.flow_tag,
-			       flow_act->flow_tag);
+			       fte->flow_context.flow_tag,
+			       flow_context->flow_tag);
 		return -EEXIST;
 	}
 
@@ -1449,7 +1452,7 @@ static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act
 }
 
 static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
-					    u32 *match_value,
+					    struct mlx5_flow_spec *spec,
 					    struct mlx5_flow_act *flow_act,
 					    struct mlx5_flow_destination *dest,
 					    int dest_num,
@@ -1460,7 +1463,7 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	int i;
 	int ret;
 
-	ret = check_conflicting_ftes(fte, flow_act);
+	ret = check_conflicting_ftes(fte, &spec->flow_context, flow_act);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -1635,7 +1638,7 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
 	u64  version;
 	int err;
 
-	fte = alloc_fte(ft, spec->match_value, flow_act);
+	fte = alloc_fte(ft, spec, flow_act);
 	if (IS_ERR(fte))
 		return  ERR_PTR(-ENOMEM);
 
@@ -1651,8 +1654,7 @@ search_again_locked:
 		fte_tmp = lookup_fte_locked(g, spec->match_value, take_write);
 		if (!fte_tmp)
 			continue;
-		rule = add_rule_fg(g, spec->match_value,
-				   flow_act, dest, dest_num, fte_tmp);
+		rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte_tmp);
 		up_write_ref_node(&fte_tmp->node, false);
 		tree_put_node(&fte_tmp->node, false);
 		kmem_cache_free(steering->ftes_cache, fte);
@@ -1699,8 +1701,7 @@ skip_search:
 
 		nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
 		up_write_ref_node(&g->node, false);
-		rule = add_rule_fg(g, spec->match_value,
-				   flow_act, dest, dest_num, fte);
+		rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte);
 		up_write_ref_node(&fte->node, false);
 		tree_put_node(&fte->node, false);
 		return rule;
@@ -1786,7 +1787,7 @@ search_again_locked:
 	if (err)
 		goto err_release_fg;
 
-	fte = alloc_fte(ft, spec->match_value, flow_act);
+	fte = alloc_fte(ft, spec, flow_act);
 	if (IS_ERR(fte)) {
 		err = PTR_ERR(fte);
 		goto err_release_fg;
@@ -1800,8 +1801,7 @@ search_again_locked:
 
 	nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
 	up_write_ref_node(&g->node, false);
-	rule = add_rule_fg(g, spec->match_value, flow_act, dest,
-			   dest_num, fte);
+	rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte);
 	up_write_ref_node(&fte->node, false);
 	tree_put_node(&fte->node, false);
 	tree_put_node(&g->node, false);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index a08c3d09a50f..c48c382f926f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -170,6 +170,7 @@ struct fs_fte {
 	u32				val[MLX5_ST_SZ_DW_MATCH_PARAM];
 	u32				dests_size;
 	u32				index;
+	struct mlx5_flow_context	flow_context;
 	struct mlx5_flow_act		action;
 	enum fs_fte_status		status;
 	struct mlx5_fc			*counter;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 2ddaa97f2179..9bf49ce218fa 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -88,10 +88,20 @@ struct mlx5_flow_group;
 struct mlx5_flow_namespace;
 struct mlx5_flow_handle;
 
+enum {
+	FLOW_CONTEXT_HAS_TAG = BIT(0),
+};
+
+struct mlx5_flow_context {
+	u32 flags;
+	u32 flow_tag;
+};
+
 struct mlx5_flow_spec {
 	u8   match_criteria_enable;
 	u32  match_criteria[MLX5_ST_SZ_DW(fte_match_param)];
 	u32  match_value[MLX5_ST_SZ_DW(fte_match_param)];
+	struct mlx5_flow_context flow_context;
 };
 
 enum {
@@ -173,13 +183,11 @@ struct mlx5_fs_vlan {
 #define MLX5_FS_VLAN_DEPTH	2
 
 enum {
-	FLOW_ACT_HAS_TAG   = BIT(0),
-	FLOW_ACT_NO_APPEND = BIT(1),
+	FLOW_ACT_NO_APPEND = BIT(0),
 };
 
 struct mlx5_flow_act {
 	u32 action;
-	u32 flow_tag;
 	u32 reformat_id;
 	u32 modify_id;
 	uintptr_t esp_id;
@@ -190,7 +198,6 @@ struct mlx5_flow_act {
 
 #define MLX5_DECLARE_FLOW_ACT(name) \
 	struct mlx5_flow_act name = { .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
-				      .flow_tag = MLX5_FS_DEFAULT_FLOW_TAG, \
 				      .reformat_id = 0, \
 				      .modify_id = 0, \
 				      .flags =  0, }
-- 
cgit v1.2.3


From 7445cfb1169cebf8f79763acf65f85d850850461 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Tue, 25 Jun 2019 17:48:00 +0000
Subject: net/mlx5: E-Switch, Tag packet with vport number in VF vports and
 uplink ingress ACLs

When a dual-port VHCA sends a RoCE packet on its non-native port, and the
packet arrives to its affiliated vport FDB, a mismatch might occur on the
rules that match the packet source vport as it is not represented by single
VHCA only in this case. So we change to match on metadata instead of source
vport.
To do that, a rule is created in all vports and uplink ingress ACLs, to
save the source vport number and vhca id in the packet's metadata in order
to match on it later.
The metadata register used is the first of the 32-bit type C registers. It
can be used for matching and header modify operations. The higher 16 bits
of this register are for vhca id, and the lower 16 ones is for vport
number.
This change is not for dual-port RoCE only. If HW and FW allow, the vport
metadata matching is enabled by default.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   9 ++
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 180 ++++++++++++++++-----
 include/linux/mlx5/eswitch.h                       |  17 ++
 4 files changed, 172 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index a42a23e505df..1235fd84ae3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1168,6 +1168,8 @@ void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
 
 	vport->ingress.drop_rule = NULL;
 	vport->ingress.allow_rule = NULL;
+
+	esw_vport_del_ingress_acl_modify_metadata(esw, vport);
 }
 
 void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 99dc25630629..51e71b824abf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -68,6 +68,8 @@ struct vport_ingress {
 	struct mlx5_flow_group *allow_spoofchk_only_grp;
 	struct mlx5_flow_group *allow_untagged_only_grp;
 	struct mlx5_flow_group *drop_grp;
+	int modify_metadata_id;
+	struct mlx5_flow_handle  *modify_metadata_rule;
 	struct mlx5_flow_handle  *allow_rule;
 	struct mlx5_flow_handle  *drop_rule;
 	struct mlx5_fc           *drop_counter;
@@ -196,6 +198,10 @@ struct mlx5_esw_functions {
 	u16			num_vfs;
 };
 
+enum {
+	MLX5_ESWITCH_VPORT_MATCH_METADATA = BIT(0),
+};
+
 struct mlx5_eswitch {
 	struct mlx5_core_dev    *dev;
 	struct mlx5_nb          nb;
@@ -203,6 +209,7 @@ struct mlx5_eswitch {
 	struct hlist_head       mc_table[MLX5_L2_ADDR_HASH_SIZE];
 	struct workqueue_struct *work_queue;
 	struct mlx5_vport       *vports;
+	u32 flags;
 	int                     total_vports;
 	int                     enabled_vports;
 	/* Synchronize between vport change events
@@ -240,6 +247,8 @@ void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw,
 				  struct mlx5_vport *vport);
 void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
 				   struct mlx5_vport *vport);
+void esw_vport_del_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
+					       struct mlx5_vport *vport);
 
 /* E-Switch API */
 int mlx5_eswitch_init(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index c1c42c1370b8..4bcbc872cd08 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1555,32 +1555,16 @@ static void esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw)
 static int esw_vport_ingress_prio_tag_config(struct mlx5_eswitch *esw,
 					     struct mlx5_vport *vport)
 {
-	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
 	/* For prio tag mode, there is only 1 FTEs:
-	 * 1) Untagged packets - push prio tag VLAN, allow
+	 * 1) Untagged packets - push prio tag VLAN and modify metadata if
+	 * required, allow
 	 * Unmatched traffic is allowed by default
 	 */
 
-	if (!MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support))
-		return -EOPNOTSUPP;
-
-	esw_vport_cleanup_ingress_rules(esw, vport);
-
-	err = esw_vport_enable_ingress_acl(esw, vport);
-	if (err) {
-		mlx5_core_warn(esw->dev,
-			       "failed to enable prio tag ingress acl (%d) on vport[%d]\n",
-			       err, vport->vport);
-		return err;
-	}
-
-	esw_debug(esw->dev,
-		  "vport[%d] configure ingress rules\n", vport->vport);
-
 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
 	if (!spec) {
 		err = -ENOMEM;
@@ -1596,6 +1580,12 @@ static int esw_vport_ingress_prio_tag_config(struct mlx5_eswitch *esw,
 	flow_act.vlan[0].ethtype = ETH_P_8021Q;
 	flow_act.vlan[0].vid = 0;
 	flow_act.vlan[0].prio = 0;
+
+	if (vport->ingress.modify_metadata_rule) {
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+		flow_act.modify_id = vport->ingress.modify_metadata_id;
+	}
+
 	vport->ingress.allow_rule =
 		mlx5_add_flow_rules(vport->ingress.acl, spec,
 				    &flow_act, NULL, 0);
@@ -1616,6 +1606,58 @@ out_no_mem:
 	return err;
 }
 
+static int esw_vport_add_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
+						     struct mlx5_vport *vport)
+{
+	u8 action[MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)] = {};
+	struct mlx5_flow_act flow_act = {};
+	struct mlx5_flow_spec spec = {};
+	int err = 0;
+
+	MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET);
+	MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_C_0);
+	MLX5_SET(set_action_in, action, data,
+		 mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport));
+
+	err = mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS,
+				       1, action, &vport->ingress.modify_metadata_id);
+	if (err) {
+		esw_warn(esw->dev,
+			 "failed to alloc modify header for vport %d ingress acl (%d)\n",
+			 vport->vport, err);
+		return err;
+	}
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+	flow_act.modify_id = vport->ingress.modify_metadata_id;
+	vport->ingress.modify_metadata_rule = mlx5_add_flow_rules(vport->ingress.acl,
+								  &spec, &flow_act, NULL, 0);
+	if (IS_ERR(vport->ingress.modify_metadata_rule)) {
+		err = PTR_ERR(vport->ingress.modify_metadata_rule);
+		esw_warn(esw->dev,
+			 "failed to add setting metadata rule for vport %d ingress acl, err(%d)\n",
+			 vport->vport, err);
+		vport->ingress.modify_metadata_rule = NULL;
+		goto out;
+	}
+
+out:
+	if (err)
+		mlx5_modify_header_dealloc(esw->dev, vport->ingress.modify_metadata_id);
+	return err;
+}
+
+void esw_vport_del_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
+					       struct mlx5_vport *vport)
+{
+	if (vport->ingress.modify_metadata_rule) {
+		mlx5_del_flow_rules(vport->ingress.modify_metadata_rule);
+		mlx5_modify_header_dealloc(esw->dev, vport->ingress.modify_metadata_id);
+
+		vport->ingress.modify_metadata_rule = NULL;
+	}
+}
+
 static int esw_vport_egress_prio_tag_config(struct mlx5_eswitch *esw,
 					    struct mlx5_vport *vport)
 {
@@ -1623,6 +1665,9 @@ static int esw_vport_egress_prio_tag_config(struct mlx5_eswitch *esw,
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
+	if (!MLX5_CAP_GEN(esw->dev, prio_tag_required))
+		return 0;
+
 	/* For prio tag mode, there is only 1 FTEs:
 	 * 1) prio tag packets - pop the prio tag VLAN, allow
 	 * Unmatched traffic is allowed by default
@@ -1676,27 +1721,75 @@ out_no_mem:
 	return err;
 }
 
-static int esw_prio_tag_acls_config(struct mlx5_eswitch *esw, int nvports)
+static int esw_vport_ingress_common_config(struct mlx5_eswitch *esw,
+					   struct mlx5_vport *vport)
 {
-	struct mlx5_vport *vport = NULL;
-	int i, j;
 	int err;
 
-	mlx5_esw_for_each_vf_vport(esw, i, vport, nvports) {
+	if (!mlx5_eswitch_vport_match_metadata_enabled(esw) &&
+	    !MLX5_CAP_GEN(esw->dev, prio_tag_required))
+		return 0;
+
+	esw_vport_cleanup_ingress_rules(esw, vport);
+
+	err = esw_vport_enable_ingress_acl(esw, vport);
+	if (err) {
+		esw_warn(esw->dev,
+			 "failed to enable ingress acl (%d) on vport[%d]\n",
+			 err, vport->vport);
+		return err;
+	}
+
+	esw_debug(esw->dev,
+		  "vport[%d] configure ingress rules\n", vport->vport);
+
+	if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
+		err = esw_vport_add_ingress_acl_modify_metadata(esw, vport);
+		if (err)
+			goto out;
+	}
+
+	if (MLX5_CAP_GEN(esw->dev, prio_tag_required) &&
+	    mlx5_eswitch_is_vf_vport(esw, vport->vport)) {
 		err = esw_vport_ingress_prio_tag_config(esw, vport);
 		if (err)
-			goto err_ingress;
-		err = esw_vport_egress_prio_tag_config(esw, vport);
+			goto out;
+	}
+
+out:
+	if (err)
+		esw_vport_disable_ingress_acl(esw, vport);
+	return err;
+}
+
+static int esw_create_offloads_acl_tables(struct mlx5_eswitch *esw)
+{
+	struct mlx5_vport *vport;
+	int i, j;
+	int err;
+
+	mlx5_esw_for_all_vports(esw, i, vport) {
+		err = esw_vport_ingress_common_config(esw, vport);
 		if (err)
-			goto err_egress;
+			goto err_ingress;
+
+		if (mlx5_eswitch_is_vf_vport(esw, vport->vport)) {
+			err = esw_vport_egress_prio_tag_config(esw, vport);
+			if (err)
+				goto err_egress;
+		}
 	}
 
+	if (mlx5_eswitch_vport_match_metadata_enabled(esw))
+		esw_info(esw->dev, "Use metadata reg_c as source vport to match\n");
+
 	return 0;
 
 err_egress:
 	esw_vport_disable_ingress_acl(esw, vport);
 err_ingress:
-	mlx5_esw_for_each_vf_vport_reverse(esw, j, vport, i - 1) {
+	for (j = MLX5_VPORT_PF; j < i; j++) {
+		vport = &esw->vports[j];
 		esw_vport_disable_egress_acl(esw, vport);
 		esw_vport_disable_ingress_acl(esw, vport);
 	}
@@ -1704,15 +1797,17 @@ err_ingress:
 	return err;
 }
 
-static void esw_prio_tag_acls_cleanup(struct mlx5_eswitch *esw)
+static void esw_destroy_offloads_acl_tables(struct mlx5_eswitch *esw)
 {
 	struct mlx5_vport *vport;
 	int i;
 
-	mlx5_esw_for_each_vf_vport(esw, i, vport, esw->nvports) {
+	mlx5_esw_for_all_vports(esw, i, vport) {
 		esw_vport_disable_egress_acl(esw, vport);
 		esw_vport_disable_ingress_acl(esw, vport);
 	}
+
+	esw->flags &= ~MLX5_ESWITCH_VPORT_MATCH_METADATA;
 }
 
 static int esw_offloads_steering_init(struct mlx5_eswitch *esw, int nvports)
@@ -1722,15 +1817,13 @@ static int esw_offloads_steering_init(struct mlx5_eswitch *esw, int nvports)
 	memset(&esw->fdb_table.offloads, 0, sizeof(struct offloads_fdb));
 	mutex_init(&esw->fdb_table.offloads.fdb_prio_lock);
 
-	if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) {
-		err = esw_prio_tag_acls_config(esw, nvports);
-		if (err)
-			return err;
-	}
+	err = esw_create_offloads_acl_tables(esw);
+	if (err)
+		return err;
 
 	err = esw_create_offloads_fdb_tables(esw, nvports);
 	if (err)
-		return err;
+		goto create_fdb_err;
 
 	err = esw_create_offloads_table(esw, nvports);
 	if (err)
@@ -1748,6 +1841,9 @@ create_fg_err:
 create_ft_err:
 	esw_destroy_offloads_fdb_tables(esw);
 
+create_fdb_err:
+	esw_destroy_offloads_acl_tables(esw);
+
 	return err;
 }
 
@@ -1756,8 +1852,7 @@ static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
 	esw_destroy_vport_rx_group(esw);
 	esw_destroy_offloads_table(esw);
 	esw_destroy_offloads_fdb_tables(esw);
-	if (MLX5_CAP_GEN(esw->dev, prio_tag_required))
-		esw_prio_tag_acls_cleanup(esw);
+	esw_destroy_offloads_acl_tables(esw);
 }
 
 static void esw_functions_changed_event_handler(struct work_struct *work)
@@ -2296,3 +2391,16 @@ bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num)
 	return vport_num >= MLX5_VPORT_FIRST_VF &&
 	       vport_num <= esw->dev->priv.sriov.max_vfs;
 }
+
+bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
+{
+	return !!(esw->flags & MLX5_ESWITCH_VPORT_MATCH_METADATA);
+}
+EXPORT_SYMBOL(mlx5_eswitch_vport_match_metadata_enabled);
+
+u32 mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+					      u16 vport_num)
+{
+	return ((MLX5_CAP_GEN(esw->dev, vhca_id) & 0xffff) << 16) | vport_num;
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 174eec0871d9..aece3ae1902d 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -67,11 +67,28 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw,
 #ifdef CONFIG_MLX5_ESWITCH
 enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
+
+bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw);
+u32 mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+					      u16 vport_num);
 #else  /* CONFIG_MLX5_ESWITCH */
 static inline enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev)
 {
 	return DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 }
+
+static inline bool
+mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
+{
+	return false;
+};
+
+static inline u32
+mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+					  int vport_num)
+{
+	return 0;
+};
 #endif /* CONFIG_MLX5_ESWITCH */
 #endif
-- 
cgit v1.2.3


From 8d212ff057f8b81ed6ed418874b54ded3bf97ad4 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Tue, 25 Jun 2019 17:48:02 +0000
Subject: net/mlx5e: Specifying known origin of packets matching the flow

In vport metadata matching, source port number is replaced by metadata.
While FW has no idea about what it is in the metadata, a syndrome will
happen. Specify a known origin to avoid the syndrome.
However, there is no functional change because ANY_VPORT (0) is filled
in flow_source, the same default value as before, as a pre-step towards
metadata matching for fast path.
There are two other values can be filled in flow_source. When setting
0x1, packet matching this rule is from uplink, while 0x2 is for packet
from other local vports.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c             | 3 +++
 include/linux/mlx5/fs.h                                      | 1 +
 3 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
index 9ec46edf22a6..ddf1b87f1bc0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -187,6 +187,7 @@ TRACE_EVENT(mlx5_fs_set_fte,
 		__field(u32, index)
 		__field(u32, action)
 		__field(u32, flow_tag)
+		__field(u32, flow_source)
 		__field(u8,  mask_enable)
 		__field(int, new_fte)
 		__array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
@@ -205,6 +206,7 @@ TRACE_EVENT(mlx5_fs_set_fte,
 			   __entry->action = fte->action.action;
 			   __entry->mask_enable = __entry->fg->mask.match_criteria_enable;
 			   __entry->flow_tag = fte->flow_context.flow_tag;
+			   __entry->flow_source = fte->flow_context.flow_source;
 			   memcpy(__entry->mask_outer,
 				  MLX5_ADDR_OF(fte_match_param,
 					       &__entry->fg->mask.match_criteria,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index fb1335a433ae..7ac1249eadc3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -398,6 +398,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 
 	MLX5_SET(flow_context, in_flow_context, flow_tag,
 		 fte->flow_context.flow_tag);
+	MLX5_SET(flow_context, in_flow_context, flow_source,
+		 fte->flow_context.flow_source);
+
 	MLX5_SET(flow_context, in_flow_context, extended_destination,
 		 extended_dest);
 	if (extended_dest) {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 9bf49ce218fa..dc7e7aa53a13 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -95,6 +95,7 @@ enum {
 struct mlx5_flow_context {
 	u32 flags;
 	u32 flow_tag;
+	u32 flow_source;
 };
 
 struct mlx5_flow_spec {
-- 
cgit v1.2.3


From 96125bf9985a75db00496dd2bc9249b777d2b19b Mon Sep 17 00:00:00 2001
From: Dave Taht <dave.taht@gmail.com>
Date: Sat, 22 Jun 2019 10:07:34 -0700
Subject: Allow 0.0.0.0/8 as a valid address range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The longstanding prohibition against using 0.0.0.0/8 dates back
to two issues with the early internet.

There was an interoperability problem with BSD 4.2 in 1984, fixed in
BSD 4.3 in 1986. BSD 4.2 has long since been retired.

Secondly, addresses of the form 0.x.y.z were initially defined only as
a source address in an ICMP datagram, indicating "node number x.y.z on
this IPv4 network", by nodes that know their address on their local
network, but do not yet know their network prefix, in RFC0792 (page
19).  This usage of 0.x.y.z was later repealed in RFC1122 (section
3.2.2.7), because the original ICMP-based mechanism for learning the
network prefix was unworkable on many networks such as Ethernet (which
have longer addresses that would not fit into the 24 "node number"
bits).  Modern networks use reverse ARP (RFC0903) or BOOTP (RFC0951)
or DHCP (RFC2131) to find their full 32-bit address and CIDR netmask
(and other parameters such as default gateways). 0.x.y.z has had
16,777,215 addresses in 0.0.0.0/8 space left unused and reserved for
future use, since 1989.

This patch allows for these 16m new IPv4 addresses to appear within
a box or on the wire. Layer 2 switches don't care.

0.0.0.0/32 is still prohibited, of course.

Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: John Gilmore <gnu@toad.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/in.h b/include/linux/in.h
index 4d2fedfb753a..1873ef642605 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -63,7 +63,7 @@ static inline bool ipv4_is_all_snoopers(__be32 addr)
 
 static inline bool ipv4_is_zeronet(__be32 addr)
 {
-	return (addr & htonl(0xff000000)) == htonl(0x00000000);
+	return (addr == 0);
 }
 
 /* Special-Use IPv4 Addresses (RFC3330) */
-- 
cgit v1.2.3


From 61caf3d109f5411a7f5b433f1eb73ead7e0789fa Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Tue, 11 Jun 2019 22:58:40 +0200
Subject: batman-adv: mcast: detect, distribute and maintain multicast router
 presence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To be able to apply our group aware multicast optimizations to packets
with a scope greater than link-local we need to not only keep track of
multicast listeners but also multicast routers.

With this patch a node detects the presence of multicast routers on
its segment by checking if
/proc/sys/net/ipv{4,6}/conf/<bat0|br0(bat)>/mc_forwarding is set for one
thing. This option is enabled by multicast routing daemons and needed
for the kernel's multicast routing tables to receive and route packets.

For another thing if a bridge is configured on top of bat0 then the
presence of an IPv6 multicast router behind this bridge is currently
detected by checking for an IPv6 multicast "All Routers Address"
(ff02::2). This should later be replaced by querying the bridge, which
performs proper, RFC4286 compliant Multicast Router Discovery (our
simplified approach includes more hosts than necessary, most notably
not just multicast routers but also unicast ones and is not applicable
for IPv4).

If no multicast router is detected then this is signalized via the new
BATADV_MCAST_WANT_NO_RTR4 and BATADV_MCAST_WANT_NO_RTR6
multicast tvlv flags.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batadv_packet.h |   8 +
 net/batman-adv/multicast.c         | 412 ++++++++++++++++++++++++++++++++-----
 net/batman-adv/originator.c        |   4 +-
 net/batman-adv/types.h             |  29 +++
 4 files changed, 399 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h
index 4ebc2135e950..2a15f01c2243 100644
--- a/include/uapi/linux/batadv_packet.h
+++ b/include/uapi/linux/batadv_packet.h
@@ -107,12 +107,20 @@ enum batadv_icmp_packettype {
  * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for
  *  224.0.0.0/24 or ff02::1
  * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets
+ *  (both link-local and routable ones)
  * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets
+ *  (both link-local and routable ones)
+ * @BATADV_MCAST_WANT_NO_RTR4: we have no IPv4 multicast router and therefore
+ * only need routable IPv4 multicast packets we signed up for explicitly
+ * @BATADV_MCAST_WANT_NO_RTR6: we have no IPv6 multicast router and therefore
+ * only need routable IPv6 multicast packets we signed up for explicitly
  */
 enum batadv_mcast_flags {
 	BATADV_MCAST_WANT_ALL_UNSNOOPABLES	= 1UL << 0,
 	BATADV_MCAST_WANT_ALL_IPV4		= 1UL << 1,
 	BATADV_MCAST_WANT_ALL_IPV6		= 1UL << 2,
+	BATADV_MCAST_WANT_NO_RTR4		= 1UL << 3,
+	BATADV_MCAST_WANT_NO_RTR6		= 1UL << 4,
 };
 
 /* tt data subtypes */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index d4e7474022e3..80d5f3c892cb 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -73,26 +73,200 @@ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_mcast_has_bridge() - check whether the soft-iface is bridged
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists
+ * @soft_iface: netdev struct of the mesh interface
  *
- * Checks whether there is a bridge on top of our soft interface.
+ * If the given soft interface has a bridge on top then the refcount
+ * of the according net device is increased.
  *
- * Return: true if there is a bridge, false otherwise.
+ * Return: NULL if no such bridge exists. Otherwise the net device of the
+ * bridge.
  */
-static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
+static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface)
 {
-	struct net_device *upper = bat_priv->soft_iface;
+	struct net_device *upper = soft_iface;
 
 	rcu_read_lock();
 	do {
 		upper = netdev_master_upper_dev_get_rcu(upper);
 	} while (upper && !(upper->priv_flags & IFF_EBRIDGE));
+
+	if (upper)
+		dev_hold(upper);
 	rcu_read_unlock();
 
 	return upper;
 }
 
+/**
+ * batadv_mcast_mla_rtr_flags_softif_get_ipv4() - get mcast router flags from
+ *  node for IPv4
+ * @dev: the interface to check
+ *
+ * Checks the presence of an IPv4 multicast router on this node.
+ *
+ * Caller needs to hold rcu read lock.
+ *
+ * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise.
+ */
+static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+	if (in_dev && IN_DEV_MFORWARD(in_dev))
+		return BATADV_NO_FLAGS;
+	else
+		return BATADV_MCAST_WANT_NO_RTR4;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_softif_get_ipv6() - get mcast router flags from
+ *  node for IPv6
+ * @dev: the interface to check
+ *
+ * Checks the presence of an IPv6 multicast router on this node.
+ *
+ * Caller needs to hold rcu read lock.
+ *
+ * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise.
+ */
+#if IS_ENABLED(CONFIG_IPV6_MROUTE)
+static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev)
+{
+	struct inet6_dev *in6_dev = __in6_dev_get(dev);
+
+	if (in6_dev && in6_dev->cnf.mc_forwarding)
+		return BATADV_NO_FLAGS;
+	else
+		return BATADV_MCAST_WANT_NO_RTR6;
+}
+#else
+static inline u8
+batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev)
+{
+	return BATADV_MCAST_WANT_NO_RTR6;
+}
+#endif
+
+/**
+ * batadv_mcast_mla_rtr_flags_softif_get() - get mcast router flags from node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @bridge: bridge interface on top of the soft_iface if present,
+ *  otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers on this
+ * node.
+ *
+ * Return:
+ *	BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ *	BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ *	BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ *	The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv,
+						struct net_device *bridge)
+{
+	struct net_device *dev = bridge ? bridge : bat_priv->soft_iface;
+	u8 flags = BATADV_NO_FLAGS;
+
+	rcu_read_lock();
+
+	flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv4(dev);
+	flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv6(dev);
+
+	rcu_read_unlock();
+
+	return flags;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge
+ * @bat_priv: the bat priv with all the soft interface information
+ * @bridge: bridge interface on top of the soft_iface if present,
+ *  otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge.
+ *
+ * Return:
+ *	BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ *	BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ *	BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ *	The former two OR'd: no multicast router is present
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
+						struct net_device *bridge)
+{
+	struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list);
+	struct net_device *dev = bat_priv->soft_iface;
+	struct br_ip_list *br_ip_entry, *tmp;
+	u8 flags = BATADV_MCAST_WANT_NO_RTR6;
+	int ret;
+
+	if (!bridge)
+		return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+
+	/* TODO: ask the bridge if a multicast router is present (the bridge
+	 * is capable of performing proper RFC4286 multicast multicast router
+	 * discovery) instead of searching for a ff02::2 listener here
+	 */
+	ret = br_multicast_list_adjacent(dev, &bridge_mcast_list);
+	if (ret < 0)
+		return BATADV_NO_FLAGS;
+
+	list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) {
+		/* the bridge snooping does not maintain IPv4 link-local
+		 * addresses - therefore we won't find any IPv4 multicast router
+		 * address here, only IPv6 ones
+		 */
+		if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) &&
+		    ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.u.ip6))
+			flags &= ~BATADV_MCAST_WANT_NO_RTR6;
+
+		list_del(&br_ip_entry->list);
+		kfree(br_ip_entry);
+	}
+
+	return flags;
+}
+#else
+static inline u8
+batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
+				      struct net_device *bridge)
+{
+	if (bridge)
+		return BATADV_NO_FLAGS;
+	else
+		return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+}
+#endif
+
+/**
+ * batadv_mcast_mla_rtr_flags_get() - get multicast router flags
+ * @bat_priv: the bat priv with all the soft interface information
+ * @bridge: bridge interface on top of the soft_iface if present,
+ *  otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers on this
+ * node or behind its bridge.
+ *
+ * Return:
+ *	BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ *	BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ *	BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ *	The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv,
+					 struct net_device *bridge)
+{
+	u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+
+	flags &= batadv_mcast_mla_rtr_flags_softif_get(bat_priv, bridge);
+	flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge);
+
+	return flags;
+}
+
 /**
  * batadv_mcast_mla_flags_get() - get the new multicast flags
  * @bat_priv: the bat priv with all the soft interface information
@@ -106,13 +280,20 @@ batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv)
 	struct net_device *dev = bat_priv->soft_iface;
 	struct batadv_mcast_querier_state *qr4, *qr6;
 	struct batadv_mcast_mla_flags mla_flags;
+	struct net_device *bridge;
+
+	bridge = batadv_mcast_get_bridge(dev);
 
 	memset(&mla_flags, 0, sizeof(mla_flags));
 	mla_flags.enabled = 1;
+	mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv,
+							       bridge);
 
-	if (!batadv_mcast_has_bridge(bat_priv))
+	if (!bridge)
 		return mla_flags;
 
+	dev_put(bridge);
+
 	mla_flags.bridged = 1;
 	qr4 = &mla_flags.querier_ipv4;
 	qr6 = &mla_flags.querier_ipv6;
@@ -137,41 +318,19 @@ batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv)
 	 * In both cases, we will signalize other batman nodes that
 	 * we need all multicast traffic of the according protocol.
 	 */
-	if (!qr4->exists || qr4->shadowing)
+	if (!qr4->exists || qr4->shadowing) {
 		mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+		mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4;
+	}
 
-	if (!qr6->exists || qr6->shadowing)
+	if (!qr6->exists || qr6->shadowing) {
 		mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+		mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6;
+	}
 
 	return mla_flags;
 }
 
-/**
- * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists
- * @soft_iface: netdev struct of the mesh interface
- *
- * If the given soft interface has a bridge on top then the refcount
- * of the according net device is increased.
- *
- * Return: NULL if no such bridge exists. Otherwise the net device of the
- * bridge.
- */
-static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface)
-{
-	struct net_device *upper = soft_iface;
-
-	rcu_read_lock();
-	do {
-		upper = netdev_master_upper_dev_get_rcu(upper);
-	} while (upper && !(upper->priv_flags & IFF_EBRIDGE));
-
-	if (upper)
-		dev_hold(upper);
-	rcu_read_unlock();
-
-	return upper;
-}
-
 /**
  * batadv_mcast_mla_is_duplicate() - check whether an address is in a list
  * @mcast_addr: the multicast address to check
@@ -234,6 +393,10 @@ batadv_mcast_mla_softif_get_ipv4(struct net_device *dev,
 		    ipv4_is_local_multicast(pmc->multiaddr))
 			continue;
 
+		if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+		    !ipv4_is_local_multicast(pmc->multiaddr))
+			continue;
+
 		ip_eth_mc_map(pmc->multiaddr, mcast_addr);
 
 		if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
@@ -301,6 +464,11 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev,
 		    ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr))
 			continue;
 
+		if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+		    IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) >
+		    IPV6_ADDR_SCOPE_LINKLOCAL)
+			continue;
+
 		ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr);
 
 		if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
@@ -442,6 +610,10 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev,
 			if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
 			    ipv4_is_local_multicast(br_ip_entry->addr.u.ip4))
 				continue;
+
+			if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+			    !ipv4_is_local_multicast(br_ip_entry->addr.u.ip4))
+				continue;
 		}
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -452,6 +624,11 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev,
 			if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
 			    ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.u.ip6))
 				continue;
+
+			if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+			    IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.u.ip6) >
+			    IPV6_ADDR_SCOPE_LINKLOCAL)
+				continue;
 		}
 #endif
 
@@ -662,19 +839,23 @@ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags)
 {
 	bool old_enabled = bat_priv->mcast.mla_flags.enabled;
 	u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags;
-	char str_old_flags[] = "[...]";
+	char str_old_flags[] = "[.... . ]";
 
-	sprintf(str_old_flags, "[%c%c%c]",
+	sprintf(str_old_flags, "[%c%c%c%s%s]",
 		(old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
 		(old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
-		(old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+		(old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+		!(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+		!(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ");
 
 	batadv_dbg(BATADV_DBG_MCAST, bat_priv,
-		   "Changing multicast flags from '%s' to '[%c%c%c]'\n",
+		   "Changing multicast flags from '%s' to '[%c%c%c%s%s]'\n",
 		   old_enabled ? str_old_flags : "<undefined>",
 		   (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
 		   (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
-		   (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+		   (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+		   !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+		   !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ");
 }
 
 /**
@@ -1466,6 +1647,127 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
 	}
 }
 
+/**
+ * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has
+ * toggled then this method updates counter and list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv,
+					  struct batadv_orig_node *orig,
+					  u8 mcast_flags)
+{
+	struct hlist_node *node = &orig->mcast_want_all_rtr4_node;
+	struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list;
+
+	lockdep_assert_held(&orig->mcast_handler_lock);
+
+	/* switched from flag set to unset */
+	if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+	    orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) {
+		atomic_inc(&bat_priv->mcast.num_want_all_rtr4);
+
+		spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+		/* flag checks above + mcast_handler_lock prevents this */
+		WARN_ON(!hlist_unhashed(node));
+
+		hlist_add_head_rcu(node, head);
+		spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+	/* switched from flag unset to set */
+	} else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 &&
+		   !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) {
+		atomic_dec(&bat_priv->mcast.num_want_all_rtr4);
+
+		spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+		/* flag checks above + mcast_handler_lock prevents this */
+		WARN_ON(hlist_unhashed(node));
+
+		hlist_del_init_rcu(node);
+		spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+	}
+}
+
+/**
+ * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has
+ * toggled then this method updates counter and list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv,
+					  struct batadv_orig_node *orig,
+					  u8 mcast_flags)
+{
+	struct hlist_node *node = &orig->mcast_want_all_rtr6_node;
+	struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list;
+
+	lockdep_assert_held(&orig->mcast_handler_lock);
+
+	/* switched from flag set to unset */
+	if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+	    orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) {
+		atomic_inc(&bat_priv->mcast.num_want_all_rtr6);
+
+		spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+		/* flag checks above + mcast_handler_lock prevents this */
+		WARN_ON(!hlist_unhashed(node));
+
+		hlist_add_head_rcu(node, head);
+		spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+	/* switched from flag unset to set */
+	} else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 &&
+		   !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) {
+		atomic_dec(&bat_priv->mcast.num_want_all_rtr6);
+
+		spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+		/* flag checks above + mcast_handler_lock prevents this */
+		WARN_ON(hlist_unhashed(node));
+
+		hlist_del_init_rcu(node);
+		spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+	}
+}
+
+/**
+ * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV
+ * @enabled: whether the originator has multicast TVLV support enabled
+ * @tvlv_value: tvlv buffer containing the multicast flags
+ * @tvlv_value_len: tvlv buffer length
+ *
+ * Return: multicast flags for the given tvlv buffer
+ */
+static u8
+batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len)
+{
+	u8 mcast_flags = BATADV_NO_FLAGS;
+
+	if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags))
+		mcast_flags = *(u8 *)tvlv_value;
+
+	if (!enabled) {
+		mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+		mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+	}
+
+	/* remove redundant flags to avoid sending duplicate packets later */
+	if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)
+		mcast_flags |= BATADV_MCAST_WANT_NO_RTR4;
+
+	if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)
+		mcast_flags |= BATADV_MCAST_WANT_NO_RTR6;
+
+	return mcast_flags;
+}
+
 /**
  * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container
  * @bat_priv: the bat priv with all the soft interface information
@@ -1481,16 +1783,10 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
 					  u16 tvlv_value_len)
 {
 	bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
-	u8 mcast_flags = BATADV_NO_FLAGS;
-
-	if (orig_mcast_enabled && tvlv_value &&
-	    tvlv_value_len >= sizeof(mcast_flags))
-		mcast_flags = *(u8 *)tvlv_value;
+	u8 mcast_flags;
 
-	if (!orig_mcast_enabled) {
-		mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
-		mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
-	}
+	mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled,
+						  tvlv_value, tvlv_value_len);
 
 	spin_lock_bh(&orig->mcast_handler_lock);
 
@@ -1507,6 +1803,8 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
 	batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags);
 	batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags);
 	batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags);
+	batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags);
+	batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags);
 
 	orig->mcast_flags = mcast_flags;
 	spin_unlock_bh(&orig->mcast_handler_lock);
@@ -1556,10 +1854,12 @@ static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv,
 		shadowing6 = '?';
 	}
 
-	seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n",
+	seq_printf(seq, "Multicast flags (own flags: [%c%c%c%s%s])\n",
 		   (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
 		   (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
-		   (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+		   (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+		   !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+		   !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ");
 	seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.');
 	seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n",
 		   querier4, querier6);
@@ -1613,13 +1913,17 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
 
 			flags = orig_node->mcast_flags;
 
-			seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig,
+			seq_printf(seq, "%pM [%c%c%c%s%s]\n", orig_node->orig,
 				   (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)
 				   ? 'U' : '.',
 				   (flags & BATADV_MCAST_WANT_ALL_IPV4)
 				   ? '4' : '.',
 				   (flags & BATADV_MCAST_WANT_ALL_IPV6)
-				   ? '6' : '.');
+				   ? '6' : '.',
+				   !(flags & BATADV_MCAST_WANT_NO_RTR4)
+				   ? "R4" : ". ",
+				   !(flags & BATADV_MCAST_WANT_NO_RTR6)
+				   ? "R6" : ". ");
 		}
 		rcu_read_unlock();
 	}
@@ -1893,6 +2197,8 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
 	batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
 	batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
 	batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
+	batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_NO_FLAGS);
+	batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_NO_FLAGS);
 
 	spin_unlock_bh(&orig->mcast_handler_lock);
 }
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 45db798a7297..38613487fb1b 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -27,6 +27,7 @@
 #include <linux/stddef.h>
 #include <linux/workqueue.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -1043,7 +1044,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
 	orig_node->bcast_seqno_reset = reset_time;
 
 #ifdef CONFIG_BATMAN_ADV_MCAST
-	orig_node->mcast_flags = BATADV_NO_FLAGS;
+	orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4;
+	orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6;
 	INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node);
 	INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node);
 	INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 72f65b3769d0..c2996296b953 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -404,6 +404,17 @@ struct batadv_orig_node {
 	 *  list
 	 */
 	struct hlist_node mcast_want_all_ipv6_node;
+
+	/**
+	 * @mcast_want_all_rtr4_node: a list node for the mcast.want_all_rtr4
+	 *  list
+	 */
+	struct hlist_node mcast_want_all_rtr4_node;
+	/**
+	 * @mcast_want_all_rtr6_node: a list node for the mcast.want_all_rtr6
+	 *  list
+	 */
+	struct hlist_node mcast_want_all_rtr6_node;
 #endif
 
 	/** @capabilities: announced capabilities of this originator */
@@ -1218,6 +1229,18 @@ struct batadv_priv_mcast {
 	 */
 	struct hlist_head want_all_ipv6_list;
 
+	/**
+	 * @want_all_rtr4_list: a list of orig_nodes wanting all routable IPv4
+	 *  multicast traffic
+	 */
+	struct hlist_head want_all_rtr4_list;
+
+	/**
+	 * @want_all_rtr6_list: a list of orig_nodes wanting all routable IPv6
+	 *  multicast traffic
+	 */
+	struct hlist_head want_all_rtr6_list;
+
 	/**
 	 * @mla_flags: flags for the querier, bridge and tvlv state
 	 */
@@ -1240,6 +1263,12 @@ struct batadv_priv_mcast {
 	/** @num_want_all_ipv6: counter for items in want_all_ipv6_list */
 	atomic_t num_want_all_ipv6;
 
+	/** @num_want_all_rtr4: counter for items in want_all_rtr4_list */
+	atomic_t num_want_all_rtr4;
+
+	/** @num_want_all_rtr6: counter for items in want_all_rtr6_list */
+	atomic_t num_want_all_rtr6;
+
 	/**
 	 * @want_lists_lock: lock for protecting modifications to mcasts
 	 *  want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked)
-- 
cgit v1.2.3


From d57d76428ae9abca51fb89f9326da9d4b1cf8270 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Wed, 26 Jun 2019 17:35:24 +0300
Subject: xsk: Add API to check for available entries in FQ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a function that checks whether the Fill Ring has the specified
amount of descriptors available. It will be useful for mlx5e that wants
to check in advance, whether it can allocate a bulk of RX descriptors,
to get the best performance.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h | 21 +++++++++++++++++++++
 net/xdp/xsk.c          |  6 ++++++
 net/xdp/xsk_queue.h    | 14 ++++++++++++++
 3 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ae0f368a62bb..b6f5ebae43a1 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -77,6 +77,7 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
 /* Used from netdev driver */
+bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
 void xsk_umem_discard_addr(struct xdp_umem *umem);
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
@@ -99,6 +100,16 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
 }
 
 /* Reuse-queue aware version of FILL queue helpers */
+static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	if (rq->length >= cnt)
+		return true;
+
+	return xsk_umem_has_addrs(umem, cnt - rq->length);
+}
+
 static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
 {
 	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
@@ -146,6 +157,11 @@ static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 	return false;
 }
 
+static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
+{
+	return false;
+}
+
 static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
 {
 	return NULL;
@@ -200,6 +216,11 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
 	return 0;
 }
 
+static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
+{
+	return false;
+}
+
 static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
 {
 	return NULL;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index a14e8864e4fa..b68a380f50b3 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -37,6 +37,12 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 		READ_ONCE(xs->umem->fq);
 }
 
+bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
+{
+	return xskq_has_addrs(umem->fq, cnt);
+}
+EXPORT_SYMBOL(xsk_umem_has_addrs);
+
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
 {
 	return xskq_peek_addr(umem->fq, addr);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 88b9ae24658d..12b49784a6d5 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -117,6 +117,20 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 	return q->nentries - (producer - q->cons_tail);
 }
 
+static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
+{
+	u32 entries = q->prod_tail - q->cons_tail;
+
+	if (entries >= cnt)
+		return true;
+
+	/* Refresh the local pointer. */
+	q->prod_tail = READ_ONCE(q->ring->producer);
+	entries = q->prod_tail - q->cons_tail;
+
+	return entries >= cnt;
+}
+
 /* UMEM queue */
 
 static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
-- 
cgit v1.2.3


From 2640d3c8123223e0a205b2a25a446df6f072b3ea Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Wed, 26 Jun 2019 17:35:25 +0300
Subject: xsk: Add getsockopt XDP_OPTIONS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make it possible for the application to determine whether the AF_XDP
socket is running in zero-copy mode. To achieve this, add a new
getsockopt option XDP_OPTIONS that returns flags. The only flag
supported for now is the zero-copy mode indicator.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h       |  8 ++++++++
 net/xdp/xsk.c                     | 20 ++++++++++++++++++++
 tools/include/uapi/linux/if_xdp.h |  8 ++++++++
 3 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index caed8b1614ff..faaa5ca2a117 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -46,6 +46,7 @@ struct xdp_mmap_offsets {
 #define XDP_UMEM_FILL_RING		5
 #define XDP_UMEM_COMPLETION_RING	6
 #define XDP_STATISTICS			7
+#define XDP_OPTIONS			8
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -60,6 +61,13 @@ struct xdp_statistics {
 	__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
 };
 
+struct xdp_options {
+	__u32 flags;
+};
+
+/* Flags for the flags field of struct xdp_options */
+#define XDP_OPTIONS_ZEROCOPY (1 << 0)
+
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_PGOFF_TX_RING		 0x80000000
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b68a380f50b3..35ca531ac74e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -650,6 +650,26 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 
 		return 0;
 	}
+	case XDP_OPTIONS:
+	{
+		struct xdp_options opts = {};
+
+		if (len < sizeof(opts))
+			return -EINVAL;
+
+		mutex_lock(&xs->mutex);
+		if (xs->zc)
+			opts.flags |= XDP_OPTIONS_ZEROCOPY;
+		mutex_unlock(&xs->mutex);
+
+		len = sizeof(opts);
+		if (copy_to_user(optval, &opts, len))
+			return -EFAULT;
+		if (put_user(len, optlen))
+			return -EFAULT;
+
+		return 0;
+	}
 	default:
 		break;
 	}
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index caed8b1614ff..faaa5ca2a117 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -46,6 +46,7 @@ struct xdp_mmap_offsets {
 #define XDP_UMEM_FILL_RING		5
 #define XDP_UMEM_COMPLETION_RING	6
 #define XDP_STATISTICS			7
+#define XDP_OPTIONS			8
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -60,6 +61,13 @@ struct xdp_statistics {
 	__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
 };
 
+struct xdp_options {
+	__u32 flags;
+};
+
+/* Flags for the flags field of struct xdp_options */
+#define XDP_OPTIONS_ZEROCOPY (1 << 0)
+
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_PGOFF_TX_RING		 0x80000000
-- 
cgit v1.2.3


From 4bce4e5cb65587f805655ec6808a20af2036627a Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Wed, 26 Jun 2019 17:35:28 +0300
Subject: xsk: Return the whole xdp_desc from xsk_umem_consume_tx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some drivers want to access the data transmitted in order to implement
acceleration features of the NICs. It is also useful in AF_XDP TX flow.

Change the xsk_umem_consume_tx API to return the whole xdp_desc, that
contains the data pointer, length and DMA address, instead of only the
latter two. Adapt the implementation of i40e and ixgbe to this change.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Cc: Björn Töpel <bjorn.topel@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c   | 12 +++++++-----
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 15 +++++++++------
 include/net/xdp_sock.h                       |  6 +++---
 net/xdp/xsk.c                                | 10 +++-------
 4 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 557c565c26fc..32bad014d76c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -641,8 +641,8 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
 	struct i40e_tx_desc *tx_desc = NULL;
 	struct i40e_tx_buffer *tx_bi;
 	bool work_done = true;
+	struct xdp_desc desc;
 	dma_addr_t dma;
-	u32 len;
 
 	while (budget-- > 0) {
 		if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
@@ -651,21 +651,23 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
 			break;
 		}
 
-		if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len))
+		if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
 			break;
 
-		dma_sync_single_for_device(xdp_ring->dev, dma, len,
+		dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
+
+		dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
 					   DMA_BIDIRECTIONAL);
 
 		tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
-		tx_bi->bytecount = len;
+		tx_bi->bytecount = desc.len;
 
 		tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
 		tx_desc->buffer_addr = cpu_to_le64(dma);
 		tx_desc->cmd_type_offset_bsz =
 			build_ctob(I40E_TX_DESC_CMD_ICRC
 				   | I40E_TX_DESC_CMD_EOP,
-				   0, len, 0);
+				   0, desc.len, 0);
 
 		xdp_ring->next_to_use++;
 		if (xdp_ring->next_to_use == xdp_ring->count)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index 6af55bb3bef3..6b609553329f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -571,8 +571,9 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
 	union ixgbe_adv_tx_desc *tx_desc = NULL;
 	struct ixgbe_tx_buffer *tx_bi;
 	bool work_done = true;
-	u32 len, cmd_type;
+	struct xdp_desc desc;
 	dma_addr_t dma;
+	u32 cmd_type;
 
 	while (budget-- > 0) {
 		if (unlikely(!ixgbe_desc_unused(xdp_ring)) ||
@@ -581,14 +582,16 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
 			break;
 		}
 
-		if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len))
+		if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
 			break;
 
-		dma_sync_single_for_device(xdp_ring->dev, dma, len,
+		dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
+
+		dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
 					   DMA_BIDIRECTIONAL);
 
 		tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use];
-		tx_bi->bytecount = len;
+		tx_bi->bytecount = desc.len;
 		tx_bi->xdpf = NULL;
 		tx_bi->gso_segs = 1;
 
@@ -599,10 +602,10 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
 		cmd_type = IXGBE_ADVTXD_DTYP_DATA |
 			   IXGBE_ADVTXD_DCMD_DEXT |
 			   IXGBE_ADVTXD_DCMD_IFCS;
-		cmd_type |= len | IXGBE_TXD_CMD;
+		cmd_type |= desc.len | IXGBE_TXD_CMD;
 		tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
 		tx_desc->read.olinfo_status =
-			cpu_to_le32(len << IXGBE_ADVTXD_PAYLEN_SHIFT);
+			cpu_to_le32(desc.len << IXGBE_ADVTXD_PAYLEN_SHIFT);
 
 		xdp_ring->next_to_use++;
 		if (xdp_ring->next_to_use == xdp_ring->count)
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index b6f5ebae43a1..057b159ff8b9 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -81,7 +81,7 @@ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
 void xsk_umem_discard_addr(struct xdp_umem *umem);
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
 void xsk_umem_consume_tx_done(struct xdp_umem *umem);
 struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
 struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
@@ -175,8 +175,8 @@ static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 {
 }
 
-static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
-				       u32 *len)
+static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
+				       struct xdp_desc *desc)
 {
 	return false;
 }
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 35ca531ac74e..74417a851ed5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -172,22 +172,18 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
 
-bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 {
-	struct xdp_desc desc;
 	struct xdp_sock *xs;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
-		if (!xskq_peek_desc(xs->tx, &desc))
+		if (!xskq_peek_desc(xs->tx, desc))
 			continue;
 
-		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
 			goto out;
 
-		*dma = xdp_umem_get_dma(umem, desc.addr);
-		*len = desc.len;
-
 		xskq_discard_desc(xs->tx);
 		rcu_read_unlock();
 		return true;
-- 
cgit v1.2.3


From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 27 Jun 2019 13:38:47 -0700
Subject: bpf: implement getsockopt and setsockopt hooks

Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and
BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks.

BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before
passing them down to the kernel or bypass kernel completely.
BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that
kernel returns.
Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure.

The buffer memory is pre-allocated (because I don't think there is
a precedent for working with __user memory from bpf). This might be
slow to do for each {s,g}etsockopt call, that's why I've added
__cgroup_bpf_prog_array_is_empty that exits early if there is nothing
attached to a cgroup. Note, however, that there is a race between
__cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup
program layout might have changed; this should not be a problem
because in general there is a race between multiple calls to
{s,g}etsocktop and user adding/removing bpf progs from a cgroup.

The return code of the BPF program is handled as follows:
* 0: EPERM
* 1: success, continue with next BPF program in the cgroup chain

v9:
* allow overwriting setsockopt arguments (Alexei Starovoitov):
  * use set_fs (same as kernel_setsockopt)
  * buffer is always kzalloc'd (no small on-stack buffer)

v8:
* use s32 for optlen (Andrii Nakryiko)

v7:
* return only 0 or 1 (Alexei Starovoitov)
* always run all progs (Alexei Starovoitov)
* use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov)
  (decided to use optval=-1 instead, optval=0 might be a valid input)
* call getsockopt hook after kernel handlers (Alexei Starovoitov)

v6:
* rework cgroup chaining; stop as soon as bpf program returns
  0 or 2; see patch with the documentation for the details
* drop Andrii's and Martin's Acked-by (not sure they are comfortable
  with the new state of things)

v5:
* skip copy_to_user() and put_user() when ret == 0 (Martin Lau)

v4:
* don't export bpf_sk_fullsock helper (Martin Lau)
* size != sizeof(__u64) for uapi pointers (Martin Lau)
* offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau)

v3:
* typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko)
* reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii
  Nakryiko)
* use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau)
* use BPF_FIELD_SIZEOF() for consistency (Martin Lau)
* new CG_SOCKOPT_ACCESS macro to wrap repeated parts

v2:
* moved bpf_sockopt_kern fields around to remove a hole (Martin Lau)
* aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau)
* bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau)
* added [0,2] return code check to verifier (Martin Lau)
* dropped unused buf[64] from the stack (Martin Lau)
* use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau)
* dropped bpf_target_off from ctx rewrites (Martin Lau)
* use return code for kernel bypass (Martin Lau & Andrii Nakryiko)

Cc: Andrii Nakryiko <andriin@fb.com>
Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h |  45 ++++++
 include/linux/bpf.h        |   2 +
 include/linux/bpf_types.h  |   1 +
 include/linux/filter.h     |  10 ++
 include/uapi/linux/bpf.h   |  14 ++
 kernel/bpf/cgroup.c        | 333 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c          |   9 ++
 kernel/bpf/syscall.c       |  19 +++
 kernel/bpf/verifier.c      |   8 ++
 net/core/filter.c          |   2 +-
 net/socket.c               |  30 ++++
 11 files changed, 472 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index bd79ae32909a..169fd25f6bc2 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -124,6 +124,14 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   loff_t *ppos, void **new_buf,
 				   enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
+				       int *optname, char __user *optval,
+				       int *optlen, char **kernel_optval);
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+				       int optname, char __user *optval,
+				       int __user *optlen, int max_optlen,
+				       int retval);
+
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
@@ -286,6 +294,38 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen,   \
+				       kernel_optval)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled)						       \
+		__ret = __cgroup_bpf_run_filter_setsockopt(sock, level,	       \
+							   optname, optval,    \
+							   optlen,	       \
+							   kernel_optval);     \
+	__ret;								       \
+})
+
+#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled)						       \
+		get_user(__ret, optlen);				       \
+	__ret;								       \
+})
+
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen,   \
+				       max_optlen, retval)		       \
+({									       \
+	int __ret = retval;						       \
+	if (cgroup_bpf_enabled)						       \
+		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \
+							   optname, optval,    \
+							   optlen, max_optlen, \
+							   retval);	       \
+	__ret;								       \
+})
+
 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog);
 int cgroup_bpf_prog_detach(const union bpf_attr *attr,
@@ -357,6 +397,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; })
+#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
+				       optlen, max_optlen, retval) ({ retval; })
+#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
+				       kernel_optval) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a62e7889b0b6..18f4cc2c6acd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -518,6 +518,7 @@ struct bpf_prog_array {
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array *progs);
 int bpf_prog_array_length(struct bpf_prog_array *progs);
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
@@ -1051,6 +1052,7 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto;
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
 extern const struct bpf_func_proto bpf_strtol_proto;
 extern const struct bpf_func_proto bpf_strtoul_proto;
+extern const struct bpf_func_proto bpf_tcp_sock_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5a9975678d6f..eec5aeeeaf92 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -30,6 +30,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt)
 #endif
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 43b45d6db36d..340f7d648974 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1199,4 +1199,14 @@ struct bpf_sysctl_kern {
 	u64 tmp_reg;
 };
 
+struct bpf_sockopt_kern {
+	struct sock	*sk;
+	u8		*optval;
+	u8		*optval_end;
+	s32		level;
+	s32		optname;
+	s32		optlen;
+	s32		retval;
+};
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b077507efa3f..a396b516a2b2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_FLOW_DISSECTOR,
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+	BPF_PROG_TYPE_CGROUP_SOCKOPT,
 };
 
 enum bpf_attach_type {
@@ -194,6 +195,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_SYSCTL,
 	BPF_CGROUP_UDP4_RECVMSG,
 	BPF_CGROUP_UDP6_RECVMSG,
+	BPF_CGROUP_GETSOCKOPT,
+	BPF_CGROUP_SETSOCKOPT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -3541,4 +3544,15 @@ struct bpf_sysctl {
 				 */
 };
 
+struct bpf_sockopt {
+	__bpf_md_ptr(struct bpf_sock *, sk);
+	__bpf_md_ptr(void *, optval);
+	__bpf_md_ptr(void *, optval_end);
+
+	__s32	level;
+	__s32	optname;
+	__s32	optlen;
+	__s32	retval;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 077ed3a19848..76fa0076f20d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -15,6 +15,7 @@
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
+#include <net/bpf_sk_storage.h>
 
 #include "../cgroup/cgroup-internal.h"
 
@@ -938,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
 
+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
+					     enum bpf_attach_type attach_type)
+{
+	struct bpf_prog_array *prog_array;
+	bool empty;
+
+	rcu_read_lock();
+	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
+	empty = bpf_prog_array_is_empty(prog_array);
+	rcu_read_unlock();
+
+	return empty;
+}
+
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+{
+	if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+		return -EINVAL;
+
+	ctx->optval = kzalloc(max_optlen, GFP_USER);
+	if (!ctx->optval)
+		return -ENOMEM;
+
+	ctx->optval_end = ctx->optval + max_optlen;
+	ctx->optlen = max_optlen;
+
+	return 0;
+}
+
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+{
+	kfree(ctx->optval);
+}
+
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+				       int *optname, char __user *optval,
+				       int *optlen, char **kernel_optval)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_kern ctx = {
+		.sk = sk,
+		.level = *level,
+		.optname = *optname,
+	};
+	int ret;
+
+	/* Opportunistic check to see whether we have any BPF program
+	 * attached to the hook so we don't waste time allocating
+	 * memory and locking the socket.
+	 */
+	if (!cgroup_bpf_enabled ||
+	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+		return 0;
+
+	ret = sockopt_alloc_buf(&ctx, *optlen);
+	if (ret)
+		return ret;
+
+	if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	lock_sock(sk);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+				 &ctx, BPF_PROG_RUN);
+	release_sock(sk);
+
+	if (!ret) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (ctx.optlen == -1) {
+		/* optlen set to -1, bypass kernel */
+		ret = 1;
+	} else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+		/* optlen is out of bounds */
+		ret = -EFAULT;
+	} else {
+		/* optlen within bounds, run kernel handler */
+		ret = 0;
+
+		/* export any potential modifications */
+		*level = ctx.level;
+		*optname = ctx.optname;
+		*optlen = ctx.optlen;
+		*kernel_optval = ctx.optval;
+	}
+
+out:
+	if (ret)
+		sockopt_free_buf(&ctx);
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
+
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+				       int optname, char __user *optval,
+				       int __user *optlen, int max_optlen,
+				       int retval)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_kern ctx = {
+		.sk = sk,
+		.level = level,
+		.optname = optname,
+		.retval = retval,
+	};
+	int ret;
+
+	/* Opportunistic check to see whether we have any BPF program
+	 * attached to the hook so we don't waste time allocating
+	 * memory and locking the socket.
+	 */
+	if (!cgroup_bpf_enabled ||
+	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+		return retval;
+
+	ret = sockopt_alloc_buf(&ctx, max_optlen);
+	if (ret)
+		return ret;
+
+	if (!retval) {
+		/* If kernel getsockopt finished successfully,
+		 * copy whatever was returned to the user back
+		 * into our temporary buffer. Set optlen to the
+		 * one that kernel returned as well to let
+		 * BPF programs inspect the value.
+		 */
+
+		if (get_user(ctx.optlen, optlen)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		if (ctx.optlen > max_optlen)
+			ctx.optlen = max_optlen;
+
+		if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	lock_sock(sk);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+				 &ctx, BPF_PROG_RUN);
+	release_sock(sk);
+
+	if (!ret) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (ctx.optlen > max_optlen) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* BPF programs only allowed to set retval to 0, not some
+	 * arbitrary value.
+	 */
+	if (ctx.retval != 0 && ctx.retval != retval) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+	    put_user(ctx.optlen, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = ctx.retval;
+
+out:
+	sockopt_free_buf(&ctx);
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+
 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
 			      size_t *lenp)
 {
@@ -1198,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
 
 const struct bpf_prog_ops cg_sysctl_prog_ops = {
 };
+
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_sk_storage_get:
+		return &bpf_sk_storage_get_proto;
+	case BPF_FUNC_sk_storage_delete:
+		return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_INET
+	case BPF_FUNC_tcp_sock:
+		return &bpf_tcp_sock_proto;
+#endif
+	default:
+		return cgroup_base_func_proto(func_id, prog);
+	}
+}
+
+static bool cg_sockopt_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
+
+	if (off < 0 || off >= sizeof(struct bpf_sockopt))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sockopt, retval):
+			if (size != size_default)
+				return false;
+			return prog->expected_attach_type ==
+				BPF_CGROUP_GETSOCKOPT;
+		case offsetof(struct bpf_sockopt, optname):
+			/* fallthrough */
+		case offsetof(struct bpf_sockopt, level):
+			if (size != size_default)
+				return false;
+			return prog->expected_attach_type ==
+				BPF_CGROUP_SETSOCKOPT;
+		case offsetof(struct bpf_sockopt, optlen):
+			return size == size_default;
+		default:
+			return false;
+		}
+	}
+
+	switch (off) {
+	case offsetof(struct bpf_sockopt, sk):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_SOCKET;
+		break;
+	case offsetof(struct bpf_sockopt, optval):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct bpf_sockopt, optval_end):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	case offsetof(struct bpf_sockopt, retval):
+		if (size != size_default)
+			return false;
+		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+	default:
+		if (size != size_default)
+			return false;
+		break;
+	}
+	return true;
+}
+
+#define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
+	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
+	  si->dst_reg, si->src_reg,					\
+	  offsetof(struct bpf_sockopt_kern, F))
+
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+					 const struct bpf_insn *si,
+					 struct bpf_insn *insn_buf,
+					 struct bpf_prog *prog,
+					 u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct bpf_sockopt, sk):
+		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+		break;
+	case offsetof(struct bpf_sockopt, level):
+		if (type == BPF_WRITE)
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+		else
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+		break;
+	case offsetof(struct bpf_sockopt, optname):
+		if (type == BPF_WRITE)
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+		else
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+		break;
+	case offsetof(struct bpf_sockopt, optlen):
+		if (type == BPF_WRITE)
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+		else
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+		break;
+	case offsetof(struct bpf_sockopt, retval):
+		if (type == BPF_WRITE)
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
+		else
+			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+		break;
+	case offsetof(struct bpf_sockopt, optval):
+		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+		break;
+	case offsetof(struct bpf_sockopt, optval_end):
+		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+				   bool direct_write,
+				   const struct bpf_prog *prog)
+{
+	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
+	 */
+	return 0;
+}
+
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+	.get_func_proto		= cg_sockopt_func_proto,
+	.is_valid_access	= cg_sockopt_is_valid_access,
+	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
+	.gen_prologue		= cg_sockopt_get_prologue,
+};
+
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 561ed07d3007..e2c1b43728da 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1809,6 +1809,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array)
 	return cnt;
 }
 
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
+{
+	struct bpf_prog_array_item *item;
+
+	for (item = array->items; item->prog; item++)
+		if (item->prog != &dummy_bpf_prog.prog)
+			return false;
+	return true;
+}
 
 static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
 				     u32 *prog_ids,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7713cf39795a..b0f545e07425 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1590,6 +1590,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		default:
 			return -EINVAL;
 		}
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+		switch (expected_attach_type) {
+		case BPF_CGROUP_SETSOCKOPT:
+		case BPF_CGROUP_GETSOCKOPT:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	default:
 		return 0;
 	}
@@ -1840,6 +1848,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	switch (prog->type) {
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		return prog->enforce_expected_attach_type &&
@@ -1912,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SYSCTL:
 		ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
 		break;
+	case BPF_CGROUP_GETSOCKOPT:
+	case BPF_CGROUP_SETSOCKOPT:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1995,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_SYSCTL:
 		ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
 		break;
+	case BPF_CGROUP_GETSOCKOPT:
+	case BPF_CGROUP_SETSOCKOPT:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -2031,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 	case BPF_CGROUP_SYSCTL:
+	case BPF_CGROUP_GETSOCKOPT:
+	case BPF_CGROUP_SETSOCKOPT:
 		break;
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e079b2298f8..6b5623d320f9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2215,6 +2215,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 
 		env->seen_direct_write = true;
 		return true;
+
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+		if (t == BPF_WRITE)
+			env->seen_direct_write = true;
+
+		return true;
+
 	default:
 		return false;
 	}
@@ -6066,6 +6073,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		break;
 	default:
 		return 0;
diff --git a/net/core/filter.c b/net/core/filter.c
index 2014d76e0d2a..dc8534be12fc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5651,7 +5651,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
 	return (unsigned long)NULL;
 }
 
-static const struct bpf_func_proto bpf_tcp_sock_proto = {
+const struct bpf_func_proto bpf_tcp_sock_proto = {
 	.func		= bpf_tcp_sock,
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_TCP_SOCK_OR_NULL,
diff --git a/net/socket.c b/net/socket.c
index 963df5dbdd54..0ddfbfb761d9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2051,6 +2051,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
 static int __sys_setsockopt(int fd, int level, int optname,
 			    char __user *optval, int optlen)
 {
+	mm_segment_t oldfs = get_fs();
+	char *kernel_optval = NULL;
 	int err, fput_needed;
 	struct socket *sock;
 
@@ -2063,6 +2065,22 @@ static int __sys_setsockopt(int fd, int level, int optname,
 		if (err)
 			goto out_put;
 
+		err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level,
+						     &optname, optval, &optlen,
+						     &kernel_optval);
+
+		if (err < 0) {
+			goto out_put;
+		} else if (err > 0) {
+			err = 0;
+			goto out_put;
+		}
+
+		if (kernel_optval) {
+			set_fs(KERNEL_DS);
+			optval = (char __user __force *)kernel_optval;
+		}
+
 		if (level == SOL_SOCKET)
 			err =
 			    sock_setsockopt(sock, level, optname, optval,
@@ -2071,6 +2089,11 @@ static int __sys_setsockopt(int fd, int level, int optname,
 			err =
 			    sock->ops->setsockopt(sock, level, optname, optval,
 						  optlen);
+
+		if (kernel_optval) {
+			set_fs(oldfs);
+			kfree(kernel_optval);
+		}
 out_put:
 		fput_light(sock->file, fput_needed);
 	}
@@ -2093,6 +2116,7 @@ static int __sys_getsockopt(int fd, int level, int optname,
 {
 	int err, fput_needed;
 	struct socket *sock;
+	int max_optlen;
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (sock != NULL) {
@@ -2100,6 +2124,8 @@ static int __sys_getsockopt(int fd, int level, int optname,
 		if (err)
 			goto out_put;
 
+		max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
 		if (level == SOL_SOCKET)
 			err =
 			    sock_getsockopt(sock, level, optname, optval,
@@ -2108,6 +2134,10 @@ static int __sys_getsockopt(int fd, int level, int optname,
 			err =
 			    sock->ops->getsockopt(sock, level, optname, optval,
 						  optlen);
+
+		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+						     optval, optlen,
+						     max_optlen, err);
 out_put:
 		fput_light(sock->file, fput_needed);
 	}
-- 
cgit v1.2.3


From d2ce8d6bfcfed014fd281e06c9b1d4638ddf3f1e Mon Sep 17 00:00:00 2001
From: Jiunn Chang <c0d1n61at3@gmail.com>
Date: Thu, 27 Jun 2019 00:04:26 -0500
Subject: nl80211: Fix undefined behavior in bit shift

Shifting signed 32-bit value by 31 bits is undefined.  Changing most
significant bit to unsigned.

Signed-off-by: Jiunn Chang <c0d1n61at3@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 6f09d1500960..fa7ebbc6ff27 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5314,7 +5314,7 @@ enum nl80211_feature_flags {
 	NL80211_FEATURE_TDLS_CHANNEL_SWITCH		= 1 << 28,
 	NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR		= 1 << 29,
 	NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR	= 1 << 30,
-	NL80211_FEATURE_ND_RANDOM_MAC_ADDR		= 1 << 31,
+	NL80211_FEATURE_ND_RANDOM_MAC_ADDR		= 1U << 31,
 };
 
 /**
-- 
cgit v1.2.3


From 720f22fed81bc6fd1765db7014651b6718887bea Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Mon, 24 Jun 2019 23:13:35 +0100
Subject: net: sched: refactor reinsert action

The TC_ACT_REINSERT return type was added as an in-kernel only option to
allow a packet ingress or egress redirect. This is used to avoid
unnecessary skb clones in situations where they are not required. If a TC
hook returns this code then the packet is 'reinserted' and no skb consume
is carried out as no clone took place.

This return type is only used in act_mirred. Rather than have the reinsert
called from the main datapath, call it directly in act_mirred. Instead of
returning TC_ACT_REINSERT, change the type to the new TC_ACT_CONSUMED
which tells the caller that the packet has been stolen by another process
and that no consume call is required.

Moving all redirect calls to the act_mirred code is in preparation for
tracking recursion created by act_mirred.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     | 2 +-
 include/net/sch_generic.h | 2 +-
 net/core/dev.c            | 4 +---
 net/sched/act_mirred.c    | 3 ++-
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 720f2b32fc2f..1a7596ba0dbe 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -10,7 +10,7 @@
 #include <net/net_namespace.h>
 
 /* TC action not accessible from user space */
-#define TC_ACT_REINSERT		(TC_ACT_VALUE_MAX + 1)
+#define TC_ACT_CONSUMED		(TC_ACT_VALUE_MAX + 1)
 
 /* Basic packet classifier frontend definitions. */
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 21f434f3ac9e..855167bbc372 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -279,7 +279,7 @@ struct tcf_result {
 		};
 		const struct tcf_proto *goto_tp;
 
-		/* used by the TC_ACT_REINSERT action */
+		/* used in the skb_tc_reinsert function */
 		struct {
 			bool		ingress;
 			struct gnet_stats_queue *qstats;
diff --git a/net/core/dev.c b/net/core/dev.c
index d6edd218babd..58529318b3a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4689,9 +4689,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		__skb_push(skb, skb->mac_len);
 		skb_do_redirect(skb);
 		return NULL;
-	case TC_ACT_REINSERT:
-		/* this does not scrub the packet, and updates stats on error */
-		skb_tc_reinsert(skb, &cl_res);
+	case TC_ACT_CONSUMED:
 		return NULL;
 	default:
 		break;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 58e7573dded4..8c1d73661cc4 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -277,7 +277,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
 		if (use_reinsert) {
 			res->ingress = want_ingress;
 			res->qstats = this_cpu_ptr(m->common.cpu_qstats);
-			return TC_ACT_REINSERT;
+			skb_tc_reinsert(skb, res);
+			return TC_ACT_CONSUMED;
 		}
 	}
 
-- 
cgit v1.2.3


From 32e454efbb2279b0fa5874abb0944a9d42080ad1 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 25 Jun 2019 10:44:33 +0100
Subject: net: phylink: further documentation clarifications

Clarify the validate() behaviour in a few cases which weren't mentioned
in the documentation, but which are necessary for users to get the
correct behaviour.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 2d2e55dfea94..5b130140fb8f 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -93,12 +93,19 @@ struct phylink_mac_ops {
  * Note that the PHY may be able to transform from one connection
  * technology to another, so, eg, don't clear 1000BaseX just
  * because the MAC is unable to BaseX mode. This is more about
- * clearing unsupported speeds and duplex settings.
+ * clearing unsupported speeds and duplex settings. The port modes
+ * should not be cleared; phylink_set_port_modes() will help with this.
  *
  * If the @state->interface mode is %PHY_INTERFACE_MODE_1000BASEX
  * or %PHY_INTERFACE_MODE_2500BASEX, select the appropriate mode
  * based on @state->advertising and/or @state->speed and update
- * @state->interface accordingly.
+ * @state->interface accordingly. See phylink_helper_basex_speed().
+ *
+ * When @state->interface is %PHY_INTERFACE_MODE_NA, phylink expects the
+ * MAC driver to return all supported link modes.
+ *
+ * If the @state->interface mode is not supported, then the @supported
+ * mask must be cleared.
  */
 void validate(struct net_device *ndev, unsigned long *supported,
 	      struct phylink_link_state *state);
-- 
cgit v1.2.3


From 9903c8dc734265689d5770ff28c84a7228fe5890 Mon Sep 17 00:00:00 2001
From: Vedang Patel <vedang.patel@intel.com>
Date: Tue, 25 Jun 2019 15:07:13 -0700
Subject: etf: Don't use BIT() in UAPI headers.

The BIT() macro isn't exported as part of the UAPI interface. So, the
compile-test to ensure they are self contained fails. So, use _BITUL()
instead.

Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 8b2f993cbb77..f88c4e0bd9e5 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -988,8 +988,8 @@ struct tc_etf_qopt {
 	__s32 delta;
 	__s32 clockid;
 	__u32 flags;
-#define TC_ETF_DEADLINE_MODE_ON	BIT(0)
-#define TC_ETF_OFFLOAD_ON	BIT(1)
+#define TC_ETF_DEADLINE_MODE_ON	_BITUL(0)
+#define TC_ETF_OFFLOAD_ON	_BITUL(1)
 };
 
 enum {
-- 
cgit v1.2.3


From d14d2b20680f02fa739c2cbbb59e3629e487f359 Mon Sep 17 00:00:00 2001
From: Vedang Patel <vedang.patel@intel.com>
Date: Tue, 25 Jun 2019 15:07:14 -0700
Subject: etf: Add skip_sock_check

Currently, etf expects a socket with SO_TXTIME option set for each packet
it encounters. So, it will drop all other packets. But, in the future
commits we are planning to add functionality where tstamp value will be set
by another qdisc. Also, some packets which are generated from within the
kernel (e.g. ICMP packets) do not have any socket associated with them.

So, this commit adds support for skip_sock_check. When this option is set,
etf will skip checking for a socket and other associated options for all
skbs.

Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_etf.c            | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f88c4e0bd9e5..127ac6d2888c 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -990,6 +990,7 @@ struct tc_etf_qopt {
 	__u32 flags;
 #define TC_ETF_DEADLINE_MODE_ON	_BITUL(0)
 #define TC_ETF_OFFLOAD_ON	_BITUL(1)
+#define TC_ETF_SKIP_SOCK_CHECK	_BITUL(2)
 };
 
 enum {
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index db0c2ba1d156..cebfb65d8556 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -22,10 +22,12 @@
 
 #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
 #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
+#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK)
 
 struct etf_sched_data {
 	bool offload;
 	bool deadline_mode;
+	bool skip_sock_check;
 	int clockid;
 	int queue;
 	s32 delta; /* in ns */
@@ -77,6 +79,9 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
 	struct sock *sk = nskb->sk;
 	ktime_t now;
 
+	if (q->skip_sock_check)
+		goto skip;
+
 	if (!sk)
 		return false;
 
@@ -92,6 +97,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
 	if (sk->sk_txtime_deadline_mode != q->deadline_mode)
 		return false;
 
+skip:
 	now = q->get_time();
 	if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
 		return false;
@@ -385,6 +391,7 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
 	q->clockid = qopt->clockid;
 	q->offload = OFFLOAD_IS_ON(qopt);
 	q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+	q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt);
 
 	switch (q->clockid) {
 	case CLOCK_REALTIME:
@@ -473,6 +480,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (q->deadline_mode)
 		opt.flags |= TC_ETF_DEADLINE_MODE_ON;
 
+	if (q->skip_sock_check)
+		opt.flags |= TC_ETF_SKIP_SOCK_CHECK;
+
 	if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 4cfd5779bd6efe8c76b4494aec63a063be0d2ff2 Mon Sep 17 00:00:00 2001
From: Vedang Patel <vedang.patel@intel.com>
Date: Tue, 25 Jun 2019 15:07:17 -0700
Subject: taprio: Add support for txtime-assist mode

Currently, we are seeing non-critical packets being transmitted outside of
their timeslice. We can confirm that the packets are being dequeued at the
right time. So, the delay is induced in the hardware side.  The most likely
reason is the hardware queues are starving the lower priority queues.

In order to improve the performance of taprio, we will be making use of the
txtime feature provided by the ETF qdisc. For all the packets which do not
have the SO_TXTIME option set, taprio will set the transmit timestamp (set
in skb->tstamp) in this mode. TAPrio Qdisc will ensure that the transmit
time for the packet is set to when the gate is open. If SO_TXTIME is set,
the TAPrio qdisc will validate whether the timestamp (in skb->tstamp)
occurs when the gate corresponding to skb's traffic class is open.

Following two parameters added to support this mode:
- flags: used to enable txtime-assist mode. Will also be used to enable
  other modes (like hardware offloading) later.
- txtime-delay: This indicates the minimum time it will take for the packet
  to hit the wire. This is useful in determining whether we can transmit
the packet in the remaining time if the gate corresponding to the packet is
currently open.

An example configuration for enabling txtime-assist:

tc qdisc replace dev eth0 parent root handle 100 taprio \\
      num_tc 3 \\
      map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\
      queues 1@0 1@0 1@0 \\
      base-time 1558653424279842568 \\
      sched-entry S 01 300000 \\
      sched-entry S 02 300000 \\
      sched-entry S 04 400000 \\
      flags 0x1 \\
      txtime-delay 40000 \\
      clockid CLOCK_TAI

tc qdisc replace dev $IFACE parent 100:1 etf skip_sock_check \\
      offload delta 200000 clockid CLOCK_TAI

Note that all the traffic classes are mapped to the same queue.  This is
only possible in taprio when txtime-assist is enabled. Also, note that the
ETF Qdisc is enabled with offload mode set.

In this mode, if the packet's traffic class is open and the complete packet
can be transmitted, taprio will try to transmit the packet immediately.
This will be done by setting skb->tstamp to current_time + the time delta
indicated in the txtime-delay parameter. This parameter indicates the time
taken (in software) for packet to reach the network adapter.

If the packet cannot be transmitted in the current interval or if the
packet's traffic is not currently transmitting, the skb->tstamp is set to
the next available timestamp value. This is tracked in the next_launchtime
parameter in the struct sched_entry.

The behaviour w.r.t admin and oper schedules is not changed from what is
present in software mode.

The transmit time is already known in advance. So, we do not need the HR
timers to advance the schedule and wakeup the dequeue side of taprio.  So,
HR timer won't be run when this mode is enabled.

Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |   4 +
 net/sched/sch_taprio.c         | 341 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 328 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 127ac6d2888c..390efb54b2e0 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1159,6 +1159,8 @@ enum {
  *       [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL]
  */
 
+#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST 0x1
+
 enum {
 	TCA_TAPRIO_ATTR_UNSPEC,
 	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
@@ -1170,6 +1172,8 @@ enum {
 	TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
+	TCA_TAPRIO_ATTR_FLAGS, /* u32 */
+	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* s32 */
 	__TCA_TAPRIO_ATTR_MAX,
 };
 
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 6ef0cc03fdb9..078230e44471 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -21,12 +21,16 @@
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
 #include <net/sch_generic.h>
+#include <net/sock.h>
 
 static LIST_HEAD(taprio_list);
 static DEFINE_SPINLOCK(taprio_list_lock);
 
 #define TAPRIO_ALL_GATES_OPEN -1
 
+#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST))
+#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
+
 struct sched_entry {
 	struct list_head list;
 
@@ -35,6 +39,7 @@ struct sched_entry {
 	 * packet leaves after this time.
 	 */
 	ktime_t close_time;
+	ktime_t next_txtime;
 	atomic_t budget;
 	int index;
 	u32 gate_mask;
@@ -55,6 +60,7 @@ struct sched_gate_list {
 struct taprio_sched {
 	struct Qdisc **qdiscs;
 	struct Qdisc *root;
+	u32 flags;
 	int clockid;
 	atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
 				    * speeds it's sub-nanoseconds per byte
@@ -68,6 +74,7 @@ struct taprio_sched {
 	ktime_t (*get_time)(void);
 	struct hrtimer advance_timer;
 	struct list_head taprio_list;
+	int txtime_delay;
 };
 
 static ktime_t sched_base_time(const struct sched_gate_list *sched)
@@ -108,6 +115,227 @@ static void switch_schedules(struct taprio_sched *q,
 	*admin = NULL;
 }
 
+/* Get how much time has been already elapsed in the current cycle. */
+static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
+{
+	ktime_t time_since_sched_start;
+	s32 time_elapsed;
+
+	time_since_sched_start = ktime_sub(time, sched->base_time);
+	div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed);
+
+	return time_elapsed;
+}
+
+static ktime_t get_interval_end_time(struct sched_gate_list *sched,
+				     struct sched_gate_list *admin,
+				     struct sched_entry *entry,
+				     ktime_t intv_start)
+{
+	s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start);
+	ktime_t intv_end, cycle_ext_end, cycle_end;
+
+	cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed);
+	intv_end = ktime_add_ns(intv_start, entry->interval);
+	cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension);
+
+	if (ktime_before(intv_end, cycle_end))
+		return intv_end;
+	else if (admin && admin != sched &&
+		 ktime_after(admin->base_time, cycle_end) &&
+		 ktime_before(admin->base_time, cycle_ext_end))
+		return admin->base_time;
+	else
+		return cycle_end;
+}
+
+static int length_to_duration(struct taprio_sched *q, int len)
+{
+	return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+}
+
+/* Returns the entry corresponding to next available interval. If
+ * validate_interval is set, it only validates whether the timestamp occurs
+ * when the gate corresponding to the skb's traffic class is open.
+ */
+static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb,
+						  struct Qdisc *sch,
+						  struct sched_gate_list *sched,
+						  struct sched_gate_list *admin,
+						  ktime_t time,
+						  ktime_t *interval_start,
+						  ktime_t *interval_end,
+						  bool validate_interval)
+{
+	ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time;
+	ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time;
+	struct sched_entry *entry = NULL, *entry_found = NULL;
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	bool entry_available = false;
+	s32 cycle_elapsed;
+	int tc, n;
+
+	tc = netdev_get_prio_tc_map(dev, skb->priority);
+	packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb));
+
+	*interval_start = 0;
+	*interval_end = 0;
+
+	if (!sched)
+		return NULL;
+
+	cycle = sched->cycle_time;
+	cycle_elapsed = get_cycle_time_elapsed(sched, time);
+	curr_intv_end = ktime_sub_ns(time, cycle_elapsed);
+	cycle_end = ktime_add_ns(curr_intv_end, cycle);
+
+	list_for_each_entry(entry, &sched->entries, list) {
+		curr_intv_start = curr_intv_end;
+		curr_intv_end = get_interval_end_time(sched, admin, entry,
+						      curr_intv_start);
+
+		if (ktime_after(curr_intv_start, cycle_end))
+			break;
+
+		if (!(entry->gate_mask & BIT(tc)) ||
+		    packet_transmit_time > entry->interval)
+			continue;
+
+		txtime = entry->next_txtime;
+
+		if (ktime_before(txtime, time) || validate_interval) {
+			transmit_end_time = ktime_add_ns(time, packet_transmit_time);
+			if ((ktime_before(curr_intv_start, time) &&
+			     ktime_before(transmit_end_time, curr_intv_end)) ||
+			    (ktime_after(curr_intv_start, time) && !validate_interval)) {
+				entry_found = entry;
+				*interval_start = curr_intv_start;
+				*interval_end = curr_intv_end;
+				break;
+			} else if (!entry_available && !validate_interval) {
+				/* Here, we are just trying to find out the
+				 * first available interval in the next cycle.
+				 */
+				entry_available = 1;
+				entry_found = entry;
+				*interval_start = ktime_add_ns(curr_intv_start, cycle);
+				*interval_end = ktime_add_ns(curr_intv_end, cycle);
+			}
+		} else if (ktime_before(txtime, earliest_txtime) &&
+			   !entry_available) {
+			earliest_txtime = txtime;
+			entry_found = entry;
+			n = div_s64(ktime_sub(txtime, curr_intv_start), cycle);
+			*interval_start = ktime_add(curr_intv_start, n * cycle);
+			*interval_end = ktime_add(curr_intv_end, n * cycle);
+		}
+	}
+
+	return entry_found;
+}
+
+static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct sched_gate_list *sched, *admin;
+	ktime_t interval_start, interval_end;
+	struct sched_entry *entry;
+
+	rcu_read_lock();
+	sched = rcu_dereference(q->oper_sched);
+	admin = rcu_dereference(q->admin_sched);
+
+	entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp,
+				       &interval_start, &interval_end, true);
+	rcu_read_unlock();
+
+	return entry;
+}
+
+/* There are a few scenarios where we will have to modify the txtime from
+ * what is read from next_txtime in sched_entry. They are:
+ * 1. If txtime is in the past,
+ *    a. The gate for the traffic class is currently open and packet can be
+ *       transmitted before it closes, schedule the packet right away.
+ *    b. If the gate corresponding to the traffic class is going to open later
+ *       in the cycle, set the txtime of packet to the interval start.
+ * 2. If txtime is in the future, there are packets corresponding to the
+ *    current traffic class waiting to be transmitted. So, the following
+ *    possibilities exist:
+ *    a. We can transmit the packet before the window containing the txtime
+ *       closes.
+ *    b. The window might close before the transmission can be completed
+ *       successfully. So, schedule the packet in the next open window.
+ */
+static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
+{
+	ktime_t transmit_end_time, interval_end, interval_start;
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct sched_gate_list *sched, *admin;
+	ktime_t minimum_time, now, txtime;
+	int len, packet_transmit_time;
+	struct sched_entry *entry;
+	bool sched_changed;
+
+	now = q->get_time();
+	minimum_time = ktime_add_ns(now, q->txtime_delay);
+
+	rcu_read_lock();
+	admin = rcu_dereference(q->admin_sched);
+	sched = rcu_dereference(q->oper_sched);
+	if (admin && ktime_after(minimum_time, admin->base_time))
+		switch_schedules(q, &admin, &sched);
+
+	/* Until the schedule starts, all the queues are open */
+	if (!sched || ktime_before(minimum_time, sched->base_time)) {
+		txtime = minimum_time;
+		goto done;
+	}
+
+	len = qdisc_pkt_len(skb);
+	packet_transmit_time = length_to_duration(q, len);
+
+	do {
+		sched_changed = 0;
+
+		entry = find_entry_to_transmit(skb, sch, sched, admin,
+					       minimum_time,
+					       &interval_start, &interval_end,
+					       false);
+		if (!entry) {
+			txtime = 0;
+			goto done;
+		}
+
+		txtime = entry->next_txtime;
+		txtime = max_t(ktime_t, txtime, minimum_time);
+		txtime = max_t(ktime_t, txtime, interval_start);
+
+		if (admin && admin != sched &&
+		    ktime_after(txtime, admin->base_time)) {
+			sched = admin;
+			sched_changed = 1;
+			continue;
+		}
+
+		transmit_end_time = ktime_add(txtime, packet_transmit_time);
+		minimum_time = transmit_end_time;
+
+		/* Update the txtime of current entry to the next time it's
+		 * interval starts.
+		 */
+		if (ktime_after(transmit_end_time, interval_end))
+			entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
+	} while (sched_changed || ktime_after(transmit_end_time, interval_end));
+
+	entry->next_txtime = transmit_end_time;
+
+done:
+	rcu_read_unlock();
+	return txtime;
+}
+
 static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			  struct sk_buff **to_free)
 {
@@ -121,6 +349,15 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (unlikely(!child))
 		return qdisc_drop(skb, sch, to_free);
 
+	if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) {
+		if (!is_valid_interval(skb, sch))
+			return qdisc_drop(skb, sch, to_free);
+	} else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+		skb->tstamp = get_packet_txtime(skb, sch);
+		if (!skb->tstamp)
+			return qdisc_drop(skb, sch, to_free);
+	}
+
 	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 
@@ -156,6 +393,9 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 		if (!skb)
 			continue;
 
+		if (TXTIME_ASSIST_IS_ENABLED(q->flags))
+			return skb;
+
 		prio = skb->priority;
 		tc = netdev_get_prio_tc_map(dev, prio);
 
@@ -168,11 +408,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 	return NULL;
 }
 
-static int length_to_duration(struct taprio_sched *q, int len)
-{
-	return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
-}
-
 static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
 {
 	atomic_set(&entry->budget,
@@ -216,6 +451,13 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
 		if (unlikely(!child))
 			continue;
 
+		if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+			skb = child->ops->dequeue(child);
+			if (!skb)
+				continue;
+			goto skb_found;
+		}
+
 		skb = child->ops->peek(child);
 		if (!skb)
 			continue;
@@ -246,6 +488,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
 		if (unlikely(!skb))
 			goto done;
 
+skb_found:
 		qdisc_bstats_update(sch, skb);
 		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
@@ -522,7 +765,8 @@ static int parse_taprio_schedule(struct nlattr **tb,
 
 static int taprio_parse_mqprio_opt(struct net_device *dev,
 				   struct tc_mqprio_qopt *qopt,
-				   struct netlink_ext_ack *extack)
+				   struct netlink_ext_ack *extack,
+				   u32 taprio_flags)
 {
 	int i, j;
 
@@ -570,6 +814,9 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
 			return -EINVAL;
 		}
 
+		if (TXTIME_ASSIST_IS_ENABLED(taprio_flags))
+			continue;
+
 		/* Verify that the offset and counts do not overlap */
 		for (j = i + 1; j < qopt->num_tc; j++) {
 			if (last > qopt->offset[j]) {
@@ -700,6 +947,18 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
 	return NOTIFY_DONE;
 }
 
+static void setup_txtime(struct taprio_sched *q,
+			 struct sched_gate_list *sched, ktime_t base)
+{
+	struct sched_entry *entry;
+	u32 interval = 0;
+
+	list_for_each_entry(entry, &sched->entries, list) {
+		entry->next_txtime = ktime_add_ns(base, interval);
+		interval += entry->interval;
+	}
+}
+
 static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 			 struct netlink_ext_ack *extack)
 {
@@ -708,6 +967,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
 	struct tc_mqprio_qopt *mqprio = NULL;
+	u32 taprio_flags = 0;
 	int i, err, clockid;
 	unsigned long flags;
 	ktime_t start;
@@ -720,7 +980,21 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
 		mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
 
-	err = taprio_parse_mqprio_opt(dev, mqprio, extack);
+	if (tb[TCA_TAPRIO_ATTR_FLAGS]) {
+		taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]);
+
+		if (q->flags != 0 && q->flags != taprio_flags) {
+			NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
+			return -EOPNOTSUPP;
+		} else if (!FLAGS_VALID(taprio_flags)) {
+			NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
+			return -EINVAL;
+		}
+
+		q->flags = taprio_flags;
+	}
+
+	err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags);
 	if (err < 0)
 		return err;
 
@@ -779,7 +1053,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	/* Protects against enqueue()/dequeue() */
 	spin_lock_bh(qdisc_lock(sch));
 
-	if (!hrtimer_active(&q->advance_timer)) {
+	if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) {
+		if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+			NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
+			err = -EINVAL;
+			goto unlock;
+		}
+
+		q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
+	}
+
+	if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
+	    !hrtimer_active(&q->advance_timer)) {
 		hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
 		q->advance_timer.function = advance_sched;
 	}
@@ -822,20 +1107,35 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 		goto unlock;
 	}
 
-	setup_first_close_time(q, new_admin, start);
+	if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) {
+		setup_txtime(q, new_admin, start);
 
-	/* Protects against advance_sched() */
-	spin_lock_irqsave(&q->current_entry_lock, flags);
+		if (!oper) {
+			rcu_assign_pointer(q->oper_sched, new_admin);
+			err = 0;
+			new_admin = NULL;
+			goto unlock;
+		}
 
-	taprio_start_sched(sch, start, new_admin);
+		rcu_assign_pointer(q->admin_sched, new_admin);
+		if (admin)
+			call_rcu(&admin->rcu, taprio_free_sched_cb);
+	} else {
+		setup_first_close_time(q, new_admin, start);
 
-	rcu_assign_pointer(q->admin_sched, new_admin);
-	if (admin)
-		call_rcu(&admin->rcu, taprio_free_sched_cb);
-	new_admin = NULL;
+		/* Protects against advance_sched() */
+		spin_lock_irqsave(&q->current_entry_lock, flags);
+
+		taprio_start_sched(sch, start, new_admin);
 
-	spin_unlock_irqrestore(&q->current_entry_lock, flags);
+		rcu_assign_pointer(q->admin_sched, new_admin);
+		if (admin)
+			call_rcu(&admin->rcu, taprio_free_sched_cb);
 
+		spin_unlock_irqrestore(&q->current_entry_lock, flags);
+	}
+
+	new_admin = NULL;
 	err = 0;
 
 unlock:
@@ -1073,6 +1373,13 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
 		goto options_error;
 
+	if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
+		goto options_error;
+
+	if (q->txtime_delay &&
+	    nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
+		goto options_error;
+
 	if (oper && dump_schedule(skb, oper))
 		goto options_error;
 
-- 
cgit v1.2.3


From 5233794b179136d597b84188c1285148f07012e6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 18 Jun 2019 13:15:06 +0200
Subject: net/mlx5e: reduce stack usage in mlx5_eswitch_termtbl_create

Putting an empty 'mlx5_flow_spec' structure on the stack is a bit
wasteful and causes a warning on 32-bit architectures when building
with clang -fsanitize-coverage:

drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c: In function 'mlx5_eswitch_termtbl_create':
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c:90:1: error: the frame size of 1032 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]

Since the structure is never written to, we can statically allocate
it to avoid the stack usage. To be on the safe side, mark all
subsequent function arguments that we pass it into as 'const'
as well.

Fixes: 10caabdaad5a ("net/mlx5e: Use termination table for VLAN push actions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../mellanox/mlx5/core/eswitch_offloads_termtbl.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c    | 20 ++++++++++----------
 include/linux/mlx5/fs.h                              |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
index cb7d8ebe2c95..1d55a324a17e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
@@ -49,8 +49,8 @@ mlx5_eswitch_termtbl_create(struct mlx5_core_dev *dev,
 			    struct mlx5_termtbl_handle *tt,
 			    struct mlx5_flow_act *flow_act)
 {
+	static const struct mlx5_flow_spec spec = {};
 	struct mlx5_flow_namespace *root_ns;
-	struct mlx5_flow_spec spec = {};
 	int prio, flags;
 	int err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 585e7adcbf99..a68a51c5011a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -584,7 +584,7 @@ err_ida_remove:
 }
 
 static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
-				struct mlx5_flow_spec *spec,
+				const struct mlx5_flow_spec *spec,
 				struct mlx5_flow_act *flow_act)
 {
 	struct mlx5_flow_steering *steering = get_steering(&ft->node);
@@ -613,7 +613,7 @@ static void dealloc_flow_group(struct mlx5_flow_steering *steering,
 
 static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering,
 						u8 match_criteria_enable,
-						void *match_criteria,
+						const void *match_criteria,
 						int start_index,
 						int end_index)
 {
@@ -643,7 +643,7 @@ static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steer
 
 static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft,
 						       u8 match_criteria_enable,
-						       void *match_criteria,
+						       const void *match_criteria,
 						       int start_index,
 						       int end_index,
 						       struct list_head *prev)
@@ -1286,7 +1286,7 @@ free_handle:
 }
 
 static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table  *ft,
-						     struct mlx5_flow_spec *spec)
+						     const struct mlx5_flow_spec *spec)
 {
 	struct list_head *prev = &ft->node.children;
 	struct mlx5_flow_group *fg;
@@ -1454,7 +1454,7 @@ static int check_conflicting_ftes(struct fs_fte *fte,
 }
 
 static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
-					    struct mlx5_flow_spec *spec,
+					    const struct mlx5_flow_spec *spec,
 					    struct mlx5_flow_act *flow_act,
 					    struct mlx5_flow_destination *dest,
 					    int dest_num,
@@ -1539,7 +1539,7 @@ static void free_match_list(struct match_list_head *head)
 
 static int build_match_list(struct match_list_head *match_head,
 			    struct mlx5_flow_table *ft,
-			    struct mlx5_flow_spec *spec)
+			    const struct mlx5_flow_spec *spec)
 {
 	struct rhlist_head *tmp, *list;
 	struct mlx5_flow_group *g;
@@ -1592,7 +1592,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 
 static struct fs_fte *
 lookup_fte_locked(struct mlx5_flow_group *g,
-		  u32 *match_value,
+		  const u32 *match_value,
 		  bool take_write)
 {
 	struct fs_fte *fte_tmp;
@@ -1625,7 +1625,7 @@ out:
 static struct mlx5_flow_handle *
 try_add_to_existing_fg(struct mlx5_flow_table *ft,
 		       struct list_head *match_head,
-		       struct mlx5_flow_spec *spec,
+		       const struct mlx5_flow_spec *spec,
 		       struct mlx5_flow_act *flow_act,
 		       struct mlx5_flow_destination *dest,
 		       int dest_num,
@@ -1716,7 +1716,7 @@ out:
 
 static struct mlx5_flow_handle *
 _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
-		     struct mlx5_flow_spec *spec,
+		     const struct mlx5_flow_spec *spec,
 		     struct mlx5_flow_act *flow_act,
 		     struct mlx5_flow_destination *dest,
 		     int dest_num)
@@ -1823,7 +1823,7 @@ static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
 
 struct mlx5_flow_handle *
 mlx5_add_flow_rules(struct mlx5_flow_table *ft,
-		    struct mlx5_flow_spec *spec,
+		    const struct mlx5_flow_spec *spec,
 		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
 		    int num_dest)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index dc7e7aa53a13..04a569568eac 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -208,7 +208,7 @@ struct mlx5_flow_act {
  */
 struct mlx5_flow_handle *
 mlx5_add_flow_rules(struct mlx5_flow_table *ft,
-		    struct mlx5_flow_spec *spec,
+		    const struct mlx5_flow_spec *spec,
 		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
 		    int num_dest);
-- 
cgit v1.2.3


From c8af5cd75e2411d5a5aacf115f59a5ff6b87f3fa Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 28 Jun 2019 11:12:34 +0200
Subject: xskmap: Move non-standard list manipulation to helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a helper in list.h for the non-standard way of clearing a list that is
used in xskmap. This makes it easier to reuse it in the other map types,
and also makes sure this usage is not forgotten in any list refactorings in
the future.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/list.h | 14 ++++++++++++++
 kernel/bpf/xskmap.c  |  3 +--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/list.h b/include/linux/list.h
index e951228db4b2..85c92555e31f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
 	WRITE_ONCE(prev->next, next);
 }
 
+/*
+ * Delete a list entry and clear the 'prev' pointer.
+ *
+ * This is a special-purpose list clearing method used in the networking code
+ * for lists allocated as per-cpu, where we don't want to incur the extra
+ * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
+ * needs to check the node 'prev' pointer instead of calling list_empty().
+ */
+static inline void __list_del_clearprev(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->prev = NULL;
+}
+
 /**
  * list_del - deletes entry from list.
  * @entry: the element to delete from the list.
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index ef7338cebd18..9bb96ace9fa1 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map)
 
 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
 		xsk_flush(xs);
-		__list_del(xs->flush_node.prev, xs->flush_node.next);
-		xs->flush_node.prev = NULL;
+		__list_del_clearprev(&xs->flush_node);
 	}
 }
 
-- 
cgit v1.2.3


From 4b55cf290dc6bd3a9e5da26d1ad60e77aa88c8cf Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 28 Jun 2019 11:12:34 +0200
Subject: devmap: Rename ifindex member in bpf_redirect_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_redirect_info struct has an 'ifindex' member which was named back
when the redirects could only target egress interfaces. Now that we can
also redirect to sockets and CPUs, this is a bit misleading, so rename the
member to tgt_index.

Reorder the struct members so we can have 'tgt_index' and 'tgt_value' next
to each other in a subsequent patch.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |  2 +-
 net/core/filter.c      | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 340f7d648974..92bd192f7786 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -578,8 +578,8 @@ struct bpf_skb_data_end {
 };
 
 struct bpf_redirect_info {
-	u32 ifindex;
 	u32 flags;
+	u32 tgt_index;
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
 	u32 kern_flags;
diff --git a/net/core/filter.c b/net/core/filter.c
index 1e5fd37e9ab5..b4a062379bb9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2158,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return TC_ACT_SHOT;
 
-	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->tgt_index = ifindex;
 
 	return TC_ACT_REDIRECT;
 }
@@ -2169,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb)
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct net_device *dev;
 
-	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
-	ri->ifindex = 0;
+	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
+	ri->tgt_index = 0;
 	if (unlikely(!dev)) {
 		kfree_skb(skb);
 		return -EINVAL;
@@ -3488,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
 		     struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
 {
 	struct net_device *fwd;
-	u32 index = ri->ifindex;
+	u32 index = ri->tgt_index;
 	int err;
 
 	fwd = dev_get_by_index_rcu(dev_net(dev), index);
-	ri->ifindex = 0;
+	ri->tgt_index = 0;
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
@@ -3604,11 +3604,11 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 			       struct bpf_prog *xdp_prog, struct bpf_map *map,
 			       struct bpf_redirect_info *ri)
 {
-	u32 index = ri->ifindex;
+	u32 index = ri->tgt_index;
 	void *fwd = NULL;
 	int err;
 
-	ri->ifindex = 0;
+	ri->tgt_index = 0;
 	WRITE_ONCE(ri->map, NULL);
 
 	fwd = __xdp_map_lookup_elem(map, index);
@@ -3651,11 +3651,11 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct bpf_map *map)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-	u32 index = ri->ifindex;
+	u32 index = ri->tgt_index;
 	void *fwd = NULL;
 	int err = 0;
 
-	ri->ifindex = 0;
+	ri->tgt_index = 0;
 	WRITE_ONCE(ri->map, NULL);
 
 	fwd = __xdp_map_lookup_elem(map, index);
@@ -3695,14 +3695,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct bpf_map *map = READ_ONCE(ri->map);
-	u32 index = ri->ifindex;
+	u32 index = ri->tgt_index;
 	struct net_device *fwd;
 	int err = 0;
 
 	if (map)
 		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
 						   map);
-	ri->ifindex = 0;
+	ri->tgt_index = 0;
 	fwd = dev_get_by_index_rcu(dev_net(dev), index);
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
@@ -3730,8 +3730,8 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 	if (unlikely(flags))
 		return XDP_ABORTED;
 
-	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->tgt_index = ifindex;
 	WRITE_ONCE(ri->map, NULL);
 
 	return XDP_REDIRECT;
@@ -3753,8 +3753,8 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
 	if (unlikely(flags))
 		return XDP_ABORTED;
 
-	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->tgt_index = ifindex;
 	WRITE_ONCE(ri->map, map);
 
 	return XDP_REDIRECT;
-- 
cgit v1.2.3


From 43e74c0267a35d6f5127218054b2d80c7fe801f5 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 28 Jun 2019 11:12:34 +0200
Subject: bpf_xdp_redirect_map: Perform map lookup in eBPF helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_redirect_map() helper used by XDP programs doesn't return any
indication of whether it can successfully redirect to the map index it was
given. Instead, BPF programs have to track this themselves, leading to
programs using duplicate maps to track which entries are populated in the
devmap.

This patch fixes this by moving the map lookup into the bpf_redirect_map()
helper, which makes it possible to return failure to the eBPF program. The
lower bits of the flags argument is used as the return code, which means
that existing users who pass a '0' flag argument will get XDP_ABORTED.

With this, a BPF program can check the return code from the helper call and
react by, for instance, substituting a different redirect. This works for
any type of map used for redirect.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h     |  1 +
 include/trace/events/xdp.h |  5 ++---
 include/uapi/linux/bpf.h   |  7 +++++--
 net/core/filter.c          | 32 ++++++++++++++++++--------------
 4 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 92bd192f7786..1fe53e78c7e3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -580,6 +580,7 @@ struct bpf_skb_data_end {
 struct bpf_redirect_info {
 	u32 flags;
 	u32 tgt_index;
+	void *tgt_value;
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
 	u32 kern_flags;
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 81e708c4b513..68899fdc985b 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -175,9 +175,8 @@ struct _bpf_dtab_netdev {
 #endif /* __DEVMAP_OBJ_TYPE */
 
 #define devmap_ifindex(fwd, map)				\
-	(!fwd ? 0 :						\
-	 ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
-	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))
+	((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a396b516a2b2..cffea1826a1f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1571,8 +1571,11 @@ union bpf_attr {
  * 		but this is only implemented for native XDP (with driver
  * 		support) as of this writing).
  *
- * 		All values for *flags* are reserved for future usage, and must
- * 		be left at zero.
+ * 		The lower two bits of *flags* are used as the return code if
+ * 		the map lookup fails. This is so that the return value can be
+ * 		one of the XDP program return codes up to XDP_TX, as chosen by
+ * 		the caller. Any higher bits in the *flags* argument must be
+ * 		unset.
  *
  * 		When used to redirect packets to net devices, this helper
  * 		provides a high performance increase over **bpf_redirect**\ ().
diff --git a/net/core/filter.c b/net/core/filter.c
index b4a062379bb9..4836264f82ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3605,17 +3605,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 			       struct bpf_redirect_info *ri)
 {
 	u32 index = ri->tgt_index;
-	void *fwd = NULL;
+	void *fwd = ri->tgt_value;
 	int err;
 
 	ri->tgt_index = 0;
+	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	fwd = __xdp_map_lookup_elem(map, index);
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
 	if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
 		xdp_do_flush_map();
 
@@ -3652,18 +3648,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	u32 index = ri->tgt_index;
-	void *fwd = NULL;
+	void *fwd = ri->tgt_value;
 	int err = 0;
 
 	ri->tgt_index = 0;
+	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	fwd = __xdp_map_lookup_elem(map, index);
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
-
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		struct bpf_dtab_netdev *dst = fwd;
 
@@ -3732,6 +3723,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 
 	ri->flags = flags;
 	ri->tgt_index = ifindex;
+	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
 	return XDP_REDIRECT;
@@ -3750,9 +3742,21 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
-	if (unlikely(flags))
+	/* Lower bits of the flags are used as return code on lookup failure */
+	if (unlikely(flags > XDP_TX))
 		return XDP_ABORTED;
 
+	ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
+	if (unlikely(!ri->tgt_value)) {
+		/* If the lookup fails we want to clear out the state in the
+		 * redirect_info struct completely, so that if an eBPF program
+		 * performs multiple lookups, the last one always takes
+		 * precedence.
+		 */
+		WRITE_ONCE(ri->map, NULL);
+		return flags;
+	}
+
 	ri->flags = flags;
 	ri->tgt_index = ifindex;
 	WRITE_ONCE(ri->map, map);
-- 
cgit v1.2.3


From 0472301a28f6cf53a6bc5783e48a2d0bbff4682f Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Fri, 28 Jun 2019 07:08:45 +0300
Subject: bpf: fix uapi bpf_prog_info fields alignment

Merge commit 1c8c5a9d38f60 ("Merge
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next") undid the
fix from commit 36f9814a494 ("bpf: fix uapi hole for 32 bit compat
applications") by taking the gpl_compatible 1-bit field definition from
commit b85fab0e67b162 ("bpf: Add gpl_compatible flag to struct
bpf_prog_info") as is. That breaks architectures with 16-bit alignment
like m68k. Add 31-bit pad after gpl_compatible to restore alignment of
following fields.

Thanks to Dmitry V. Levin his analysis of this bug history.

Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h       | 1 +
 tools/include/uapi/linux/bpf.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a8b823c30b43..29a5bc3d5c66 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3143,6 +3143,7 @@ struct bpf_prog_info {
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
 	__u32 gpl_compatible:1;
+	__u32 :31; /* alignment pad */
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a8b823c30b43..29a5bc3d5c66 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3143,6 +3143,7 @@ struct bpf_prog_info {
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
 	__u32 gpl_compatible:1;
+	__u32 :31; /* alignment pad */
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
-- 
cgit v1.2.3


From b60a77386b1d4868f72f6353d35dabe5fbe981f2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 26 Jun 2019 20:40:45 +0200
Subject: net: make skb_dst_force return true when dst is refcounted

netfilter did not expect that skb_dst_force() can cause skb to lose its
dst entry.

I got a bug report with a skb->dst NULL dereference in netfilter
output path.  The backtrace contains nf_reinject(), so the dst might have
been cleared when skb got queued to userspace.

Other users were fixed via
if (skb_dst(skb)) {
	skb_dst_force(skb);
	if (!skb_dst(skb))
		goto handle_err;
}

But I think its preferable to make the 'dst might be cleared' part
of the function explicit.

In netfilter case, skb with a null dst is expected when queueing in
prerouting hook, so drop skb for the other hooks.

v2:
 v1 of this patch returned true in case skb had no dst entry.
 Eric said:
   Say if we have two skb_dst_force() calls for some reason
   on the same skb, only the first one will return false.

 This now returns false even when skb had no dst, as per Erics
 suggestion, so callers might need to check skb_dst() first before
 skb_dst_force().

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h        | 5 ++++-
 net/netfilter/nf_queue.c | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 12b31c602cb0..f8206d3fed2f 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -302,8 +302,9 @@ static inline bool dst_hold_safe(struct dst_entry *dst)
  * @skb: buffer
  *
  * If dst is not yet refcounted and not destroyed, grab a ref on it.
+ * Returns true if dst is refcounted.
  */
-static inline void skb_dst_force(struct sk_buff *skb)
+static inline bool skb_dst_force(struct sk_buff *skb)
 {
 	if (skb_dst_is_noref(skb)) {
 		struct dst_entry *dst = skb_dst(skb);
@@ -314,6 +315,8 @@ static inline void skb_dst_force(struct sk_buff *skb)
 
 		skb->_skb_refdst = (unsigned long)dst;
 	}
+
+	return skb->_skb_refdst != 0UL;
 }
 
 
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index b5b2be55ca82..2c440015ff0c 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -190,6 +190,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 		goto err;
 	}
 
+	if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) {
+		status = -ENETDOWN;
+		goto err;
+	}
+
 	*entry = (struct nf_queue_entry) {
 		.skb	= skb,
 		.state	= *state,
@@ -198,7 +203,6 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 	};
 
 	nf_queue_entry_get_refs(entry);
-	skb_dst_force(skb);
 
 	switch (entry->state.pf) {
 	case AF_INET:
-- 
cgit v1.2.3


From 79293f49677e2e703ef0d0efc9919319adacb3fb Mon Sep 17 00:00:00 2001
From: Jiunn Chang <c0d1n61at3@gmail.com>
Date: Wed, 26 Jun 2019 22:25:30 -0500
Subject: packet: Fix undefined behavior in bit shift

Shifting signed 32-bit value by 31 bits is undefined.  Changing most
significant bit to unsigned.

Changes included in v2:
  - use subsystem specific subject lines
  - CC required mailing lists

Signed-off-by: Jiunn Chang <c0d1n61at3@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 467b654bd4c7..3d884d68eb30 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -123,7 +123,7 @@ struct tpacket_auxdata {
 /* Rx and Tx ring - header status */
 #define TP_STATUS_TS_SOFTWARE		(1 << 29)
 #define TP_STATUS_TS_SYS_HARDWARE	(1 << 30) /* deprecated, never set */
-#define TP_STATUS_TS_RAW_HARDWARE	(1 << 31)
+#define TP_STATUS_TS_RAW_HARDWARE	(1U << 31)
 
 /* Rx ring - feature request bits */
 #define TP_FT_REQ_FILL_RXHASH	0x1
-- 
cgit v1.2.3


From c7b37c769d2a5e711106a3c793140a4f46768e04 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 24 Jun 2019 22:04:48 +0200
Subject: xfrm: remove get_mtu indirection from xfrm_type

esp4_get_mtu and esp6_get_mtu are exactly the same, the only difference
is a single sizeof() (ipv4 vs. ipv6 header).

Merge both into xfrm_state_mtu() and remove the indirection.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  4 +---
 net/ipv4/esp4.c        | 27 +--------------------------
 net/ipv6/esp6.c        | 20 +-------------------
 net/xfrm/xfrm_device.c |  5 ++---
 net/xfrm/xfrm_state.c  | 34 +++++++++++++++++++++++++++++-----
 5 files changed, 34 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 56b31676e330..b22db30c3d88 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -404,8 +404,6 @@ struct xfrm_type {
 	int			(*reject)(struct xfrm_state *, struct sk_buff *,
 					  const struct flowi *);
 	int			(*hdr_offset)(struct xfrm_state *, struct sk_buff *, u8 **);
-	/* Estimate maximal size of result of transformation of a dgram */
-	u32			(*get_mtu)(struct xfrm_state *, int size);
 };
 
 int xfrm_register_type(const struct xfrm_type *type, unsigned short family);
@@ -1546,7 +1544,7 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
 int xfrm_init_replay(struct xfrm_state *x);
-int xfrm_state_mtu(struct xfrm_state *x, int mtu);
+u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload);
 int xfrm_init_state(struct xfrm_state *x);
 int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index c06562aded11..5c967764041f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -33,8 +33,6 @@ struct esp_output_extra {
 
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
 
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
-
 /*
  * Allocate an AEAD request structure with extra space for SG and IV.
  *
@@ -506,7 +504,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
 		u32 padto;
 
-		padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+		padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
 		if (skb->len < padto)
 			esp.tfclen = padto - skb->len;
 	}
@@ -788,28 +786,6 @@ out:
 	return err;
 }
 
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
-{
-	struct crypto_aead *aead = x->data;
-	u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-	unsigned int net_adj;
-
-	switch (x->props.mode) {
-	case XFRM_MODE_TRANSPORT:
-	case XFRM_MODE_BEET:
-		net_adj = sizeof(struct iphdr);
-		break;
-	case XFRM_MODE_TUNNEL:
-		net_adj = 0;
-		break;
-	default:
-		BUG();
-	}
-
-	return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
-		 net_adj) & ~(blksize - 1)) + net_adj - 2;
-}
-
 static int esp4_err(struct sk_buff *skb, u32 info)
 {
 	struct net *net = dev_net(skb->dev);
@@ -1035,7 +1011,6 @@ static const struct xfrm_type esp_type =
 	.flags		= XFRM_TYPE_REPLAY_PROT,
 	.init_state	= esp_init_state,
 	.destructor	= esp_destroy,
-	.get_mtu	= esp4_get_mtu,
 	.input		= esp_input,
 	.output		= esp_output,
 };
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index b6c6b3e08836..a3b403ba8f8f 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -41,8 +41,6 @@ struct esp_skb_cb {
 
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
 
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
-
 /*
  * Allocate an AEAD request structure with extra space for SG and IV.
  *
@@ -447,7 +445,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
 		u32 padto;
 
-		padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached));
+		padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
 		if (skb->len < padto)
 			esp.tfclen = padto - skb->len;
 	}
@@ -687,21 +685,6 @@ out:
 	return ret;
 }
 
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
-{
-	struct crypto_aead *aead = x->data;
-	u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-	unsigned int net_adj;
-
-	if (x->props.mode != XFRM_MODE_TUNNEL)
-		net_adj = sizeof(struct ipv6hdr);
-	else
-		net_adj = 0;
-
-	return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
-		 net_adj) & ~(blksize - 1)) + net_adj - 2;
-}
-
 static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		    u8 type, u8 code, int offset, __be32 info)
 {
@@ -919,7 +902,6 @@ static const struct xfrm_type esp6_type = {
 	.flags		= XFRM_TYPE_REPLAY_PROT,
 	.init_state	= esp6_init_state,
 	.destructor	= esp6_destroy,
-	.get_mtu	= esp6_get_mtu,
 	.input		= esp6_input,
 	.output		= esp6_output,
 	.hdr_offset	= xfrm6_find_1stfragopt,
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index b24cd86a02c3..f10a70388f72 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -275,9 +275,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 		return false;
 
 	if ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
-	    (!xdst->child->xfrm && x->type->get_mtu)) {
-		mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
-
+	    (!xdst->child->xfrm)) {
+		mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
 		if (skb->len <= mtu)
 			goto ok;
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index fd51737f9f17..c6f3c4a1bd99 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -27,6 +27,8 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 
+#include <crypto/aead.h>
+
 #include "xfrm_hash.h"
 
 #define xfrm_state_deref_prot(table, net) \
@@ -2403,16 +2405,38 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(xfrm_state_delete_tunnel);
 
-int xfrm_state_mtu(struct xfrm_state *x, int mtu)
+u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
 {
 	const struct xfrm_type *type = READ_ONCE(x->type);
+	struct crypto_aead *aead;
+	u32 blksize, net_adj = 0;
+
+	if (x->km.state != XFRM_STATE_VALID ||
+	    !type || type->proto != IPPROTO_ESP)
+		return mtu - x->props.header_len;
+
+	aead = x->data;
+	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 
-	if (x->km.state == XFRM_STATE_VALID &&
-	    type && type->get_mtu)
-		return type->get_mtu(x, mtu);
+	switch (x->props.mode) {
+	case XFRM_MODE_TRANSPORT:
+	case XFRM_MODE_BEET:
+		if (x->props.family == AF_INET)
+			net_adj = sizeof(struct iphdr);
+		else if (x->props.family == AF_INET6)
+			net_adj = sizeof(struct ipv6hdr);
+		break;
+	case XFRM_MODE_TUNNEL:
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
 
-	return mtu - x->props.header_len;
+	return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+		 net_adj) & ~(blksize - 1)) + net_adj - 2;
 }
+EXPORT_SYMBOL_GPL(xfrm_state_mtu);
 
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 {
-- 
cgit v1.2.3


From 40f6a2cb9cfc5da713f745b23bcc2c6761e5eb5e Mon Sep 17 00:00:00 2001
From: Vandana BN <bnvandana@gmail.com>
Date: Mon, 1 Jul 2019 17:25:39 +0530
Subject: net: dst.h: Fix shifting signed 32-bit value by 31 bits problem

Fix DST_FEATURE_ECN_CA to use "U" cast to avoid shifting signed
32-bit value by 31 bits problem.

Signed-off-by: Vandana BN <bnvandana@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index f8206d3fed2f..fe62fe2eb781 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -183,7 +183,7 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
 }
 
 /* Kernel-internal feature bits that are unallocated in user space. */
-#define DST_FEATURE_ECN_CA	(1 << 31)
+#define DST_FEATURE_ECN_CA	(1U << 31)
 
 #define DST_FEATURE_MASK	(DST_FEATURE_ECN_CA)
 #define DST_FEATURE_ECN_MASK	(DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)
-- 
cgit v1.2.3


From a346abe051bd2bd0d5d0140b2da9ec95639acad7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 1 Jul 2019 06:39:36 -0700
Subject: ipv6: icmp: allow flowlabel reflection in echo replies

Extend flowlabel_reflect bitmask to allow conditional
reflection of incoming flowlabels in echo replies.

Note this has precedence against auto flowlabels.

Add flowlabel_reflect enum to replace hard coded
values.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 4 +++-
 include/net/ipv6.h                     | 7 +++++++
 net/ipv6/af_inet6.c                    | 2 +-
 net/ipv6/icmp.c                        | 3 +++
 net/ipv6/sysctl_net_ipv6.c             | 4 ++--
 net/ipv6/tcp_ipv6.c                    | 2 +-
 6 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e0d8a96e2c67..f0e6d1f53485 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1452,7 +1452,7 @@ flowlabel_reflect - INTEGER
 	environments. See RFC 7690 and:
 	https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
 
-	This is a mask of two bits.
+	This is a bitmask.
 	1: enabled for established flows
 
 	Note that this prevents automatic flowlabel changes, as done
@@ -1463,6 +1463,8 @@ flowlabel_reflect - INTEGER
 	If set, a RST packet sent in response to a SYN packet on a closed
 	port will reflect the incoming flow label.
 
+	4: enabled for ICMPv6 echo reply messages.
+
 	Default: 0
 
 fib_multipath_hash_policy - INTEGER
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index b41f6a0fa903..8eca5fb30376 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -301,6 +301,13 @@ struct ipv6_txoptions {
 	/* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
 };
 
+/* flowlabel_reflect sysctl values */
+enum flowlabel_reflect {
+	FLOWLABEL_REFLECT_ESTABLISHED		= 1,
+	FLOWLABEL_REFLECT_TCP_RESET		= 2,
+	FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES	= 4,
+};
+
 struct ip6_flowlabel {
 	struct ip6_flowlabel __rcu *next;
 	__be32			label;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7382a927d1eb..8369af32cef6 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -208,7 +208,7 @@ lookup_protocol:
 	np->mc_loop	= 1;
 	np->mc_all	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
-	np->repflow	= net->ipv6.sysctl.flowlabel_reflect & 1;
+	np->repflow	= net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 
 	/* Init the ipv4 part of the socket since we can have sockets
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 12906301ec7b..62c997201970 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -703,6 +703,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
 
 	memset(&fl6, 0, sizeof(fl6));
+	if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
+		fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb));
+
 	fl6.flowi6_proto = IPPROTO_ICMPV6;
 	fl6.daddr = ipv6_hdr(skb)->saddr;
 	if (saddr)
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 6d86fac472e7..8b3fe81783ed 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -23,7 +23,7 @@
 
 static int zero;
 static int one = 1;
-static int three = 3;
+static int flowlabel_reflect_max = 0x7;
 static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
 
@@ -116,7 +116,7 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
-		.extra2		= &three,
+		.extra2		= &flowlabel_reflect_max,
 	},
 	{
 		.procname	= "max_dst_opts_number",
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 408d9ec26971..4f3f99b39820 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -989,7 +989,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		if (sk->sk_state == TCP_TIME_WAIT)
 			label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
 	} else {
-		if (net->ipv6.sysctl.flowlabel_reflect & 2)
+		if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
 			label = ip6_flowlabel(ipv6h);
 	}
 
-- 
cgit v1.2.3


From 88405680ec57c35f5886dbb81b3f6f638f74f40d Mon Sep 17 00:00:00 2001
From: Vandana BN <bnvandana@gmail.com>
Date: Mon, 1 Jul 2019 19:46:10 +0530
Subject: net:gue.h:Fix shifting signed 32-bit value by 31 bits problem

Fix GUE_PFLAG_REMCSUM to use "U" cast to avoid shifting signed
32-bit value by 31 bits problem.

Signed-off-by: Vandana BN <bnvandana@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/gue.h b/include/net/gue.h
index fdad41469b65..3a6595bfa641 100644
--- a/include/net/gue.h
+++ b/include/net/gue.h
@@ -60,7 +60,7 @@ struct guehdr {
 
 /* Private flags in the private option extension */
 
-#define GUE_PFLAG_REMCSUM	htonl(1 << 31)
+#define GUE_PFLAG_REMCSUM	htonl(1U << 31)
 #define GUE_PLEN_REMCSUM	4
 
 #define GUE_PFLAGS_ALL	(GUE_PFLAG_REMCSUM)
-- 
cgit v1.2.3


From 1759d322f4bad2f82c376856363b725cac12e61d Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 28 Jun 2019 22:35:48 +0000
Subject: net/mlx5: Add hardware definitions for sub functions

Update mlx5 device interface data structures for:
1. New command definitions for allocating, deallocating SF
2. Query SF partition
3. Eswitch SF fields
4. HCA CAP SF fields
5. Extend Eswitch functions command for SF

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 99 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 96 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d4409654f760..db00effaa83a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -106,6 +106,9 @@ enum {
 	MLX5_CMD_OP_QUERY_ISSI                    = 0x10a,
 	MLX5_CMD_OP_SET_ISSI                      = 0x10b,
 	MLX5_CMD_OP_SET_DRIVER_VERSION            = 0x10d,
+	MLX5_CMD_OP_QUERY_SF_PARTITION            = 0x111,
+	MLX5_CMD_OP_ALLOC_SF                      = 0x113,
+	MLX5_CMD_OP_DEALLOC_SF                    = 0x114,
 	MLX5_CMD_OP_CREATE_MKEY                   = 0x200,
 	MLX5_CMD_OP_QUERY_MKEY                    = 0x201,
 	MLX5_CMD_OP_DESTROY_MKEY                  = 0x202,
@@ -713,7 +716,11 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         reserved_2b[0x6];
 	u8         max_encap_header_size[0xa];
 
-	u8         reserved_40[0x7c0];
+	u8         reserved_at_40[0xb];
+	u8         log_max_esw_sf[0x5];
+	u8         esw_sf_base_id[0x10];
+
+	u8         reserved_at_60[0x7a0];
 
 };
 
@@ -1330,13 +1337,24 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_640[0x10];
 	u8         num_q_monitor_counters[0x10];
 
-	u8         reserved_at_660[0x40];
+	u8         reserved_at_660[0x20];
+
+	u8         sf[0x1];
+	u8         sf_set_partition[0x1];
+	u8         reserved_at_682[0x1];
+	u8         log_max_sf[0x5];
+	u8         reserved_at_688[0x8];
+	u8         log_min_sf_size[0x8];
+	u8         max_num_sf_partitions[0x8];
 
 	u8         uctx_cap[0x20];
 
 	u8         reserved_at_6c0[0x4];
 	u8         flex_parser_id_geneve_tlv_option_0[0x4];
-	u8         reserved_at_6c8[0x138];
+	u8	   reserved_at_6c8[0x28];
+	u8	   sf_base_id[0x10];
+
+	u8	   reserved_at_700[0x100];
 };
 
 enum mlx5_flow_destination_type {
@@ -9786,6 +9804,81 @@ struct mlx5_ifc_query_esw_functions_out_bits {
 	struct mlx5_ifc_host_params_context_bits host_params_context;
 
 	u8         reserved_at_280[0x180];
+	u8         host_sf_enable[0][0x40];
+};
+
+struct mlx5_ifc_sf_partition_bits {
+	u8         reserved_at_0[0x10];
+	u8         log_num_sf[0x8];
+	u8         log_sf_bar_size[0x8];
+};
+
+struct mlx5_ifc_query_sf_partitions_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x18];
+	u8         num_sf_partitions[0x8];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_sf_partition_bits sf_partition[0];
+};
+
+struct mlx5_ifc_query_sf_partitions_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_sf_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_sf_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_sf_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_sf_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
 };
 
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From a82e0b5bdac29d9719d3ca2df01494a7947351aa Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayag@mellanox.com>
Date: Fri, 28 Jun 2019 22:35:50 +0000
Subject: net/mlx5: Added MCQI and MCQS registers' description to ifc

Given a fw component index, the MCQI register allows us to query
this component's information (e.g. its version and capabilities).

Given a fw component index, the MCQS register allows us to query the
status of a fw component, including its type and state
(e.g. PRESET/IN_USE).
It can be used to find the index of a component of a specific type, by
sequentially increasing the component index, and querying each time the
type of the returned component.
If max component index is reached, 'last_index_flag' is set by the HCA.

These registers' description was added to query the running and pending
fw version of the HCA.

Signed-off-by: Shay Agroskin <shayag@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 59 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 87f77ded78d4..2ff624a91e3d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -138,6 +138,7 @@ enum {
 	MLX5_REG_MTPPS		 = 0x9053,
 	MLX5_REG_MTPPSE		 = 0x9054,
 	MLX5_REG_MPEGC		 = 0x9056,
+	MLX5_REG_MCQS		 = 0x9060,
 	MLX5_REG_MCQI		 = 0x9061,
 	MLX5_REG_MCC		 = 0x9062,
 	MLX5_REG_MCDA		 = 0x9063,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index db00effaa83a..e2a77b5152a8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8542,7 +8542,7 @@ struct mlx5_ifc_mcam_access_reg_bits {
 	u8         mcda[0x1];
 	u8         mcc[0x1];
 	u8         mcqi[0x1];
-	u8         reserved_at_1f[0x1];
+	u8         mcqs[0x1];
 
 	u8         regs_95_to_87[0x9];
 	u8         mpegc[0x1];
@@ -9034,6 +9034,24 @@ struct mlx5_ifc_mtppse_reg_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_mcqs_reg_bits {
+	u8         last_index_flag[0x1];
+	u8         reserved_at_1[0x7];
+	u8         fw_device[0x8];
+	u8         component_index[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         identifier[0x10];
+
+	u8         reserved_at_40[0x17];
+	u8         component_status[0x5];
+	u8         component_update_state[0x4];
+
+	u8         last_update_state_changer_type[0x4];
+	u8         last_update_state_changer_host_id[0x4];
+	u8         reserved_at_68[0x18];
+};
+
 struct mlx5_ifc_mcqi_cap_bits {
 	u8         supported_info_bitmask[0x20];
 
@@ -9054,6 +9072,43 @@ struct mlx5_ifc_mcqi_cap_bits {
 	u8         reserved_at_86[0x1a];
 };
 
+struct mlx5_ifc_mcqi_version_bits {
+	u8         reserved_at_0[0x2];
+	u8         build_time_valid[0x1];
+	u8         user_defined_time_valid[0x1];
+	u8         reserved_at_4[0x14];
+	u8         version_string_length[0x8];
+
+	u8         version[0x20];
+
+	u8         build_time[0x40];
+
+	u8         user_defined_time[0x40];
+
+	u8         build_tool_version[0x20];
+
+	u8         reserved_at_e0[0x20];
+
+	u8         version_string[92][0x8];
+};
+
+struct mlx5_ifc_mcqi_activation_method_bits {
+	u8         pending_server_ac_power_cycle[0x1];
+	u8         pending_server_dc_power_cycle[0x1];
+	u8         pending_server_reboot[0x1];
+	u8         pending_fw_reset[0x1];
+	u8         auto_activate[0x1];
+	u8         all_hosts_sync[0x1];
+	u8         device_hw_reset[0x1];
+	u8         reserved_at_7[0x19];
+};
+
+union mlx5_ifc_mcqi_reg_data_bits {
+	struct mlx5_ifc_mcqi_cap_bits               mcqi_caps;
+	struct mlx5_ifc_mcqi_version_bits           mcqi_version;
+	struct mlx5_ifc_mcqi_activation_method_bits mcqi_activation_mathod;
+};
+
 struct mlx5_ifc_mcqi_reg_bits {
 	u8         read_pending_component[0x1];
 	u8         reserved_at_1[0xf];
@@ -9071,7 +9126,7 @@ struct mlx5_ifc_mcqi_reg_bits {
 	u8         reserved_at_a0[0x10];
 	u8         data_size[0x10];
 
-	u8         data[0][0x20];
+	union mlx5_ifc_mcqi_reg_data_bits data[0];
 };
 
 struct mlx5_ifc_mcc_reg_bits {
-- 
cgit v1.2.3


From 2f69e591e4531d3192841a4eb2bd9b512f5a8b66 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:35:53 +0000
Subject: {IB, net}/mlx5: E-Switch, Use index of rep for vport to IB port
 mapping

In the single IB device mode, the mapping between vport number and
rep relies on a counter. However for dynamic vport allocation, it is
desired to keep consistent map of eswitch vport and IB port.

Hence, simplify code to remove the free running counter and instead
use the available vport index during load/unload sequence from the
eswitch.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Suggested-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                        | 4 ++--
 drivers/infiniband/hw/mlx5/mlx5_ib.h                       | 1 -
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 1 +
 include/linux/mlx5/eswitch.h                               | 2 ++
 4 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 22e651cb5534..1de16a93fc64 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -14,7 +14,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	int vport_index;
 
 	ibdev = mlx5_ib_get_uplink_ibdev(dev->priv.eswitch);
-	vport_index = ibdev->free_port++;
+	vport_index = rep->vport_index;
 
 	ibdev->port[vport_index].rep = rep;
 	write_lock(&ibdev->port[vport_index].roce.netdev_lock);
@@ -50,7 +50,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	}
 
 	ibdev->is_rep = true;
-	vport_index = ibdev->free_port++;
+	vport_index = rep->vport_index;
 	ibdev->port[vport_index].rep = rep;
 	ibdev->port[vport_index].roce.netdev =
 		mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 1c205c2bd486..ee73dc122d28 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -978,7 +978,6 @@ struct mlx5_ib_dev {
 	u16			devx_whitelist_uid;
 	struct mlx5_srq_table   srq_table;
 	struct mlx5_async_ctx   async_ctx;
-	int			free_port;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index bc639a846714..24af2744453b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1411,6 +1411,7 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 
 	mlx5_esw_for_all_reps(esw, vport_index, rep) {
 		rep->vport = mlx5_eswitch_index_to_vport_num(esw, vport_index);
+		rep->vport_index = vport_index;
 		ether_addr_copy(rep->hw_id, hw_id);
 
 		for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++)
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index aece3ae1902d..36cb641188b0 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -46,6 +46,8 @@ struct mlx5_eswitch_rep {
 	u16		       vport;
 	u8		       hw_id[ETH_ALEN];
 	u16		       vlan;
+	/* Only IB rep is using vport_index */
+	u16		       vport_index;
 	u32		       vlan_refcount;
 };
 
-- 
cgit v1.2.3


From 386e75af995c3aec475a2185b919bf46af396bfc Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Fri, 28 Jun 2019 22:35:58 +0000
Subject: net/mlx5: Rename mlx5_pci_dev_type to mlx5_coredev_type

Rename mlx5_pci_dev_type to mlx5_coredev_type to distinguish different mlx5
device types.

mlx5_coredev_type represents mlx5_core_dev instance type. Hence keep
mlx5_coredev_type in mlx5_core_dev structure.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  5 +++--
 include/linux/mlx5/driver.h                    | 11 ++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index bfc8c6faedc2..e5f9df7f7e34 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -731,8 +731,6 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev,
 	struct mlx5_priv *priv = &dev->priv;
 	int err = 0;
 
-	priv->pci_dev_data = id->driver_data;
-
 	pci_set_drvdata(dev->pdev, dev);
 
 	dev->bar_addr = pci_resource_start(pdev, 0);
@@ -1320,6 +1318,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->device = &pdev->dev;
 	dev->pdev = pdev;
 
+	dev->coredev_type = id->driver_data & MLX5_PCI_DEV_IS_VF ?
+			 MLX5_COREDEV_VF : MLX5_COREDEV_PF;
+
 	err = mlx5_mdev_init(dev, prof_sel);
 	if (err)
 		goto mdev_init_err;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2ff624a91e3d..155b8cbe1cc9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -182,6 +182,11 @@ enum port_state_policy {
 	MLX5_POLICY_INVALID	= 0xffffffff
 };
 
+enum mlx5_coredev_type {
+	MLX5_COREDEV_PF,
+	MLX5_COREDEV_VF
+};
+
 struct mlx5_field_desc {
 	struct dentry	       *dent;
 	int			i;
@@ -567,7 +572,6 @@ struct mlx5_priv {
 	struct mlx5_core_sriov	sriov;
 	struct mlx5_lag		*lag;
 	struct mlx5_devcom	*devcom;
-	unsigned long		pci_dev_data;
 	struct mlx5_core_roce	roce;
 	struct mlx5_fc_stats		fc_stats;
 	struct mlx5_rl_table            rl_table;
@@ -646,6 +650,7 @@ struct mlx5_vxlan;
 
 struct mlx5_core_dev {
 	struct device *device;
+	enum mlx5_coredev_type coredev_type;
 	struct pci_dev	       *pdev;
 	/* sync pci state */
 	struct mutex		pci_status_mutex;
@@ -1079,9 +1084,9 @@ enum {
 	MLX5_PCI_DEV_IS_VF		= 1 << 0,
 };
 
-static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev)
+static inline bool mlx5_core_is_pf(struct mlx5_core_dev *dev)
 {
-	return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF);
+	return dev->coredev_type == MLX5_COREDEV_PF;
 }
 
 static inline bool mlx5_core_is_ecpf(struct mlx5_core_dev *dev)
-- 
cgit v1.2.3


From 5ccf2770e83bf8739f0a7c8bed9186d7e5d2ecbc Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:04 +0000
Subject: net/mlx5: Don't handle VF func change if host PF is disabled

When ECPF eswitch manager is at offloads mode, it monitors functions
changed event from host PF side and acts according to the number of
VFs enabled/disabled.

As ECPF and host PF work in two independent hosts, it's possible that
host PF OS reboots but ECPF system is still kept on and continues
monitoring events from host PF. When kernel from host PF side is
booting, PCI iov driver does sriov_init and compute_max_vf_buses by
iterating over all valid num of VFs. This triggers FLR and generates
functions changed events, even though host PF HCA is not enabled at
this time. However, ECPF is not aware of this information, and still
handles these events as usual. ECPF system will see massive number of
reps are created, but destroyed immediately once creation finished.

To eliminate this noise, a bit is added to host parameter context to
indicate host PF is disabled. ECPF will not handle the VF changed
event if this bit is set.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 5 ++++-
 include/linux/mlx5/mlx5_ifc.h                              | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 24af2744453b..105c21069c0c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2026,6 +2026,7 @@ static void esw_functions_changed_event_handler(struct work_struct *work)
 	u32 out[MLX5_ST_SZ_DW(query_esw_functions_out)] = {};
 	struct mlx5_host_work *host_work;
 	struct mlx5_eswitch *esw;
+	bool host_pf_disabled;
 	u16 num_vfs = 0;
 	int err;
 
@@ -2035,7 +2036,9 @@ static void esw_functions_changed_event_handler(struct work_struct *work)
 	err = mlx5_esw_query_functions(esw->dev, out, sizeof(out));
 	num_vfs = MLX5_GET(query_esw_functions_out, out,
 			   host_params_context.host_num_of_vfs);
-	if (err || num_vfs == esw->esw_funcs.num_vfs)
+	host_pf_disabled = MLX5_GET(query_esw_functions_out, out,
+				    host_params_context.host_pf_disabled);
+	if (err || host_pf_disabled || num_vfs == esw->esw_funcs.num_vfs)
 		goto out;
 
 	/* Number of VFs can only change from "0 to x" or "x to 0". */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e2a77b5152a8..031db53e94ce 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9823,7 +9823,8 @@ struct mlx5_ifc_mtrc_ctrl_bits {
 
 struct mlx5_ifc_host_params_context_bits {
 	u8         host_number[0x8];
-	u8         reserved_at_8[0x8];
+	u8         reserved_at_8[0x7];
+	u8         host_pf_disabled[0x1];
 	u8         host_num_of_vfs[0x10];
 
 	u8         host_total_vfs[0x10];
-- 
cgit v1.2.3


From d886aba677a0a75ad7fdb06e08418b481e09b036 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:06 +0000
Subject: net/mlx5: Reduce dependency on enabled_vfs counter and num_vfs

While enabling SR-IOV, PCI core already checks that if SR-IOV is already
enabled, it returns failure error code.
Hence, remove such duplicate check from mlx5_core driver.

While at it, make mlx5_device_disable_sriov() to perform cleanup of VFs in
reverse order of mlx5_device_enable_sriov().

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 22 ++++------------------
 include/linux/mlx5/driver.h                     |  1 -
 2 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index 2eecb831c499..9d9ff4511306 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -74,13 +74,6 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 	int err;
 	int vf;
 
-	if (sriov->enabled_vfs) {
-		mlx5_core_warn(dev,
-			       "failed to enable SRIOV on device, already enabled with %d vfs\n",
-			       sriov->enabled_vfs);
-		return -EBUSY;
-	}
-
 	if (!MLX5_ESWITCH_MANAGER(dev))
 		goto enable_vfs_hca;
 
@@ -99,7 +92,6 @@ enable_vfs_hca:
 			continue;
 		}
 		sriov->vfs_ctx[vf].enabled = 1;
-		sriov->enabled_vfs++;
 		if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) {
 			err = sriov_restore_guids(dev, vf);
 			if (err) {
@@ -118,13 +110,11 @@ enable_vfs_hca:
 static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int num_vfs = pci_num_vf(dev->pdev);
 	int err;
 	int vf;
 
-	if (!sriov->enabled_vfs)
-		goto out;
-
-	for (vf = 0; vf < sriov->num_vfs; vf++) {
+	for (vf = num_vfs - 1; vf >= 0; vf--) {
 		if (!sriov->vfs_ctx[vf].enabled)
 			continue;
 		err = mlx5_core_disable_hca(dev, vf + 1);
@@ -133,10 +123,8 @@ static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
 			continue;
 		}
 		sriov->vfs_ctx[vf].enabled = 0;
-		sriov->enabled_vfs--;
 	}
 
-out:
 	if (MLX5_ESWITCH_MANAGER(dev))
 		mlx5_eswitch_disable_sriov(dev->priv.eswitch);
 
@@ -191,13 +179,11 @@ int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
 
 int mlx5_sriov_attach(struct mlx5_core_dev *dev)
 {
-	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
-
-	if (!mlx5_core_is_pf(dev) || !sriov->num_vfs)
+	if (!mlx5_core_is_pf(dev) || !pci_num_vf(dev->pdev))
 		return 0;
 
 	/* If sriov VFs exist in PCI level, enable them in device level */
-	return mlx5_device_enable_sriov(dev, sriov->num_vfs);
+	return mlx5_device_enable_sriov(dev, pci_num_vf(dev->pdev));
 }
 
 void mlx5_sriov_detach(struct mlx5_core_dev *dev)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 155b8cbe1cc9..7658a4908431 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -468,7 +468,6 @@ struct mlx5_vf_context {
 struct mlx5_core_sriov {
 	struct mlx5_vf_context	*vfs_ctx;
 	int			num_vfs;
-	int			enabled_vfs;
 	u16			max_vfs;
 };
 
-- 
cgit v1.2.3


From e1d974d03e590cf8370d4820e8b467ee700925c3 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:13 +0000
Subject: net/mlx5: Handle host PF vport mac/guid for ECPF

When ECPF is eswitch manager, it has the privilege to query and
configure the mac and node guid of host PF.

While vport number of host PF is 0, the vport command should be
issued with other_vport set in this case as the cmd is issued by
ECPF vport(0xfffe).

Add a specific function to query own vport mac. Low level functions
are used by vport manager to query/modify any vport mac and node guid.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  2 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  2 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/rdma.c     |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    | 28 ++++++++++++++--------
 include/linux/mlx5/vport.h                         |  3 ++-
 9 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index 554672edf8c3..8dd31b5c740c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -680,7 +680,7 @@ static void mlx5e_dcbnl_getpermhwaddr(struct net_device *netdev,
 
 	memset(perm_addr, 0xff, MAX_ADDR_LEN);
 
-	mlx5_query_nic_vport_mac_address(priv->mdev, 0, perm_addr);
+	mlx5_query_mac_address(priv->mdev, perm_addr);
 }
 
 static void mlx5e_dcbnl_setpgtccfgtx(struct net_device *netdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 457cc39423f2..bc9150f18116 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4581,7 +4581,7 @@ static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 
-	mlx5_query_nic_vport_mac_address(priv->mdev, 0, netdev->dev_addr);
+	mlx5_query_mac_address(priv->mdev, netdev->dev_addr);
 	if (is_zero_ether_addr(netdev->dev_addr) &&
 	    !MLX5_CAP_GEN(priv->mdev, vport_group_manager)) {
 		eth_hw_addr_random(netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 33f8f99681a5..abe8540d6879 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1394,7 +1394,7 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev)
 		SET_NETDEV_DEV(netdev, mdev->device);
 		netdev->netdev_ops = &mlx5e_netdev_ops_uplink_rep;
 		/* we want a persistent mac for the uplink rep */
-		mlx5_query_nic_vport_mac_address(mdev, 0, netdev->dev_addr);
+		mlx5_query_mac_address(mdev, netdev->dev_addr);
 		netdev->ethtool_ops = &mlx5e_uplink_rep_ethtool_ops;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 		if (MLX5_CAP_GEN(mdev, qos))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 0c75219d91b5..a758755d7a08 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -897,7 +897,7 @@ static void esw_vport_change_handle_locked(struct mlx5_vport *vport)
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
 	u8 mac[ETH_ALEN];
 
-	mlx5_query_nic_vport_mac_address(dev, vport->vport, mac);
+	mlx5_query_nic_vport_mac_address(dev, vport->vport, true, mac);
 	esw_debug(dev, "vport[%d] Context Changed: perm mac: %pM\n",
 		  vport->vport, mac);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 105c21069c0c..b253bdf75dd6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1407,7 +1407,7 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 	if (!esw->offloads.vport_reps)
 		return -ENOMEM;
 
-	mlx5_query_nic_vport_mac_address(dev, 0, hw_id);
+	mlx5_query_mac_address(dev, hw_id);
 
 	mlx5_esw_for_all_reps(esw, vport_index, rep) {
 		rep->vport = mlx5_eswitch_index_to_vport_num(esw, vport_index);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index ca2296a2f9ee..d61d536f4e17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -867,7 +867,7 @@ struct mlx5_fpga_conn *mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev,
 	conn->cb_arg = attr->cb_arg;
 
 	remote_mac = MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, remote_mac_47_32);
-	err = mlx5_query_nic_vport_mac_address(fdev->mdev, 0, remote_mac);
+	err = mlx5_query_mac_address(fdev->mdev, remote_mac);
 	if (err) {
 		mlx5_fpga_err(fdev, "Failed to query local MAC: %d\n", err);
 		ret = ERR_PTR(err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c
index 401441aefbcb..17ce9dd56b13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c
@@ -126,7 +126,7 @@ static void mlx5_rdma_make_default_gid(struct mlx5_core_dev *dev, union ib_gid *
 {
 	u8 hw_id[ETH_ALEN];
 
-	mlx5_query_nic_vport_mac_address(dev, 0, hw_id);
+	mlx5_query_mac_address(dev, hw_id);
 	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
 	addrconf_addr_eui48(&gid->raw[8], hw_id);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 95cdc8cbcba4..670fa493c5f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -155,11 +155,12 @@ int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 }
 
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
-				     u16 vport, u8 *addr)
+				     u16 vport, bool other, u8 *addr)
 {
-	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {};
 	u8 *out_addr;
+	u32 *out;
 	int err;
 
 	out = kvzalloc(outlen, GFP_KERNEL);
@@ -169,7 +170,12 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 	out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out,
 				nic_vport_context.permanent_address);
 
-	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(query_nic_vport_context_in, in, other_vport, other);
+
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
 	if (!err)
 		ether_addr_copy(addr, &out_addr[2]);
 
@@ -178,6 +184,12 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_address);
 
+int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr)
+{
+	return mlx5_query_nic_vport_mac_address(mdev, 0, false, addr);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_mac_address);
+
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				      u16 vport, u8 *addr)
 {
@@ -194,9 +206,7 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 	MLX5_SET(modify_nic_vport_context_in, in,
 		 field_select.permanent_address, 1);
 	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
-
-	if (vport)
-		MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
 
 	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
 				     in, nic_vport_context);
@@ -291,9 +301,7 @@ int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
 		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
 	MLX5_SET(query_nic_vport_context_in, in, allowed_list_type, list_type);
 	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
-
-	if (vport)
-		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+	MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
 
 	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
 	if (err)
@@ -483,7 +491,7 @@ int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 	MLX5_SET(modify_nic_vport_context_in, in,
 		 field_select.node_guid, 1);
 	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
-	MLX5_SET(modify_nic_vport_context_in, in, other_vport, !!vport);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
 
 	nic_vport_context = MLX5_ADDR_OF(modify_nic_vport_context_in,
 					 in, nic_vport_context);
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 3d1c6cdbbba7..c147acc7bf70 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -69,7 +69,8 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 other_vport, u8 state);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
-				     u16 vport, u8 *addr);
+				     u16 vport, bool other, u8 *addr);
+int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr);
 int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 				    u16 vport, u8 *min_inline);
 void mlx5_query_min_inline(struct mlx5_core_dev *mdev, u8 *min_inline);
-- 
cgit v1.2.3


From f6455de0b0e52dcb11aeb503151b12ec87f9c5e4 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:15 +0000
Subject: net/mlx5: E-Switch, Refactor eswitch SR-IOV interface

Devlink eswitch mode is not necessarily related to SR-IOV, e.g, ECPF
can be at offload mode when SR-IOV is not enabled.

Rename the interface and eswitch mode names to decouple from SR-IOV,
and cleanup eswitch messages accordingly.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.h                |  2 +-
 drivers/infiniband/hw/mlx5/main.c                  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 85 +++++++++++-----------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  8 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 32 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/lag.c      |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c    |  4 +-
 include/linux/mlx5/eswitch.h                       |  6 +-
 11 files changed, 77 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index 22adce2d6795..478503ce20df 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -28,7 +28,7 @@ struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
 #else /* CONFIG_MLX5_ESWITCH */
 static inline u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
 {
-	return SRIOV_NONE;
+	return MLX5_ESWITCH_NONE;
 }
 
 static inline
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 602ac3feea5d..798aa5e0941e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6814,7 +6814,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	printk_once(KERN_INFO "%s", mlx5_version);
 
 	if (MLX5_ESWITCH_MANAGER(mdev) &&
-	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
+	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
 		if (!mlx5_core_mp_enabled(mdev))
 			mlx5_ib_register_vport_reps(mdev);
 		return mdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bc9150f18116..f83fdb67e760 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5115,7 +5115,7 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
 
 #ifdef CONFIG_MLX5_ESWITCH
 	if (MLX5_ESWITCH_MANAGER(mdev) &&
-	    mlx5_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
+	    mlx5_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
 		mlx5e_rep_register_vport_reps(mdev);
 		return mdev;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index abe8540d6879..ef6d61c1d886 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -391,7 +391,7 @@ static int mlx5e_rep_get_port_parent_id(struct net_device *dev,
 	struct mlx5e_priv *uplink_priv = NULL;
 	struct net_device *uplink_dev;
 
-	if (esw->mode == SRIOV_NONE)
+	if (esw->mode == MLX5_ESWITCH_NONE)
 		return -EOPNOTSUPP;
 
 	uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH);
@@ -419,7 +419,7 @@ static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
 	struct mlx5e_rep_sq *rep_sq, *tmp;
 	struct mlx5e_rep_priv *rpriv;
 
-	if (esw->mode != SRIOV_OFFLOADS)
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS)
 		return;
 
 	rpriv = mlx5e_rep_to_rep_priv(rep);
@@ -440,7 +440,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
 	int err;
 	int i;
 
-	if (esw->mode != SRIOV_OFFLOADS)
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS)
 		return 0;
 
 	rpriv = mlx5e_rep_to_rep_priv(rep);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 8ff1ca46d8d3..1ff9785c2f83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -3342,7 +3342,7 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv,
 	if (!tc_can_offload_extack(priv->netdev, f->common.extack))
 		return -EOPNOTSUPP;
 
-	if (esw && esw->mode == SRIOV_OFFLOADS)
+	if (esw && esw->mode == MLX5_ESWITCH_OFFLOADS)
 		err = mlx5e_add_fdb_flow(priv, f, flow_flags,
 					 filter_dev, flow);
 	else
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index a758755d7a08..b42540e1ba6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -497,7 +497,7 @@ static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 
 fdb_add:
 	/* SRIOV is enabled: Forward UC MAC to vport */
-	if (esw->fdb_table.legacy.fdb && esw->mode == SRIOV_LEGACY)
+	if (esw->fdb_table.legacy.fdb && esw->mode == MLX5_ESWITCH_LEGACY)
 		vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
 
 	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n",
@@ -1577,7 +1577,7 @@ static void esw_apply_vport_conf(struct mlx5_eswitch *esw,
 			       flags);
 
 	/* Only legacy mode needs ACLs */
-	if (esw->mode == SRIOV_LEGACY) {
+	if (esw->mode == MLX5_ESWITCH_LEGACY) {
 		esw_vport_ingress_config(esw, vport);
 		esw_vport_egress_config(esw, vport);
 	}
@@ -1629,7 +1629,7 @@ static void esw_enable_vport(struct mlx5_eswitch *esw, struct mlx5_vport *vport,
 	esw_debug(esw->dev, "Enabling VPORT(%d)\n", vport_num);
 
 	/* Create steering drop counters for ingress and egress ACLs */
-	if (vport_num && esw->mode == SRIOV_LEGACY)
+	if (vport_num && esw->mode == MLX5_ESWITCH_LEGACY)
 		esw_vport_create_drop_counters(vport);
 
 	/* Restore old vport configuration */
@@ -1683,7 +1683,7 @@ static void esw_disable_vport(struct mlx5_eswitch *esw,
 	vport->enabled_events = 0;
 	esw_vport_disable_qos(esw, vport);
 	if (esw->manager_vport != vport_num &&
-	    esw->mode == SRIOV_LEGACY) {
+	    esw->mode == MLX5_ESWITCH_LEGACY) {
 		mlx5_modify_vport_admin_state(esw->dev,
 					      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
 					      vport_num, 1,
@@ -1728,7 +1728,7 @@ int mlx5_esw_query_functions(struct mlx5_core_dev *dev, u32 *out, int outlen)
 /* Public E-Switch API */
 #define ESW_ALLOWED(esw) ((esw) && MLX5_ESWITCH_MANAGER((esw)->dev))
 
-int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
+int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int nvfs, int mode)
 {
 	struct mlx5_vport *vport;
 	int total_nvports = 0;
@@ -1737,19 +1737,17 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 
 	if (!ESW_ALLOWED(esw) ||
 	    !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
-		esw_warn(esw->dev, "E-Switch FDB is not supported, aborting ...\n");
+		esw_warn(esw->dev, "FDB is not supported, aborting ...\n");
 		return -EOPNOTSUPP;
 	}
 
 	if (!MLX5_CAP_ESW_INGRESS_ACL(esw->dev, ft_support))
-		esw_warn(esw->dev, "E-Switch ingress ACL is not supported by FW\n");
+		esw_warn(esw->dev, "ingress ACL is not supported by FW\n");
 
 	if (!MLX5_CAP_ESW_EGRESS_ACL(esw->dev, ft_support))
-		esw_warn(esw->dev, "E-Switch engress ACL is not supported by FW\n");
+		esw_warn(esw->dev, "engress ACL is not supported by FW\n");
 
-	esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d) mode (%d)\n", nvfs, mode);
-
-	if (mode == SRIOV_OFFLOADS) {
+	if (mode == MLX5_ESWITCH_OFFLOADS) {
 		if (mlx5_core_is_ecpf_esw_manager(esw->dev))
 			total_nvports = esw->total_vports;
 		else
@@ -1760,7 +1758,7 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 
 	mlx5_lag_update(esw->dev);
 
-	if (mode == SRIOV_LEGACY) {
+	if (mode == MLX5_ESWITCH_LEGACY) {
 		err = esw_create_legacy_table(esw);
 		if (err)
 			goto abort;
@@ -1777,11 +1775,11 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 	if (err)
 		esw_warn(esw->dev, "Failed to create eswitch TSAR");
 
-	/* Don't enable vport events when in SRIOV_OFFLOADS mode, since:
+	/* Don't enable vport events when in MLX5_ESWITCH_OFFLOADS mode, since:
 	 * 1. L2 table (MPFS) is programmed by PF/VF representors netdevs set_rx_mode
 	 * 2. FDB/Eswitch is programmed by user space tools
 	 */
-	enabled_events = (mode == SRIOV_LEGACY) ? SRIOV_VPORT_EVENTS : 0;
+	enabled_events = (mode == MLX5_ESWITCH_LEGACY) ? SRIOV_VPORT_EVENTS : 0;
 
 	/* Enable PF vport */
 	vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF);
@@ -1797,19 +1795,21 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 	mlx5_esw_for_each_vf_vport(esw, i, vport, nvfs)
 		esw_enable_vport(esw, vport, enabled_events);
 
-	if (mode == SRIOV_LEGACY) {
+	if (mode == MLX5_ESWITCH_LEGACY) {
 		MLX5_NB_INIT(&esw->nb, eswitch_vport_event, NIC_VPORT_CHANGE);
 		mlx5_eq_notifier_register(esw->dev, &esw->nb);
 	}
 
-	esw_info(esw->dev, "SRIOV enabled: active vports(%d)\n",
-		 esw->enabled_vports);
+	esw_info(esw->dev, "Enable: mode(%s), nvfs(%d), active vports(%d)\n",
+		 mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
+		 nvfs, esw->enabled_vports);
+
 	return 0;
 
 abort:
-	esw->mode = SRIOV_NONE;
+	esw->mode = MLX5_ESWITCH_NONE;
 
-	if (mode == SRIOV_OFFLOADS) {
+	if (mode == MLX5_ESWITCH_OFFLOADS) {
 		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
 		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_ETH);
 	}
@@ -1817,22 +1817,23 @@ abort:
 	return err;
 }
 
-void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
+void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
 {
 	struct esw_mc_addr *mc_promisc;
 	struct mlx5_vport *vport;
 	int old_mode;
 	int i;
 
-	if (!ESW_ALLOWED(esw) || esw->mode == SRIOV_NONE)
+	if (!ESW_ALLOWED(esw) || esw->mode == MLX5_ESWITCH_NONE)
 		return;
 
-	esw_info(esw->dev, "disable SRIOV: active vports(%d) mode(%d)\n",
-		 esw->enabled_vports, esw->mode);
+	esw_info(esw->dev, "Disable: mode(%s), nvfs(%d), active vports(%d)\n",
+		 esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
+		 esw->dev->priv.sriov.num_vfs, esw->enabled_vports);
 
 	mc_promisc = &esw->mc_promisc;
 
-	if (esw->mode == SRIOV_LEGACY)
+	if (esw->mode == MLX5_ESWITCH_LEGACY)
 		mlx5_eq_notifier_unregister(esw->dev, &esw->nb);
 
 	mlx5_esw_for_all_vports(esw, i, vport)
@@ -1843,17 +1844,17 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
 
 	esw_destroy_tsar(esw);
 
-	if (esw->mode == SRIOV_LEGACY)
+	if (esw->mode == MLX5_ESWITCH_LEGACY)
 		esw_destroy_legacy_table(esw);
-	else if (esw->mode == SRIOV_OFFLOADS)
+	else if (esw->mode == MLX5_ESWITCH_OFFLOADS)
 		esw_offloads_cleanup(esw);
 
 	old_mode = esw->mode;
-	esw->mode = SRIOV_NONE;
+	esw->mode = MLX5_ESWITCH_NONE;
 
 	mlx5_lag_update(esw->dev);
 
-	if (old_mode == SRIOV_OFFLOADS) {
+	if (old_mode == MLX5_ESWITCH_OFFLOADS) {
 		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
 		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_ETH);
 	}
@@ -1914,7 +1915,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	}
 
 	esw->enabled_vports = 0;
-	esw->mode = SRIOV_NONE;
+	esw->mode = MLX5_ESWITCH_NONE;
 	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
 	if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) &&
 	    MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))
@@ -1984,7 +1985,7 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 
 	ether_addr_copy(evport->info.mac, mac);
 	evport->info.node_guid = node_guid;
-	if (evport->enabled && esw->mode == SRIOV_LEGACY)
+	if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY)
 		err = esw_vport_ingress_config(esw, evport);
 
 unlock:
@@ -2068,7 +2069,7 @@ int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
 
 	evport->info.vlan = vlan;
 	evport->info.qos = qos;
-	if (evport->enabled && esw->mode == SRIOV_LEGACY) {
+	if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) {
 		err = esw_vport_ingress_config(esw, evport);
 		if (err)
 			goto unlock;
@@ -2110,7 +2111,7 @@ int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
 		mlx5_core_warn(esw->dev,
 			       "Spoofchk in set while MAC is invalid, vport(%d)\n",
 			       evport->vport);
-	if (evport->enabled && esw->mode == SRIOV_LEGACY)
+	if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY)
 		err = esw_vport_ingress_config(esw, evport);
 	if (err)
 		evport->info.spoofchk = pschk;
@@ -2206,7 +2207,7 @@ int mlx5_eswitch_set_vepa(struct mlx5_eswitch *esw, u8 setting)
 		return -EPERM;
 
 	mutex_lock(&esw->state_lock);
-	if (esw->mode != SRIOV_LEGACY) {
+	if (esw->mode != MLX5_ESWITCH_LEGACY) {
 		err = -EOPNOTSUPP;
 		goto out;
 	}
@@ -2229,7 +2230,7 @@ int mlx5_eswitch_get_vepa(struct mlx5_eswitch *esw, u8 *setting)
 		return -EPERM;
 
 	mutex_lock(&esw->state_lock);
-	if (esw->mode != SRIOV_LEGACY) {
+	if (esw->mode != MLX5_ESWITCH_LEGACY) {
 		err = -EOPNOTSUPP;
 		goto out;
 	}
@@ -2372,7 +2373,7 @@ static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev,
 	u64 bytes = 0;
 	int err = 0;
 
-	if (!vport->enabled || esw->mode != SRIOV_LEGACY)
+	if (!vport->enabled || esw->mode != MLX5_ESWITCH_LEGACY)
 		return 0;
 
 	if (vport->egress.drop_counter)
@@ -2482,7 +2483,7 @@ free_out:
 
 u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
 {
-	return ESW_ALLOWED(esw) ? esw->mode : SRIOV_NONE;
+	return ESW_ALLOWED(esw) ? esw->mode : MLX5_ESWITCH_NONE;
 }
 EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
 
@@ -2499,10 +2500,10 @@ EXPORT_SYMBOL(mlx5_eswitch_get_encap_mode);
 
 bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
 {
-	if ((dev0->priv.eswitch->mode == SRIOV_NONE &&
-	     dev1->priv.eswitch->mode == SRIOV_NONE) ||
-	    (dev0->priv.eswitch->mode == SRIOV_OFFLOADS &&
-	     dev1->priv.eswitch->mode == SRIOV_OFFLOADS))
+	if ((dev0->priv.eswitch->mode == MLX5_ESWITCH_NONE &&
+	     dev1->priv.eswitch->mode == MLX5_ESWITCH_NONE) ||
+	    (dev0->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS &&
+	     dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS))
 		return true;
 
 	return false;
@@ -2511,6 +2512,6 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
 bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
 			       struct mlx5_core_dev *dev1)
 {
-	return (dev0->priv.eswitch->mode == SRIOV_OFFLOADS &&
-		dev1->priv.eswitch->mode == SRIOV_OFFLOADS);
+	return (dev0->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS &&
+		dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 335cbeee1b9e..273a17243275 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -253,8 +253,8 @@ void esw_vport_del_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
 /* E-Switch API */
 int mlx5_eswitch_init(struct mlx5_core_dev *dev);
 void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
-int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode);
-void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw);
+int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int nvfs, int mode);
+void mlx5_eswitch_disable(struct mlx5_eswitch *esw);
 int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 			       int vport, u8 mac[ETH_ALEN]);
 int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
@@ -528,8 +528,8 @@ bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num);
 /* eswitch API stubs */
 static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
 static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
-static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
-static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
+static inline int  mlx5_eswitch_enable(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
+static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw) {}
 static inline bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1) { return true; }
 static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; }
 static inline int
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b253bdf75dd6..a1beada1cdbf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -147,7 +147,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	struct mlx5_flow_table *fdb;
 	int j, i = 0;
 
-	if (esw->mode != SRIOV_OFFLOADS)
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	flow_act.action = attr->action;
@@ -1358,19 +1358,19 @@ static int esw_offloads_start(struct mlx5_eswitch *esw,
 {
 	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
 
-	if (esw->mode != SRIOV_LEGACY &&
+	if (esw->mode != MLX5_ESWITCH_LEGACY &&
 	    !mlx5_core_is_ecpf_esw_manager(esw->dev)) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can't set offloads mode, SRIOV legacy not enabled");
 		return -EINVAL;
 	}
 
-	mlx5_eswitch_disable_sriov(esw);
-	err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+	mlx5_eswitch_disable(esw);
+	err = mlx5_eswitch_enable(esw, num_vfs, MLX5_ESWITCH_OFFLOADS);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Failed setting eswitch to offloads");
-		err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+		err1 = mlx5_eswitch_enable(esw, num_vfs, MLX5_ESWITCH_LEGACY);
 		if (err1) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed setting eswitch back to legacy");
@@ -2174,11 +2174,11 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw,
 {
 	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
 
-	mlx5_eswitch_disable_sriov(esw);
-	err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+	mlx5_eswitch_disable(esw);
+	err = mlx5_eswitch_enable(esw, num_vfs, MLX5_ESWITCH_LEGACY);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed setting eswitch to legacy");
-		err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+		err1 = mlx5_eswitch_enable(esw, num_vfs, MLX5_ESWITCH_OFFLOADS);
 		if (err1) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed setting eswitch back to offloads");
@@ -2203,10 +2203,10 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
 {
 	switch (mode) {
 	case DEVLINK_ESWITCH_MODE_LEGACY:
-		*mlx5_mode = SRIOV_LEGACY;
+		*mlx5_mode = MLX5_ESWITCH_LEGACY;
 		break;
 	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
-		*mlx5_mode = SRIOV_OFFLOADS;
+		*mlx5_mode = MLX5_ESWITCH_OFFLOADS;
 		break;
 	default:
 		return -EINVAL;
@@ -2218,10 +2218,10 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
 static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode)
 {
 	switch (mlx5_mode) {
-	case SRIOV_LEGACY:
+	case MLX5_ESWITCH_LEGACY:
 		*mode = DEVLINK_ESWITCH_MODE_LEGACY;
 		break;
-	case SRIOV_OFFLOADS:
+	case MLX5_ESWITCH_OFFLOADS:
 		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
 		break;
 	default:
@@ -2285,7 +2285,7 @@ static int mlx5_devlink_eswitch_check(struct devlink *devlink)
 	if(!MLX5_ESWITCH_MANAGER(dev))
 		return -EPERM;
 
-	if (dev->priv.eswitch->mode == SRIOV_NONE &&
+	if (dev->priv.eswitch->mode == MLX5_ESWITCH_NONE &&
 	    !mlx5_core_is_ecpf_esw_manager(dev))
 		return -EOPNOTSUPP;
 
@@ -2408,7 +2408,7 @@ int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
 	if (!MLX5_CAP_GEN(dev, vport_group_manager))
 		return -EOPNOTSUPP;
 
-	if (esw->mode == SRIOV_NONE)
+	if (esw->mode == MLX5_ESWITCH_NONE)
 		return -EOPNOTSUPP;
 
 	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
@@ -2455,7 +2455,7 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink,
 	if (encap && encap != DEVLINK_ESWITCH_ENCAP_MODE_BASIC)
 		return -EOPNOTSUPP;
 
-	if (esw->mode == SRIOV_LEGACY) {
+	if (esw->mode == MLX5_ESWITCH_LEGACY) {
 		esw->offloads.encap = encap;
 		return 0;
 	}
@@ -2522,7 +2522,7 @@ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
 	struct mlx5_eswitch_rep *rep;
 	int i;
 
-	if (esw->mode == SRIOV_OFFLOADS)
+	if (esw->mode == MLX5_ESWITCH_OFFLOADS)
 		__unload_reps_all_vport(esw, max_vf, rep_type);
 
 	mlx5_esw_for_all_reps(esw, i, rep)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index 959605559858..c5ef2ff26465 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -305,8 +305,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 			   !mlx5_sriov_is_enabled(dev1);
 
 #ifdef CONFIG_MLX5_ESWITCH
-		roce_lag &= dev0->priv.eswitch->mode == SRIOV_NONE &&
-			    dev1->priv.eswitch->mode == SRIOV_NONE;
+		roce_lag &= dev0->priv.eswitch->mode == MLX5_ESWITCH_NONE &&
+			    dev1->priv.eswitch->mode == MLX5_ESWITCH_NONE;
 #endif
 
 		if (roce_lag)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index 9d9ff4511306..d4c90f029f49 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -77,7 +77,7 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 	if (!MLX5_ESWITCH_MANAGER(dev))
 		goto enable_vfs_hca;
 
-	err = mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY);
+	err = mlx5_eswitch_enable(dev->priv.eswitch, num_vfs, MLX5_ESWITCH_LEGACY);
 	if (err) {
 		mlx5_core_warn(dev,
 			       "failed to enable eswitch SRIOV (%d)\n", err);
@@ -126,7 +126,7 @@ static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
 	}
 
 	if (MLX5_ESWITCH_MANAGER(dev))
-		mlx5_eswitch_disable_sriov(dev->priv.eswitch);
+		mlx5_eswitch_disable(dev->priv.eswitch);
 
 	if (mlx5_wait_for_pages(dev, &dev->priv.vfs_pages))
 		mlx5_core_warn(dev, "timeout reclaiming VFs pages\n");
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 36cb641188b0..d4731199edb4 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -12,9 +12,9 @@
 #define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager)
 
 enum {
-	SRIOV_NONE,
-	SRIOV_LEGACY,
-	SRIOV_OFFLOADS
+	MLX5_ESWITCH_NONE,
+	MLX5_ESWITCH_LEGACY,
+	MLX5_ESWITCH_OFFLOADS
 };
 
 enum {
-- 
cgit v1.2.3


From 411ec9e0b45792e2ac7c55f94a635d5ce894910b Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 28 Jun 2019 22:36:22 +0000
Subject: net/mlx5: E-Switch, Consider host PF for inline mode and vlan pop

When ECPF is the eswitch manager, host PF is treated like other VFs.
Driver should do the same for inline mode and vlan pop.

Add new iterators to include host PF if ECPF is the eswitch manager.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  | 26 ++++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 13 ++++++-----
 include/linux/mlx5/vport.h                         |  1 +
 4 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index b256f397f112..935b9429bb2a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1891,6 +1891,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 
 	esw->dev = dev;
 	esw->manager_vport = mlx5_eswitch_manager_vport(dev);
+	esw->first_host_vport = mlx5_eswitch_first_host_vport_num(dev);
 
 	esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq");
 	if (!esw->work_queue) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index bfc32bcbf544..f59183440d7f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -227,6 +227,7 @@ struct mlx5_eswitch {
 	int                     mode;
 	int                     nvports;
 	u16                     manager_vport;
+	u16                     first_host_vport;
 	struct mlx5_esw_functions esw_funcs;
 };
 
@@ -422,6 +423,12 @@ static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev)
 		MLX5_VPORT_ECPF : MLX5_VPORT_PF;
 }
 
+static inline u16 mlx5_eswitch_first_host_vport_num(struct mlx5_core_dev *dev)
+{
+	return mlx5_core_is_ecpf_esw_manager(dev) ?
+		MLX5_VPORT_PF : MLX5_VPORT_FIRST_VF;
+}
+
 static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev)
 {
 	/* Ideally device should have the functions changed supported
@@ -518,6 +525,25 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw);
 #define mlx5_esw_for_each_vf_vport_num_reverse(esw, vport, nvfs)	\
 	for ((vport) = (nvfs); (vport) >= MLX5_VPORT_FIRST_VF; (vport)--)
 
+/* Includes host PF (vport 0) if it's not esw manager. */
+#define mlx5_esw_for_each_host_func_rep(esw, i, rep, nvfs)	\
+	for ((i) = (esw)->first_host_vport;			\
+	     (rep) = &(esw)->offloads.vport_reps[i],		\
+	     (i) <= (nvfs); (i)++)
+
+#define mlx5_esw_for_each_host_func_rep_reverse(esw, i, rep, nvfs)	\
+	for ((i) = (nvfs);						\
+	     (rep) = &(esw)->offloads.vport_reps[i],			\
+	     (i) >= (esw)->first_host_vport; (i)--)
+
+#define mlx5_esw_for_each_host_func_vport(esw, vport, nvfs)	\
+	for ((vport) = (esw)->first_host_vport;			\
+	     (vport) <= (nvfs); (vport)++)
+
+#define mlx5_esw_for_each_host_func_vport_reverse(esw, vport, nvfs)	\
+	for ((vport) = (nvfs);						\
+	     (vport) >= (esw)->first_host_vport; (vport)--)
+
 struct mlx5_vport *__must_check
 mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index aecfb636fbc6..50e5841c1698 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -344,10 +344,10 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
 static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
 {
 	struct mlx5_eswitch_rep *rep;
-	int vf_vport, err = 0;
+	int i, err = 0;
 
 	esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
-	mlx5_esw_for_each_vf_rep(esw, vf_vport, rep, esw->esw_funcs.num_vfs) {
+	mlx5_esw_for_each_host_func_rep(esw, i, rep, esw->esw_funcs.num_vfs) {
 		if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED)
 			continue;
 
@@ -2330,7 +2330,7 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
 	if (err)
 		goto out;
 
-	mlx5_esw_for_each_vf_vport_num(esw, vport, esw->esw_funcs.num_vfs) {
+	mlx5_esw_for_each_host_func_vport(esw, vport, esw->esw_funcs.num_vfs) {
 		err = mlx5_modify_nic_vport_min_inline(dev, vport, mlx5_mode);
 		if (err) {
 			NL_SET_ERR_MSG_MOD(extack,
@@ -2344,7 +2344,7 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
 
 revert_inline_mode:
 	num_vport = --vport;
-	mlx5_esw_for_each_vf_vport_num_reverse(esw, vport, num_vport)
+	mlx5_esw_for_each_host_func_vport_reverse(esw, vport, num_vport)
 		mlx5_modify_nic_vport_min_inline(dev,
 						 vport,
 						 esw->offloads.inline_mode);
@@ -2389,9 +2389,10 @@ int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, u8 *mode)
 	}
 
 query_vports:
-	mlx5_esw_for_each_vf_vport_num(esw, vport, esw->esw_funcs.num_vfs) {
+	mlx5_query_nic_vport_min_inline(dev, esw->first_host_vport, &prev_mlx5_mode);
+	mlx5_esw_for_each_host_func_vport(esw, vport, esw->esw_funcs.num_vfs) {
 		mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode);
-		if (vport > 1 && prev_mlx5_mode != mlx5_mode)
+		if (prev_mlx5_mode != mlx5_mode)
 			return -EINVAL;
 		prev_mlx5_mode = mlx5_mode;
 	}
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index c147acc7bf70..6cbf29229749 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -58,6 +58,7 @@ enum {
 	MLX5_CAP_INLINE_MODE_NOT_REQUIRED,
 };
 
+/* Vport number for each function must keep unchanged */
 enum {
 	MLX5_VPORT_PF			= 0x0,
 	MLX5_VPORT_FIRST_VF		= 0x1,
-- 
cgit v1.2.3


From 362b87f5b1c6603b72699e8bb18661ecc4efc0bb Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 28 Jun 2019 16:40:21 +0200
Subject: netlink: use 48 byte ctx instead of 6 signed longs for callback

People are inclined to stuff random things into cb->args[n] because it
looks like an array of integers. Sometimes people even put u64s in there
with comments noting that a certain member takes up two slots. The
horror! Really this should mirror the usage of skb->cb, which are just
48 opaque bytes suitable for casting a struct. Then people can create
their usual casting macros for accessing strongly typed members of a
struct.

As a plus, this also gives us the same amount of space on 32bit and 64bit.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 593d1b9c33a8..205fa7b1f07a 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -192,7 +192,14 @@ struct netlink_callback {
 	bool			strict_check;
 	u16			answer_flags;
 	unsigned int		prev_seq, seq;
-	long			args[6];
+	union {
+		u8		ctx[48];
+
+		/* args is deprecated. Cast a struct over ctx instead
+		 * for proper type safety.
+		 */
+		long		args[6];
+	};
 };
 
 struct netlink_notify {
-- 
cgit v1.2.3


From e33d2b74d805af0e4c8060f41040595ba105a520 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 28 Jun 2019 11:03:41 -0700
Subject: idr: fix overflow case for idr_for_each_entry_ul()

idr_for_each_entry_ul() is buggy as it can't handle overflow
case correctly. When we have an ID == UINT_MAX, it becomes an
infinite loop. This happens when running on 32-bit CPU where
unsigned long has the same size with unsigned int.

There is no better way to fix this than casting it to a larger
integer, but we can't just 64 bit integer on 32 bit CPU. Instead
we could just use an additional integer to help us to detect this
overflow case, that is, adding a new parameter to this macro.
Fortunately tc action is its only user right now.

Fixes: 65a206c01e8e ("net/sched: Change act_api and act_xxx modules to use IDR")
Reported-by: Li Shuang <shuali@redhat.com>
Tested-by: Davide Caratti <dcaratti@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chris Mi <chrism@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/idr.h | 7 +++++--
 net/sched/act_api.c | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index ee7abae143d3..68528a72d10d 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -191,14 +191,17 @@ static inline void idr_preload_end(void)
  * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type.
  * @idr: IDR handle.
  * @entry: The type * to use as cursor.
+ * @tmp: A temporary placeholder for ID.
  * @id: Entry ID.
  *
  * @entry and @id do not need to be initialized before the loop, and
  * after normal termination @entry is left with the value NULL.  This
  * is convenient for a "not found" value.
  */
-#define idr_for_each_entry_ul(idr, entry, id)			\
-	for (id = 0; ((entry) = idr_get_next_ul(idr, &(id))) != NULL; ++id)
+#define idr_for_each_entry_ul(idr, entry, tmp, id)			\
+	for (tmp = 0, id = 0;						\
+	     tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
+	     tmp = id, ++id)
 
 /**
  * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4e5d2e9ace5d..339712296164 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -221,12 +221,13 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 	struct idr *idr = &idrinfo->action_idr;
 	struct tc_action *p;
 	unsigned long id = 1;
+	unsigned long tmp;
 
 	mutex_lock(&idrinfo->lock);
 
 	s_i = cb->args[0];
 
-	idr_for_each_entry_ul(idr, p, id) {
+	idr_for_each_entry_ul(idr, p, tmp, id) {
 		index++;
 		if (index < s_i)
 			continue;
@@ -292,6 +293,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 	struct idr *idr = &idrinfo->action_idr;
 	struct tc_action *p;
 	unsigned long id = 1;
+	unsigned long tmp;
 
 	nest = nla_nest_start_noflag(skb, 0);
 	if (nest == NULL)
@@ -300,7 +302,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 		goto nla_put_failure;
 
 	mutex_lock(&idrinfo->lock);
-	idr_for_each_entry_ul(idr, p, id) {
+	idr_for_each_entry_ul(idr, p, tmp, id) {
 		ret = tcf_idr_release_unsafe(p);
 		if (ret == ACT_P_DELETED) {
 			module_put(ops->owner);
@@ -533,8 +535,9 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
 	struct tc_action *p;
 	int ret;
 	unsigned long id = 1;
+	unsigned long tmp;
 
-	idr_for_each_entry_ul(idr, p, id) {
+	idr_for_each_entry_ul(idr, p, tmp, id) {
 		ret = __tcf_idr_release(p, false, true);
 		if (ret == ACT_P_DELETED)
 			module_put(ops->owner);
-- 
cgit v1.2.3


From d39d714969cda5cbda291402c8c6b1fb1047f42e Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 28 Jun 2019 11:03:42 -0700
Subject: idr: introduce idr_for_each_entry_continue_ul()

Similarly, other callers of idr_get_next_ul() suffer the same
overflow bug as they don't handle it properly either.

Introduce idr_for_each_entry_continue_ul() to help these callers
iterate from a given ID.

cls_flower needs more care here because it still has overflow when
does arg->cookie++, we have to fold its nested loops into one
and remove the arg->cookie++.

Fixes: 01683a146999 ("net: sched: refactor flower walk to iterate over idr")
Fixes: 12d6066c3b29 ("net/mlx5: Add flow counters idr")
Reported-by: Li Shuang <shuali@redhat.com>
Cc: Davide Caratti <dcaratti@redhat.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Cc: Chris Mi <chrism@mellanox.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Tested-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 10 ++++----
 include/linux/idr.h                                | 14 +++++++++++
 net/sched/cls_flower.c                             | 27 ++++++----------------
 3 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index c6c28f56aa29..b3762123a69c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -102,13 +102,15 @@ static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
 	unsigned long next_id = (unsigned long)id + 1;
 	struct mlx5_fc *counter;
+	unsigned long tmp;
 
 	rcu_read_lock();
 	/* skip counters that are in idr, but not yet in counters list */
-	while ((counter = idr_get_next_ul(&fc_stats->counters_idr,
-					  &next_id)) != NULL &&
-	       list_empty(&counter->list))
-		next_id++;
+	idr_for_each_entry_continue_ul(&fc_stats->counters_idr,
+				       counter, tmp, next_id) {
+		if (!list_empty(&counter->list))
+			break;
+	}
 	rcu_read_unlock();
 
 	return counter ? &counter->list : &fc_stats->counters;
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 68528a72d10d..4ec8986e5dfb 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -216,6 +216,20 @@ static inline void idr_preload_end(void)
 	     entry;							\
 	     ++id, (entry) = idr_get_next((idr), &(id)))
 
+/**
+ * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type
+ * @idr: IDR handle.
+ * @entry: The type * to use as a cursor.
+ * @tmp: A temporary placeholder for ID.
+ * @id: Entry ID.
+ *
+ * Continue to iterate over entries, continuing after the current position.
+ */
+#define idr_for_each_entry_continue_ul(idr, entry, tmp, id)		\
+	for (tmp = id;							\
+	     tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
+	     tmp = id, ++id)
+
 /*
  * IDA - ID Allocator, use when translation from id to pointer isn't necessary.
  */
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index eedd5786c084..fdeede3af72e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -524,24 +524,6 @@ static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle)
 	return f;
 }
 
-static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp,
-						unsigned long *handle)
-{
-	struct cls_fl_head *head = fl_head_dereference(tp);
-	struct cls_fl_filter *f;
-
-	rcu_read_lock();
-	while ((f = idr_get_next_ul(&head->handle_idr, handle))) {
-		/* don't return filters that are being deleted */
-		if (refcount_inc_not_zero(&f->refcnt))
-			break;
-		++(*handle);
-	}
-	rcu_read_unlock();
-
-	return f;
-}
-
 static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
 		       bool *last, bool rtnl_held,
 		       struct netlink_ext_ack *extack)
@@ -1691,20 +1673,25 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
 static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
 		    bool rtnl_held)
 {
+	struct cls_fl_head *head = fl_head_dereference(tp);
+	unsigned long id = arg->cookie, tmp;
 	struct cls_fl_filter *f;
 
 	arg->count = arg->skip;
 
-	while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) {
+	idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) {
+		/* don't return filters that are being deleted */
+		if (!refcount_inc_not_zero(&f->refcnt))
+			continue;
 		if (arg->fn(tp, f, arg) < 0) {
 			__fl_put(f);
 			arg->stop = 1;
 			break;
 		}
 		__fl_put(f);
-		arg->cookie++;
 		arg->count++;
 	}
+	arg->cookie = id;
 }
 
 static struct cls_fl_filter *
-- 
cgit v1.2.3


From acd3e96d53a24d219f720ed4012b62723ae05da1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 28 Jun 2019 16:11:39 -0700
Subject: net/tls: make sure offload also gets the keys wiped

Commit 86029d10af18 ("tls: zero the crypto information from tls_context
before freeing") added memzero_explicit() calls to clear the key material
before freeing struct tls_context, but it missed tls_device.c has its
own way of freeing this structure. Replace the missing free.

Fixes: 86029d10af18 ("tls: zero the crypto information from tls_context before freeing")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    | 1 +
 net/tls/tls_device.c | 2 +-
 net/tls/tls_main.c   | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 53d96bca220d..889df0312cd1 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -313,6 +313,7 @@ struct tls_offload_context_rx {
 	(ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \
 	 TLS_DRIVER_STATE_SIZE)
 
+void tls_ctx_free(struct tls_context *ctx);
 int wait_on_pending_writer(struct sock *sk, long *timeo);
 int tls_sk_query(struct sock *sk, int optname, char __user *optval,
 		int __user *optlen);
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 397990407ed6..eb8f24f420f0 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -61,7 +61,7 @@ static void tls_device_free_ctx(struct tls_context *ctx)
 	if (ctx->rx_conf == TLS_HW)
 		kfree(tls_offload_ctx_rx(ctx));
 
-	kfree(ctx);
+	tls_ctx_free(ctx);
 }
 
 static void tls_device_gc_task(struct work_struct *work)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index e2b69e805d46..4674e57e66b0 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -251,7 +251,7 @@ static void tls_write_space(struct sock *sk)
 	ctx->sk_write_space(sk);
 }
 
-static void tls_ctx_free(struct tls_context *ctx)
+void tls_ctx_free(struct tls_context *ctx)
 {
 	if (!ctx)
 		return;
@@ -643,7 +643,7 @@ static void tls_hw_sk_destruct(struct sock *sk)
 
 	ctx->sk_destruct(sk);
 	/* Free ctx */
-	kfree(ctx);
+	tls_ctx_free(ctx);
 	icsk->icsk_ulp_data = NULL;
 }
 
-- 
cgit v1.2.3


From bb005f2a70742d66f8fe44d57e217c696e805d97 Mon Sep 17 00:00:00 2001
From: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Date: Sat, 29 Jun 2019 08:23:24 +0300
Subject: net: page_pool: add helper function for retrieving dma direction

Since the dma direction is stored in page pool params, offer an API
helper for driver that choose not to keep track of it locally

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index f07c518ef8a5..ee9c871d2043 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -112,6 +112,15 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
 	return page_pool_alloc_pages(pool, gfp);
 }
 
+/* get the stored dma direction. A driver might decide to treat this locally and
+ * avoid the extra cache line from page_pool to determine the direction
+ */
+static
+inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
+{
+	return pool->p.dma_dir;
+}
+
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
 void __page_pool_free(struct page_pool *pool);
-- 
cgit v1.2.3


From 4de83b88c66a1e4dba426b29766fb68e61d93792 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Mon, 1 Jul 2019 14:38:49 -0700
Subject: loopback: create blackhole net device similar to loopack.

Create a blackhole net device that can be used for "dead"
dst entries instead of loopback device. This blackhole device differs
from loopback in few aspects: (a) It's not per-ns. (b)  MTU on this
device is ETH_MIN_MTU (c) The xmit function is essentially kfree_skb().
and (d) since it's not registered it won't have ifindex.

Lower MTU effectively make the device not pass the MTU check during
the route check when a dst associated with the skb is dead.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    | 76 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/netdevice.h |  2 ++
 2 files changed, 69 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 87d361666cdd..3b39def5471e 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -55,6 +55,13 @@
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
 
+/* blackhole_netdev - a device used for dsts that are marked expired!
+ * This is global device (instead of per-net-ns) since it's not needed
+ * to be per-ns and gets initialized at boot time.
+ */
+struct net_device *blackhole_netdev;
+EXPORT_SYMBOL(blackhole_netdev);
+
 /* The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
@@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = {
 	.ndo_set_mac_address = eth_mac_addr,
 };
 
-/* The loopback device is special. There is only one instance
- * per network namespace.
- */
-static void loopback_setup(struct net_device *dev)
+static void gen_lo_setup(struct net_device *dev,
+			 unsigned int mtu,
+			 const struct ethtool_ops *eth_ops,
+			 const struct header_ops *hdr_ops,
+			 const struct net_device_ops *dev_ops,
+			 void (*dev_destructor)(struct net_device *dev))
 {
-	dev->mtu		= 64 * 1024;
+	dev->mtu		= mtu;
 	dev->hard_header_len	= ETH_HLEN;	/* 14	*/
 	dev->min_header_len	= ETH_HLEN;	/* 14	*/
 	dev->addr_len		= ETH_ALEN;	/* 6	*/
@@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev)
 		| NETIF_F_NETNS_LOCAL
 		| NETIF_F_VLAN_CHALLENGED
 		| NETIF_F_LOOPBACK;
-	dev->ethtool_ops	= &loopback_ethtool_ops;
-	dev->header_ops		= &eth_header_ops;
-	dev->netdev_ops		= &loopback_ops;
+	dev->ethtool_ops	= eth_ops;
+	dev->header_ops		= hdr_ops;
+	dev->netdev_ops		= dev_ops;
 	dev->needs_free_netdev	= true;
-	dev->priv_destructor	= loopback_dev_free;
+	dev->priv_destructor	= dev_destructor;
+}
+
+/* The loopback device is special. There is only one instance
+ * per network namespace.
+ */
+static void loopback_setup(struct net_device *dev)
+{
+	gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, &eth_header_ops,
+		     &loopback_ops, loopback_dev_free);
 }
 
 /* Setup and register the loopback device. */
@@ -213,3 +231,43 @@ out:
 struct pernet_operations __net_initdata loopback_net_ops = {
 	.init = loopback_net_init,
 };
+
+/* blackhole netdevice */
+static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
+					 struct net_device *dev)
+{
+	kfree_skb(skb);
+	net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
+	return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops blackhole_netdev_ops = {
+	.ndo_start_xmit = blackhole_netdev_xmit,
+};
+
+/* This is a dst-dummy device used specifically for invalidated
+ * DSTs and unlike loopback, this is not per-ns.
+ */
+static void blackhole_netdev_setup(struct net_device *dev)
+{
+	gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
+}
+
+/* Setup and register the blackhole_netdev. */
+static int __init blackhole_netdev_init(void)
+{
+	blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
+					blackhole_netdev_setup);
+	if (!blackhole_netdev)
+		return -ENOMEM;
+
+	dev_init_scheduler(blackhole_netdev);
+	dev_activate(blackhole_netdev);
+
+	blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
+	dev_net_set(blackhole_netdev, &init_net);
+
+	return 0;
+}
+
+device_initcall(blackhole_netdev_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eeacebd7debb..88292953aa6f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4870,4 +4870,6 @@ do {								\
 #define PTYPE_HASH_SIZE	(16)
 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 
+extern struct net_device *blackhole_netdev;
+
 #endif	/* _LINUX_NETDEVICE_H */
-- 
cgit v1.2.3


From 99f0eae653b2db64917d0b58099eb51e300b311d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jul 2019 16:04:19 +0100
Subject: rxrpc: Fix oops in tracepoint

If the rxrpc_eproto tracepoint is enabled, an oops will be cause by the
trace line that rxrpc_extract_header() tries to emit when a protocol error
occurs (typically because the packet is short) because the call argument is
NULL.

Fix this by using ?: to assume 0 as the debug_id if call is NULL.

This can then be induced by:

	echo -e '\0\0\0\0\0\0\0\0' | ncat -4u --send-only <addr> 20001

where addr has the following program running on it:

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <unistd.h>
	#include <sys/socket.h>
	#include <arpa/inet.h>
	#include <linux/rxrpc.h>
	int main(void)
	{
		struct sockaddr_rxrpc srx;
		int fd;
		memset(&srx, 0, sizeof(srx));
		srx.srx_family			= AF_RXRPC;
		srx.srx_service			= 0;
		srx.transport_type		= AF_INET;
		srx.transport_len		= sizeof(srx.transport.sin);
		srx.transport.sin.sin_family	= AF_INET;
		srx.transport.sin.sin_port	= htons(0x4e21);
		fd = socket(AF_RXRPC, SOCK_DGRAM, AF_INET6);
		bind(fd, (struct sockaddr *)&srx, sizeof(srx));
		sleep(20);
		return 0;
	}

It results in the following oops.

	BUG: kernel NULL pointer dereference, address: 0000000000000340
	#PF: supervisor read access in kernel mode
	#PF: error_code(0x0000) - not-present page
	...
	RIP: 0010:trace_event_raw_event_rxrpc_rx_eproto+0x47/0xac
	...
	Call Trace:
	 <IRQ>
	 rxrpc_extract_header+0x86/0x171
	 ? rcu_read_lock_sched_held+0x5d/0x63
	 ? rxrpc_new_skb+0xd4/0x109
	 rxrpc_input_packet+0xef/0x14fc
	 ? rxrpc_input_data+0x986/0x986
	 udp_queue_rcv_one_skb+0xbf/0x3d0
	 udp_unicast_rcv_skb.isra.8+0x64/0x71
	 ip_protocol_deliver_rcu+0xe4/0x1b4
	 ip_local_deliver+0xf0/0x154
	 __netif_receive_skb_one_core+0x50/0x6c
	 netif_receive_skb_internal+0x26b/0x2e9
	 napi_gro_receive+0xf8/0x1da
	 rtl8169_poll+0x303/0x4c4
	 net_rx_action+0x10e/0x333
	 __do_softirq+0x1a5/0x38f
	 irq_exit+0x54/0xc4
	 do_IRQ+0xda/0xf8
	 common_interrupt+0xf/0xf
	 </IRQ>
	 ...
	 ? cpuidle_enter_state+0x23c/0x34d
	 cpuidle_enter+0x2a/0x36
	 do_idle+0x163/0x1ea
	 cpu_startup_entry+0x1d/0x1f
	 start_secondary+0x157/0x172
	 secondary_startup_64+0xa4/0xb0

Fixes: a25e21f0bcd2 ("rxrpc, afs: Use debug_ids rather than pointers in traces")
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/rxrpc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index d85816878a52..cc1d060cbf13 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -1379,7 +1379,7 @@ TRACE_EVENT(rxrpc_rx_eproto,
 			     ),
 
 	    TP_fast_assign(
-		    __entry->call = call->debug_id;
+		    __entry->call = call ? call->debug_id : 0;
 		    __entry->serial = serial;
 		    __entry->why = why;
 			   ),
-- 
cgit v1.2.3


From 455302d1c9ae9318660aaeb9748a01ff414c9741 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@samsung.com>
Date: Fri, 28 Jun 2019 11:04:07 +0300
Subject: xdp: fix hang while unregistering device bound to xdp socket

Device that bound to XDP socket will not have zero refcount until the
userspace application will not close it. This leads to hang inside
'netdev_wait_allrefs()' if device unregistering requested:

  # ip link del p1
  < hang on recvmsg on netlink socket >

  # ps -x | grep ip
  5126  pts/0    D+   0:00 ip link del p1

  # journalctl -b

  Jun 05 07:19:16 kernel:
  unregister_netdevice: waiting for p1 to become free. Usage count = 1

  Jun 05 07:19:27 kernel:
  unregister_netdevice: waiting for p1 to become free. Usage count = 1
  ...

Fix that by implementing NETDEV_UNREGISTER event notification handler
to properly clean up all the resources and unref device.

This should also allow socket killing via ss(8) utility.

Fixes: 965a99098443 ("xsk: add support for bind for Rx")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  5 +++
 net/xdp/xdp_umem.c     | 10 +++---
 net/xdp/xdp_umem.h     |  1 +
 net/xdp/xsk.c          | 87 +++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 87 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index d074b6d60f8a..7da155164947 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -61,6 +61,11 @@ struct xdp_sock {
 	struct xsk_queue *tx ____cacheline_aligned_in_smp;
 	struct list_head list;
 	bool zc;
+	enum {
+		XSK_READY = 0,
+		XSK_BOUND,
+		XSK_UNBOUND,
+	} state;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 	/* Mutual exclusion of NAPI TX thread and sendmsg error paths
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 267b82a4cbcf..20c91f02d3d8 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -140,11 +140,13 @@ out_rtnl_unlock:
 	return err;
 }
 
-static void xdp_umem_clear_dev(struct xdp_umem *umem)
+void xdp_umem_clear_dev(struct xdp_umem *umem)
 {
 	struct netdev_bpf bpf;
 	int err;
 
+	ASSERT_RTNL();
+
 	if (!umem->dev)
 		return;
 
@@ -153,17 +155,13 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
 		bpf.xsk.umem = NULL;
 		bpf.xsk.queue_id = umem->queue_id;
 
-		rtnl_lock();
 		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
-		rtnl_unlock();
 
 		if (err)
 			WARN(1, "failed to disable umem!\n");
 	}
 
-	rtnl_lock();
 	xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
-	rtnl_unlock();
 
 	dev_put(umem->dev);
 	umem->dev = NULL;
@@ -195,7 +193,9 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
 
 static void xdp_umem_release(struct xdp_umem *umem)
 {
+	rtnl_lock();
 	xdp_umem_clear_dev(umem);
+	rtnl_unlock();
 
 	ida_simple_remove(&umem_ida, umem->id);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 27603227601b..a63a9fb251f5 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -10,6 +10,7 @@
 
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u16 queue_id, u16 flags);
+void xdp_umem_clear_dev(struct xdp_umem *umem);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index a14e8864e4fa..f53a6ef7c155 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -335,6 +335,22 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 	return 0;
 }
 
+static void xsk_unbind_dev(struct xdp_sock *xs)
+{
+	struct net_device *dev = xs->dev;
+
+	if (!dev || xs->state != XSK_BOUND)
+		return;
+
+	xs->state = XSK_UNBOUND;
+
+	/* Wait for driver to stop using the xdp socket. */
+	xdp_del_sk_umem(xs->umem, xs);
+	xs->dev = NULL;
+	synchronize_net();
+	dev_put(dev);
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -354,15 +370,7 @@ static int xsk_release(struct socket *sock)
 	sock_prot_inuse_add(net, sk->sk_prot, -1);
 	local_bh_enable();
 
-	if (xs->dev) {
-		struct net_device *dev = xs->dev;
-
-		/* Wait for driver to stop using the xdp socket. */
-		xdp_del_sk_umem(xs->umem, xs);
-		xs->dev = NULL;
-		synchronize_net();
-		dev_put(dev);
-	}
+	xsk_unbind_dev(xs);
 
 	xskq_destroy(xs->rx);
 	xskq_destroy(xs->tx);
@@ -412,7 +420,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		return -EINVAL;
 
 	mutex_lock(&xs->mutex);
-	if (xs->dev) {
+	if (xs->state != XSK_READY) {
 		err = -EBUSY;
 		goto out_release;
 	}
@@ -492,6 +500,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 out_unlock:
 	if (err)
 		dev_put(dev);
+	else
+		xs->state = XSK_BOUND;
 out_release:
 	mutex_unlock(&xs->mutex);
 	return err;
@@ -520,6 +530,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
+		if (xs->state != XSK_READY) {
+			mutex_unlock(&xs->mutex);
+			return -EBUSY;
+		}
 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 		err = xsk_init_queue(entries, q, false);
 		mutex_unlock(&xs->mutex);
@@ -534,7 +548,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
-		if (xs->umem) {
+		if (xs->state != XSK_READY || xs->umem) {
 			mutex_unlock(&xs->mutex);
 			return -EBUSY;
 		}
@@ -561,6 +575,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
+		if (xs->state != XSK_READY) {
+			mutex_unlock(&xs->mutex);
+			return -EBUSY;
+		}
 		if (!xs->umem) {
 			mutex_unlock(&xs->mutex);
 			return -EINVAL;
@@ -662,6 +680,9 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 	unsigned long pfn;
 	struct page *qpg;
 
+	if (xs->state != XSK_READY)
+		return -EBUSY;
+
 	if (offset == XDP_PGOFF_RX_RING) {
 		q = READ_ONCE(xs->rx);
 	} else if (offset == XDP_PGOFF_TX_RING) {
@@ -693,6 +714,38 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 			       size, vma->vm_page_prot);
 }
 
+static int xsk_notifier(struct notifier_block *this,
+			unsigned long msg, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct net *net = dev_net(dev);
+	struct sock *sk;
+
+	switch (msg) {
+	case NETDEV_UNREGISTER:
+		mutex_lock(&net->xdp.lock);
+		sk_for_each(sk, &net->xdp.list) {
+			struct xdp_sock *xs = xdp_sk(sk);
+
+			mutex_lock(&xs->mutex);
+			if (xs->dev == dev) {
+				sk->sk_err = ENETDOWN;
+				if (!sock_flag(sk, SOCK_DEAD))
+					sk->sk_error_report(sk);
+
+				xsk_unbind_dev(xs);
+
+				/* Clear device references in umem. */
+				xdp_umem_clear_dev(xs->umem);
+			}
+			mutex_unlock(&xs->mutex);
+		}
+		mutex_unlock(&net->xdp.lock);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
 static struct proto xsk_proto = {
 	.name =		"XDP",
 	.owner =	THIS_MODULE,
@@ -764,6 +817,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 	sock_set_flag(sk, SOCK_RCU_FREE);
 
 	xs = xdp_sk(sk);
+	xs->state = XSK_READY;
 	mutex_init(&xs->mutex);
 	spin_lock_init(&xs->tx_completion_lock);
 
@@ -784,6 +838,10 @@ static const struct net_proto_family xsk_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
+static struct notifier_block xsk_netdev_notifier = {
+	.notifier_call	= xsk_notifier,
+};
+
 static int __net_init xsk_net_init(struct net *net)
 {
 	mutex_init(&net->xdp.lock);
@@ -816,8 +874,15 @@ static int __init xsk_init(void)
 	err = register_pernet_subsys(&xsk_net_ops);
 	if (err)
 		goto out_sk;
+
+	err = register_netdevice_notifier(&xsk_netdev_notifier);
+	if (err)
+		goto out_pernet;
+
 	return 0;
 
+out_pernet:
+	unregister_pernet_subsys(&xsk_net_ops);
 out_sk:
 	sock_unregister(PF_XDP);
 out_proto:
-- 
cgit v1.2.3


From 23729ff23186424e54b4d6678fcd526cdacef4d3 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 2 Jul 2019 09:13:56 -0700
Subject: bpf: add BPF_CGROUP_SOCK_OPS callback that is executed on every RTT

Performance impact should be minimal because it's under a new
BPF_SOCK_OPS_RTT_CB_FLAG flag that has to be explicitly enabled.

Suggested-by: Eric Dumazet <edumazet@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Priyaranjan Jha <priyarjha@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/tcp.h        | 8 ++++++++
 include/uapi/linux/bpf.h | 6 +++++-
 net/ipv4/tcp_input.c     | 4 ++++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9d36cc88d043..e16d8a3fd3b4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2221,6 +2221,14 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
+static inline void tcp_bpf_rtt(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG))
+		tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
+}
+
 #if IS_ENABLED(CONFIG_SMC)
 extern struct static_key_false tcp_have_smc;
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cffea1826a1f..9cdd0aaeba06 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1770,6 +1770,7 @@ union bpf_attr {
  * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
  * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
  * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ * 		* **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
  *
  * 		Therefore, this function can be used to clear a callback flag by
  * 		setting the appropriate bit to zero. e.g. to disable the RTO
@@ -3314,7 +3315,8 @@ struct bpf_sock_ops {
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
 #define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
 #define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
-#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
+#define BPF_SOCK_OPS_RTT_CB_FLAG	(1<<3)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0xF		/* Mask of all currently
 							 * supported cb flags
 							 */
 
@@ -3369,6 +3371,8 @@ enum {
 	BPF_SOCK_OPS_TCP_LISTEN_CB,	/* Called on listen(2), right after
 					 * socket transition to LISTEN state.
 					 */
+	BPF_SOCK_OPS_RTT_CB,		/* Called on every RTT.
+					 */
 };
 
 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b71efeb0ae5b..c21e8a22fb3b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
 			tp->rtt_seq = tp->snd_nxt;
 			tp->mdev_max_us = tcp_rto_min_us(sk);
+
+			tcp_bpf_rtt(sk);
 		}
 	} else {
 		/* no previous measure. */
@@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
 		tp->mdev_max_us = tp->rttvar_us;
 		tp->rtt_seq = tp->snd_nxt;
+
+		tcp_bpf_rtt(sk);
 	}
 	tp->srtt_us = max(1U, srtt);
 }
-- 
cgit v1.2.3


From 0357746d1e40a8226f68a42c8d7222a12d7c451f Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 2 Jul 2019 09:13:58 -0700
Subject: bpf: add dsack_dups/delivered{, _ce} to bpf_tcp_sock

Add more fields to bpf_tcp_sock that might be useful for debugging
congestion control issues.

Cc: Eric Dumazet <edumazet@google.com>
Cc: Priyaranjan Jha <priyarjha@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  5 +++++
 net/core/filter.c        | 11 ++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9cdd0aaeba06..bfb0b1a76684 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3073,6 +3073,11 @@ struct bpf_tcp_sock {
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
 				 */
+	__u32 dsack_dups;	/* RFC4898 tcpEStatsStackDSACKDups
+				 * total number of DSACK blocks received
+				 */
+	__u32 delivered;	/* Total data packets delivered incl. rexmits */
+	__u32 delivered_ce;	/* Like the above but only ECE marked packets */
 };
 
 struct bpf_sock_tuple {
diff --git a/net/core/filter.c b/net/core/filter.c
index ad908526545d..3da4b6c38b46 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5544,7 +5544,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 				  struct bpf_insn_access_aux *info)
 {
-	if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
+	if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, delivered_ce))
 		return false;
 
 	if (off % size != 0)
@@ -5652,6 +5652,15 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_tcp_sock, bytes_acked):
 		BPF_TCP_SOCK_GET_COMMON(bytes_acked);
 		break;
+	case offsetof(struct bpf_tcp_sock, dsack_dups):
+		BPF_TCP_SOCK_GET_COMMON(dsack_dups);
+		break;
+	case offsetof(struct bpf_tcp_sock, delivered):
+		BPF_TCP_SOCK_GET_COMMON(delivered);
+		break;
+	case offsetof(struct bpf_tcp_sock, delivered_ce):
+		BPF_TCP_SOCK_GET_COMMON(delivered_ce);
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From c2cb5e82a720c05b707701c75dfeb356fe184787 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 2 Jul 2019 09:13:59 -0700
Subject: bpf: add icsk_retransmits to bpf_tcp_sock

Add some inet_connection_sock fields to bpf_tcp_sock that might be useful
for debugging congestion control issues.

Cc: Eric Dumazet <edumazet@google.com>
Cc: Priyaranjan Jha <priyarjha@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bfb0b1a76684..ead27aebf491 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3078,6 +3078,7 @@ struct bpf_tcp_sock {
 				 */
 	__u32 delivered;	/* Total data packets delivered incl. rexmits */
 	__u32 delivered_ce;	/* Like the above but only ECE marked packets */
+	__u32 icsk_retransmits;	/* Number of unrecovered [RTO] timeouts */
 };
 
 struct bpf_sock_tuple {
diff --git a/net/core/filter.c b/net/core/filter.c
index 3da4b6c38b46..089aaea0ccc6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5544,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 				  struct bpf_insn_access_aux *info)
 {
-	if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, delivered_ce))
+	if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
+					  icsk_retransmits))
 		return false;
 
 	if (off % size != 0)
@@ -5575,6 +5576,20 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct tcp_sock, FIELD)); \
 	} while (0)
 
+#define BPF_INET_SOCK_GET_COMMON(FIELD)					\
+	do {								\
+		BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock,	\
+					  FIELD) >			\
+			     FIELD_SIZEOF(struct bpf_tcp_sock, FIELD));	\
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			\
+					struct inet_connection_sock,	\
+					FIELD),				\
+				      si->dst_reg, si->src_reg,		\
+				      offsetof(				\
+					struct inet_connection_sock,	\
+					FIELD));			\
+	} while (0)
+
 	if (insn > insn_buf)
 		return insn - insn_buf;
 
@@ -5661,6 +5676,9 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_tcp_sock, delivered_ce):
 		BPF_TCP_SOCK_GET_COMMON(delivered_ce);
 		break;
+	case offsetof(struct bpf_tcp_sock, icsk_retransmits):
+		BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From b9a7ba5562074855e8a3f92ea7e1174b61a3e87d Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:23 +0300
Subject: net/mlx5: Use event mask based on device capabilities

Use the reported device capabilities for the supported user events (i.e.
affiliated and un-affiliated) to set the EQ mask.

As the event mask can be up to 256 defined by 4 entries of u64 change
the applicable code to work accordingly.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c             |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 40 ++++++++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/fw.c |  6 +++++
 include/linux/mlx5/device.h                  |  6 ++++-
 include/linux/mlx5/eq.h                      |  2 +-
 include/linux/mlx5/mlx5_ifc.h                | 13 ++++++---
 6 files changed, 55 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index c594489eb2d7..831c450b271a 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1558,9 +1558,9 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
 	param = (struct mlx5_eq_param) {
 		.irq_index = 0,
-		.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 		.nent = MLX5_IB_NUM_PF_EQE,
 	};
+	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 8000d2a4a7e2..33f78d4d3724 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -256,6 +256,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	int inlen;
 	u32 *in;
 	int err;
+	int i;
 
 	/* Init CQ table */
 	memset(cq_table, 0, sizeof(*cq_table));
@@ -283,10 +284,12 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	mlx5_fill_page_array(&eq->buf, pas);
 
 	MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
-	if (!param->mask && MLX5_CAP_GEN(dev, log_max_uctx))
+	if (!param->mask[0] && MLX5_CAP_GEN(dev, log_max_uctx))
 		MLX5_SET(create_eq_in, in, uid, MLX5_SHARED_RESOURCE_UID);
 
-	MLX5_SET64(create_eq_in, in, event_bitmask, param->mask);
+	for (i = 0; i < 4; i++)
+		MLX5_ARRAY_SET64(create_eq_in, in, event_bitmask, i,
+				 param->mask[i]);
 
 	eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
 	MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent));
@@ -507,7 +510,23 @@ static int cq_err_event_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-static u64 gather_async_events_mask(struct mlx5_core_dev *dev)
+static void gather_user_async_events(struct mlx5_core_dev *dev, u64 mask[4])
+{
+	__be64 *user_unaffiliated_events;
+	__be64 *user_affiliated_events;
+	int i;
+
+	user_affiliated_events =
+		MLX5_CAP_DEV_EVENT(dev, user_affiliated_events);
+	user_unaffiliated_events =
+		MLX5_CAP_DEV_EVENT(dev, user_unaffiliated_events);
+
+	for (i = 0; i < 4; i++)
+		mask[i] |= be64_to_cpu(user_affiliated_events[i] |
+				       user_unaffiliated_events[i]);
+}
+
+static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4])
 {
 	u64 async_event_mask = MLX5_ASYNC_EVENT_MASK;
 
@@ -544,7 +563,10 @@ static u64 gather_async_events_mask(struct mlx5_core_dev *dev)
 		async_event_mask |=
 			(1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED);
 
-	return async_event_mask;
+	mask[0] = async_event_mask;
+
+	if (MLX5_CAP_GEN(dev, event_cap))
+		gather_user_async_events(dev, mask);
 }
 
 static int create_async_eqs(struct mlx5_core_dev *dev)
@@ -559,9 +581,10 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	table->cmd_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.irq_index = 0,
-		.mask = 1ull << MLX5_EVENT_TYPE_CMD,
 		.nent = MLX5_NUM_CMD_EQE,
 	};
+
+	param.mask[0] = 1ull << MLX5_EVENT_TYPE_CMD;
 	err = create_async_eq(dev, &table->cmd_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
@@ -577,9 +600,10 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.irq_index = 0,
-		.mask = gather_async_events_mask(dev),
 		.nent = MLX5_NUM_ASYNC_EQE,
 	};
+
+	gather_async_events_mask(dev, param.mask);
 	err = create_async_eq(dev, &table->async_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
@@ -595,9 +619,10 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int;
 	param = (struct mlx5_eq_param) {
 		.irq_index = 0,
-		.mask =  1 << MLX5_EVENT_TYPE_PAGE_REQUEST,
 		.nent = /* TODO: sriov max_vf + */ 1,
 	};
+
+	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST;
 	err = create_async_eq(dev, &table->pages_eq.core, &param);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
@@ -789,7 +814,6 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
 			.irq_index = vecidx,
-			.mask = 0,
 			.nent = nent,
 		};
 		err = create_map_eq(dev, &eq->core, &param);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 1ab6f7e3bec6..05367f15c3a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -202,6 +202,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 			return err;
 	}
 
+	if (MLX5_CAP_GEN(dev, event_cap)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_EVENT);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 5e760067ac41..0d1abe097627 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -351,7 +351,7 @@ enum mlx5_event {
 
 	MLX5_EVENT_TYPE_DEVICE_TRACER      = 0x26,
 
-	MLX5_EVENT_TYPE_MAX                = MLX5_EVENT_TYPE_DEVICE_TRACER + 1,
+	MLX5_EVENT_TYPE_MAX                = 0x100,
 };
 
 enum {
@@ -1077,6 +1077,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_DEBUG,
 	MLX5_CAP_RESERVED_14,
 	MLX5_CAP_DEV_MEM,
+	MLX5_CAP_DEV_EVENT = 0x14,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1255,6 +1256,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP64_DEV_MEM(mdev, cap)\
 	MLX5_GET64(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap)
 
+#define MLX5_CAP_DEV_EVENT(mdev, cap)\
+	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca_cur[MLX5_CAP_DEV_EVENT], cap)
+
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
 	MLX5_CMD_STAT_INT_ERR			= 0x1,
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index 70e16dcfb4c4..e49d8c0d4f26 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -15,7 +15,7 @@ struct mlx5_core_dev;
 struct mlx5_eq_param {
 	u8             irq_index;
 	int            nent;
-	u64            mask;
+	u64            mask[4];
 };
 
 struct mlx5_eq *
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 031db53e94ce..4148c47a65ed 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -860,6 +860,12 @@ struct mlx5_ifc_device_mem_cap_bits {
 	u8         reserved_at_180[0x680];
 };
 
+struct mlx5_ifc_device_event_cap_bits {
+	u8         user_affiliated_events[4][0x40];
+
+	u8         user_unaffiliated_events[4][0x40];
+};
+
 enum {
 	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE     = 0x0,
 	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES    = 0x2,
@@ -1017,7 +1023,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         log_max_srq_sz[0x8];
 	u8         log_max_qp_sz[0x8];
-	u8         reserved_at_90[0x8];
+	u8         event_cap[0x1];
+	u8         reserved_at_91[0x7];
 	u8         prio_tag_required[0x1];
 	u8         reserved_at_99[0x2];
 	u8         log_max_qp[0x5];
@@ -7422,9 +7429,9 @@ struct mlx5_ifc_create_eq_in_bits {
 
 	u8         reserved_at_280[0x40];
 
-	u8         event_bitmask[0x40];
+	u8         event_bitmask[4][0x40];
 
-	u8         reserved_at_300[0x580];
+	u8         reserved_at_3c0[0x4c0];
 
 	u8         pas[0][0x40];
 };
-- 
cgit v1.2.3


From c0670781f54839fb9d0b2c0eaee58862601981bf Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:24 +0300
Subject: net/mlx5: Expose the API to register for ANY event

Expose the API to register for ANY event, mlx5_ib will be able to use
this functionality for its needs.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h | 3 ---
 include/linux/mlx5/driver.h                      | 2 ++
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 33f78d4d3724..c634a78d5cdd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -949,6 +949,7 @@ int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb)
 
 	return atomic_notifier_chain_register(&eqt->nh[nb->event_type], &nb->nb);
 }
+EXPORT_SYMBOL(mlx5_eq_notifier_register);
 
 int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb)
 {
@@ -959,3 +960,4 @@ int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb)
 
 	return atomic_notifier_chain_unregister(&eqt->nh[nb->event_type], &nb->nb);
 }
+EXPORT_SYMBOL(mlx5_eq_notifier_unregister);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index d826e63d5a17..3dfab91ae5f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -97,7 +97,4 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev);
 struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev);
 #endif
 
-int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
-int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
-
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7658a4908431..24b02ab206c3 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1043,6 +1043,8 @@ int mlx5_register_interface(struct mlx5_interface *intf);
 void mlx5_unregister_interface(struct mlx5_interface *intf);
 int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
 int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
+int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
+int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
 
 int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
 
-- 
cgit v1.2.3


From 38164b771947be9baf06e78ffdfb650f8f3e908e Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:25 +0300
Subject: net/mlx5: mlx5_core_create_cq() enhancements

Enhance mlx5_core_create_cq() to get the command out buffer from the
callers to let them use the output.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c                     | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/cq.c        | 7 +++----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 3 ++-
 include/linux/mlx5/cq.h                             | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 0220736b073e..d323b822b694 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -891,6 +891,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 	int entries = attr->cqe;
 	int vector = attr->comp_vector;
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	struct mlx5_ib_cq *cq;
 	int uninitialized_var(index);
 	int uninitialized_var(inlen);
@@ -958,7 +959,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 	if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
 		MLX5_SET(cqc, cqc, oi, 1);
 
-	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen);
+	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out));
 	if (err)
 		goto err_cqb;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 703d88332bc6..1bd4336392a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -87,11 +87,10 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
 }
 
 int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
-			u32 *in, int inlen)
+			u32 *in, int inlen, u32 *out, int outlen)
 {
 	int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn);
 	u32 dout[MLX5_ST_SZ_DW(destroy_cq_out)];
-	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	u32 din[MLX5_ST_SZ_DW(destroy_cq_in)];
 	struct mlx5_eq_comp *eq;
 	int err;
@@ -100,9 +99,9 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	if (IS_ERR(eq))
 		return PTR_ERR(eq);
 
-	memset(out, 0, sizeof(out));
+	memset(out, 0, outlen);
 	MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ);
-	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f83fdb67e760..9ae55e93286d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1517,6 +1517,7 @@ static void mlx5e_free_cq(struct mlx5e_cq *cq)
 
 static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 {
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	struct mlx5_core_dev *mdev = cq->mdev;
 	struct mlx5_core_cq *mcq = &cq->mcq;
 
@@ -1551,7 +1552,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 					    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
 
-	err = mlx5_core_create_cq(mdev, mcq, in, inlen);
+	err = mlx5_core_create_cq(mdev, mcq, in, inlen, out, sizeof(out));
 
 	kvfree(in);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index d61d536f4e17..1fa922698a88 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -429,6 +429,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	struct mlx5_fpga_device *fdev = conn->fdev;
 	struct mlx5_core_dev *mdev = fdev->mdev;
 	u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {0};
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	struct mlx5_wq_param wqp;
 	struct mlx5_cqe64 *cqe;
 	int inlen, err, eqn;
@@ -476,7 +477,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
 	mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.buf, pas);
 
-	err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen);
+	err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen, out, sizeof(out));
 	kvfree(in);
 
 	if (err)
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 769326ea1d9b..e44157a2b7db 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -185,7 +185,7 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq)
 }
 
 int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
-			u32 *in, int inlen);
+			u32 *in, int inlen, u32 *out, int outlen);
 int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
 int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 		       u32 *out, int outlen);
-- 
cgit v1.2.3


From 4e0e2ea1886afe8c001971ff767f6670312a9b04 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:27 +0300
Subject: net/mlx5: Report EQE data upon CQ completion

Report EQE data upon CQ completion to let upper layers use this data.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c                     | 2 +-
 drivers/infiniband/hw/mlx5/main.c                   | 2 +-
 drivers/infiniband/hw/mlx5/qp.c                     | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/cq.c        | 5 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en.h        | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c   | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c        | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 3 ++-
 include/linux/mlx5/cq.h                             | 4 ++--
 9 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index d323b822b694..4efbbd2fce0c 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -37,7 +37,7 @@
 #include "mlx5_ib.h"
 #include "srq.h"
 
-static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq)
+static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe)
 {
 	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 798aa5e0941e..26b1ce2359ba 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4492,7 +4492,7 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
 	 * lock/unlock above locks Now need to arm all involved CQs.
 	 */
 	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
-		mcq->comp(mcq);
+		mcq->comp(mcq, NULL);
 	}
 	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
 }
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index f6623c77443a..768c7e81f688 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -6297,7 +6297,7 @@ static void handle_drain_completion(struct ib_cq *cq,
 		/* Run the CQ handler - this makes sure that the drain WR will
 		 * be processed if wasn't processed yet.
 		 */
-		mcq->mcq.comp(&mcq->mcq);
+		mcq->mcq.comp(&mcq->mcq, NULL);
 	}
 
 	wait_for_completion(&sdrain->done);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 1bd4336392a2..818edc63e428 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -58,7 +58,7 @@ void mlx5_cq_tasklet_cb(unsigned long data)
 	list_for_each_entry_safe(mcq, temp, &ctx->process_list,
 				 tasklet_ctx.list) {
 		list_del_init(&mcq->tasklet_ctx.list);
-		mcq->tasklet_ctx.comp(mcq);
+		mcq->tasklet_ctx.comp(mcq, NULL);
 		mlx5_cq_put(mcq);
 		if (time_after(jiffies, end))
 			break;
@@ -68,7 +68,8 @@ void mlx5_cq_tasklet_cb(unsigned long data)
 		tasklet_schedule(&ctx->task);
 }
 
-static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
+static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq,
+				   struct mlx5_eqe *eqe)
 {
 	unsigned long flags;
 	struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3a183d690e23..16753f263079 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -780,7 +780,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5e_tx_wqe *wqe, u16 pi, bool xmit_more);
 
 void mlx5e_trigger_irq(struct mlx5e_icosq *sq);
-void mlx5e_completion_event(struct mlx5_core_cq *mcq);
+void mlx5e_completion_event(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe);
 void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index f9862bf75491..c665ae0f22bd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -136,7 +136,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
-void mlx5e_completion_event(struct mlx5_core_cq *mcq)
+void mlx5e_completion_event(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
 {
 	struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 678454535460..41f25ea2e8d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -153,7 +153,7 @@ static int mlx5_eq_comp_int(struct notifier_block *nb,
 		cq = mlx5_eq_cq_get(eq, cqn);
 		if (likely(cq)) {
 			++cq->arm_sn;
-			cq->comp(cq);
+			cq->comp(cq, eqe);
 			mlx5_cq_put(cq);
 		} else {
 			mlx5_core_warn(eq->dev, "Completion event for bogus CQ 0x%x\n", cqn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index 1fa922698a88..4c50efe4e7f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -414,7 +414,8 @@ static void mlx5_fpga_conn_cq_tasklet(unsigned long data)
 	mlx5_fpga_conn_cqes(conn, MLX5_FPGA_CQ_BUDGET);
 }
 
-static void mlx5_fpga_conn_cq_complete(struct mlx5_core_cq *mcq)
+static void mlx5_fpga_conn_cq_complete(struct mlx5_core_cq *mcq,
+				       struct mlx5_eqe *eqe)
 {
 	struct mlx5_fpga_conn *conn;
 
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index e44157a2b7db..40748fc1b11b 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -47,7 +47,7 @@ struct mlx5_core_cq {
 	struct completion	free;
 	unsigned		vector;
 	unsigned int		irqn;
-	void (*comp)		(struct mlx5_core_cq *);
+	void (*comp)(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe);
 	void (*event)		(struct mlx5_core_cq *, enum mlx5_event);
 	u32			cons_index;
 	unsigned		arm_sn;
@@ -55,7 +55,7 @@ struct mlx5_core_cq {
 	int			pid;
 	struct {
 		struct list_head list;
-		void (*comp)(struct mlx5_core_cq *);
+		void (*comp)(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe);
 		void		*priv;
 	} tasklet_ctx;
 	int			reset_notify_added;
-- 
cgit v1.2.3


From e4075c44287638b9a99430fea79a2d1468fbc27d Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 30 Jun 2019 19:23:28 +0300
Subject: net/mlx5: Expose device definitions for object events

Expose an extra device definitions for objects events.

It includes: object_type values for legacy objects and generic data
header for any other object.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4148c47a65ed..be92401a25a0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -91,6 +91,20 @@ enum {
 
 enum {
 	MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b,
+	MLX5_OBJ_TYPE_MKEY = 0xff01,
+	MLX5_OBJ_TYPE_QP = 0xff02,
+	MLX5_OBJ_TYPE_PSV = 0xff03,
+	MLX5_OBJ_TYPE_RMP = 0xff04,
+	MLX5_OBJ_TYPE_XRC_SRQ = 0xff05,
+	MLX5_OBJ_TYPE_RQ = 0xff06,
+	MLX5_OBJ_TYPE_SQ = 0xff07,
+	MLX5_OBJ_TYPE_TIR = 0xff08,
+	MLX5_OBJ_TYPE_TIS = 0xff09,
+	MLX5_OBJ_TYPE_DCT = 0xff0a,
+	MLX5_OBJ_TYPE_XRQ = 0xff0b,
+	MLX5_OBJ_TYPE_RQT = 0xff0e,
+	MLX5_OBJ_TYPE_FLOW_COUNTER = 0xff0f,
+	MLX5_OBJ_TYPE_CQ = 0xff10,
 };
 
 enum {
@@ -9944,4 +9958,11 @@ struct mlx5_ifc_alloc_sf_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_affiliated_event_header_bits {
+	u8         reserved_at_0[0x10];
+	u8         obj_type[0x10];
+
+	u8         obj_id[0x20];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 2752b823169b216db142c4466b43269281962dcf Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Wed, 15 May 2019 00:04:27 -0500
Subject: net/mlx5: Introduce and use mlx5_eswitch_get_total_vports()

Instead MLX5_TOTAL_VPORTS, use mlx5_eswitch_get_total_vports().
mlx5_eswitch_get_total_vports() in subsequent patch accounts for SF
vports as well.
Expanding MLX5_TOTAL_VPORTS macro would require exposing SF internals to
more generic vport.h header file. Such exposure is not desired.
Hence a mlx5_eswitch_get_total_vports() is introduced.

Given that mlx5_eswitch_get_total_vports() API wants to work on const
mlx5_core_dev*, change its helper functions also to accept const *dev.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  4 +++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 26 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    | 15 +++++++++++++
 include/linux/mlx5/driver.h                        |  9 ++++----
 include/linux/mlx5/eswitch.h                       |  3 +++
 include/linux/mlx5/vport.h                         |  3 ---
 8 files changed, 43 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 3065c5d0ee96..f2cb789d2331 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -29,7 +29,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 static int
 mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
-	int num_ports = MLX5_TOTAL_VPORTS(dev);
+	int num_ports = mlx5_eswitch_get_total_vports(dev);
 	const struct mlx5_ib_profile *profile;
 	struct mlx5_ib_dev *ibdev;
 	int vport_index;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 89f52370e770..9137a8390216 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1868,14 +1868,16 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
 
 int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
-	int total_vports = MLX5_TOTAL_VPORTS(dev);
 	struct mlx5_eswitch *esw;
 	struct mlx5_vport *vport;
+	int total_vports;
 	int err, i;
 
 	if (!MLX5_VPORT_MANAGER(dev))
 		return 0;
 
+	total_vports = mlx5_eswitch_get_total_vports(dev);
+
 	esw_info(dev,
 		 "Total vports %d, per vport: max uc(%d) max mc(%d)\n",
 		 total_vports,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 50e5841c1698..5c8fb2597bfa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1394,7 +1394,7 @@ void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw)
 
 int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 {
-	int total_vports = MLX5_TOTAL_VPORTS(esw->dev);
+	int total_vports = esw->total_vports;
 	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_eswitch_rep *rep;
 	u8 hw_id[ETH_ALEN], rep_type;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9f5544ac6b8a..8162252585ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2090,7 +2090,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_d
 {
 	struct mlx5_flow_steering *steering = dev->priv.steering;
 
-	if (!steering || vport >= MLX5_TOTAL_VPORTS(dev))
+	if (!steering || vport >= mlx5_eswitch_get_total_vports(dev))
 		return NULL;
 
 	switch (type) {
@@ -2421,7 +2421,7 @@ static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev)
 	if (!steering->esw_egress_root_ns)
 		return;
 
-	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+	for (i = 0; i < mlx5_eswitch_get_total_vports(dev); i++)
 		cleanup_root_ns(steering->esw_egress_root_ns[i]);
 
 	kfree(steering->esw_egress_root_ns);
@@ -2435,7 +2435,7 @@ static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
 	if (!steering->esw_ingress_root_ns)
 		return;
 
-	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+	for (i = 0; i < mlx5_eswitch_get_total_vports(dev); i++)
 		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
 
 	kfree(steering->esw_ingress_root_ns);
@@ -2614,16 +2614,18 @@ static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering, int vpo
 static int init_egress_acls_root_ns(struct mlx5_core_dev *dev)
 {
 	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int total_vports = mlx5_eswitch_get_total_vports(dev);
 	int err;
 	int i;
 
-	steering->esw_egress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
-					       sizeof(*steering->esw_egress_root_ns),
-					       GFP_KERNEL);
+	steering->esw_egress_root_ns =
+			kcalloc(total_vports,
+				sizeof(*steering->esw_egress_root_ns),
+				GFP_KERNEL);
 	if (!steering->esw_egress_root_ns)
 		return -ENOMEM;
 
-	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+	for (i = 0; i < total_vports; i++) {
 		err = init_egress_acl_root_ns(steering, i);
 		if (err)
 			goto cleanup_root_ns;
@@ -2641,16 +2643,18 @@ cleanup_root_ns:
 static int init_ingress_acls_root_ns(struct mlx5_core_dev *dev)
 {
 	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int total_vports = mlx5_eswitch_get_total_vports(dev);
 	int err;
 	int i;
 
-	steering->esw_ingress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
-						sizeof(*steering->esw_ingress_root_ns),
-						GFP_KERNEL);
+	steering->esw_ingress_root_ns =
+			kcalloc(total_vports,
+				sizeof(*steering->esw_ingress_root_ns),
+				GFP_KERNEL);
 	if (!steering->esw_ingress_root_ns)
 		return -ENOMEM;
 
-	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+	for (i = 0; i < total_vports; i++) {
 		err = init_ingress_acl_root_ns(steering, i);
 		if (err)
 			goto cleanup_root_ns;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 670fa493c5f5..c912d82ca64b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -34,6 +34,7 @@
 #include <linux/etherdevice.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/vport.h>
+#include <linux/mlx5/eswitch.h>
 #include "mlx5_core.h"
 
 /* Mutex to hold while enabling or disabling RoCE */
@@ -1165,3 +1166,17 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev)
 	return tmp;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid);
+
+/**
+ * mlx5_eswitch_get_total_vports - Get total vports of the eswitch
+ *
+ * @dev:	Pointer to core device
+ *
+ * mlx5_eswitch_get_total_vports returns total number of vports for
+ * the eswitch.
+ */
+u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev)
+{
+	return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev);
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_total_vports);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 24b02ab206c3..031043341ed5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1085,7 +1085,7 @@ enum {
 	MLX5_PCI_DEV_IS_VF		= 1 << 0,
 };
 
-static inline bool mlx5_core_is_pf(struct mlx5_core_dev *dev)
+static inline bool mlx5_core_is_pf(const struct mlx5_core_dev *dev)
 {
 	return dev->coredev_type == MLX5_COREDEV_PF;
 }
@@ -1095,17 +1095,18 @@ static inline bool mlx5_core_is_ecpf(struct mlx5_core_dev *dev)
 	return dev->caps.embedded_cpu;
 }
 
-static inline bool mlx5_core_is_ecpf_esw_manager(struct mlx5_core_dev *dev)
+static inline bool
+mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev)
 {
 	return dev->caps.embedded_cpu && MLX5_CAP_GEN(dev, eswitch_manager);
 }
 
-static inline bool mlx5_ecpf_vport_exists(struct mlx5_core_dev *dev)
+static inline bool mlx5_ecpf_vport_exists(const struct mlx5_core_dev *dev)
 {
 	return mlx5_core_is_pf(dev) && MLX5_CAP_ESW(dev, ecpf_vport_exists);
 }
 
-static inline u16 mlx5_core_max_vfs(struct mlx5_core_dev *dev)
+static inline u16 mlx5_core_max_vfs(const struct mlx5_core_dev *dev)
 {
 	return dev->priv.sriov.max_vfs;
 }
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index d4731199edb4..61db37aa9642 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -66,6 +66,8 @@ struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw,
 				    int vport, u32 sqn);
 
+u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
+
 #ifdef CONFIG_MLX5_ESWITCH
 enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
@@ -93,4 +95,5 @@ mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
 	return 0;
 };
 #endif /* CONFIG_MLX5_ESWITCH */
+
 #endif
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 6cbf29229749..16060fb9b5e5 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -44,9 +44,6 @@
 				   MLX5_VPORT_UPLINK_PLACEHOLDER +	\
 				   MLX5_VPORT_ECPF_PLACEHOLDER(mdev))
 
-#define MLX5_TOTAL_VPORTS(mdev)	(MLX5_SPECIAL_VPORTS(mdev) +		\
-				 mlx5_core_max_vfs(mdev))
-
 #define MLX5_VPORT_MANAGER(mdev)					\
 	(MLX5_CAP_GEN(mdev, vport_group_manager) &&			\
 	 (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&	\
-- 
cgit v1.2.3


From a12ff35e0fb770b4d060298be147189313ec002c Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Wed, 3 Apr 2019 13:05:50 +0300
Subject: net/mlx5: Introduce TLS TX offload hardware bits and structures

Add TLS offload related IFC structs, layouts and enumerations.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   |  14 ++++++
 include/linux/mlx5/mlx5_ifc.h | 104 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 114 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 0d1abe097627..7358d64e76fa 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -437,6 +437,7 @@ enum {
 	MLX5_OPCODE_SET_PSV		= 0x20,
 	MLX5_OPCODE_GET_PSV		= 0x21,
 	MLX5_OPCODE_CHECK_PSV		= 0x22,
+	MLX5_OPCODE_DUMP		= 0x23,
 	MLX5_OPCODE_RGET_PSV		= 0x26,
 	MLX5_OPCODE_RCHECK_PSV		= 0x27,
 
@@ -444,6 +445,14 @@ enum {
 
 };
 
+enum {
+	MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x20,
+};
+
+enum {
+	MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS = 0x20,
+};
+
 enum {
 	MLX5_SET_PORT_RESET_QKEY	= 0,
 	MLX5_SET_PORT_GUID0		= 16,
@@ -1077,6 +1086,8 @@ enum mlx5_cap_type {
 	MLX5_CAP_DEBUG,
 	MLX5_CAP_RESERVED_14,
 	MLX5_CAP_DEV_MEM,
+	MLX5_CAP_RESERVED_16,
+	MLX5_CAP_TLS,
 	MLX5_CAP_DEV_EVENT = 0x14,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
@@ -1256,6 +1267,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP64_DEV_MEM(mdev, cap)\
 	MLX5_GET64(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap)
 
+#define MLX5_CAP_TLS(mdev, cap) \
+	MLX5_GET(tls_cap, (mdev)->caps.hca_cur[MLX5_CAP_TLS], cap)
+
 #define MLX5_CAP_DEV_EVENT(mdev, cap)\
 	MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca_cur[MLX5_CAP_DEV_EVENT], cap)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index be92401a25a0..f03ec31e3232 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -973,6 +973,16 @@ struct mlx5_ifc_vector_calc_cap_bits {
 	u8         reserved_at_c0[0x720];
 };
 
+struct mlx5_ifc_tls_cap_bits {
+	u8         tls_1_2_aes_gcm_128[0x1];
+	u8         tls_1_3_aes_gcm_128[0x1];
+	u8         tls_1_2_aes_gcm_256[0x1];
+	u8         tls_1_3_aes_gcm_256[0x1];
+	u8         reserved_at_4[0x1c];
+
+	u8         reserved_at_20[0x7e0];
+};
+
 enum {
 	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
 	MLX5_WQ_TYPE_CYCLIC       = 0x1,
@@ -1303,7 +1313,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_440[0x20];
 
-	u8         reserved_at_460[0x3];
+	u8         tls[0x1];
+	u8         reserved_at_461[0x2];
 	u8         log_max_uctx[0x5];
 	u8         reserved_at_468[0x3];
 	u8         log_max_umem[0x5];
@@ -1328,7 +1339,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         max_geneve_tlv_option_data_len[0x5];
 	u8         reserved_at_570[0x10];
 
-	u8         reserved_at_580[0x3c];
+	u8         reserved_at_580[0x33];
+	u8         log_max_dek[0x5];
+	u8         reserved_at_5b8[0x4];
 	u8         mini_cqe_resp_stride_index[0x1];
 	u8         cqe_128_always[0x1];
 	u8         cqe_compression_128[0x1];
@@ -2607,6 +2620,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_qos_cap_bits qos_cap;
 	struct mlx5_ifc_debug_cap_bits debug_cap;
 	struct mlx5_ifc_fpga_cap_bits fpga_cap;
+	struct mlx5_ifc_tls_cap_bits tls_cap;
 	u8         reserved_at_0[0x8000];
 };
 
@@ -2746,7 +2760,8 @@ struct mlx5_ifc_traffic_counter_bits {
 
 struct mlx5_ifc_tisc_bits {
 	u8         strict_lag_tx_port_affinity[0x1];
-	u8         reserved_at_1[0x3];
+	u8         tls_en[0x1];
+	u8         reserved_at_1[0x2];
 	u8         lag_tx_port_affinity[0x04];
 
 	u8         reserved_at_8[0x4];
@@ -2760,7 +2775,11 @@ struct mlx5_ifc_tisc_bits {
 
 	u8         reserved_at_140[0x8];
 	u8         underlay_qpn[0x18];
-	u8         reserved_at_160[0x3a0];
+
+	u8         reserved_at_160[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_180[0x380];
 };
 
 enum {
@@ -9965,4 +9984,81 @@ struct mlx5_ifc_affiliated_event_header_bits {
 	u8         obj_id[0x20];
 };
 
+enum {
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT(0xc),
+};
+
+enum {
+	MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc,
+};
+
+struct mlx5_ifc_encryption_key_obj_bits {
+	u8         modify_field_select[0x40];
+
+	u8         reserved_at_40[0x14];
+	u8         key_size[0x4];
+	u8         reserved_at_58[0x4];
+	u8         key_type[0x4];
+
+	u8         reserved_at_60[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_80[0x180];
+	u8         key[8][0x20];
+
+	u8         reserved_at_300[0x500];
+};
+
+struct mlx5_ifc_create_encryption_key_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+	struct mlx5_ifc_encryption_key_obj_bits encryption_key_object;
+};
+
+enum {
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_128 = 0x0,
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_256 = 0x1,
+};
+
+enum {
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_DEK = 0x1,
+};
+
+struct mlx5_ifc_tls_static_params_bits {
+	u8         const_2[0x2];
+	u8         tls_version[0x4];
+	u8         const_1[0x2];
+	u8         reserved_at_8[0x14];
+	u8         encryption_standard[0x4];
+
+	u8         reserved_at_20[0x20];
+
+	u8         initial_record_number[0x40];
+
+	u8         resync_tcp_sn[0x20];
+
+	u8         gcm_iv[0x20];
+
+	u8         implicit_iv[0x40];
+
+	u8         reserved_at_100[0x8];
+	u8         dek_index[0x18];
+
+	u8         reserved_at_120[0xe0];
+};
+
+struct mlx5_ifc_tls_progress_params_bits {
+	u8         valid[0x1];
+	u8         reserved_at_1[0x7];
+	u8         pd[0x18];
+
+	u8         next_record_tcp_sn[0x20];
+
+	u8         hw_resync_tcp_sn[0x20];
+
+	u8         record_tracker_state[0x2];
+	u8         auth_state[0x2];
+	u8         reserved_at_64[0x4];
+	u8         hw_offset_record_number[0x18];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 0718edf528c552c66a5dc3525ffb145971efa766 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Tue, 2 Jul 2019 17:12:09 +0300
Subject: net/mlx5: Properly name the generic WQE control field

A generic WQE control field is used for different purposes
in different cases.
Use union to allow using the proper name in each case.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/qp.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index d1f353c64797..127d224443e3 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -202,7 +202,12 @@ struct mlx5_wqe_ctrl_seg {
 	u8			signature;
 	u8			rsvd[2];
 	u8			fm_ce_se;
-	__be32			imm;
+	union {
+		__be32		general_id;
+		__be32		imm;
+		__be32		umr_mkey;
+		__be32		tisn;
+	};
 };
 
 #define MLX5_WQE_CTRL_DS_MASK 0x3f
-- 
cgit v1.2.3


From e473093639945cb0a07ad4d51d5fd3fc3c3708cf Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 3 Jul 2019 16:06:52 +0200
Subject: inet: factor out inet_send_prepare()

The same code is replicated verbatim in multiple places, and the next
patches will introduce an additional user for it. Factor out a
helper and use it where appropriate. No functional change intended.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_common.h |  1 +
 net/ipv4/af_inet.c        | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 975901a95c0f..ae2ba897675c 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -25,6 +25,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 		       int addr_len, int flags);
 int inet_accept(struct socket *sock, struct socket *newsock, int flags,
 		bool kern);
+int inet_send_prepare(struct sock *sk);
 int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		      size_t size, int flags);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 52bdb881a506..8421e2f5bbb3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -784,10 +784,8 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 }
 EXPORT_SYMBOL(inet_getname);
 
-int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+int inet_send_prepare(struct sock *sk)
 {
-	struct sock *sk = sock->sk;
-
 	sock_rps_record_flow(sk);
 
 	/* We may need to bind the socket. */
@@ -795,6 +793,17 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	    inet_autobind(sk))
 		return -EAGAIN;
 
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inet_send_prepare);
+
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+
+	if (unlikely(inet_send_prepare(sk)))
+		return -EAGAIN;
+
 	return sk->sk_prot->sendmsg(sk, msg, size);
 }
 EXPORT_SYMBOL(inet_sendmsg);
@@ -804,11 +813,7 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 {
 	struct sock *sk = sock->sk;
 
-	sock_rps_record_flow(sk);
-
-	/* We may need to bind the socket. */
-	if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
-	    inet_autobind(sk))
+	if (unlikely(inet_send_prepare(sk)))
 		return -EAGAIN;
 
 	if (sk->sk_prot->sendpage)
-- 
cgit v1.2.3


From f0c1aab2bd1ad131d9d7528b9dcbf9253a74e5da Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 21 Jun 2019 17:37:48 +0200
Subject: netfilter: rename nf_SYNPROXY.h to nf_synproxy.h

Uppercase is a reminiscence from the iptables infrastructure, rename
this header before this is included in stable kernels.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_SYNPROXY.h | 19 -------------------
 include/uapi/linux/netfilter/nf_synproxy.h | 19 +++++++++++++++++++
 include/uapi/linux/netfilter/xt_SYNPROXY.h |  2 +-
 net/netfilter/nf_synproxy_core.c           |  2 +-
 4 files changed, 21 insertions(+), 21 deletions(-)
 delete mode 100644 include/uapi/linux/netfilter/nf_SYNPROXY.h
 create mode 100644 include/uapi/linux/netfilter/nf_synproxy.h

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_SYNPROXY.h b/include/uapi/linux/netfilter/nf_SYNPROXY.h
deleted file mode 100644
index 068d1b3a6f06..000000000000
--- a/include/uapi/linux/netfilter/nf_SYNPROXY.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NF_SYNPROXY_H
-#define _NF_SYNPROXY_H
-
-#include <linux/types.h>
-
-#define NF_SYNPROXY_OPT_MSS		0x01
-#define NF_SYNPROXY_OPT_WSCALE		0x02
-#define NF_SYNPROXY_OPT_SACK_PERM	0x04
-#define NF_SYNPROXY_OPT_TIMESTAMP	0x08
-#define NF_SYNPROXY_OPT_ECN		0x10
-
-struct nf_synproxy_info {
-	__u8	options;
-	__u8	wscale;
-	__u16	mss;
-};
-
-#endif /* _NF_SYNPROXY_H */
diff --git a/include/uapi/linux/netfilter/nf_synproxy.h b/include/uapi/linux/netfilter/nf_synproxy.h
new file mode 100644
index 000000000000..068d1b3a6f06
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_synproxy.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NF_SYNPROXY_H
+#define _NF_SYNPROXY_H
+
+#include <linux/types.h>
+
+#define NF_SYNPROXY_OPT_MSS		0x01
+#define NF_SYNPROXY_OPT_WSCALE		0x02
+#define NF_SYNPROXY_OPT_SACK_PERM	0x04
+#define NF_SYNPROXY_OPT_TIMESTAMP	0x08
+#define NF_SYNPROXY_OPT_ECN		0x10
+
+struct nf_synproxy_info {
+	__u8	options;
+	__u8	wscale;
+	__u16	mss;
+};
+
+#endif /* _NF_SYNPROXY_H */
diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h
index 4d5611d647df..19c04ed86172 100644
--- a/include/uapi/linux/netfilter/xt_SYNPROXY.h
+++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h
@@ -2,7 +2,7 @@
 #ifndef _XT_SYNPROXY_H
 #define _XT_SYNPROXY_H
 
-#include <linux/netfilter/nf_SYNPROXY.h>
+#include <linux/netfilter/nf_synproxy.h>
 
 #define XT_SYNPROXY_OPT_MSS		NF_SYNPROXY_OPT_MSS
 #define XT_SYNPROXY_OPT_WSCALE		NF_SYNPROXY_OPT_WSCALE
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 409722d23302..b101f187eda8 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -11,7 +11,7 @@
 #include <linux/proc_fs.h>
 
 #include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/nf_SYNPROXY.h>
+#include <linux/netfilter/nf_synproxy.h>
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
-- 
cgit v1.2.3


From 0d9cb300acad29f25ea23d2592e69970bc61f14c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 2 Jul 2019 20:41:14 +0200
Subject: netfilter: nf_queue: remove unused hook entries pointer

Its not used anywhere, so remove this.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 3 +--
 net/bridge/br_input.c            | 2 +-
 net/netfilter/core.c             | 2 +-
 net/netfilter/nf_queue.c         | 8 +++-----
 4 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 7239105d9d2e..3cb6dcf53a4e 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -120,6 +120,5 @@ nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family,
 }
 
 int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
-	     const struct nf_hook_entries *entries, unsigned int index,
-	     unsigned int verdict);
+	     unsigned int index, unsigned int verdict);
 #endif /* _NF_QUEUE_H */
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 21b74e7a7b2f..512383d5e53f 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -234,7 +234,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb)
 			kfree_skb(skb);
 			return RX_HANDLER_CONSUMED;
 		case NF_QUEUE:
-			ret = nf_queue(skb, &state, e, i, verdict);
+			ret = nf_queue(skb, &state, i, verdict);
 			if (ret == 1)
 				continue;
 			return RX_HANDLER_CONSUMED;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 817a9e5d16e4..5d5bdf450091 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -520,7 +520,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 				ret = -EPERM;
 			return ret;
 		case NF_QUEUE:
-			ret = nf_queue(skb, state, e, s, verdict);
+			ret = nf_queue(skb, state, s, verdict);
 			if (ret == 1)
 				continue;
 			return ret;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index b5b2be55ca82..c72a5bdd123f 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -156,7 +156,6 @@ static void nf_ip6_saveroute(const struct sk_buff *skb,
 }
 
 static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
-		      const struct nf_hook_entries *entries,
 		      unsigned int index, unsigned int queuenum)
 {
 	int status = -ENOENT;
@@ -225,12 +224,11 @@ err:
 
 /* Packets leaving via this function must come back through nf_reinject(). */
 int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
-	     const struct nf_hook_entries *entries, unsigned int index,
-	     unsigned int verdict)
+	     unsigned int index, unsigned int verdict)
 {
 	int ret;
 
-	ret = __nf_queue(skb, state, entries, index, verdict >> NF_VERDICT_QBITS);
+	ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS);
 	if (ret < 0) {
 		if (ret == -ESRCH &&
 		    (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
@@ -336,7 +334,7 @@ next_hook:
 		local_bh_enable();
 		break;
 	case NF_QUEUE:
-		err = nf_queue(skb, &entry->state, hooks, i, verdict);
+		err = nf_queue(skb, &entry->state, i, verdict);
 		if (err == 1)
 			goto next_hook;
 		break;
-- 
cgit v1.2.3


From 6f7b841bc939e7c811ad32427b58d54edbcfa6ed Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@yandex-team.ru>
Date: Mon, 1 Jul 2019 19:49:34 +0300
Subject: ipvs: allow tunneling with gre encapsulation

windows real servers can handle gre tunnels, this patch allows
gre encapsulation with the tunneling method, thereby letting ipvs
be load balancer for windows-based services

Signed-off-by: Vadim Fedorenko <vfedorenko@yandex-team.ru>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/ip_vs.h      |  1 +
 net/netfilter/ipvs/ip_vs_ctl.c  |  1 +
 net/netfilter/ipvs/ip_vs_xmit.c | 66 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 65 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index e4f18061a4fd..4102ddcb4e14 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -128,6 +128,7 @@
 enum {
 	IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0,	/* IPIP */
 	IP_VS_CONN_F_TUNNEL_TYPE_GUE,		/* GUE */
+	IP_VS_CONN_F_TUNNEL_TYPE_GRE,		/* GRE */
 	IP_VS_CONN_F_TUNNEL_TYPE_MAX,
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 84384d896e29..998353bec74f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -525,6 +525,7 @@ static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
 			port = dest->tun_port;
 			break;
 		case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
+		case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
 			port = 0;
 			break;
 		default:
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 71fc6d63a67f..9c464d24beec 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -29,6 +29,7 @@
 #include <linux/tcp.h>                  /* for tcphdr */
 #include <net/ip.h>
 #include <net/gue.h>
+#include <net/gre.h>
 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
 #include <net/udp.h>
 #include <net/icmp.h>                   /* for icmp_send */
@@ -388,6 +389,12 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			     IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
 			    skb->ip_summed == CHECKSUM_PARTIAL)
 				mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+		} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+			__be16 tflags = 0;
+
+			if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+				tflags |= TUNNEL_CSUM;
+			mtu -= gre_calc_hlen(tflags);
 		}
 		if (mtu < 68) {
 			IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
@@ -548,6 +555,12 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			     IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
 			    skb->ip_summed == CHECKSUM_PARTIAL)
 				mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+		} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+			__be16 tflags = 0;
+
+			if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+				tflags |= TUNNEL_CSUM;
+			mtu -= gre_calc_hlen(tflags);
 		}
 		if (mtu < IPV6_MIN_MTU) {
 			IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
@@ -1079,6 +1092,24 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb,
 	return 0;
 }
 
+static void
+ipvs_gre_encap(struct net *net, struct sk_buff *skb,
+	       struct ip_vs_conn *cp, __u8 *next_protocol)
+{
+	__be16 proto = *next_protocol == IPPROTO_IPIP ?
+				htons(ETH_P_IP) : htons(ETH_P_IPV6);
+	__be16 tflags = 0;
+	size_t hdrlen;
+
+	if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+		tflags |= TUNNEL_CSUM;
+
+	hdrlen = gre_calc_hlen(tflags);
+	gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
+
+	*next_protocol = IPPROTO_GRE;
+}
+
 /*
  *   IP Tunneling transmitter
  *
@@ -1151,6 +1182,15 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
 
 		max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+		size_t gre_hdrlen;
+		__be16 tflags = 0;
+
+		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+			tflags |= TUNNEL_CSUM;
+		gre_hdrlen = gre_calc_hlen(tflags);
+
+		max_headroom += gre_hdrlen;
 	}
 
 	/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
@@ -1172,6 +1212,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		    skb->ip_summed == CHECKSUM_PARTIAL) {
 			gso_type |= SKB_GSO_TUNNEL_REMCSUM;
 		}
+	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+			gso_type |= SKB_GSO_GRE_CSUM;
+		else
+			gso_type |= SKB_GSO_GRE;
 	}
 
 	if (iptunnel_handle_offloads(skb, gso_type))
@@ -1192,8 +1237,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			check = true;
 
 		udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
-	}
-
+	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
+		ipvs_gre_encap(net, skb, cp, &next_protocol);
 
 	skb_push(skb, sizeof(struct iphdr));
 	skb_reset_network_header(skb);
@@ -1287,6 +1332,15 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
 
 		max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+		size_t gre_hdrlen;
+		__be16 tflags = 0;
+
+		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+			tflags |= TUNNEL_CSUM;
+		gre_hdrlen = gre_calc_hlen(tflags);
+
+		max_headroom += gre_hdrlen;
 	}
 
 	skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
@@ -1306,6 +1360,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		    skb->ip_summed == CHECKSUM_PARTIAL) {
 			gso_type |= SKB_GSO_TUNNEL_REMCSUM;
 		}
+	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+			gso_type |= SKB_GSO_GRE_CSUM;
+		else
+			gso_type |= SKB_GSO_GRE;
 	}
 
 	if (iptunnel_handle_offloads(skb, gso_type))
@@ -1326,7 +1385,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 			check = true;
 
 		udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
-	}
+	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
+		ipvs_gre_encap(net, skb, cp, &next_protocol);
 
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
-- 
cgit v1.2.3


From f8efee08dd9d41ab71010e9b16c9ead51753b7d6 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:30 +0300
Subject: net/mlx5: Add rts2rts_qp_counters_set_id field in hca cap

Add rts2rts_qp_counters_set_id field in hca cap so that RTS2RTS
qp modification can be used to change the counter of a QP.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f03ec31e3232..06881b79167e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1096,7 +1096,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cc_modify_allowed[0x1];
 	u8         start_pad[0x1];
 	u8         cache_line_128byte[0x1];
-	u8         reserved_at_165[0xa];
+	u8         reserved_at_165[0x4];
+	u8         rts2rts_qp_counters_set_id[0x1];
+	u8         reserved_at_16a[0x5];
 	u8         qcam_reg[0x1];
 	u8         gid_table_size[0x10];
 
-- 
cgit v1.2.3


From 07a4ddec3ce9b0a533b5f90f582f1057390d5e63 Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.ch>
Date: Tue, 2 Jul 2019 19:43:54 +0200
Subject: bonding: add an option to specify a delay between peer notifications

Currently, gratuitous ARP/ND packets are sent every `miimon'
milliseconds. This commit allows a user to specify a custom delay
through a new option, `peer_notif_delay'.

Like for `updelay' and `downdelay', this delay should be a multiple of
`miimon' to avoid managing an additional work queue. The configuration
logic is copied from `updelay' and `downdelay'. However, the default
value cannot be set using a module parameter: Netlink or sysfs should
be used to configure this feature.

When setting `miimon' to 100 and `peer_notif_delay' to 500, we can
observe the 500 ms delay is respected:

    20:30:19.354693 ARP, Request who-has 203.0.113.10 tell 203.0.113.10, length 28
    20:30:19.874892 ARP, Request who-has 203.0.113.10 tell 203.0.113.10, length 28
    20:30:20.394919 ARP, Request who-has 203.0.113.10 tell 203.0.113.10, length 28
    20:30:20.914963 ARP, Request who-has 203.0.113.10 tell 203.0.113.10, length 28

In bond_mii_monitor(), I have tried to keep the lock logic readable.
The change is due to the fact we cannot rely on a notification to
lower the value of `bond->send_peer_notif' as `NETDEV_NOTIFY_PEERS' is
only triggered once every N times, while we need to decrement the
counter each time.

iproute2 also needs to be updated to be able to specify this new
attribute through `ip link'.

Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c    | 31 +++++++++++------
 drivers/net/bonding/bond_netlink.c | 14 ++++++++
 drivers/net/bonding/bond_options.c | 71 ++++++++++++++++++++++++--------------
 drivers/net/bonding/bond_procfs.c  |  2 ++
 drivers/net/bonding/bond_sysfs.c   | 13 +++++++
 include/net/bond_options.h         |  1 +
 include/net/bonding.h              |  1 +
 include/uapi/linux/if_link.h       |  1 +
 tools/include/uapi/linux/if_link.h |  1 +
 9 files changed, 98 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 84168455aded..302499ae05e6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -796,6 +796,8 @@ static bool bond_should_notify_peers(struct bonding *bond)
 		   slave ? slave->dev->name : "NULL");
 
 	if (!slave || !bond->send_peer_notif ||
+	    bond->send_peer_notif %
+	    max(1, bond->params.peer_notif_delay) != 0 ||
 	    !netif_carrier_ok(bond->dev) ||
 	    test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state))
 		return false;
@@ -886,15 +888,18 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
 
 			if (netif_running(bond->dev)) {
 				bond->send_peer_notif =
-					bond->params.num_peer_notif;
+					bond->params.num_peer_notif *
+					max(1, bond->params.peer_notif_delay);
 				should_notify_peers =
 					bond_should_notify_peers(bond);
 			}
 
 			call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev);
-			if (should_notify_peers)
+			if (should_notify_peers) {
+				bond->send_peer_notif--;
 				call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
 							 bond->dev);
+			}
 		}
 	}
 
@@ -2279,6 +2284,7 @@ static void bond_mii_monitor(struct work_struct *work)
 	struct bonding *bond = container_of(work, struct bonding,
 					    mii_work.work);
 	bool should_notify_peers = false;
+	bool commit;
 	unsigned long delay;
 	struct slave *slave;
 	struct list_head *iter;
@@ -2289,12 +2295,19 @@ static void bond_mii_monitor(struct work_struct *work)
 		goto re_arm;
 
 	rcu_read_lock();
-
 	should_notify_peers = bond_should_notify_peers(bond);
-
-	if (bond_miimon_inspect(bond)) {
+	commit = !!bond_miimon_inspect(bond);
+	if (bond->send_peer_notif) {
+		rcu_read_unlock();
+		if (rtnl_trylock()) {
+			bond->send_peer_notif--;
+			rtnl_unlock();
+		}
+	} else {
 		rcu_read_unlock();
+	}
 
+	if (commit) {
 		/* Race avoidance with bond_close cancel of workqueue */
 		if (!rtnl_trylock()) {
 			delay = 1;
@@ -2308,8 +2321,7 @@ static void bond_mii_monitor(struct work_struct *work)
 		bond_miimon_commit(bond);
 
 		rtnl_unlock();	/* might sleep, hold no other locks */
-	} else
-		rcu_read_unlock();
+	}
 
 re_arm:
 	if (bond->params.miimon)
@@ -3065,10 +3077,6 @@ static int bond_master_netdev_event(unsigned long event,
 	case NETDEV_REGISTER:
 		bond_create_proc_entry(event_bond);
 		break;
-	case NETDEV_NOTIFY_PEERS:
-		if (event_bond->send_peer_notif)
-			event_bond->send_peer_notif--;
-		break;
 	default:
 		break;
 	}
@@ -4691,6 +4699,7 @@ static int bond_check_params(struct bond_params *params)
 	params->arp_all_targets = arp_all_targets_value;
 	params->updelay = updelay;
 	params->downdelay = downdelay;
+	params->peer_notif_delay = 0;
 	params->use_carrier = use_carrier;
 	params->lacp_fast = lacp_fast;
 	params->primary[0] = 0;
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index b24cce48ae35..a259860a7208 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -108,6 +108,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_AD_ACTOR_SYSTEM]	= { .type = NLA_BINARY,
 					    .len  = ETH_ALEN },
 	[IFLA_BOND_TLB_DYNAMIC_LB]	= { .type = NLA_U8 },
+	[IFLA_BOND_PEER_NOTIF_DELAY]    = { .type = NLA_U32 },
 };
 
 static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = {
@@ -215,6 +216,14 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		if (err)
 			return err;
 	}
+	if (data[IFLA_BOND_PEER_NOTIF_DELAY]) {
+		int delay = nla_get_u32(data[IFLA_BOND_PEER_NOTIF_DELAY]);
+
+		bond_opt_initval(&newval, delay);
+		err = __bond_opt_set(bond, BOND_OPT_PEER_NOTIF_DELAY, &newval);
+		if (err)
+			return err;
+	}
 	if (data[IFLA_BOND_USE_CARRIER]) {
 		int use_carrier = nla_get_u8(data[IFLA_BOND_USE_CARRIER]);
 
@@ -494,6 +503,7 @@ static size_t bond_get_size(const struct net_device *bond_dev)
 		nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_USER_PORT_KEY */
 		nla_total_size(ETH_ALEN) + /* IFLA_BOND_AD_ACTOR_SYSTEM */
 		nla_total_size(sizeof(u8)) + /* IFLA_BOND_TLB_DYNAMIC_LB */
+		nla_total_size(sizeof(u32)) +	/* IFLA_BOND_PEER_NOTIF_DELAY */
 		0;
 }
 
@@ -536,6 +546,10 @@ static int bond_fill_info(struct sk_buff *skb,
 			bond->params.downdelay * bond->params.miimon))
 		goto nla_put_failure;
 
+	if (nla_put_u32(skb, IFLA_BOND_PEER_NOTIF_DELAY,
+			bond->params.downdelay * bond->params.miimon))
+		goto nla_put_failure;
+
 	if (nla_put_u8(skb, IFLA_BOND_USE_CARRIER, bond->params.use_carrier))
 		goto nla_put_failure;
 
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 0d852fe9da7c..ddb3916d3506 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -24,6 +24,8 @@ static int bond_option_updelay_set(struct bonding *bond,
 				   const struct bond_opt_value *newval);
 static int bond_option_downdelay_set(struct bonding *bond,
 				     const struct bond_opt_value *newval);
+static int bond_option_peer_notif_delay_set(struct bonding *bond,
+					    const struct bond_opt_value *newval);
 static int bond_option_use_carrier_set(struct bonding *bond,
 				       const struct bond_opt_value *newval);
 static int bond_option_arp_interval_set(struct bonding *bond,
@@ -424,6 +426,13 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.desc = "Number of peer notifications to send on failover event",
 		.values = bond_num_peer_notif_tbl,
 		.set = bond_option_num_peer_notif_set
+	},
+	[BOND_OPT_PEER_NOTIF_DELAY] = {
+		.id = BOND_OPT_PEER_NOTIF_DELAY,
+		.name = "peer_notif_delay",
+		.desc = "Delay between each peer notification on failover event, in milliseconds",
+		.values = bond_intmax_tbl,
+		.set = bond_option_peer_notif_delay_set
 	}
 };
 
@@ -841,6 +850,9 @@ static int bond_option_miimon_set(struct bonding *bond,
 	if (bond->params.downdelay)
 		netdev_dbg(bond->dev, "Note: Updating downdelay (to %d) since it is a multiple of the miimon value\n",
 			   bond->params.downdelay * bond->params.miimon);
+	if (bond->params.peer_notif_delay)
+		netdev_dbg(bond->dev, "Note: Updating peer_notif_delay (to %d) since it is a multiple of the miimon value\n",
+			   bond->params.peer_notif_delay * bond->params.miimon);
 	if (newval->value && bond->params.arp_interval) {
 		netdev_dbg(bond->dev, "MII monitoring cannot be used with ARP monitoring - disabling ARP monitoring...\n");
 		bond->params.arp_interval = 0;
@@ -864,52 +876,59 @@ static int bond_option_miimon_set(struct bonding *bond,
 	return 0;
 }
 
-/* Set up and down delays. These must be multiples of the
- * MII monitoring value, and are stored internally as the multiplier.
- * Thus, we must translate to MS for the real world.
+/* Set up, down and peer notification delays. These must be multiples
+ * of the MII monitoring value, and are stored internally as the
+ * multiplier. Thus, we must translate to MS for the real world.
  */
-static int bond_option_updelay_set(struct bonding *bond,
-				   const struct bond_opt_value *newval)
+static int _bond_option_delay_set(struct bonding *bond,
+				  const struct bond_opt_value *newval,
+				  const char *name,
+				  int *target)
 {
 	int value = newval->value;
 
 	if (!bond->params.miimon) {
-		netdev_err(bond->dev, "Unable to set up delay as MII monitoring is disabled\n");
+		netdev_err(bond->dev, "Unable to set %s as MII monitoring is disabled\n",
+			   name);
 		return -EPERM;
 	}
 	if ((value % bond->params.miimon) != 0) {
-		netdev_warn(bond->dev, "up delay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n",
+		netdev_warn(bond->dev,
+			    "%s (%d) is not a multiple of miimon (%d), value rounded to %d ms\n",
+			    name,
 			    value, bond->params.miimon,
 			    (value / bond->params.miimon) *
 			    bond->params.miimon);
 	}
-	bond->params.updelay = value / bond->params.miimon;
-	netdev_dbg(bond->dev, "Setting up delay to %d\n",
-		   bond->params.updelay * bond->params.miimon);
+	*target = value / bond->params.miimon;
+	netdev_dbg(bond->dev, "Setting %s to %d\n",
+		   name,
+		   *target * bond->params.miimon);
 
 	return 0;
 }
 
+static int bond_option_updelay_set(struct bonding *bond,
+				   const struct bond_opt_value *newval)
+{
+	return _bond_option_delay_set(bond, newval, "up delay",
+				      &bond->params.updelay);
+}
+
 static int bond_option_downdelay_set(struct bonding *bond,
 				     const struct bond_opt_value *newval)
 {
-	int value = newval->value;
-
-	if (!bond->params.miimon) {
-		netdev_err(bond->dev, "Unable to set down delay as MII monitoring is disabled\n");
-		return -EPERM;
-	}
-	if ((value % bond->params.miimon) != 0) {
-		netdev_warn(bond->dev, "down delay (%d) is not a multiple of miimon (%d), delay rounded to %d ms\n",
-			    value, bond->params.miimon,
-			    (value / bond->params.miimon) *
-			    bond->params.miimon);
-	}
-	bond->params.downdelay = value / bond->params.miimon;
-	netdev_dbg(bond->dev, "Setting down delay to %d\n",
-		   bond->params.downdelay * bond->params.miimon);
+	return _bond_option_delay_set(bond, newval, "down delay",
+				      &bond->params.downdelay);
+}
 
-	return 0;
+static int bond_option_peer_notif_delay_set(struct bonding *bond,
+					    const struct bond_opt_value *newval)
+{
+	int ret = _bond_option_delay_set(bond, newval,
+					 "peer notification delay",
+					 &bond->params.peer_notif_delay);
+	return ret;
 }
 
 static int bond_option_use_carrier_set(struct bonding *bond,
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index 9f7d83e827c3..fd5c9cbe45b1 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -104,6 +104,8 @@ static void bond_info_show_master(struct seq_file *seq)
 		   bond->params.updelay * bond->params.miimon);
 	seq_printf(seq, "Down Delay (ms): %d\n",
 		   bond->params.downdelay * bond->params.miimon);
+	seq_printf(seq, "Peer Notification Delay (ms): %d\n",
+		   bond->params.peer_notif_delay * bond->params.miimon);
 
 
 	/* ARP information */
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 94214eaf53c5..2d615a93685e 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -327,6 +327,18 @@ static ssize_t bonding_show_updelay(struct device *d,
 static DEVICE_ATTR(updelay, 0644,
 		   bonding_show_updelay, bonding_sysfs_store_option);
 
+static ssize_t bonding_show_peer_notif_delay(struct device *d,
+					     struct device_attribute *attr,
+					     char *buf)
+{
+	struct bonding *bond = to_bond(d);
+
+	return sprintf(buf, "%d\n",
+		       bond->params.peer_notif_delay * bond->params.miimon);
+}
+static DEVICE_ATTR(peer_notif_delay, 0644,
+		   bonding_show_peer_notif_delay, bonding_sysfs_store_option);
+
 /* Show the LACP interval. */
 static ssize_t bonding_show_lacp(struct device *d,
 				 struct device_attribute *attr,
@@ -718,6 +730,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_arp_ip_target.attr,
 	&dev_attr_downdelay.attr,
 	&dev_attr_updelay.attr,
+	&dev_attr_peer_notif_delay.attr,
 	&dev_attr_lacp_rate.attr,
 	&dev_attr_ad_select.attr,
 	&dev_attr_xmit_hash_policy.attr,
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 2a05cc349018..9d382f2f0bc5 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -63,6 +63,7 @@ enum {
 	BOND_OPT_AD_ACTOR_SYSTEM,
 	BOND_OPT_AD_USER_PORT_KEY,
 	BOND_OPT_NUM_PEER_NOTIF_ALIAS,
+	BOND_OPT_PEER_NOTIF_DELAY,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 676e7fae05a3..f7fe45689142 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -123,6 +123,7 @@ struct bond_params {
 	int fail_over_mac;
 	int updelay;
 	int downdelay;
+	int peer_notif_delay;
 	int lacp_fast;
 	unsigned int min_links;
 	int ad_select;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6f75bda2c2d7..4a8c02cafa9a 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -636,6 +636,7 @@ enum {
 	IFLA_BOND_AD_USER_PORT_KEY,
 	IFLA_BOND_AD_ACTOR_SYSTEM,
 	IFLA_BOND_TLB_DYNAMIC_LB,
+	IFLA_BOND_PEER_NOTIF_DELAY,
 	__IFLA_BOND_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 5b225ff63b48..7d113a9602f0 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -636,6 +636,7 @@ enum {
 	IFLA_BOND_AD_USER_PORT_KEY,
 	IFLA_BOND_AD_ACTOR_SYSTEM,
 	IFLA_BOND_TLB_DYNAMIC_LB,
+	IFLA_BOND_PEER_NOTIF_DELAY,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From ad49d86e07a497e834cb06f2b151dccd75f8e148 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Wed, 26 Jun 2019 12:59:19 +0200
Subject: netfilter: nf_tables: Add synproxy support

Add synproxy support for nf_tables. This behaves like the iptables
synproxy target but it is structured in a way that allows us to propose
improvements in the future.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_synproxy.h |   1 +
 include/net/netfilter/nf_synproxy.h           |   5 +
 include/uapi/linux/netfilter/nf_synproxy.h    |   4 +
 include/uapi/linux/netfilter/nf_tables.h      |  16 ++
 net/netfilter/Kconfig                         |  11 +
 net/netfilter/Makefile                        |   1 +
 net/netfilter/nft_synproxy.c                  | 287 ++++++++++++++++++++++++++
 7 files changed, 325 insertions(+)
 create mode 100644 net/netfilter/nft_synproxy.c

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index c5659dcf5b1a..8f00125b06f4 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -2,6 +2,7 @@
 #ifndef _NF_CONNTRACK_SYNPROXY_H
 #define _NF_CONNTRACK_SYNPROXY_H
 
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netns/generic.h>
 
 struct nf_conn_synproxy {
diff --git a/include/net/netfilter/nf_synproxy.h b/include/net/netfilter/nf_synproxy.h
index 3e8b3f03b687..87d73fb5279d 100644
--- a/include/net/netfilter/nf_synproxy.h
+++ b/include/net/netfilter/nf_synproxy.h
@@ -39,6 +39,11 @@ unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
 				const struct nf_hook_state *nhs);
 int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net);
 void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net);
+#else
+static inline int
+nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) { return 0; }
+static inline void
+nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) {};
 #endif /* CONFIG_IPV6 */
 
 #endif /* _NF_SYNPROXY_SHARED_H */
diff --git a/include/uapi/linux/netfilter/nf_synproxy.h b/include/uapi/linux/netfilter/nf_synproxy.h
index 068d1b3a6f06..6f3791c8946f 100644
--- a/include/uapi/linux/netfilter/nf_synproxy.h
+++ b/include/uapi/linux/netfilter/nf_synproxy.h
@@ -9,6 +9,10 @@
 #define NF_SYNPROXY_OPT_SACK_PERM	0x04
 #define NF_SYNPROXY_OPT_TIMESTAMP	0x08
 #define NF_SYNPROXY_OPT_ECN		0x10
+#define NF_SYNPROXY_OPT_MASK		(NF_SYNPROXY_OPT_MSS | \
+					 NF_SYNPROXY_OPT_WSCALE | \
+					 NF_SYNPROXY_OPT_SACK_PERM | \
+					 NF_SYNPROXY_OPT_TIMESTAMP)
 
 struct nf_synproxy_info {
 	__u8	options;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index c6c8ec5c7c00..c53d581643fe 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1551,6 +1551,22 @@ enum nft_osf_flags {
 	NFT_OSF_F_VERSION = (1 << 0),
 };
 
+/**
+ * enum nft_synproxy_attributes - nf_tables synproxy expression netlink attributes
+ *
+ * @NFTA_SYNPROXY_MSS: mss value sent to the backend (NLA_U16)
+ * @NFTA_SYNPROXY_WSCALE: wscale value sent to the backend (NLA_U8)
+ * @NFTA_SYNPROXY_FLAGS: flags (NLA_U32)
+ */
+enum nft_synproxy_attributes {
+	NFTA_SYNPROXY_UNSPEC,
+	NFTA_SYNPROXY_MSS,
+	NFTA_SYNPROXY_WSCALE,
+	NFTA_SYNPROXY_FLAGS,
+	__NFTA_SYNPROXY_MAX,
+};
+#define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1)
+
 /**
  * enum nft_device_attributes - nf_tables device netlink attributes
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 21025c2c605b..d59742408d9b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -651,6 +651,17 @@ config NFT_TPROXY
 	help
 	  This makes transparent proxy support available in nftables.
 
+config NFT_SYNPROXY
+	tristate "Netfilter nf_tables SYNPROXY expression support"
+	depends on NF_CONNTRACK && NETFILTER_ADVANCED
+	select NETFILTER_SYNPROXY
+	select SYN_COOKIES
+	help
+	  The SYNPROXY expression allows you to intercept TCP connections and
+	  establish them using syncookies before they are passed on to the
+	  server. This allows to avoid conntrack and server resource usage
+	  during SYN-flood attacks.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 72cca6b48960..deada20975ff 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -110,6 +110,7 @@ obj-$(CONFIG_NFT_SOCKET)	+= nft_socket.o
 obj-$(CONFIG_NFT_OSF)		+= nft_osf.o
 obj-$(CONFIG_NFT_TPROXY)	+= nft_tproxy.o
 obj-$(CONFIG_NFT_XFRM)		+= nft_xfrm.o
+obj-$(CONFIG_NFT_SYNPROXY)	+= nft_synproxy.o
 
 obj-$(CONFIG_NFT_NAT)		+= nft_chain_nat.o
 
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
new file mode 100644
index 000000000000..80060ade8a5b
--- /dev/null
+++ b/net/netfilter/nft_synproxy.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/netlink.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_synproxy.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_synproxy.h>
+
+struct nft_synproxy {
+	struct nf_synproxy_info	info;
+};
+
+static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = {
+	[NFTA_SYNPROXY_MSS]		= { .type = NLA_U16 },
+	[NFTA_SYNPROXY_WSCALE]		= { .type = NLA_U8 },
+	[NFTA_SYNPROXY_FLAGS]		= { .type = NLA_U32 },
+};
+
+static void nft_synproxy_tcp_options(struct synproxy_options *opts,
+				     const struct tcphdr *tcp,
+				     struct synproxy_net *snet,
+				     struct nf_synproxy_info *info,
+				     struct nft_synproxy *priv)
+{
+	this_cpu_inc(snet->stats->syn_received);
+	if (tcp->ece && tcp->cwr)
+		opts->options |= NF_SYNPROXY_OPT_ECN;
+
+	opts->options &= priv->info.options;
+	if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+		synproxy_init_timestamp_cookie(info, opts);
+	else
+		opts->options &= ~(NF_SYNPROXY_OPT_WSCALE |
+				   NF_SYNPROXY_OPT_SACK_PERM |
+				   NF_SYNPROXY_OPT_ECN);
+}
+
+static void nft_synproxy_eval_v4(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt,
+				 const struct tcphdr *tcp,
+				 struct tcphdr *_tcph,
+				 struct synproxy_options *opts)
+{
+	struct nft_synproxy *priv = nft_expr_priv(expr);
+	struct nf_synproxy_info info = priv->info;
+	struct net *net = nft_net(pkt);
+	struct synproxy_net *snet = synproxy_pernet(net);
+	struct sk_buff *skb = pkt->skb;
+
+	if (tcp->syn) {
+		/* Initial SYN from client */
+		nft_synproxy_tcp_options(opts, tcp, snet, &info, priv);
+		synproxy_send_client_synack(net, skb, tcp, opts);
+		consume_skb(skb);
+		regs->verdict.code = NF_STOLEN;
+	} else if (tcp->ack) {
+		/* ACK from client */
+		if (synproxy_recv_client_ack(net, skb, tcp, opts,
+					     ntohl(tcp->seq))) {
+			consume_skb(skb);
+			regs->verdict.code = NF_STOLEN;
+		} else {
+			regs->verdict.code = NF_DROP;
+		}
+	}
+}
+
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+static void nft_synproxy_eval_v6(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt,
+				 const struct tcphdr *tcp,
+				 struct tcphdr *_tcph,
+				 struct synproxy_options *opts)
+{
+	struct nft_synproxy *priv = nft_expr_priv(expr);
+	struct nf_synproxy_info info = priv->info;
+	struct net *net = nft_net(pkt);
+	struct synproxy_net *snet = synproxy_pernet(net);
+	struct sk_buff *skb = pkt->skb;
+
+	if (tcp->syn) {
+		/* Initial SYN from client */
+		nft_synproxy_tcp_options(opts, tcp, snet, &info, priv);
+		synproxy_send_client_synack_ipv6(net, skb, tcp, opts);
+		consume_skb(skb);
+		regs->verdict.code = NF_STOLEN;
+	} else if (tcp->ack) {
+		/* ACK from client */
+		if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts,
+						  ntohl(tcp->seq))) {
+			consume_skb(skb);
+			regs->verdict.code = NF_STOLEN;
+		} else {
+			regs->verdict.code = NF_DROP;
+		}
+	}
+}
+#endif /* CONFIG_NF_TABLES_IPV6*/
+
+static void nft_synproxy_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct synproxy_options opts = {};
+	struct sk_buff *skb = pkt->skb;
+	int thoff = pkt->xt.thoff;
+	const struct tcphdr *tcp;
+	struct tcphdr _tcph;
+
+	if (pkt->tprot != IPPROTO_TCP) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) {
+		regs->verdict.code = NF_DROP;
+		return;
+	}
+
+	tcp = skb_header_pointer(skb, pkt->xt.thoff,
+				 sizeof(struct tcphdr),
+				 &_tcph);
+	if (!tcp) {
+		regs->verdict.code = NF_DROP;
+		return;
+	}
+
+	if (!synproxy_parse_options(skb, thoff, tcp, &opts)) {
+		regs->verdict.code = NF_DROP;
+		return;
+	}
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts);
+		return;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+	case htons(ETH_P_IPV6):
+		nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts);
+		return;
+#endif
+	}
+	regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_synproxy_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
+{
+	struct synproxy_net *snet = synproxy_pernet(ctx->net);
+	struct nft_synproxy *priv = nft_expr_priv(expr);
+	u32 flags;
+	int err;
+
+	if (tb[NFTA_SYNPROXY_MSS])
+		priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS]));
+	if (tb[NFTA_SYNPROXY_WSCALE])
+		priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]);
+	if (tb[NFTA_SYNPROXY_FLAGS]) {
+		flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS]));
+		if (flags & ~NF_SYNPROXY_OPT_MASK)
+			return -EOPNOTSUPP;
+		priv->info.options = flags;
+	}
+
+	err = nf_ct_netns_get(ctx->net, ctx->family);
+	if (err)
+		return err;
+
+	switch (ctx->family) {
+	case NFPROTO_IPV4:
+		err = nf_synproxy_ipv4_init(snet, ctx->net);
+		if (err)
+			goto nf_ct_failure;
+		break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+	case NFPROTO_IPV6:
+		err = nf_synproxy_ipv6_init(snet, ctx->net);
+		if (err)
+			goto nf_ct_failure;
+		break;
+#endif
+	case NFPROTO_INET:
+	case NFPROTO_BRIDGE:
+		err = nf_synproxy_ipv4_init(snet, ctx->net);
+		if (err)
+			goto nf_ct_failure;
+		err = nf_synproxy_ipv6_init(snet, ctx->net);
+		if (err)
+			goto nf_ct_failure;
+		break;
+	}
+
+	return 0;
+
+nf_ct_failure:
+	nf_ct_netns_put(ctx->net, ctx->family);
+	return err;
+}
+
+static void nft_synproxy_destroy(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr)
+{
+	struct synproxy_net *snet = synproxy_pernet(ctx->net);
+
+	switch (ctx->family) {
+	case NFPROTO_IPV4:
+		nf_synproxy_ipv4_fini(snet, ctx->net);
+		break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+	case NFPROTO_IPV6:
+		nf_synproxy_ipv6_fini(snet, ctx->net);
+		break;
+#endif
+	case NFPROTO_INET:
+	case NFPROTO_BRIDGE:
+		nf_synproxy_ipv4_fini(snet, ctx->net);
+		nf_synproxy_ipv6_fini(snet, ctx->net);
+		break;
+	}
+	nf_ct_netns_put(ctx->net, ctx->family);
+}
+
+static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_synproxy *priv = nft_expr_priv(expr);
+
+	if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) ||
+	    nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) ||
+	    nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_synproxy_validate(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr,
+				 const struct nft_data **data)
+{
+	return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
+						    (1 << NF_INET_FORWARD));
+}
+
+static struct nft_expr_type nft_synproxy_type;
+static const struct nft_expr_ops nft_synproxy_ops = {
+	.eval		= nft_synproxy_eval,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_synproxy)),
+	.init		= nft_synproxy_init,
+	.destroy	= nft_synproxy_destroy,
+	.dump		= nft_synproxy_dump,
+	.type		= &nft_synproxy_type,
+	.validate	= nft_synproxy_validate,
+};
+
+static struct nft_expr_type nft_synproxy_type __read_mostly = {
+	.ops		= &nft_synproxy_ops,
+	.name		= "synproxy",
+	.owner		= THIS_MODULE,
+	.policy		= nft_synproxy_policy,
+	.maxattr	= NFTA_SYNPROXY_MAX,
+};
+
+static int __init nft_synproxy_module_init(void)
+{
+	return nft_register_expr(&nft_synproxy_type);
+}
+
+static void __exit nft_synproxy_module_exit(void)
+{
+	return nft_unregister_expr(&nft_synproxy_type);
+}
+
+module_init(nft_synproxy_module_init);
+module_exit(nft_synproxy_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
+MODULE_ALIAS_NFT_EXPR("synproxy");
-- 
cgit v1.2.3


From 30e103fe24debce6f35f2e53cc763ed7be292df3 Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Fri, 5 Jul 2019 21:16:32 +0800
Subject: netfilter: nft_meta: move bridge meta keys into nft_meta_bridge

Separate bridge meta key from nft_meta to meta_bridge to avoid a
dependency between the bridge module and nft_meta when using the bridge
API available through include/linux/if_bridge.h

Signed-off-by: wenxu <wenxu@ucloud.cn>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h       |  44 ++++++++++++
 net/bridge/netfilter/Kconfig           |   6 ++
 net/bridge/netfilter/Makefile          |   1 +
 net/bridge/netfilter/nft_meta_bridge.c | 127 +++++++++++++++++++++++++++++++++
 net/netfilter/nf_tables_core.c         |   1 +
 net/netfilter/nft_meta.c               |  81 ++++++++-------------
 6 files changed, 207 insertions(+), 53 deletions(-)
 create mode 100644 include/net/netfilter/nft_meta.h
 create mode 100644 net/bridge/netfilter/nft_meta_bridge.c

(limited to 'include')

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
new file mode 100644
index 000000000000..5c69e9b09388
--- /dev/null
+++ b/include/net/netfilter/nft_meta.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NFT_META_H_
+#define _NFT_META_H_
+
+struct nft_meta {
+	enum nft_meta_keys	key:8;
+	union {
+		enum nft_registers	dreg:8;
+		enum nft_registers	sreg:8;
+	};
+};
+
+extern const struct nla_policy nft_meta_policy[];
+
+int nft_meta_get_init(const struct nft_ctx *ctx,
+		      const struct nft_expr *expr,
+		      const struct nlattr * const tb[]);
+
+int nft_meta_set_init(const struct nft_ctx *ctx,
+		      const struct nft_expr *expr,
+		      const struct nlattr * const tb[]);
+
+int nft_meta_get_dump(struct sk_buff *skb,
+		      const struct nft_expr *expr);
+
+int nft_meta_set_dump(struct sk_buff *skb,
+		      const struct nft_expr *expr);
+
+void nft_meta_get_eval(const struct nft_expr *expr,
+		       struct nft_regs *regs,
+		       const struct nft_pktinfo *pkt);
+
+void nft_meta_set_eval(const struct nft_expr *expr,
+		       struct nft_regs *regs,
+		       const struct nft_pktinfo *pkt);
+
+void nft_meta_set_destroy(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr);
+
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nft_data **data);
+
+#endif
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index f4fb0b9b927d..fbc708508360 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -9,6 +9,12 @@ menuconfig NF_TABLES_BRIDGE
 	bool "Ethernet Bridge nf_tables support"
 
 if NF_TABLES_BRIDGE
+
+config NFT_BRIDGE_META
+	tristate "Netfilter nf_table bridge meta support"
+	help
+	  Add support for bridge dedicated meta key.
+
 config NFT_BRIDGE_REJECT
 	tristate "Netfilter nf_tables bridge reject support"
 	depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 9d7767322a64..8e2c5759d964 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,6 +3,7 @@
 # Makefile for the netfilter modules for Link Layer filtering on a bridge.
 #
 
+obj-$(CONFIG_NFT_BRIDGE_META)  += nft_meta_bridge.o
 obj-$(CONFIG_NFT_BRIDGE_REJECT)  += nft_reject_bridge.o
 
 # connection tracking
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
new file mode 100644
index 000000000000..dde8651254ac
--- /dev/null
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+
+#include "../br_private.h"
+
+static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
+				     struct nft_regs *regs,
+				     const struct nft_pktinfo *pkt)
+{
+	const struct nft_meta *priv = nft_expr_priv(expr);
+	const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
+	u32 *dest = &regs->data[priv->dreg];
+	const struct net_bridge_port *p;
+
+	switch (priv->key) {
+	case NFT_META_BRI_IIFNAME:
+		if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
+			goto err;
+		break;
+	case NFT_META_BRI_OIFNAME:
+		if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
+			goto err;
+		break;
+	default:
+		goto out;
+	}
+
+	strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+	return;
+out:
+	return nft_meta_get_eval(expr, regs, pkt);
+err:
+	regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
+				    const struct nft_expr *expr,
+				    const struct nlattr * const tb[])
+{
+	struct nft_meta *priv = nft_expr_priv(expr);
+	unsigned int len;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+	switch (priv->key) {
+	case NFT_META_BRI_IIFNAME:
+	case NFT_META_BRI_OIFNAME:
+		len = IFNAMSIZ;
+		break;
+	default:
+		return nft_meta_get_init(ctx, expr, tb);
+	}
+
+	priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
+	return nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, len);
+}
+
+static struct nft_expr_type nft_meta_bridge_type;
+static const struct nft_expr_ops nft_meta_bridge_get_ops = {
+	.type		= &nft_meta_bridge_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+	.eval		= nft_meta_bridge_get_eval,
+	.init		= nft_meta_bridge_get_init,
+	.dump		= nft_meta_get_dump,
+};
+
+static const struct nft_expr_ops nft_meta_bridge_set_ops = {
+	.type		= &nft_meta_bridge_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+	.eval		= nft_meta_set_eval,
+	.init		= nft_meta_set_init,
+	.destroy	= nft_meta_set_destroy,
+	.dump		= nft_meta_set_dump,
+	.validate	= nft_meta_set_validate,
+};
+
+static const struct nft_expr_ops *
+nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
+			   const struct nlattr * const tb[])
+{
+	if (tb[NFTA_META_KEY] == NULL)
+		return ERR_PTR(-EINVAL);
+
+	if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
+		return ERR_PTR(-EINVAL);
+
+	if (tb[NFTA_META_DREG])
+		return &nft_meta_bridge_get_ops;
+
+	if (tb[NFTA_META_SREG])
+		return &nft_meta_bridge_set_ops;
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
+	.family         = NFPROTO_BRIDGE,
+	.name           = "meta",
+	.select_ops     = nft_meta_bridge_select_ops,
+	.policy         = nft_meta_policy,
+	.maxattr        = NFTA_META_MAX,
+	.owner          = THIS_MODULE,
+};
+
+static int __init nft_meta_bridge_module_init(void)
+{
+	return nft_register_expr(&nft_meta_bridge_type);
+}
+
+static void __exit nft_meta_bridge_module_exit(void)
+{
+	nft_unregister_expr(&nft_meta_bridge_type);
+}
+
+module_init(nft_meta_bridge_module_init);
+module_exit(nft_meta_bridge_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("wenxu <wenxu@ucloud.cn>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index b950cd31348b..96c74c4c7176 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -19,6 +19,7 @@
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netfilter/nft_meta.h>
 
 static noinline void __nft_trace_packet(struct nft_traceinfo *info,
 					const struct nft_chain *chain,
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index a54329b8634a..18a848b01759 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -21,23 +21,12 @@
 #include <net/tcp_states.h> /* for TCP_TIME_WAIT */
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nft_meta.h>
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
-struct nft_meta {
-	enum nft_meta_keys	key:8;
-	union {
-		enum nft_registers	dreg:8;
-		enum nft_registers	sreg:8;
-	};
-};
-
 static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
 
-#ifdef CONFIG_NF_TABLES_BRIDGE
-#include "../bridge/br_private.h"
-#endif
-
 void nft_meta_get_eval(const struct nft_expr *expr,
 		       struct nft_regs *regs,
 		       const struct nft_pktinfo *pkt)
@@ -47,9 +36,6 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 	const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
 	struct sock *sk;
 	u32 *dest = &regs->data[priv->dreg];
-#ifdef CONFIG_NF_TABLES_BRIDGE
-	const struct net_bridge_port *p;
-#endif
 
 	switch (priv->key) {
 	case NFT_META_LEN:
@@ -228,18 +214,6 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 	case NFT_META_SECPATH:
 		nft_reg_store8(dest, secpath_exists(skb));
 		break;
-#endif
-#ifdef CONFIG_NF_TABLES_BRIDGE
-	case NFT_META_BRI_IIFNAME:
-		if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
-			goto err;
-		strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
-		return;
-	case NFT_META_BRI_OIFNAME:
-		if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
-			goto err;
-		strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
-		return;
 #endif
 	case NFT_META_IIFKIND:
 		if (in == NULL || in->rtnl_link_ops == NULL)
@@ -260,10 +234,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 err:
 	regs->verdict.code = NFT_BREAK;
 }
+EXPORT_SYMBOL_GPL(nft_meta_get_eval);
 
-static void nft_meta_set_eval(const struct nft_expr *expr,
-			      struct nft_regs *regs,
-			       const struct nft_pktinfo *pkt)
+void nft_meta_set_eval(const struct nft_expr *expr,
+		       struct nft_regs *regs,
+		       const struct nft_pktinfo *pkt)
 {
 	const struct nft_meta *meta = nft_expr_priv(expr);
 	struct sk_buff *skb = pkt->skb;
@@ -300,16 +275,18 @@ static void nft_meta_set_eval(const struct nft_expr *expr,
 		WARN_ON(1);
 	}
 }
+EXPORT_SYMBOL_GPL(nft_meta_set_eval);
 
-static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
 	[NFTA_META_DREG]	= { .type = NLA_U32 },
 	[NFTA_META_KEY]		= { .type = NLA_U32 },
 	[NFTA_META_SREG]	= { .type = NLA_U32 },
 };
+EXPORT_SYMBOL_GPL(nft_meta_policy);
 
-static int nft_meta_get_init(const struct nft_ctx *ctx,
-			     const struct nft_expr *expr,
-			     const struct nlattr * const tb[])
+int nft_meta_get_init(const struct nft_ctx *ctx,
+		      const struct nft_expr *expr,
+		      const struct nlattr * const tb[])
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int len;
@@ -359,14 +336,6 @@ static int nft_meta_get_init(const struct nft_ctx *ctx,
 	case NFT_META_SECPATH:
 		len = sizeof(u8);
 		break;
-#endif
-#ifdef CONFIG_NF_TABLES_BRIDGE
-	case NFT_META_BRI_IIFNAME:
-	case NFT_META_BRI_OIFNAME:
-		if (ctx->family != NFPROTO_BRIDGE)
-			return -EOPNOTSUPP;
-		len = IFNAMSIZ;
-		break;
 #endif
 	default:
 		return -EOPNOTSUPP;
@@ -376,6 +345,7 @@ static int nft_meta_get_init(const struct nft_ctx *ctx,
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, len);
 }
+EXPORT_SYMBOL_GPL(nft_meta_get_init);
 
 static int nft_meta_get_validate(const struct nft_ctx *ctx,
 				 const struct nft_expr *expr,
@@ -409,9 +379,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
 #endif
 }
 
-static int nft_meta_set_validate(const struct nft_ctx *ctx,
-				 const struct nft_expr *expr,
-				 const struct nft_data **data)
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nft_data **data)
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int hooks;
@@ -437,10 +407,11 @@ static int nft_meta_set_validate(const struct nft_ctx *ctx,
 
 	return nft_chain_validate_hooks(ctx->chain, hooks);
 }
+EXPORT_SYMBOL_GPL(nft_meta_set_validate);
 
-static int nft_meta_set_init(const struct nft_ctx *ctx,
-			     const struct nft_expr *expr,
-			     const struct nlattr * const tb[])
+int nft_meta_set_init(const struct nft_ctx *ctx,
+		      const struct nft_expr *expr,
+		      const struct nlattr * const tb[])
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int len;
@@ -475,9 +446,10 @@ static int nft_meta_set_init(const struct nft_ctx *ctx,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(nft_meta_set_init);
 
-static int nft_meta_get_dump(struct sk_buff *skb,
-			     const struct nft_expr *expr)
+int nft_meta_get_dump(struct sk_buff *skb,
+		      const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
@@ -490,8 +462,9 @@ static int nft_meta_get_dump(struct sk_buff *skb,
 nla_put_failure:
 	return -1;
 }
+EXPORT_SYMBOL_GPL(nft_meta_get_dump);
 
-static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
@@ -505,15 +478,17 @@ static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
 nla_put_failure:
 	return -1;
 }
+EXPORT_SYMBOL_GPL(nft_meta_set_dump);
 
-static void nft_meta_set_destroy(const struct nft_ctx *ctx,
-				 const struct nft_expr *expr)
+void nft_meta_set_destroy(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
 	if (priv->key == NFT_META_NFTRACE)
 		static_branch_dec(&nft_trace_enabled);
 }
+EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
 
 static const struct nft_expr_ops nft_meta_get_ops = {
 	.type		= &nft_meta_type,
-- 
cgit v1.2.3


From 7582f5b70f9a2335f3713edb9a2614a50f1f1a90 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 5 Jul 2019 21:16:34 +0800
Subject: bridge: add br_vlan_get_pvid_rcu()

This new function allows you to fetch bridge pvid from packet path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
 include/linux/if_bridge.h |  6 ++++++
 net/bridge/br_vlan.c      | 19 +++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index f3fab5d0ea97..950db1dad830 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -88,6 +88,7 @@ static inline bool br_multicast_router(const struct net_device *dev)
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
 bool br_vlan_enabled(const struct net_device *dev);
 int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid);
+int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
 #else
@@ -101,6 +102,11 @@ static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
 	return -EINVAL;
 }
 
+static inline int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
+{
+	return -EINVAL;
+}
+
 static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
 				   struct bridge_vlan_info *p_vinfo)
 {
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index f47f526b4f19..8d97b91ad503 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1227,13 +1227,11 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
 	}
 }
 
-int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+static int __br_vlan_get_pvid(const struct net_device *dev,
+			      struct net_bridge_port *p, u16 *p_pvid)
 {
 	struct net_bridge_vlan_group *vg;
-	struct net_bridge_port *p;
 
-	ASSERT_RTNL();
-	p = br_port_get_check_rtnl(dev);
 	if (p)
 		vg = nbp_vlan_group(p);
 	else if (netif_is_bridge_master(dev))
@@ -1244,8 +1242,21 @@ int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
 	*p_pvid = br_get_pvid(vg);
 	return 0;
 }
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+	ASSERT_RTNL();
+
+	return __br_vlan_get_pvid(dev, br_port_get_check_rtnl(dev), p_pvid);
+}
 EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
 
+int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
+{
+	return __br_vlan_get_pvid(dev, br_port_get_check_rcu(dev), p_pvid);
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
+
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo)
 {
-- 
cgit v1.2.3


From c54c7c685494fc0f1662091d4d0c4fc26e810471 Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Fri, 5 Jul 2019 21:16:35 +0800
Subject: netfilter: nft_meta_bridge: add NFT_META_BRI_IIFPVID support

This patch allows you to match on the bridge port pvid, eg.

nft add rule bridge firewall zones counter meta ibrpvid 10

Signed-off-by: wenxu <wenxu@ucloud.cn>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/bridge/netfilter/nft_meta_bridge.c   | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index c53d581643fe..87474920615a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -795,6 +795,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_SECPATH: boolean, secpath_exists (!!skb->sp)
  * @NFT_META_IIFKIND: packet input interface kind name (dev->rtnl_link_ops->kind)
  * @NFT_META_OIFKIND: packet output interface kind name (dev->rtnl_link_ops->kind)
+ * @NFT_META_BRI_IIFPVID: packet input bridge port pvid
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -825,6 +826,7 @@ enum nft_meta_keys {
 	NFT_META_SECPATH,
 	NFT_META_IIFKIND,
 	NFT_META_OIFKIND,
+	NFT_META_BRI_IIFPVID,
 };
 
 /**
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 2ea8acb4bc4a..9487d42f657a 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -7,6 +7,7 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nft_meta.h>
+#include <linux/if_bridge.h>
 
 static const struct net_device *
 nft_meta_get_bridge(const struct net_device *dev)
@@ -37,6 +38,17 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
 		if (!br_dev)
 			goto err;
 		break;
+	case NFT_META_BRI_IIFPVID: {
+		u16 p_pvid;
+
+		br_dev = nft_meta_get_bridge(in);
+		if (!br_dev || !br_vlan_enabled(br_dev))
+			goto err;
+
+		br_vlan_get_pvid_rcu(in, &p_pvid);
+		nft_reg_store16(dest, p_pvid);
+		return;
+	}
 	default:
 		goto out;
 	}
@@ -62,6 +74,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
 	case NFT_META_BRI_OIFNAME:
 		len = IFNAMSIZ;
 		break;
+	case NFT_META_BRI_IIFPVID:
+		len = sizeof(u16);
+		break;
 	default:
 		return nft_meta_get_init(ctx, expr, tb);
 	}
-- 
cgit v1.2.3


From 31aed46fedbba65abece57e14d24f00b52389c4f Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Fri, 5 Jul 2019 21:16:36 +0800
Subject: bridge: add br_vlan_get_proto()

This new function allows you to fetch the bridge port vlan protocol.

Signed-off-by: wenxu <wenxu@ucloud.cn>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/if_bridge.h |  6 ++++++
 net/bridge/br_vlan.c      | 10 ++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 950db1dad830..9e57c4411734 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -89,6 +89,7 @@ static inline bool br_multicast_router(const struct net_device *dev)
 bool br_vlan_enabled(const struct net_device *dev);
 int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid);
 int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid);
+int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
 #else
@@ -102,6 +103,11 @@ static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
 	return -EINVAL;
 }
 
+static inline int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
+{
+	return -EINVAL;
+}
+
 static inline int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
 {
 	return -EINVAL;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8d97b91ad503..021cc9f66804 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -797,6 +797,16 @@ bool br_vlan_enabled(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(br_vlan_enabled);
 
+int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	*p_proto = ntohs(br->vlan_proto);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_proto);
+
 int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
 {
 	int err = 0;
-- 
cgit v1.2.3


From 2a3a93ef0ba5166e8b5766bb232f216fd412d40b Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Fri, 5 Jul 2019 21:16:37 +0800
Subject: netfilter: nft_meta_bridge: Add NFT_META_BRI_IIFVPROTO support

This patch allows you to match on bridge vlan protocol, eg.

nft add rule bridge firewall zones counter meta ibrvproto 0x8100

Signed-off-by: wenxu <wenxu@ucloud.cn>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/bridge/netfilter/nft_meta_bridge.c   | 12 ++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 87474920615a..0e3462dfb182 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -796,6 +796,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_IIFKIND: packet input interface kind name (dev->rtnl_link_ops->kind)
  * @NFT_META_OIFKIND: packet output interface kind name (dev->rtnl_link_ops->kind)
  * @NFT_META_BRI_IIFPVID: packet input bridge port pvid
+ * @NFT_META_BRI_IIFVPROTO: packet input bridge vlan proto
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -827,6 +828,7 @@ enum nft_meta_keys {
 	NFT_META_IIFKIND,
 	NFT_META_OIFKIND,
 	NFT_META_BRI_IIFPVID,
+	NFT_META_BRI_IIFVPROTO,
 };
 
 /**
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 9487d42f657a..bed66f536b34 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -49,6 +49,17 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
 		nft_reg_store16(dest, p_pvid);
 		return;
 	}
+	case NFT_META_BRI_IIFVPROTO: {
+		u16 p_proto;
+
+		br_dev = nft_meta_get_bridge(in);
+		if (!br_dev || !br_vlan_enabled(br_dev))
+			goto err;
+
+		br_vlan_get_proto(br_dev, &p_proto);
+		nft_reg_store16(dest, p_proto);
+		return;
+	}
 	default:
 		goto out;
 	}
@@ -75,6 +86,7 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
 		len = IFNAMSIZ;
 		break;
 	case NFT_META_BRI_IIFPVID:
+	case NFT_META_BRI_IIFVPROTO:
 		len = sizeof(u16);
 		break;
 	default:
-- 
cgit v1.2.3


From e4aa33ad595936391f7356f25c0c839011f14ead Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Thu, 4 Jul 2019 17:03:26 +0800
Subject: net: remove unused parameter from skb_checksum_try_convert

the check parameter is never used

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 8 +++-----
 net/ipv4/gre_demux.c   | 2 +-
 net/ipv4/udp.c         | 3 +--
 net/ipv6/udp.c         | 3 +--
 4 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b5d427b149c9..7ece49d5f8ef 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3919,18 +3919,16 @@ static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
 	return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
 }
 
-static inline void __skb_checksum_convert(struct sk_buff *skb,
-					  __sum16 check, __wsum pseudo)
+static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
 {
 	skb->csum = ~pseudo;
 	skb->ip_summed = CHECKSUM_COMPLETE;
 }
 
-#define skb_checksum_try_convert(skb, proto, check, compute_pseudo)	\
+#define skb_checksum_try_convert(skb, proto, compute_pseudo)	\
 do {									\
 	if (__skb_checksum_convert_check(skb))				\
-		__skb_checksum_convert(skb, check,			\
-				       compute_pseudo(skb, proto));	\
+		__skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
 } while (0)
 
 static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 293acfb36376..44bfeecac33e 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -83,7 +83,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	options = (__be32 *)(greh + 1);
 	if (greh->flags & GRE_CSUM) {
 		if (!skb_checksum_simple_validate(skb)) {
-			skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+			skb_checksum_try_convert(skb, IPPROTO_GRE,
 						 null_compute_pseudo);
 		} else if (csum_err) {
 			*csum_err = true;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1b971bd95786..c21862ba9c02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2224,8 +2224,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
 	int ret;
 
 	if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
-		skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
-					 inet_compute_pseudo);
+		skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
 
 	ret = udp_queue_rcv_skb(sk, skb);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 66ca5a4b17c4..4406e059da68 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -826,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
 	int ret;
 
 	if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
-		skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
-					 ip6_compute_pseudo);
+		skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
 
 	ret = udpv6_queue_rcv_skb(sk, skb);
 
-- 
cgit v1.2.3


From e2869fb2068be603b46cd62bc980b4765948c6ed Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Fri, 5 Jul 2019 18:30:12 +0300
Subject: net/mlx5: Kconfig, Better organize compilation flags

Always contain all acceleration functions declarations in
'accel' files, independent to the flags setting.
For this, introduce new flags CONFIG_FPGA_{IPSEC/TLS} and use stubs
where needed.

This obsoletes the need for stubs in 'fpga' files. Remove them.

Also use the new flags in Makefile, to decide whether to compile
TLS-specific or IPSEC-specific objects, or not.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    | 43 ++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |  7 +-
 .../net/ethernet/mellanox/mlx5/core/accel/ipsec.c  |  4 ++
 .../net/ethernet/mellanox/mlx5/core/accel/ipsec.h  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c    |  3 +
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h    |  4 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.h   | 75 ----------------------
 include/linux/mlx5/accel.h                         |  2 +-
 8 files changed, 47 insertions(+), 93 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 7845aa5bf6be..6556490d809c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -97,26 +97,49 @@ config MLX5_CORE_IPOIB
 	---help---
 	  MLX5 IPoIB offloads & acceleration support.
 
+config MLX5_FPGA_IPSEC
+	bool "Mellanox Technologies IPsec Innova support"
+	depends on MLX5_CORE
+	depends on MLX5_FPGA
+	default n
+	help
+	Build IPsec support for the Innova family of network cards by Mellanox
+	Technologies. Innova network cards are comprised of a ConnectX chip
+	and an FPGA chip on one board. If you select this option, the
+	mlx5_core driver will include the Innova FPGA core and allow building
+	sandbox-specific client drivers.
+
 config MLX5_EN_IPSEC
 	bool "IPSec XFRM cryptography-offload accelaration"
-	depends on MLX5_ACCEL
 	depends on MLX5_CORE_EN
 	depends on XFRM_OFFLOAD
 	depends on INET_ESP_OFFLOAD || INET6_ESP_OFFLOAD
+	depends on MLX5_FPGA_IPSEC
 	default n
-	---help---
+	help
 	  Build support for IPsec cryptography-offload accelaration in the NIC.
 	  Note: Support for hardware with this capability needs to be selected
 	  for this option to become available.
 
-config MLX5_EN_TLS
-	bool "TLS cryptography-offload accelaration"
-	depends on MLX5_CORE_EN
+config MLX5_FPGA_TLS
+	bool "Mellanox Technologies TLS Innova support"
 	depends on TLS_DEVICE
 	depends on TLS=y || MLX5_CORE=m
-	depends on MLX5_ACCEL
+	depends on MLX5_FPGA
 	default n
-	---help---
-	  Build support for TLS cryptography-offload accelaration in the NIC.
-	  Note: Support for hardware with this capability needs to be selected
-	  for this option to become available.
+	help
+	Build TLS support for the Innova family of network cards by Mellanox
+	Technologies. Innova network cards are comprised of a ConnectX chip
+	and an FPGA chip on one board. If you select this option, the
+	mlx5_core driver will include the Innova FPGA core and allow building
+	sandbox-specific client drivers.
+
+config MLX5_EN_TLS
+	bool "TLS cryptography-offload accelaration"
+	depends on MLX5_CORE_EN
+	depends on MLX5_FPGA_TLS
+	default y
+	help
+	Build support for TLS cryptography-offload accelaration in the NIC.
+	Note: Support for hardware with this capability needs to be selected
+	for this option to become available.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8456b19d79cd..d3409870646a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -53,10 +53,11 @@ mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib
 #
 # Accelerations & FPGA
 #
-mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
+mlx5_core-$(CONFIG_MLX5_FPGA_IPSEC) += fpga/ipsec.o
+mlx5_core-$(CONFIG_MLX5_FPGA_TLS)   += fpga/tls.o
+mlx5_core-$(CONFIG_MLX5_ACCEL)      += accel/tls.o accel/ipsec.o
 
-mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
-				 fpga/ipsec.o fpga/tls.o
+mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o
 
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 				     en_accel/ipsec_stats.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
index d1e76d5a413b..eddc34e4a762 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
@@ -31,6 +31,8 @@
  *
  */
 
+#ifdef CONFIG_MLX5_FPGA_IPSEC
+
 #include <linux/mlx5/device.h>
 
 #include "accel/ipsec.h"
@@ -112,3 +114,5 @@ int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
 	return mlx5_fpga_esp_modify_xfrm(xfrm, attrs);
 }
 EXPORT_SYMBOL_GPL(mlx5_accel_esp_modify_xfrm);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
index 93b3f5faddb5..530e428d46ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
@@ -37,7 +37,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/accel.h>
 
-#ifdef CONFIG_MLX5_ACCEL
+#ifdef CONFIG_MLX5_FPGA_IPSEC
 
 #define MLX5_IPSEC_DEV(mdev) (mlx5_accel_ipsec_device_caps(mdev) & \
 			      MLX5_ACCEL_IPSEC_CAP_DEVICE)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
index da7bd26368f9..a2c9eda1ebf5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
@@ -35,6 +35,8 @@
 
 #include "accel/tls.h"
 #include "mlx5_core.h"
+
+#ifdef CONFIG_MLX5_FPGA_TLS
 #include "fpga/tls.h"
 
 int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
@@ -78,3 +80,4 @@ void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev)
 {
 	mlx5_fpga_tls_cleanup(mdev);
 }
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
index def4093ebfae..e5d306ad7f91 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
@@ -37,8 +37,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/tls.h>
 
-#ifdef CONFIG_MLX5_ACCEL
-
+#ifdef CONFIG_MLX5_FPGA_TLS
 enum {
 	MLX5_ACCEL_TLS_TX = BIT(0),
 	MLX5_ACCEL_TLS_RX = BIT(1),
@@ -88,7 +87,6 @@ static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) { return
 static inline u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) { return 0; }
 static inline int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) { return 0; }
 static inline void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev) { }
-
 #endif
 
 #endif	/* __MLX5_ACCEL_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
index 2b5e63b0d4d6..382985e65b48 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
@@ -37,8 +37,6 @@
 #include "accel/ipsec.h"
 #include "fs_cmd.h"
 
-#ifdef CONFIG_MLX5_FPGA
-
 u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev);
 unsigned int mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev);
 int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
@@ -66,77 +64,4 @@ int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
 const struct mlx5_flow_cmds *
 mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type);
 
-#else
-
-static inline u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev)
-{
-	return 0;
-}
-
-static inline unsigned int
-mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev)
-{
-	return 0;
-}
-
-static inline int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev,
-						u64 *counters)
-{
-	return 0;
-}
-
-static inline void *
-mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev,
-			      struct mlx5_accel_esp_xfrm *accel_xfrm,
-			      const __be32 saddr[4],
-			      const __be32 daddr[4],
-			      const __be32 spi, bool is_ipv6)
-{
-	return NULL;
-}
-
-static inline void mlx5_fpga_ipsec_delete_sa_ctx(void *context)
-{
-}
-
-static inline int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev)
-{
-	return 0;
-}
-
-static inline void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev)
-{
-}
-
-static inline void mlx5_fpga_ipsec_build_fs_cmds(void)
-{
-}
-
-static inline struct mlx5_accel_esp_xfrm *
-mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev,
-			  const struct mlx5_accel_esp_xfrm_attrs *attrs,
-			  u32 flags)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm)
-{
-}
-
-static inline int
-mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
-			  const struct mlx5_accel_esp_xfrm_attrs *attrs)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline const struct mlx5_flow_cmds *
-mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type)
-{
-	return mlx5_fs_cmd_get_default(type);
-}
-
-#endif /* CONFIG_MLX5_FPGA */
-
 #endif	/* __MLX5_FPGA_SADB_H__ */
diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h
index 70e7e5673ce9..5613e677a5f9 100644
--- a/include/linux/mlx5/accel.h
+++ b/include/linux/mlx5/accel.h
@@ -114,7 +114,7 @@ enum mlx5_accel_ipsec_cap {
 	MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN	= 1 << 7,
 };
 
-#ifdef CONFIG_MLX5_ACCEL
+#ifdef CONFIG_MLX5_FPGA_IPSEC
 
 u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev);
 
-- 
cgit v1.2.3


From 302975cba1a4244d84e645773c82edbcfae1875f Mon Sep 17 00:00:00 2001
From: Spoorthi Ravishankar Koppad <spoorthix.k@intel.com>
Date: Fri, 21 Jun 2019 14:51:56 +0530
Subject: Bluetooth: Add support for LE ping feature

Changes made to add HCI Write Authenticated Payload timeout
command for LE Ping feature.

As per the Core Specification 5.0 Volume 2 Part E Section 7.3.94,
the following code changes implements
HCI Write Authenticated Payload timeout command for LE Ping feature.

Signed-off-by: Spoorthi Ravishankar Koppad <spoorthix.k@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h      | 20 +++++++++++
 include/net/bluetooth/hci_core.h |  4 +++
 net/bluetooth/hci_conn.c         |  3 ++
 net/bluetooth/hci_core.c         |  1 +
 net/bluetooth/hci_debugfs.c      | 31 +++++++++++++++++
 net/bluetooth/hci_event.c        | 72 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 131 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 9a5330eed794..5bc1e30dedde 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -1143,6 +1143,26 @@ struct hci_cp_write_sc_support {
 	__u8	support;
 } __packed;
 
+#define HCI_OP_READ_AUTH_PAYLOAD_TO    0x0c7b
+struct hci_cp_read_auth_payload_to {
+	__le16  handle;
+} __packed;
+struct hci_rp_read_auth_payload_to {
+	__u8    status;
+	__le16  handle;
+	__le16  timeout;
+} __packed;
+
+#define HCI_OP_WRITE_AUTH_PAYLOAD_TO    0x0c7c
+struct hci_cp_write_auth_payload_to {
+	__le16  handle;
+	__le16  timeout;
+} __packed;
+struct hci_rp_write_auth_payload_to {
+	__u8    status;
+	__le16  handle;
+} __packed;
+
 #define HCI_OP_READ_LOCAL_OOB_EXT_DATA	0x0c7d
 struct hci_rp_read_local_oob_ext_data {
 	__u8     status;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 05b1b96f4d9e..ded574b32c20 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -199,6 +199,8 @@ struct adv_info {
 /* Default min/max age of connection information (1s/3s) */
 #define DEFAULT_CONN_INFO_MIN_AGE	1000
 #define DEFAULT_CONN_INFO_MAX_AGE	3000
+/* Default authenticated payload timeout 30s */
+#define DEFAULT_AUTH_PAYLOAD_TIMEOUT   0x0bb8
 
 struct amp_assoc {
 	__u16	len;
@@ -275,6 +277,7 @@ struct hci_dev {
 	__u16		discov_interleaved_timeout;
 	__u16		conn_info_min_age;
 	__u16		conn_info_max_age;
+	__u16		auth_payload_timeout;
 	__u8		ssp_debug_mode;
 	__u8		hw_error_code;
 	__u32		clock;
@@ -481,6 +484,7 @@ struct hci_conn {
 	__u16		disc_timeout;
 	__u16		conn_timeout;
 	__u16		setting;
+	__u16		auth_payload_timeout;
 	__u16		le_conn_min_interval;
 	__u16		le_conn_max_interval;
 	__u16		le_conn_interval;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 15d1cb5aee18..17e5111daa11 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -520,6 +520,9 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	set_bit(HCI_CONN_POWER_SAVE, &conn->flags);
 	conn->disc_timeout = HCI_DISCONN_TIMEOUT;
 
+	/* Set Default Authenticated payload timeout to 30s */
+	conn->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT;
+
 	if (conn->role == HCI_ROLE_MASTER)
 		conn->out = true;
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index b81bf53c5ac4..ff9a755f4df3 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3200,6 +3200,7 @@ struct hci_dev *hci_alloc_dev(void)
 	hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
 	hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE;
 	hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE;
+	hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT;
 
 	mutex_init(&hdev->lock);
 	mutex_init(&hdev->req_lock);
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 51f5b1efc3a5..bb67f4a5479a 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -941,6 +941,35 @@ static int adv_max_interval_get(void *data, u64 *val)
 DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get,
 			adv_max_interval_set, "%llu\n");
 
+static int auth_payload_timeout_set(void *data, u64 val)
+{
+	struct hci_dev *hdev = data;
+
+	if (val < 0x0001 || val > 0xffff)
+		return -EINVAL;
+
+	hci_dev_lock(hdev);
+	hdev->auth_payload_timeout = val;
+	hci_dev_unlock(hdev);
+
+	return 0;
+}
+
+static int auth_payload_timeout_get(void *data, u64 *val)
+{
+	struct hci_dev *hdev = data;
+
+	hci_dev_lock(hdev);
+	*val = hdev->auth_payload_timeout;
+	hci_dev_unlock(hdev);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops,
+			auth_payload_timeout_get,
+			auth_payload_timeout_set, "%llu\n");
+
 DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter,
 		       HCI_QUIRK_STRICT_DUPLICATE_FILTER);
 DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery,
@@ -994,6 +1023,8 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
 			    &adv_max_interval_fops);
 	debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs,
 			   &hdev->discov_interleaved_timeout);
+	debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev,
+			    &auth_payload_timeout_fops);
 
 	debugfs_create_file("quirk_strict_duplicate_filter", 0644,
 			    hdev->debugfs, hdev,
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 9e4fcf406d9c..c1d3a303d97f 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -579,6 +579,51 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev,
 		memcpy(hdev->commands, rp->commands, sizeof(hdev->commands));
 }
 
+static void hci_cc_read_auth_payload_timeout(struct hci_dev *hdev,
+					     struct sk_buff *skb)
+{
+	struct hci_rp_read_auth_payload_to *rp = (void *)skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+	if (conn)
+		conn->auth_payload_timeout = __le16_to_cpu(rp->timeout);
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cc_write_auth_payload_timeout(struct hci_dev *hdev,
+					      struct sk_buff *skb)
+{
+	struct hci_rp_write_auth_payload_to *rp = (void *)skb->data;
+	struct hci_conn *conn;
+	void *sent;
+
+	BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO);
+	if (!sent)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+	if (conn)
+		conn->auth_payload_timeout = get_unaligned_le16(sent + 2);
+
+	hci_dev_unlock(hdev);
+}
+
 static void hci_cc_read_local_features(struct hci_dev *hdev,
 				       struct sk_buff *skb)
 {
@@ -2975,6 +3020,25 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
 		goto unlock;
 	}
 
+	/* Set the default Authenticated Payload Timeout after
+	 * an LE Link is established. As per Core Spec v5.0, Vol 2, Part B
+	 * Section 3.3, the HCI command WRITE_AUTH_PAYLOAD_TIMEOUT should be
+	 * sent when the link is active and Encryption is enabled, the conn
+	 * type can be either LE or ACL and controller must support LMP Ping.
+	 * Ensure for AES-CCM encryption as well.
+	 */
+	if (test_bit(HCI_CONN_ENCRYPT, &conn->flags) &&
+	    test_bit(HCI_CONN_AES_CCM, &conn->flags) &&
+	    ((conn->type == ACL_LINK && lmp_ping_capable(hdev)) ||
+	     (conn->type == LE_LINK && (hdev->le_features[0] & HCI_LE_PING)))) {
+		struct hci_cp_write_auth_payload_to cp;
+
+		cp.handle = cpu_to_le16(conn->handle);
+		cp.timeout = cpu_to_le16(hdev->auth_payload_timeout);
+		hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO,
+			     sizeof(cp), &cp);
+	}
+
 notify:
 	if (conn->state == BT_CONFIG) {
 		if (!ev->status)
@@ -3170,6 +3234,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
 		hci_cc_write_sc_support(hdev, skb);
 		break;
 
+	case HCI_OP_READ_AUTH_PAYLOAD_TO:
+		hci_cc_read_auth_payload_timeout(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_AUTH_PAYLOAD_TO:
+		hci_cc_write_auth_payload_timeout(hdev, skb);
+		break;
+
 	case HCI_OP_READ_LOCAL_VERSION:
 		hci_cc_read_local_version(hdev, skb);
 		break;
-- 
cgit v1.2.3


From 600c70bad6594cb124c641ed05355ca134650ea4 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 1 Jul 2019 10:38:39 -0700
Subject: bpf: allow wide (u64) aligned stores for some fields of bpf_sock_addr

Since commit cd17d7770578 ("bpf/tools: sync bpf.h") clang decided
that it can do a single u64 store into user_ip6[2] instead of two
separate u32 ones:

 #  17: (18) r2 = 0x100000000000000
 #  ; ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2);
 #  19: (7b) *(u64 *)(r1 +16) = r2
 #  invalid bpf_context access off=16 size=8

>From the compiler point of view it does look like a correct thing
to do, so let's support it on the kernel side.

Credit to Andrii Nakryiko for a proper implementation of
bpf_ctx_wide_store_ok.

Cc: Andrii Nakryiko <andriin@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Fixes: cd17d7770578 ("bpf/tools: sync bpf.h")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h   |  6 ++++++
 include/uapi/linux/bpf.h |  6 +++---
 net/core/filter.c        | 22 ++++++++++++++--------
 3 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1fe53e78c7e3..6d944369ca87 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -747,6 +747,12 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
+#define bpf_ctx_wide_store_ok(off, size, type, field)			\
+	(size == sizeof(__u64) &&					\
+	off >= offsetof(type, field) &&					\
+	off + sizeof(__u64) <= offsetofend(type, field) &&		\
+	off % sizeof(__u64) == 0)
+
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ead27aebf491..c318385aba51 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3247,7 +3247,7 @@ struct bpf_sock_addr {
 	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+	__u32 user_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__u32 user_port;	/* Allows 4-byte read and write.
@@ -3256,10 +3256,10 @@ struct bpf_sock_addr {
 	__u32 family;		/* Allows 4-byte read, but no write */
 	__u32 type;		/* Allows 4-byte read, but no write */
 	__u32 protocol;		/* Allows 4-byte read, but no write */
-	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index 089aaea0ccc6..4481e950f020 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6890,6 +6890,16 @@ static bool sock_addr_is_valid_access(int off, int size,
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
+			if (bpf_ctx_wide_store_ok(off, size,
+						  struct bpf_sock_addr,
+						  user_ip6))
+				return true;
+
+			if (bpf_ctx_wide_store_ok(off, size,
+						  struct bpf_sock_addr,
+						  msg_src_ip6))
+				return true;
+
 			if (size != size_default)
 				return false;
 		}
@@ -7730,9 +7740,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
  * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
  *
- * It doesn't support SIZE argument though since narrow stores are not
- * supported for now.
- *
  * In addition it uses Temporary Field TF (member of struct S) as the 3rd
  * "register" since two registers available in convert_ctx_access are not
  * enough: we can't override neither SRC, since it contains value to store, nor
@@ -7740,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
  * instructions. But we need a temporary place to save pointer to nested
  * structure whose field we want to store to.
  */
-#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF)		       \
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF)	       \
 	do {								       \
 		int tmp_reg = BPF_REG_9;				       \
 		if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)	       \
@@ -7751,8 +7758,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(S, TF));			       \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,	       \
 				      si->dst_reg, offsetof(S, F));	       \
-		*insn++ = BPF_STX_MEM(					       \
-			BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg,	       \
+		*insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg,	       \
 			bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF),	       \
 				       target_size)			       \
 				+ OFF);					       \
@@ -7764,8 +7770,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 						      TF)		       \
 	do {								       \
 		if (type == BPF_WRITE) {				       \
-			SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF,    \
-							 TF);		       \
+			SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE,   \
+							 OFF, TF);	       \
 		} else {						       \
 			SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(		       \
 				S, NS, F, NF, SIZE, OFF);  \
-- 
cgit v1.2.3


From bef8e2639242e7f7214f1ab3b37ace1415a4f750 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 8 Jul 2019 14:57:21 +0200
Subject: bpf: avoid unused variable warning in tcp_bpf_rtt()

When CONFIG_BPF is disabled, we get a warning for an unused
variable:

In file included from drivers/target/target_core_device.c:26:
include/net/tcp.h:2226:19: error: unused variable 'tp' [-Werror,-Wunused-variable]
        struct tcp_sock *tp = tcp_sk(sk);

The variable is only used in one place, so it can be
replaced with its value there to avoid the warning.

Fixes: 23729ff23186 ("bpf: add BPF_CGROUP_SOCK_OPS callback that is executed on every RTT")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/tcp.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e16d8a3fd3b4..cca3c59b98bf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2223,9 +2223,7 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 
 static inline void tcp_bpf_rtt(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG))
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
 		tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
 }
 
-- 
cgit v1.2.3


From 1da4bbeffe41ba318812d7590955faee8636668b Mon Sep 17 00:00:00 2001
From: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Date: Tue, 9 Jul 2019 00:34:28 +0300
Subject: net: core: page_pool: add user refcnt and reintroduce
 page_pool_destroy

Jesper recently removed page_pool_destroy() (from driver invocation)
and moved shutdown and free of page_pool into xdp_rxq_info_unreg(),
in-order to handle in-flight packets/pages. This created an asymmetry
in drivers create/destroy pairs.

This patch reintroduce page_pool_destroy and add page_pool user
refcnt. This serves the purpose to simplify drivers error handling as
driver now drivers always calls page_pool_destroy() and don't need to
track if xdp_rxq_info_reg_mem_model() was unsuccessful.

This could be used for a special cases where a single RX-queue (with a
single page_pool) provides packets for two net_device'es, and thus
needs to register the same page_pool twice with two xdp_rxq_info
structures.

This patch is primarily to ease API usage for drivers. The recently
merged netsec driver, actually have a bug in this area, which is
solved by this API change.

This patch is a modified version of Ivan Khoronzhuk's original patch.

Link: https://lore.kernel.org/netdev/20190625175948.24771-2-ivan.khoronzhuk@linaro.org/
Fixes: 5c67bf0ec4d0 ("net: netsec: Use page_pool API")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  4 ++--
 drivers/net/ethernet/socionext/netsec.c           |  8 ++------
 include/net/page_pool.h                           | 25 +++++++++++++++++++++++
 net/core/page_pool.c                              |  8 ++++++++
 net/core/xdp.c                                    |  3 +++
 5 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 83194d56434d..10efd69de7ef 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -577,8 +577,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		}
 		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
 						 MEM_TYPE_PAGE_POOL, rq->page_pool);
-		if (err)
-			page_pool_free(rq->page_pool);
 	}
 	if (err)
 		goto err_free;
@@ -646,6 +644,7 @@ err_rq_wq_destroy:
 	if (rq->xdp_prog)
 		bpf_prog_put(rq->xdp_prog);
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	page_pool_destroy(rq->page_pool);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 
 	return err;
@@ -680,6 +679,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 	}
 
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	page_pool_destroy(rq->page_pool);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 460777449cd9..d7307ab90d74 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -1212,15 +1212,11 @@ static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id)
 		}
 	}
 
-	/* Rx is currently using page_pool
-	 * since the pool is created during netsec_setup_rx_dring(), we need to
-	 * free the pool manually if the registration failed
-	 */
+	/* Rx is currently using page_pool */
 	if (id == NETSEC_RING_RX) {
 		if (xdp_rxq_info_is_reg(&dring->xdp_rxq))
 			xdp_rxq_info_unreg(&dring->xdp_rxq);
-		else
-			page_pool_free(dring->page_pool);
+		page_pool_destroy(dring->page_pool);
 	}
 
 	memset(dring->desc, 0, sizeof(struct netsec_desc) * DESC_NUM);
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ee9c871d2043..2cbcdbdec254 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -101,6 +101,12 @@ struct page_pool {
 	struct ptr_ring ring;
 
 	atomic_t pages_state_release_cnt;
+
+	/* A page_pool is strictly tied to a single RX-queue being
+	 * protected by NAPI, due to above pp_alloc_cache. This
+	 * refcnt serves purpose is to simplify drivers error handling.
+	 */
+	refcount_t user_cnt;
 };
 
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
@@ -134,6 +140,15 @@ static inline void page_pool_free(struct page_pool *pool)
 #endif
 }
 
+/* Drivers use this instead of page_pool_free */
+static inline void page_pool_destroy(struct page_pool *pool)
+{
+	if (!pool)
+		return;
+
+	page_pool_free(pool);
+}
+
 /* Never call this directly, use helpers below */
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
@@ -201,4 +216,14 @@ static inline bool is_page_pool_compiled_in(void)
 #endif
 }
 
+static inline void page_pool_get(struct page_pool *pool)
+{
+	refcount_inc(&pool->user_cnt);
+}
+
+static inline bool page_pool_put(struct page_pool *pool)
+{
+	return refcount_dec_and_test(&pool->user_cnt);
+}
+
 #endif /* _NET_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index b366f59885c1..3272dc7a8c81 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -49,6 +49,9 @@ static int page_pool_init(struct page_pool *pool,
 
 	atomic_set(&pool->pages_state_release_cnt, 0);
 
+	/* Driver calling page_pool_create() also call page_pool_destroy() */
+	refcount_set(&pool->user_cnt, 1);
+
 	if (pool->p.flags & PP_FLAG_DMA_MAP)
 		get_device(pool->p.dev);
 
@@ -70,6 +73,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
 		kfree(pool);
 		return ERR_PTR(err);
 	}
+
 	return pool;
 }
 EXPORT_SYMBOL(page_pool_create);
@@ -356,6 +360,10 @@ static void __warn_in_flight(struct page_pool *pool)
 
 void __page_pool_free(struct page_pool *pool)
 {
+	/* Only last user actually free/release resources */
+	if (!page_pool_put(pool))
+		return;
+
 	WARN(pool->alloc.count, "API usage violation");
 	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 829377cc83db..d7bf62ffbb5e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -370,6 +370,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 		goto err;
 	}
 
+	if (type == MEM_TYPE_PAGE_POOL)
+		page_pool_get(xdp_alloc->page_pool);
+
 	mutex_unlock(&mem_id_lock);
 
 	trace_mem_connect(xdp_alloc, xdp_rxq);
-- 
cgit v1.2.3


From d4117d63a30876a3654f587c3a419db63d8b529d Mon Sep 17 00:00:00 2001
From: Kweh Hock Leong <hock.leong.kweh@intel.com>
Date: Sat, 6 Jul 2019 01:33:27 +0800
Subject: net: stmmac: enable clause 45 mdio support

DWMAC4 is capable to support clause 45 mdio communication.
This patch enable the feature on stmmac_mdio_write() and
stmmac_mdio_read() by following phy_write_mmd() and
phy_read_mmd() mdiobus read write implementation format.

Reviewed-by: Li, Yifan <yifan2.li@intel.com>
Signed-off-by: Kweh Hock Leong <hock.leong.kweh@intel.com>
Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 43 ++++++++++++++++++-----
 include/linux/phy.h                               |  2 ++
 2 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 18cadf0b0d66..4304c1abc5d1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -24,11 +24,14 @@
 
 #define MII_BUSY 0x00000001
 #define MII_WRITE 0x00000002
+#define MII_DATA_MASK GENMASK(15, 0)
 
 /* GMAC4 defines */
 #define MII_GMAC4_GOC_SHIFT		2
+#define MII_GMAC4_REG_ADDR_SHIFT	16
 #define MII_GMAC4_WRITE			(1 << MII_GMAC4_GOC_SHIFT)
 #define MII_GMAC4_READ			(3 << MII_GMAC4_GOC_SHIFT)
+#define MII_GMAC4_C45E			BIT(1)
 
 /* XGMAC defines */
 #define MII_XGMAC_SADDR			BIT(18)
@@ -155,22 +158,34 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	unsigned int mii_address = priv->hw->mii.addr;
 	unsigned int mii_data = priv->hw->mii.data;
-	u32 v;
-	int data;
 	u32 value = MII_BUSY;
+	int data = 0;
+	u32 v;
 
 	value |= (phyaddr << priv->hw->mii.addr_shift)
 		& priv->hw->mii.addr_mask;
 	value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
 	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
 		& priv->hw->mii.clk_csr_mask;
-	if (priv->plat->has_gmac4)
+	if (priv->plat->has_gmac4) {
 		value |= MII_GMAC4_READ;
+		if (phyreg & MII_ADDR_C45) {
+			value |= MII_GMAC4_C45E;
+			value &= ~priv->hw->mii.reg_mask;
+			value |= ((phyreg >> MII_DEVADDR_C45_SHIFT) <<
+			       priv->hw->mii.reg_shift) &
+			       priv->hw->mii.reg_mask;
+
+			data |= (phyreg & MII_REGADDR_C45_MASK) <<
+				MII_GMAC4_REG_ADDR_SHIFT;
+		}
+	}
 
 	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
 			       100, 10000))
 		return -EBUSY;
 
+	writel(data, priv->ioaddr + mii_data);
 	writel(value, priv->ioaddr + mii_address);
 
 	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
@@ -178,7 +193,7 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
 		return -EBUSY;
 
 	/* Read the data from the MII data register */
-	data = (int)readl(priv->ioaddr + mii_data);
+	data = (int)readl(priv->ioaddr + mii_data) & MII_DATA_MASK;
 
 	return data;
 }
@@ -198,8 +213,9 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	unsigned int mii_address = priv->hw->mii.addr;
 	unsigned int mii_data = priv->hw->mii.data;
-	u32 v;
 	u32 value = MII_BUSY;
+	int data = phydata;
+	u32 v;
 
 	value |= (phyaddr << priv->hw->mii.addr_shift)
 		& priv->hw->mii.addr_mask;
@@ -207,10 +223,21 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 
 	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
 		& priv->hw->mii.clk_csr_mask;
-	if (priv->plat->has_gmac4)
+	if (priv->plat->has_gmac4) {
 		value |= MII_GMAC4_WRITE;
-	else
+		if (phyreg & MII_ADDR_C45) {
+			value |= MII_GMAC4_C45E;
+			value &= ~priv->hw->mii.reg_mask;
+			value |= ((phyreg >> MII_DEVADDR_C45_SHIFT) <<
+			       priv->hw->mii.reg_shift) &
+			       priv->hw->mii.reg_mask;
+
+			data |= (phyreg & MII_REGADDR_C45_MASK) <<
+				MII_GMAC4_REG_ADDR_SHIFT;
+		}
+	} else {
 		value |= MII_WRITE;
+	}
 
 	/* Wait until any existing MII operation is complete */
 	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
@@ -218,7 +245,7 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 		return -EBUSY;
 
 	/* Set the MII address register to write */
-	writel(phydata, priv->ioaddr + mii_data);
+	writel(data, priv->ioaddr + mii_data);
 	writel(value, priv->ioaddr + mii_address);
 
 	/* Wait until any existing MII operation is complete */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d0af7d37fdf9..1739c6dc470e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -195,6 +195,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 /* Or MII_ADDR_C45 into regnum for read/write on mii_bus to enable the 21 bit
    IEEE 802.3ae clause 45 addressing mode used by 10GIGE phy chips. */
 #define MII_ADDR_C45 (1<<30)
+#define MII_DEVADDR_C45_SHIFT	16
+#define MII_REGADDR_C45_MASK	GENMASK(15, 0)
 
 struct device;
 struct phylink;
-- 
cgit v1.2.3


From bf0bdd1343efbbf65b4d53aef1fce14acbd79d50 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@samsung.com>
Date: Wed, 3 Jul 2019 15:09:16 +0300
Subject: xdp: fix race on generic receive path

Unlike driver mode, generic xdp receive could be triggered
by different threads on different CPU cores at the same time
leading to the fill and rx queue breakage. For example, this
could happen while sending packets from two processes to the
first interface of veth pair while the second part of it is
open with AF_XDP socket.

Need to take a lock for each generic receive to avoid race.

Fixes: c497176cb2e4 ("xsk: add Rx receive functions and poll support")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Tested-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  2 ++
 net/xdp/xsk.c          | 31 ++++++++++++++++++++++---------
 2 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 057b159ff8b9..de4e3a353df3 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -67,6 +67,8 @@ struct xdp_sock {
 	 * in the SKB destructor callback.
 	 */
 	spinlock_t tx_completion_lock;
+	/* Protects generic receive. */
+	spinlock_t rx_lock;
 	u64 rx_dropped;
 };
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 74417a851ed5..0574f008954c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -129,13 +129,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	u64 addr;
 	int err;
 
-	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
-		return -EINVAL;
+	spin_lock_bh(&xs->rx_lock);
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
 
 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
-		xs->rx_dropped++;
-		return -ENOSPC;
+		err = -ENOSPC;
+		goto out_drop;
 	}
 
 	addr += xs->umem->headroom;
@@ -144,13 +148,21 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	memcpy(buffer, xdp->data_meta, len + metalen);
 	addr += metalen;
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
-	if (!err) {
-		xskq_discard_addr(xs->umem->fq);
-		xsk_flush(xs);
-		return 0;
-	}
+	if (err)
+		goto out_drop;
+
+	xskq_discard_addr(xs->umem->fq);
+	xskq_produce_flush_desc(xs->rx);
 
+	spin_unlock_bh(&xs->rx_lock);
+
+	xs->sk.sk_data_ready(&xs->sk);
+	return 0;
+
+out_drop:
 	xs->rx_dropped++;
+out_unlock:
+	spin_unlock_bh(&xs->rx_lock);
 	return err;
 }
 
@@ -787,6 +799,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 
 	xs = xdp_sk(sk);
 	mutex_init(&xs->mutex);
+	spin_lock_init(&xs->rx_lock);
 	spin_lock_init(&xs->tx_completion_lock);
 
 	mutex_lock(&net->xdp.lock);
-- 
cgit v1.2.3


From 333f7909a8573145811c4ab7d8c9092301707721 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Jul 2019 20:14:16 +0100
Subject: coallocate socket_wq with socket itself

socket->wq is assign-once, set when we are initializing both
struct socket it's in and struct socket_wq it points to.  As the
matter of fact, the only reason for separate allocation was the
ability to RCU-delay freeing of socket_wq.  RCU-delaying the
freeing of socket itself gets rid of that need, so we can just
fold struct socket_wq into the end of struct socket and simplify
the life both for sock_alloc_inode() (one allocation instead of
two) and for tun/tap oddballs, where we used to embed struct socket
and struct socket_wq into the same structure (now - embedding just
the struct socket).

Note that reference to struct socket_wq in struct sock does remain
a reference - that's unchanged.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tap.c      |  5 ++---
 drivers/net/tun.c      |  8 +++-----
 include/linux/if_tap.h |  1 -
 include/linux/net.h    |  4 ++--
 include/net/sock.h     |  4 ++--
 net/core/sock.c        |  2 +-
 net/socket.c           | 19 +++++--------------
 7 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 8e01390c738e..dd614c2cd994 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -520,8 +520,7 @@ static int tap_open(struct inode *inode, struct file *file)
 		goto err;
 	}
 
-	RCU_INIT_POINTER(q->sock.wq, &q->wq);
-	init_waitqueue_head(&q->wq.wait);
+	init_waitqueue_head(&q->sock.wq.wait);
 	q->sock.type = SOCK_RAW;
 	q->sock.state = SS_CONNECTED;
 	q->sock.file = file;
@@ -579,7 +578,7 @@ static __poll_t tap_poll(struct file *file, poll_table *wait)
 		goto out;
 
 	mask = 0;
-	poll_wait(file, &q->wq.wait, wait);
+	poll_wait(file, &q->sock.wq.wait, wait);
 
 	if (!ptr_ring_empty(&q->ring))
 		mask |= EPOLLIN | EPOLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d7c55e0fa8f4..3d443597bd04 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -160,7 +160,6 @@ struct tun_pcpu_stats {
 struct tun_file {
 	struct sock sk;
 	struct socket socket;
-	struct socket_wq wq;
 	struct tun_struct __rcu *tun;
 	struct fasync_struct *fasync;
 	/* only used for fasnyc */
@@ -2165,7 +2164,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 		goto out;
 	}
 
-	add_wait_queue(&tfile->wq.wait, &wait);
+	add_wait_queue(&tfile->socket.wq.wait, &wait);
 
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -2185,7 +2184,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 	}
 
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tfile->wq.wait, &wait);
+	remove_wait_queue(&tfile->socket.wq.wait, &wait);
 
 out:
 	*err = error;
@@ -3415,8 +3414,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	tfile->flags = 0;
 	tfile->ifindex = 0;
 
-	init_waitqueue_head(&tfile->wq.wait);
-	RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
+	init_waitqueue_head(&tfile->socket.wq.wait);
 
 	tfile->socket.file = file;
 	tfile->socket.ops = &tun_socket_ops;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 8e66866c11be..915a187cfabd 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -62,7 +62,6 @@ struct tap_dev {
 struct tap_queue {
 	struct sock sk;
 	struct socket sock;
-	struct socket_wq wq;
 	int vnet_hdr_sz;
 	struct tap_dev __rcu *tap;
 	struct file *file;
diff --git a/include/linux/net.h b/include/linux/net.h
index f7d672cf25b5..9cafb5f353a9 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -116,11 +116,11 @@ struct socket {
 
 	unsigned long		flags;
 
-	struct socket_wq	*wq;
-
 	struct file		*file;
 	struct sock		*sk;
 	const struct proto_ops	*ops;
+
+	struct socket_wq	wq;
 };
 
 struct vm_area_struct;
diff --git a/include/net/sock.h b/include/net/sock.h
index 6cbc16136357..228db3998e46 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1822,7 +1822,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 {
 	WARN_ON(parent->sk);
 	write_lock_bh(&sk->sk_callback_lock);
-	rcu_assign_pointer(sk->sk_wq, parent->wq);
+	rcu_assign_pointer(sk->sk_wq, &parent->wq);
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
 	sk->sk_uid = SOCK_INODE(parent)->i_uid;
@@ -2100,7 +2100,7 @@ static inline void sock_poll_wait(struct file *filp, struct socket *sock,
 				  poll_table *p)
 {
 	if (!poll_does_not_wait(p)) {
-		poll_wait(filp, &sock->wq->wait, p);
+		poll_wait(filp, &sock->wq.wait, p);
 		/* We need to be sure we are in sync with the
 		 * socket flags modification.
 		 *
diff --git a/net/core/sock.c b/net/core/sock.c
index 0eb21384079d..3e073ca6138f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2847,7 +2847,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	if (sock) {
 		sk->sk_type	=	sock->type;
-		RCU_INIT_POINTER(sk->sk_wq, sock->wq);
+		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
 		sock->sk	=	sk;
 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
 	} else {
diff --git a/net/socket.c b/net/socket.c
index 541719a2443d..16449d6daeca 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -234,20 +234,13 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init;
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
 	struct socket_alloc *ei;
-	struct socket_wq *wq;
 
 	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
-	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
-	if (!wq) {
-		kmem_cache_free(sock_inode_cachep, ei);
-		return NULL;
-	}
-	init_waitqueue_head(&wq->wait);
-	wq->fasync_list = NULL;
-	wq->flags = 0;
-	ei->socket.wq = wq;
+	init_waitqueue_head(&ei->socket.wq.wait);
+	ei->socket.wq.fasync_list = NULL;
+	ei->socket.wq.flags = 0;
 
 	ei->socket.state = SS_UNCONNECTED;
 	ei->socket.flags = 0;
@@ -263,7 +256,6 @@ static void sock_free_inode(struct inode *inode)
 	struct socket_alloc *ei;
 
 	ei = container_of(inode, struct socket_alloc, vfs_inode);
-	kfree(ei->socket.wq);
 	kmem_cache_free(sock_inode_cachep, ei);
 }
 
@@ -599,7 +591,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
 		module_put(owner);
 	}
 
-	if (sock->wq->fasync_list)
+	if (sock->wq.fasync_list)
 		pr_err("%s: fasync list not empty!\n", __func__);
 
 	if (!sock->file) {
@@ -1288,13 +1280,12 @@ static int sock_fasync(int fd, struct file *filp, int on)
 {
 	struct socket *sock = filp->private_data;
 	struct sock *sk = sock->sk;
-	struct socket_wq *wq;
+	struct socket_wq *wq = &sock->wq;
 
 	if (sk == NULL)
 		return -EINVAL;
 
 	lock_sock(sk);
-	wq = sock->wq;
 	fasync_helper(fd, filp, on, &wq->fasync_list);
 
 	if (!wq->fasync_list)
-- 
cgit v1.2.3


From 59c820b2317f0ffe1ab9b5d2c0515cdbfe714e6e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sun, 7 Jul 2019 05:34:45 -0400
Subject: ipv6: elide flowlabel check if no exclusive leases exist

Processes can request ipv6 flowlabels with cmsg IPV6_FLOWINFO.
If not set, by default an autogenerated flowlabel is selected.

Explicit flowlabels require a control operation per label plus a
datapath check on every connection (every datagram if unconnected).
This is particularly expensive on unconnected sockets multiplexing
many flows, such as QUIC.

In the common case, where no lease is exclusive, the check can be
safely elided, as both lease request and check trivially succeed.
Indeed, autoflowlabel does the same even with exclusive leases.

Elide the check if no process has requested an exclusive lease.

fl6_sock_lookup previously returns either a reference to a lease or
NULL to denote failure. Modify to return a real error and update
all callers. On return NULL, they can use the label and will elide
the atomic_dec in fl6_sock_release.

This is an optimization. Robust applications still have to revert to
requesting leases if the fast path fails due to an exclusive lease.

Changes RFC->v1:
  - use static_key_false_deferred to rate limit jump label operations
    - call static_key_deferred_flush to stop timers on exit
  - move decrement out of RCU context
  - defer optimization also if opt data is associated with a lease
  - updated all fp6_sock_lookup callers, not just udp

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h       | 14 +++++++++++++-
 net/dccp/ipv6.c          |  2 +-
 net/ipv6/ip6_flowlabel.c | 27 +++++++++++++++++++++++----
 net/ipv6/raw.c           |  4 ++--
 net/ipv6/tcp_ipv6.c      |  2 +-
 net/ipv6/udp.c           |  4 ++--
 net/l2tp/l2tp_ip6.c      |  4 ++--
 net/sctp/ipv6.c          |  2 +-
 8 files changed, 45 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8eca5fb30376..8dfc65639aa4 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -13,6 +13,7 @@
 #include <linux/hardirq.h>
 #include <linux/jhash.h>
 #include <linux/refcount.h>
+#include <linux/jump_label_ratelimit.h>
 #include <net/if_inet6.h>
 #include <net/ndisc.h>
 #include <net/flow.h>
@@ -389,7 +390,18 @@ static inline void txopt_put(struct ipv6_txoptions *opt)
 		kfree_rcu(opt, rcu);
 }
 
-struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label);
+struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label);
+
+extern struct static_key_false_deferred ipv6_flowlabel_exclusive;
+static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk,
+						    __be32 label)
+{
+	if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key))
+		return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);
+
+	return NULL;
+}
+
 struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
 					 struct ip6_flowlabel *fl,
 					 struct ipv6_txoptions *fopt);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 85c10c8f50bd..1b7381ff787b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -830,7 +830,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
 			struct ip6_flowlabel *flowlabel;
 			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-			if (flowlabel == NULL)
+			if (IS_ERR(flowlabel))
 				return -EINVAL;
 			fl6_sock_release(flowlabel);
 		}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 545e339b8c4f..ad284b1fd308 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/pid_namespace.h>
+#include <linux/jump_label_ratelimit.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -53,6 +54,9 @@ static DEFINE_SPINLOCK(ip6_fl_lock);
 
 static DEFINE_SPINLOCK(ip6_sk_fl_lock);
 
+DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ);
+EXPORT_SYMBOL(ipv6_flowlabel_exclusive);
+
 #define for_each_fl_rcu(hash, fl)				\
 	for (fl = rcu_dereference_bh(fl_ht[(hash)]);		\
 	     fl != NULL;					\
@@ -90,6 +94,13 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
 	return fl;
 }
 
+static bool fl_shared_exclusive(struct ip6_flowlabel *fl)
+{
+	return fl->share == IPV6_FL_S_EXCL ||
+	       fl->share == IPV6_FL_S_PROCESS ||
+	       fl->share == IPV6_FL_S_USER;
+}
+
 static void fl_free_rcu(struct rcu_head *head)
 {
 	struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
@@ -103,8 +114,13 @@ static void fl_free_rcu(struct rcu_head *head)
 
 static void fl_free(struct ip6_flowlabel *fl)
 {
-	if (fl)
-		call_rcu(&fl->rcu, fl_free_rcu);
+	if (!fl)
+		return;
+
+	if (fl_shared_exclusive(fl) || fl->opt)
+		static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive);
+
+	call_rcu(&fl->rcu, fl_free_rcu);
 }
 
 static void fl_release(struct ip6_flowlabel *fl)
@@ -240,7 +256,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
 
 /* Socket flowlabel lists */
 
-struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label)
+struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label)
 {
 	struct ipv6_fl_socklist *sfl;
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -260,7 +276,7 @@ struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label)
 	rcu_read_unlock_bh();
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+EXPORT_SYMBOL_GPL(__fl6_sock_lookup);
 
 void fl6_free_socklist(struct sock *sk)
 {
@@ -419,6 +435,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
 	}
 	fl->dst = freq->flr_dst;
 	atomic_set(&fl->users, 1);
+	if (fl_shared_exclusive(fl) || fl->opt)
+		static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
 	switch (fl->share) {
 	case IPV6_FL_S_EXCL:
 	case IPV6_FL_S_ANY:
@@ -854,6 +872,7 @@ int ip6_flowlabel_init(void)
 
 void ip6_flowlabel_cleanup(void)
 {
+	static_key_deferred_flush(&ipv6_flowlabel_exclusive);
 	del_timer(&ip6_fl_gc_timer);
 	unregister_pernet_subsys(&ip6_flowlabel_net_ops);
 }
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 70693bc7ad9d..8a6131991e38 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -834,7 +834,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 			if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-				if (!flowlabel)
+				if (IS_ERR(flowlabel))
 					return -EINVAL;
 			}
 		}
@@ -876,7 +876,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		}
 		if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
 			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-			if (!flowlabel)
+			if (IS_ERR(flowlabel))
 				return -EINVAL;
 		}
 		if (!(opt->opt_nflen|opt->opt_flen))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4f3f99b39820..d56a9019a0fe 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -171,7 +171,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
 			struct ip6_flowlabel *flowlabel;
 			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-			if (!flowlabel)
+			if (IS_ERR(flowlabel))
 				return -EINVAL;
 			fl6_sock_release(flowlabel);
 		}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4406e059da68..827fe7385078 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1319,7 +1319,7 @@ do_udp_sendmsg:
 			fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 			if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-				if (!flowlabel)
+				if (IS_ERR(flowlabel))
 					return -EINVAL;
 			}
 		}
@@ -1371,7 +1371,7 @@ do_udp_sendmsg:
 		}
 		if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
 			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-			if (!flowlabel)
+			if (IS_ERR(flowlabel))
 				return -EINVAL;
 		}
 		if (!(opt->opt_nflen|opt->opt_flen))
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 1a76a0a4e3ab..687e23a8b326 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -536,7 +536,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK;
 			if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-				if (flowlabel == NULL)
+				if (IS_ERR(flowlabel))
 					return -EINVAL;
 			}
 		}
@@ -577,7 +577,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		}
 		if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) {
 			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-			if (flowlabel == NULL)
+			if (IS_ERR(flowlabel))
 				return -EINVAL;
 		}
 		if (!(opt->opt_nflen|opt->opt_flen))
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 64e0a594a651..e5f2fc726a98 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -253,7 +253,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 		struct ip6_flowlabel *flowlabel;
 
 		flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
-		if (!flowlabel)
+		if (IS_ERR(flowlabel))
 			goto out;
 		fl6_sock_release(flowlabel);
 	}
-- 
cgit v1.2.3


From 6413139dfc641aaaa30580b59696a5f7ea274194 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sun, 7 Jul 2019 05:51:55 -0400
Subject: skbuff: increase verbosity when dumping skb data

skb_warn_bad_offload and netdev_rx_csum_fault trigger on hard to debug
issues. Dump more state and the header.

Optionally dump the entire packet and linear segment. This is required
to debug checksum bugs that may include bytes past skb_tail_pointer().

Both call sites call this function inside a net_ratelimit() block.
Limit full packet log further to a hard limit of can_dump_full (5).

Based on an earlier patch by Cong Wang, see link below.

Changes v1 -> v2
  - dump frag_list only on full_pkt

Link: https://patchwork.ozlabs.org/patch/1000841/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 +
 net/core/dev.c         | 16 ++------
 net/core/skbuff.c      | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7ece49d5f8ef..1fdfdbb34e8e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1024,6 +1024,7 @@ static inline bool skb_unref(struct sk_buff *skb)
 void skb_release_head_state(struct sk_buff *skb);
 void kfree_skb(struct sk_buff *skb);
 void kfree_skb_list(struct sk_buff *segs);
+void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
 void skb_tx_error(struct sk_buff *skb);
 void consume_skb(struct sk_buff *skb);
 void __consume_stateless_skb(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 58529318b3a9..fc676b2610e3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2900,12 +2900,10 @@ static void skb_warn_bad_offload(const struct sk_buff *skb)
 		else
 			name = netdev_name(dev);
 	}
-	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
-	     "gso_type=%d ip_summed=%d\n",
+	skb_dump(KERN_WARNING, skb, false);
+	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 	     name, dev ? &dev->features : &null_features,
-	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
-	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
-	     skb_shinfo(skb)->gso_type, skb->ip_summed);
+	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 }
 
 /*
@@ -3124,13 +3122,7 @@ void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 {
 	if (net_ratelimit()) {
 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
-		if (dev)
-			pr_err("dev features: %pNF\n", &dev->features);
-		pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n",
-		       skb->len, skb->data_len, skb->pkt_type,
-		       skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type,
-		       skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum,
-		       skb->csum_complete_sw, skb->csum_valid, skb->csum_level);
+		skb_dump(KERN_ERR, skb, true);
 		dump_stack();
 	}
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5323441a12cc..cdb0ccdaac0b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -707,6 +707,105 @@ void kfree_skb_list(struct sk_buff *segs)
 }
 EXPORT_SYMBOL(kfree_skb_list);
 
+/* Dump skb information and contents.
+ *
+ * Must only be called from net_ratelimit()-ed paths.
+ *
+ * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise.
+ */
+void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
+{
+	static atomic_t can_dump_full = ATOMIC_INIT(5);
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	struct net_device *dev = skb->dev;
+	struct sock *sk = skb->sk;
+	struct sk_buff *list_skb;
+	bool has_mac, has_trans;
+	int headroom, tailroom;
+	int i, len, seg_len;
+
+	if (full_pkt)
+		full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0;
+
+	if (full_pkt)
+		len = skb->len;
+	else
+		len = min_t(int, skb->len, MAX_HEADER + 128);
+
+	headroom = skb_headroom(skb);
+	tailroom = skb_tailroom(skb);
+
+	has_mac = skb_mac_header_was_set(skb);
+	has_trans = skb_transport_header_was_set(skb);
+
+	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
+	       "mac=(%d,%d) net=(%d,%d) trans=%d\n"
+	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
+	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
+	       level, skb->len, headroom, skb_headlen(skb), tailroom,
+	       has_mac ? skb->mac_header : -1,
+	       has_mac ? skb_mac_header_len(skb) : -1,
+	       skb->network_header,
+	       has_trans ? skb_network_header_len(skb) : -1,
+	       has_trans ? skb->transport_header : -1,
+	       sh->tx_flags, sh->nr_frags,
+	       sh->gso_size, sh->gso_type, sh->gso_segs,
+	       skb->csum, skb->ip_summed, skb->csum_complete_sw,
+	       skb->csum_valid, skb->csum_level,
+	       skb->hash, skb->sw_hash, skb->l4_hash,
+	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
+
+	if (dev)
+		printk("%sdev name=%s feat=0x%pNF\n",
+		       level, dev->name, &dev->features);
+	if (sk)
+		printk("%ssk family=%hu type=%hu proto=%hu\n",
+		       level, sk->sk_family, sk->sk_type, sk->sk_protocol);
+
+	if (full_pkt && headroom)
+		print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
+			       16, 1, skb->head, headroom, false);
+
+	seg_len = min_t(int, skb_headlen(skb), len);
+	if (seg_len)
+		print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
+			       16, 1, skb->data, seg_len, false);
+	len -= seg_len;
+
+	if (full_pkt && tailroom)
+		print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
+			       16, 1, skb_tail_pointer(skb), tailroom, false);
+
+	for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		u32 p_off, p_len, copied;
+		struct page *p;
+		u8 *vaddr;
+
+		skb_frag_foreach_page(frag, frag->page_offset,
+				      skb_frag_size(frag), p, p_off, p_len,
+				      copied) {
+			seg_len = min_t(int, p_len, len);
+			vaddr = kmap_atomic(p);
+			print_hex_dump(level, "skb frag:     ",
+				       DUMP_PREFIX_OFFSET,
+				       16, 1, vaddr + p_off, seg_len, false);
+			kunmap_atomic(vaddr);
+			len -= seg_len;
+			if (!len)
+				break;
+		}
+	}
+
+	if (full_pkt && skb_has_frag_list(skb)) {
+		printk("skb fraglist:\n");
+		skb_walk_frags(skb, list_skb)
+			skb_dump(level, list_skb, true);
+	}
+}
+EXPORT_SYMBOL(skb_dump);
+
 /**
  *	skb_tx_error - report an sk_buff xmit error
  *	@skb: buffer that triggered an error
-- 
cgit v1.2.3


From 8822e270d697010e6a4fd42a319dbefc33db91e1 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Sun, 7 Jul 2019 15:01:54 +0100
Subject: net: core: move push MPLS functionality from OvS to core helper

Open vSwitch provides code to push an MPLS header to a packet. In
preparation for supporting this in TC, move the push code to an skb helper
that can be reused.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 64 +++++++++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/actions.c | 31 +++--------------------
 3 files changed, 69 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1fdfdbb34e8e..1dc55000710c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3447,6 +3447,7 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len);
 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 			     gfp_t gfp);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index cdb0ccdaac0b..495fd743a935 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -66,6 +66,7 @@
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
 #include <net/xfrm.h>
+#include <net/mpls.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -5425,6 +5426,69 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 }
 EXPORT_SYMBOL(skb_vlan_push);
 
+/* Update the ethertype of hdr and the skb csum value if required. */
+static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
+			     __be16 ethertype)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		__be16 diff[] = { ~hdr->h_proto, ethertype };
+
+		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+	}
+
+	hdr->h_proto = ethertype;
+}
+
+/**
+ * skb_mpls_push() - push a new MPLS header after the mac header
+ *
+ * @skb: buffer
+ * @mpls_lse: MPLS label stack entry to push
+ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
+{
+	struct mpls_shim_hdr *lse;
+	int err;
+
+	if (unlikely(!eth_p_mpls(mpls_proto)))
+		return -EINVAL;
+
+	/* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
+	if (skb->encapsulation)
+		return -EINVAL;
+
+	err = skb_cow_head(skb, MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	if (!skb->inner_protocol) {
+		skb_set_inner_network_header(skb, skb->mac_len);
+		skb_set_inner_protocol(skb, skb->protocol);
+	}
+
+	skb_push(skb, MPLS_HLEN);
+	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb->mac_len);
+
+	lse = mpls_hdr(skb);
+	lse->label_stack_entry = mpls_lse;
+	skb_postpush_rcsum(skb, lse, MPLS_HLEN);
+
+	if (skb->dev && skb->dev->type == ARPHRD_ETHER)
+		skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
+	skb->protocol = mpls_proto;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_push);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index bd131469e4ca..a9a6c9cbf946 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -175,34 +175,11 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
 static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_mpls *mpls)
 {
-	struct mpls_shim_hdr *new_mpls_lse;
-
-	/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
-	if (skb->encapsulation)
-		return -ENOTSUPP;
-
-	if (skb_cow_head(skb, MPLS_HLEN) < 0)
-		return -ENOMEM;
-
-	if (!skb->inner_protocol) {
-		skb_set_inner_network_header(skb, skb->mac_len);
-		skb_set_inner_protocol(skb, skb->protocol);
-	}
-
-	skb_push(skb, MPLS_HLEN);
-	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
-		skb->mac_len);
-	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, skb->mac_len);
-
-	new_mpls_lse = mpls_hdr(skb);
-	new_mpls_lse->label_stack_entry = mpls->mpls_lse;
-
-	skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
+	int err;
 
-	if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
-		update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
-	skb->protocol = mpls->mpls_ethertype;
+	err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype);
+	if (err)
+		return err;
 
 	invalidate_flow_key(key);
 	return 0;
-- 
cgit v1.2.3


From ed246cee09b9865145a2e1e34f63ec0e31dd83a5 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Sun, 7 Jul 2019 15:01:55 +0100
Subject: net: core: move pop MPLS functionality from OvS to core helper

Open vSwitch provides code to pop an MPLS header to a packet. In
preparation for supporting this in TC, move the pop code to an skb helper
that can be reused.

Remove the, now unused, update_ethertype static function from OvS.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 42 ++++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/actions.c | 37 ++-----------------------------------
 3 files changed, 45 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1dc55000710c..08d1c8e70540 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3448,6 +3448,7 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 			     gfp_t gfp);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 495fd743a935..8c00be4d8919 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5489,6 +5489,48 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
 }
 EXPORT_SYMBOL_GPL(skb_mpls_push);
 
+/**
+ * skb_mpls_pop() - pop the outermost MPLS header
+ *
+ * @skb: buffer
+ * @next_proto: ethertype of header after popped MPLS header
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto)
+{
+	int err;
+
+	if (unlikely(!eth_p_mpls(skb->protocol)))
+		return -EINVAL;
+
+	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
+	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+
+	__skb_pull(skb, MPLS_HLEN);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb->mac_len);
+
+	if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+		struct ethhdr *hdr;
+
+		/* use mpls_hdr() to get ethertype to account for VLANs. */
+		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+		skb_mod_eth_type(skb, hdr, next_proto);
+	}
+	skb->protocol = next_proto;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_pop);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a9a6c9cbf946..62715bb8d611 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -160,18 +160,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			      struct sw_flow_key *key,
 			      const struct nlattr *attr, int len);
 
-static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
-			     __be16 ethertype)
-{
-	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		__be16 diff[] = { ~(hdr->h_proto), ethertype };
-
-		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
-	}
-
-	hdr->h_proto = ethertype;
-}
-
 static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_mpls *mpls)
 {
@@ -190,31 +178,10 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 {
 	int err;
 
-	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
-	if (unlikely(err))
+	err = skb_mpls_pop(skb, ethertype);
+	if (err)
 		return err;
 
-	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
-
-	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
-		skb->mac_len);
-
-	__skb_pull(skb, MPLS_HLEN);
-	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, skb->mac_len);
-
-	if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
-		struct ethhdr *hdr;
-
-		/* mpls_hdr() is used to locate the ethertype field correctly in the
-		 * presence of VLAN tags.
-		 */
-		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
-		update_ethertype(skb, hdr, ethertype);
-	}
-	if (eth_p_mpls(skb->protocol))
-		skb->protocol = ethertype;
-
 	invalidate_flow_key(key);
 	return 0;
 }
-- 
cgit v1.2.3


From d27cf5c59a12f66425df29cd81f61aa73ef14ac1 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Sun, 7 Jul 2019 15:01:56 +0100
Subject: net: core: add MPLS update core helper and use in OvS

Open vSwitch allows the updating of an existing MPLS header on a packet.
In preparation for supporting similar functionality in TC, move this to a
common skb helper function.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 33 +++++++++++++++++++++++++++++++++
 net/openvswitch/actions.c | 13 +++----------
 3 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 08d1c8e70540..9f7e01f2be83 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3449,6 +3449,7 @@ int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto);
+int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 			     gfp_t gfp);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8c00be4d8919..93443a01ab39 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5531,6 +5531,39 @@ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto)
 }
 EXPORT_SYMBOL_GPL(skb_mpls_pop);
 
+/**
+ * skb_mpls_update_lse() - modify outermost MPLS header and update csum
+ *
+ * @skb: buffer
+ * @mpls_lse: new MPLS label stack entry to update to
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
+{
+	int err;
+
+	if (unlikely(!eth_p_mpls(skb->protocol)))
+		return -EINVAL;
+
+	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		__be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
+
+		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+	}
+
+	mpls_hdr(skb)->label_stack_entry = mpls_lse;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 62715bb8d611..3572e11b6f21 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -193,19 +193,12 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	__be32 lse;
 	int err;
 
-	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
-	if (unlikely(err))
-		return err;
-
 	stack = mpls_hdr(skb);
 	lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
-	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		__be32 diff[] = { ~(stack->label_stack_entry), lse };
-
-		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
-	}
+	err = skb_mpls_update_lse(skb, lse);
+	if (err)
+		return err;
 
-	stack->label_stack_entry = lse;
 	flow_key->mpls.top_lse = lse;
 	return 0;
 }
-- 
cgit v1.2.3


From 2a2ea50870baa3fb4de0872c5b60828138654ca7 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Sun, 7 Jul 2019 15:01:57 +0100
Subject: net: sched: add mpls manipulation actions to TC

Currently, TC offers the ability to match on the MPLS fields of a packet
through the use of the flow_dissector_key_mpls struct. However, as yet, TC
actions do not allow the modification or manipulation of such fields.

Add a new module that registers TC action ops to allow manipulation of
MPLS. This includes the ability to push and pop headers as well as modify
the contents of new or existing headers. A further action to decrement the
TTL field of an MPLS header is also provided with a new helper added to
support this.

Examples of the usage of the new action with flower rules to push and pop
MPLS labels are:

tc filter add dev eth0 protocol ip parent ffff: flower \
    action mpls push protocol mpls_uc label 123  \
    action mirred egress redirect dev eth1

tc filter add dev eth0 protocol mpls_uc parent ffff: flower \
    action mpls pop protocol ipv4  \
    action mirred egress redirect dev eth1

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h              |   1 +
 include/net/tc_act/tc_mpls.h        |  30 +++
 include/uapi/linux/pkt_cls.h        |   3 +-
 include/uapi/linux/tc_act/tc_mpls.h |  33 +++
 net/core/skbuff.c                   |  30 +++
 net/sched/Kconfig                   |  11 +
 net/sched/Makefile                  |   1 +
 net/sched/act_mpls.c                | 406 ++++++++++++++++++++++++++++++++++++
 8 files changed, 514 insertions(+), 1 deletion(-)
 create mode 100644 include/net/tc_act/tc_mpls.h
 create mode 100644 include/uapi/linux/tc_act/tc_mpls.h
 create mode 100644 net/sched/act_mpls.c

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9f7e01f2be83..9d7a2c28ea35 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3450,6 +3450,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto);
 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
+int skb_mpls_dec_ttl(struct sk_buff *skb);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 			     gfp_t gfp);
 
diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h
new file mode 100644
index 000000000000..4bc3d9250ef0
--- /dev/null
+++ b/include/net/tc_act/tc_mpls.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#ifndef __NET_TC_MPLS_H
+#define __NET_TC_MPLS_H
+
+#include <linux/tc_act/tc_mpls.h>
+#include <net/act_api.h>
+
+struct tcf_mpls_params {
+	int tcfm_action;
+	u32 tcfm_label;
+	u8 tcfm_tc;
+	u8 tcfm_ttl;
+	u8 tcfm_bos;
+	__be16 tcfm_proto;
+	struct rcu_head	rcu;
+};
+
+#define ACT_MPLS_TC_NOT_SET	0xff
+#define ACT_MPLS_BOS_NOT_SET	0xff
+#define ACT_MPLS_LABEL_NOT_SET	0xffffffff
+
+struct tcf_mpls {
+	struct tc_action common;
+	struct tcf_mpls_params __rcu *mpls_p;
+};
+#define to_mpls(a) ((struct tcf_mpls *)a)
+
+#endif /* __NET_TC_MPLS_H */
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 8cc6b6777b3c..e22ef4a940bc 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -104,8 +104,9 @@ enum tca_id {
 	TCA_ID_SIMP = TCA_ACT_SIMP,
 	TCA_ID_IFE = TCA_ACT_IFE,
 	TCA_ID_SAMPLE = TCA_ACT_SAMPLE,
-	/* other actions go here */
 	TCA_ID_CTINFO,
+	TCA_ID_MPLS,
+	/* other actions go here */
 	__TCA_ID_MAX = 255
 };
 
diff --git a/include/uapi/linux/tc_act/tc_mpls.h b/include/uapi/linux/tc_act/tc_mpls.h
new file mode 100644
index 000000000000..9360e95273c7
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_mpls.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#ifndef __LINUX_TC_MPLS_H
+#define __LINUX_TC_MPLS_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_MPLS_ACT_POP	1
+#define TCA_MPLS_ACT_PUSH	2
+#define TCA_MPLS_ACT_MODIFY	3
+#define TCA_MPLS_ACT_DEC_TTL	4
+
+struct tc_mpls {
+	tc_gen;		/* generic TC action fields. */
+	int m_action;	/* action of type TCA_MPLS_ACT_*. */
+};
+
+enum {
+	TCA_MPLS_UNSPEC,
+	TCA_MPLS_TM,	/* struct tcf_t; time values associated with action. */
+	TCA_MPLS_PARMS,	/* struct tc_mpls; action type and general TC fields. */
+	TCA_MPLS_PAD,
+	TCA_MPLS_PROTO,	/* be16; eth_type of pushed or next (for pop) header. */
+	TCA_MPLS_LABEL,	/* u32; MPLS label. Lower 20 bits are used. */
+	TCA_MPLS_TC,	/* u8; MPLS TC field. Lower 3 bits are used. */
+	TCA_MPLS_TTL,	/* u8; MPLS TTL field. Must not be 0. */
+	TCA_MPLS_BOS,	/* u8; MPLS BOS field. Either 1 or 0. */
+	__TCA_MPLS_MAX,
+};
+#define TCA_MPLS_MAX (__TCA_MPLS_MAX - 1)
+
+#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93443a01ab39..6f1e31f674a3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -59,6 +59,7 @@
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
 #include <linux/if_vlan.h>
+#include <linux/mpls.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -5564,6 +5565,35 @@ int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
 }
 EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
 
+/**
+ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
+ *
+ * @skb: buffer
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_dec_ttl(struct sk_buff *skb)
+{
+	u32 lse;
+	u8 ttl;
+
+	if (unlikely(!eth_p_mpls(skb->protocol)))
+		return -EINVAL;
+
+	lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
+	ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+	if (!--ttl)
+		return -EINVAL;
+
+	lse &= ~MPLS_LS_TTL_MASK;
+	lse |= ttl << MPLS_LS_TTL_SHIFT;
+
+	return skb_mpls_update_lse(skb, cpu_to_be32(lse));
+}
+EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 360fdd3eaa77..731f5fbc2a3c 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -842,6 +842,17 @@ config NET_ACT_CSUM
 	  To compile this code as a module, choose M here: the
 	  module will be called act_csum.
 
+config NET_ACT_MPLS
+	tristate "MPLS manipulation"
+	depends on NET_CLS_ACT
+	help
+	  Say Y here to push or pop MPLS headers.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_mpls.
+
 config NET_ACT_VLAN
         tristate "Vlan manipulation"
         depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index d54bfcbd7981..c26603606c22 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
 obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
 obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
 obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
+obj-$(CONFIG_NET_ACT_MPLS)	+= act_mpls.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
new file mode 100644
index 000000000000..ca2597ce4ac9
--- /dev/null
+++ b/net/sched/act_mpls.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/tc_act/tc_mpls.h>
+#include <net/mpls.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mpls.h>
+
+static unsigned int mpls_net_id;
+static struct tc_action_ops act_mpls_ops;
+
+#define ACT_MPLS_TTL_DEFAULT	255
+
+static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse,
+			       struct tcf_mpls_params *p, bool set_bos)
+{
+	u32 new_lse = 0;
+
+	if (lse)
+		new_lse = be32_to_cpu(lse->label_stack_entry);
+
+	if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) {
+		new_lse &= ~MPLS_LS_LABEL_MASK;
+		new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT;
+	}
+	if (p->tcfm_ttl) {
+		new_lse &= ~MPLS_LS_TTL_MASK;
+		new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT;
+	}
+	if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) {
+		new_lse &= ~MPLS_LS_TC_MASK;
+		new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT;
+	}
+	if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) {
+		new_lse &= ~MPLS_LS_S_MASK;
+		new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT;
+	} else if (set_bos) {
+		new_lse |= 1 << MPLS_LS_S_SHIFT;
+	}
+
+	return cpu_to_be32(new_lse);
+}
+
+static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
+			struct tcf_result *res)
+{
+	struct tcf_mpls *m = to_mpls(a);
+	struct tcf_mpls_params *p;
+	__be32 new_lse;
+	int ret;
+
+	tcf_lastuse_update(&m->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+
+	/* Ensure 'data' points at mac_header prior calling mpls manipulating
+	 * functions.
+	 */
+	if (skb_at_tc_ingress(skb))
+		skb_push_rcsum(skb, skb->mac_len);
+
+	ret = READ_ONCE(m->tcf_action);
+
+	p = rcu_dereference_bh(m->mpls_p);
+
+	switch (p->tcfm_action) {
+	case TCA_MPLS_ACT_POP:
+		if (skb_mpls_pop(skb, p->tcfm_proto))
+			goto drop;
+		break;
+	case TCA_MPLS_ACT_PUSH:
+		new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol));
+		if (skb_mpls_push(skb, new_lse, p->tcfm_proto))
+			goto drop;
+		break;
+	case TCA_MPLS_ACT_MODIFY:
+		new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false);
+		if (skb_mpls_update_lse(skb, new_lse))
+			goto drop;
+		break;
+	case TCA_MPLS_ACT_DEC_TTL:
+		if (skb_mpls_dec_ttl(skb))
+			goto drop;
+		break;
+	}
+
+	if (skb_at_tc_ingress(skb))
+		skb_pull_rcsum(skb, skb->mac_len);
+
+	return ret;
+
+drop:
+	qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
+	return TC_ACT_SHOT;
+}
+
+static int valid_label(const struct nlattr *attr,
+		       struct netlink_ext_ack *extack)
+{
+	const u32 *label = nla_data(attr);
+
+	if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) {
+		NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
+	[TCA_MPLS_UNSPEC]	= { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
+	[TCA_MPLS_PARMS]	= NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
+	[TCA_MPLS_PROTO]	= { .type = NLA_U16 },
+	[TCA_MPLS_LABEL]	= NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
+	[TCA_MPLS_TC]		= NLA_POLICY_RANGE(NLA_U8, 0, 7),
+	[TCA_MPLS_TTL]		= NLA_POLICY_MIN(NLA_U8, 1),
+	[TCA_MPLS_BOS]		= NLA_POLICY_RANGE(NLA_U8, 0, 1),
+};
+
+static int tcf_mpls_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action **a,
+			 int ovr, int bind, bool rtnl_held,
+			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+	struct nlattr *tb[TCA_MPLS_MAX + 1];
+	struct tcf_chain *goto_ch = NULL;
+	struct tcf_mpls_params *p;
+	struct tc_mpls *parm;
+	bool exists = false;
+	struct tcf_mpls *m;
+	int ret = 0, err;
+	u8 mpls_ttl = 0;
+
+	if (!nla) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_MPLS_PARMS]) {
+		NL_SET_ERR_MSG_MOD(extack, "No MPLS params");
+		return -EINVAL;
+	}
+	parm = nla_data(tb[TCA_MPLS_PARMS]);
+
+	/* Verify parameters against action type. */
+	switch (parm->m_action) {
+	case TCA_MPLS_ACT_POP:
+		if (!tb[TCA_MPLS_PROTO]) {
+			NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop");
+			return -EINVAL;
+		}
+		if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop");
+			return -EINVAL;
+		}
+		if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] ||
+		    tb[TCA_MPLS_BOS]) {
+			NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop");
+			return -EINVAL;
+		}
+		break;
+	case TCA_MPLS_ACT_DEC_TTL:
+		if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] ||
+		    tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) {
+			NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl");
+			return -EINVAL;
+		}
+		break;
+	case TCA_MPLS_ACT_PUSH:
+		if (!tb[TCA_MPLS_LABEL]) {
+			NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push");
+			return -EINVAL;
+		}
+		if (tb[TCA_MPLS_PROTO] &&
+		    !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+			NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push");
+			return -EPROTONOSUPPORT;
+		}
+		/* Push needs a TTL - if not specified, set a default value. */
+		if (!tb[TCA_MPLS_TTL]) {
+#if IS_ENABLED(CONFIG_MPLS)
+			mpls_ttl = net->mpls.default_ttl ?
+				   net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT;
+#else
+			mpls_ttl = ACT_MPLS_TTL_DEFAULT;
+#endif
+		}
+		break;
+	case TCA_MPLS_ACT_MODIFY:
+		if (tb[TCA_MPLS_PROTO]) {
+			NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify");
+			return -EINVAL;
+		}
+		break;
+	default:
+		NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action");
+		return -EINVAL;
+	}
+
+	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	if (err < 0)
+		return err;
+	exists = err;
+	if (exists && bind)
+		return 0;
+
+	if (!exists) {
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_mpls_ops, bind, true);
+		if (ret) {
+			tcf_idr_cleanup(tn, parm->index);
+			return ret;
+		}
+
+		ret = ACT_P_CREATED;
+	} else if (!ovr) {
+		tcf_idr_release(*a, bind);
+		return -EEXIST;
+	}
+
+	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+	if (err < 0)
+		goto release_idr;
+
+	m = to_mpls(*a);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		err = -ENOMEM;
+		goto put_chain;
+	}
+
+	p->tcfm_action = parm->m_action;
+	p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) :
+					     ACT_MPLS_LABEL_NOT_SET;
+	p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) :
+				       ACT_MPLS_TC_NOT_SET;
+	p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) :
+					 mpls_ttl;
+	p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) :
+					 ACT_MPLS_BOS_NOT_SET;
+	p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) :
+					     htons(ETH_P_MPLS_UC);
+
+	spin_lock_bh(&m->tcf_lock);
+	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+	rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+	spin_unlock_bh(&m->tcf_lock);
+
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+	if (p)
+		kfree_rcu(p, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_idr_insert(tn, *a);
+	return ret;
+put_chain:
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+release_idr:
+	tcf_idr_release(*a, bind);
+	return err;
+}
+
+static void tcf_mpls_cleanup(struct tc_action *a)
+{
+	struct tcf_mpls *m = to_mpls(a);
+	struct tcf_mpls_params *p;
+
+	p = rcu_dereference_protected(m->mpls_p, 1);
+	if (p)
+		kfree_rcu(p, rcu);
+}
+
+static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
+			 int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_mpls *m = to_mpls(a);
+	struct tcf_mpls_params *p;
+	struct tc_mpls opt = {
+		.index    = m->tcf_index,
+		.refcnt   = refcount_read(&m->tcf_refcnt) - ref,
+		.bindcnt  = atomic_read(&m->tcf_bindcnt) - bind,
+	};
+	struct tcf_t t;
+
+	spin_lock_bh(&m->tcf_lock);
+	opt.action = m->tcf_action;
+	p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock));
+	opt.m_action = p->tcfm_action;
+
+	if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET &&
+	    nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label))
+		goto nla_put_failure;
+
+	if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET &&
+	    nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc))
+		goto nla_put_failure;
+
+	if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl))
+		goto nla_put_failure;
+
+	if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET &&
+	    nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos))
+		goto nla_put_failure;
+
+	if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &m->tcf_tm);
+
+	if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD))
+		goto nla_put_failure;
+
+	spin_unlock_bh(&m->tcf_lock);
+
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(&m->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -EMSGSIZE;
+}
+
+static int tcf_mpls_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_mpls_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+	return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_mpls_ops = {
+	.kind		=	"mpls",
+	.id		=	TCA_ID_MPLS,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_mpls_act,
+	.dump		=	tcf_mpls_dump,
+	.init		=	tcf_mpls_init,
+	.cleanup	=	tcf_mpls_cleanup,
+	.walk		=	tcf_mpls_walker,
+	.lookup		=	tcf_mpls_search,
+	.size		=	sizeof(struct tcf_mpls),
+};
+
+static __net_init int mpls_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+	return tc_action_net_init(tn, &act_mpls_ops);
+}
+
+static void __net_exit mpls_exit_net(struct list_head *net_list)
+{
+	tc_action_net_exit(net_list, mpls_net_id);
+}
+
+static struct pernet_operations mpls_net_ops = {
+	.init = mpls_init_net,
+	.exit_batch = mpls_exit_net,
+	.id   = &mpls_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init mpls_init_module(void)
+{
+	return tcf_register_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+static void __exit mpls_cleanup_module(void)
+{
+	tcf_unregister_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+module_init(mpls_init_module);
+module_exit(mpls_cleanup_module);
+
+MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPLS manipulation actions");
-- 
cgit v1.2.3


From a96701fb3534c45bd6fe5e6f6d3a91e3acc19b59 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 9 Jul 2019 00:57:04 +0800
Subject: sctp: remove reconf_enable from asoc

asoc's reconf support is actually decided by the 4-shakehand negotiation,
not something that users can set by sockopt. asoc->peer.reconf_capable is
working for this. So remove it from asoc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 3 +--
 net/sctp/associola.c       | 1 -
 net/sctp/sm_make_chunk.c   | 5 ++---
 net/sctp/socket.c          | 7 ++-----
 4 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 0767701ef362..d9e0e1a53f99 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2051,8 +2051,7 @@ struct sctp_association {
 	     temp:1,		/* Is it a temporary association? */
 	     force_delay:1,
 	     intl_enable:1,
-	     prsctp_enable:1,
-	     reconf_enable:1;
+	     prsctp_enable:1;
 
 	__u8 strreset_enable;
 	__u8 strreset_outstanding; /* request param count on the fly */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1999237ce481..321c199edacf 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -262,7 +262,6 @@ static struct sctp_association *sctp_association_init(
 
 	asoc->active_key_id = ep->active_key_id;
 	asoc->prsctp_enable = ep->prsctp_enable;
-	asoc->reconf_enable = ep->reconf_enable;
 	asoc->strreset_enable = ep->strreset_enable;
 
 	/* Save the hmacs and chunks list into this association */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9b0e5b0d701a..d784dc176d70 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 		num_ext += 2;
 	}
 
-	if (asoc->reconf_enable) {
+	if (asoc->ep->reconf_enable) {
 		extensions[num_ext] = SCTP_CID_RECONF;
 		num_ext += 1;
 	}
@@ -2007,8 +2007,7 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 	for (i = 0; i < num_ext; i++) {
 		switch (param.ext->chunks[i]) {
 		case SCTP_CID_RECONF:
-			if (asoc->reconf_enable &&
-			    !asoc->peer.reconf_capable)
+			if (asoc->ep->reconf_enable)
 				asoc->peer.reconf_capable = 1;
 			break;
 		case SCTP_CID_FWD_TSN:
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f33aa9ee9e27..d8bcc4711d4a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4226,10 +4226,7 @@ static int sctp_setsockopt_reconfig_supported(struct sock *sk,
 	    sctp_style(sk, UDP))
 		goto out;
 
-	if (asoc)
-		asoc->reconf_enable = !!params.assoc_value;
-	else
-		sctp_sk(sk)->ep->reconf_enable = !!params.assoc_value;
+	sctp_sk(sk)->ep->reconf_enable = !!params.assoc_value;
 
 	retval = 0;
 
@@ -7536,7 +7533,7 @@ static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len,
 		goto out;
 	}
 
-	params.assoc_value = asoc ? asoc->reconf_enable
+	params.assoc_value = asoc ? asoc->peer.reconf_capable
 				  : sctp_sk(sk)->ep->reconf_enable;
 
 	if (put_user(len, optlen))
-- 
cgit v1.2.3


From 1c13475368b697d4fc9c0630b5d4ee51d5ca0790 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 9 Jul 2019 00:57:05 +0800
Subject: sctp: remove prsctp_enable from asoc

Like reconf_enable, prsctp_enable should also be removed from asoc,
as asoc->peer.prsctp_capable has taken its job.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 3 +--
 net/sctp/associola.c       | 1 -
 net/sctp/sm_make_chunk.c   | 8 ++++----
 net/sctp/socket.c          | 2 +-
 4 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d9e0e1a53f99..7f35b8ee9f65 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2050,8 +2050,7 @@ struct sctp_association {
 	__u8 need_ecne:1,	/* Need to send an ECNE Chunk? */
 	     temp:1,		/* Is it a temporary association? */
 	     force_delay:1,
-	     intl_enable:1,
-	     prsctp_enable:1;
+	     intl_enable:1;
 
 	__u8 strreset_enable;
 	__u8 strreset_outstanding; /* request param count on the fly */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 321c199edacf..5010cce52c93 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -261,7 +261,6 @@ static struct sctp_association *sctp_association_init(
 		goto stream_free;
 
 	asoc->active_key_id = ep->active_key_id;
-	asoc->prsctp_enable = ep->prsctp_enable;
 	asoc->strreset_enable = ep->strreset_enable;
 
 	/* Save the hmacs and chunks list into this association */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index d784dc176d70..227bbac5222f 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -247,7 +247,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
 	chunksize += sizeof(ecap_param);
 
-	if (asoc->prsctp_enable)
+	if (asoc->ep->prsctp_enable)
 		chunksize += sizeof(prsctp_param);
 
 	/* ADDIP: Section 4.2.7:
@@ -348,7 +348,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 		sctp_addto_param(retval, num_ext, extensions);
 	}
 
-	if (asoc->prsctp_enable)
+	if (asoc->ep->prsctp_enable)
 		sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
 
 	if (sp->adaptation_ind) {
@@ -2011,7 +2011,7 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 				asoc->peer.reconf_capable = 1;
 			break;
 		case SCTP_CID_FWD_TSN:
-			if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
+			if (asoc->ep->prsctp_enable)
 				asoc->peer.prsctp_capable = 1;
 			break;
 		case SCTP_CID_AUTH:
@@ -2636,7 +2636,7 @@ do_addr_param:
 		break;
 
 	case SCTP_PARAM_FWD_TSN_SUPPORT:
-		if (asoc->prsctp_enable) {
+		if (asoc->ep->prsctp_enable) {
 			asoc->peer.prsctp_capable = 1;
 			break;
 		}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d8bcc4711d4a..54ceece59ea5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7325,7 +7325,7 @@ static int sctp_getsockopt_pr_supported(struct sock *sk, int len,
 		goto out;
 	}
 
-	params.assoc_value = asoc ? asoc->prsctp_enable
+	params.assoc_value = asoc ? asoc->peer.prsctp_capable
 				  : sctp_sk(sk)->ep->prsctp_enable;
 
 	if (put_user(len, optlen))
-- 
cgit v1.2.3


From da1f6d4de7b743c86cb49015ea05b184fea1388c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 9 Jul 2019 00:57:06 +0800
Subject: sctp: rename asoc intl_enable to asoc peer.intl_capable

To keep consistent with other asoc features, we move intl_enable
to peer.intl_capable in asoc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h   | 33 +++++++++++++++++----------------
 net/sctp/sm_make_chunk.c     |  4 ++--
 net/sctp/socket.c            |  2 +-
 net/sctp/stream_interleave.c |  4 ++--
 net/sctp/stream_sched.c      |  2 +-
 5 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 7f35b8ee9f65..c41b57ba04bb 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1679,28 +1679,30 @@ struct sctp_association {
 		__be16 addip_disabled_mask;
 
 		/* These are capabilities which our peer advertised.  */
-		__u8	ecn_capable:1,      /* Can peer do ECN? */
+		__u16	ecn_capable:1,      /* Can peer do ECN? */
 			ipv4_address:1,     /* Peer understands IPv4 addresses? */
 			ipv6_address:1,     /* Peer understands IPv6 addresses? */
 			hostname_address:1, /* Peer understands DNS addresses? */
 			asconf_capable:1,   /* Does peer support ADDIP? */
 			prsctp_capable:1,   /* Can peer do PR-SCTP? */
 			reconf_capable:1,   /* Can peer do RE-CONFIG? */
-			auth_capable:1;     /* Is peer doing SCTP-AUTH? */
-
-		/* sack_needed : This flag indicates if the next received
-		 *             : packet is to be responded to with a
-		 *             : SACK. This is initialized to 0.  When a packet
-		 *             : is received sack_cnt is incremented. If this value
-		 *             : reaches 2 or more, a SACK is sent and the
-		 *             : value is reset to 0. Note: This is used only
-		 *             : when no DATA chunks are received out of
-		 *             : order.  When DATA chunks are out of order,
-		 *             : SACK's are not delayed (see Section 6).
-		 */
-		__u8    sack_needed:1,     /* Do we need to sack the peer? */
+			intl_capable:1,     /* Can peer do INTERLEAVE */
+			auth_capable:1,     /* Is peer doing SCTP-AUTH? */
+			/* sack_needed:
+			 *   This flag indicates if the next received
+			 *   packet is to be responded to with a
+			 *   SACK. This is initialized to 0.  When a packet
+			 *   is received sack_cnt is incremented. If this value
+			 *   reaches 2 or more, a SACK is sent and the
+			 *   value is reset to 0. Note: This is used only
+			 *   when no DATA chunks are received out of
+			 *   order.  When DATA chunks are out of order,
+			 *   SACK's are not delayed (see Section 6).
+			 */
+			sack_needed:1,     /* Do we need to sack the peer? */
 			sack_generation:1,
 			zero_window_announced:1;
+
 		__u32	sack_cnt;
 
 		__u32   adaptation_ind;	 /* Adaptation Code point. */
@@ -2049,8 +2051,7 @@ struct sctp_association {
 
 	__u8 need_ecne:1,	/* Need to send an ECNE Chunk? */
 	     temp:1,		/* Is it a temporary association? */
-	     force_delay:1,
-	     intl_enable:1;
+	     force_delay:1;
 
 	__u8 strreset_enable;
 	__u8 strreset_outstanding; /* request param count on the fly */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 227bbac5222f..31ab2c605e06 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -438,7 +438,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
-	if (asoc->intl_enable) {
+	if (asoc->peer.intl_capable) {
 		extensions[num_ext] = SCTP_CID_I_DATA;
 		num_ext += 1;
 	}
@@ -2028,7 +2028,7 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 			break;
 		case SCTP_CID_I_DATA:
 			if (sctp_sk(asoc->base.sk)->strm_interleave)
-				asoc->intl_enable = 1;
+				asoc->peer.intl_capable = 1;
 			break;
 		default:
 			break;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 54ceece59ea5..226661fe8c45 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7692,7 +7692,7 @@ static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
 		goto out;
 	}
 
-	params.assoc_value = asoc ? asoc->intl_enable
+	params.assoc_value = asoc ? asoc->peer.intl_capable
 				  : sctp_sk(sk)->strm_interleave;
 
 	if (put_user(len, optlen))
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index afbf1223d91c..40c40be23fcb 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1358,6 +1358,6 @@ void sctp_stream_interleave_init(struct sctp_stream *stream)
 	struct sctp_association *asoc;
 
 	asoc = container_of(stream, struct sctp_association, stream);
-	stream->si = asoc->intl_enable ? &sctp_stream_interleave_1
-				       : &sctp_stream_interleave_0;
+	stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1
+					     : &sctp_stream_interleave_0;
 }
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index b8fa7ab3e394..99e5f69fbb74 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -228,7 +228,7 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
 void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
 {
 	if (!list_is_last(&ch->frag_list, &ch->msg->chunks) &&
-	    !q->asoc->intl_enable) {
+	    !q->asoc->peer.intl_capable) {
 		struct sctp_stream_out *sout;
 		__u16 sid;
 
-- 
cgit v1.2.3


From e55f4b8bf4622103badac8694cdabceec06f9b38 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 9 Jul 2019 00:57:07 +0800
Subject: sctp: rename sp strm_interleave to ep intl_enable

Like other endpoint features, strm_interleave should be moved to
sctp_endpoint and renamed to intl_enable.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 2 +-
 net/sctp/sm_make_chunk.c   | 4 ++--
 net/sctp/socket.c          | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index c41b57ba04bb..ba5c4f6eede5 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -219,7 +219,6 @@ struct sctp_sock {
 		disable_fragments:1,
 		v4mapped:1,
 		frag_interleave:1,
-		strm_interleave:1,
 		recvrcvinfo:1,
 		recvnxtinfo:1,
 		data_ready_signalled:1;
@@ -1324,6 +1323,7 @@ struct sctp_endpoint {
 	struct list_head endpoint_shared_keys;
 	__u16 active_key_id;
 	__u8  auth_enable:1,
+	      intl_enable:1,
 	      prsctp_enable:1,
 	      reconf_enable:1;
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 31ab2c605e06..ed39396b9bba 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -269,7 +269,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	if (sp->adaptation_ind)
 		chunksize += sizeof(aiparam);
 
-	if (sp->strm_interleave) {
+	if (asoc->ep->intl_enable) {
 		extensions[num_ext] = SCTP_CID_I_DATA;
 		num_ext += 1;
 	}
@@ -2027,7 +2027,7 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
 				asoc->peer.asconf_capable = 1;
 			break;
 		case SCTP_CID_I_DATA:
-			if (sctp_sk(asoc->base.sk)->strm_interleave)
+			if (asoc->ep->intl_enable)
 				asoc->peer.intl_capable = 1;
 			break;
 		default:
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 226661fe8c45..aa80cda36581 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1913,7 +1913,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		if (err)
 			goto err;
 
-		if (sp->strm_interleave) {
+		if (asoc->ep->intl_enable) {
 			timeo = sock_sndtimeo(sk, 0);
 			err = sctp_wait_for_connect(asoc, &timeo);
 			if (err) {
@@ -3581,7 +3581,7 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk,
 	sctp_sk(sk)->frag_interleave = !!val;
 
 	if (!sctp_sk(sk)->frag_interleave)
-		sctp_sk(sk)->strm_interleave = 0;
+		sctp_sk(sk)->ep->intl_enable = 0;
 
 	return 0;
 }
@@ -4484,7 +4484,7 @@ static int sctp_setsockopt_interleaving_supported(struct sock *sk,
 		goto out;
 	}
 
-	sp->strm_interleave = !!params.assoc_value;
+	sp->ep->intl_enable = !!params.assoc_value;
 
 	retval = 0;
 
@@ -7693,7 +7693,7 @@ static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
 	}
 
 	params.assoc_value = asoc ? asoc->peer.intl_capable
-				  : sctp_sk(sk)->strm_interleave;
+				  : sctp_sk(sk)->ep->intl_enable;
 
 	if (put_user(len, optlen))
 		goto out;
-- 
cgit v1.2.3


From b5d9a834f4fd1b6abfa527ec351c871084dd23a3 Mon Sep 17 00:00:00 2001
From: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Date: Mon, 8 Jul 2019 19:53:13 -0700
Subject: net/tls: don't clear TX resync flag on error

Introduce a return code for the tls_dev_resync callback.

When the driver TX resync fails, kernel can retry the resync again
until it succeeds.  This prevents drivers from attempting to offload
TLS packets if the connection is known to be out of sync.

We don't worry about the RX resync since they will be retried naturally
as more encrypted records get received.

Signed-off-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c |  8 +++++---
 drivers/net/ethernet/netronome/nfp/crypto/tls.c        | 13 +++++++++----
 include/net/tls.h                                      |  6 +++---
 net/tls/tls_device.c                                   |  8 ++++++--
 4 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
index f8b93b62a7d2..ca07c86427a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -160,9 +160,9 @@ static void mlx5e_tls_del(struct net_device *netdev,
 				direction == TLS_OFFLOAD_CTX_DIR_TX);
 }
 
-static void mlx5e_tls_resync(struct net_device *netdev, struct sock *sk,
-			     u32 seq, u8 *rcd_sn_data,
-			     enum tls_offload_ctx_dir direction)
+static int mlx5e_tls_resync(struct net_device *netdev, struct sock *sk,
+			    u32 seq, u8 *rcd_sn_data,
+			    enum tls_offload_ctx_dir direction)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -177,6 +177,8 @@ static void mlx5e_tls_resync(struct net_device *netdev, struct sock *sk,
 		    be64_to_cpu(rcd_sn));
 	mlx5_accel_tls_resync_rx(priv->mdev, rx_ctx->handle, seq, rcd_sn);
 	atomic64_inc(&priv->tls->sw_stats.rx_tls_resync_reply);
+
+	return 0;
 }
 
 static const struct tlsdev_ops mlx5e_tls_ops = {
diff --git a/drivers/net/ethernet/netronome/nfp/crypto/tls.c b/drivers/net/ethernet/netronome/nfp/crypto/tls.c
index b49405b4af55..d448c6de8ea4 100644
--- a/drivers/net/ethernet/netronome/nfp/crypto/tls.c
+++ b/drivers/net/ethernet/netronome/nfp/crypto/tls.c
@@ -403,7 +403,7 @@ nfp_net_tls_del(struct net_device *netdev, struct tls_context *tls_ctx,
 	nfp_net_tls_del_fw(nn, ntls->fw_handle);
 }
 
-static void
+static int
 nfp_net_tls_resync(struct net_device *netdev, struct sock *sk, u32 seq,
 		   u8 *rcd_sn, enum tls_offload_ctx_dir direction)
 {
@@ -412,11 +412,12 @@ nfp_net_tls_resync(struct net_device *netdev, struct sock *sk, u32 seq,
 	struct nfp_crypto_req_update *req;
 	struct sk_buff *skb;
 	gfp_t flags;
+	int err;
 
 	flags = direction == TLS_OFFLOAD_CTX_DIR_TX ? GFP_KERNEL : GFP_ATOMIC;
 	skb = nfp_net_tls_alloc_simple(nn, sizeof(*req), flags);
 	if (!skb)
-		return;
+		return -ENOMEM;
 
 	ntls = tls_driver_ctx(sk, direction);
 	req = (void *)skb->data;
@@ -428,13 +429,17 @@ nfp_net_tls_resync(struct net_device *netdev, struct sock *sk, u32 seq,
 	memcpy(req->rec_no, rcd_sn, sizeof(req->rec_no));
 
 	if (direction == TLS_OFFLOAD_CTX_DIR_TX) {
-		nfp_net_tls_communicate_simple(nn, skb, "sync",
-					       NFP_CCM_TYPE_CRYPTO_UPDATE);
+		err = nfp_net_tls_communicate_simple(nn, skb, "sync",
+						     NFP_CCM_TYPE_CRYPTO_UPDATE);
+		if (err)
+			return err;
 		ntls->next_seq = seq;
 	} else {
 		nfp_ccm_mbox_post(nn, skb, NFP_CCM_TYPE_CRYPTO_UPDATE,
 				  sizeof(struct nfp_crypto_reply_simple));
 	}
+
+	return 0;
 }
 
 static const struct tlsdev_ops nfp_net_tls_ops = {
diff --git a/include/net/tls.h b/include/net/tls.h
index 176d0b039f32..584609174fe0 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -304,9 +304,9 @@ struct tlsdev_ops {
 	void (*tls_dev_del)(struct net_device *netdev,
 			    struct tls_context *ctx,
 			    enum tls_offload_ctx_dir direction);
-	void (*tls_dev_resync)(struct net_device *netdev,
-			       struct sock *sk, u32 seq, u8 *rcd_sn,
-			       enum tls_offload_ctx_dir direction);
+	int (*tls_dev_resync)(struct net_device *netdev,
+			      struct sock *sk, u32 seq, u8 *rcd_sn,
+			      enum tls_offload_ctx_dir direction);
 };
 
 enum tls_offload_sync_type {
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 92fd1352c037..77fa3b5f2b49 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -214,6 +214,7 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
 {
 	struct net_device *netdev;
 	struct sk_buff *skb;
+	int err = 0;
 	u8 *rcd_sn;
 
 	skb = tcp_write_queue_tail(sk);
@@ -225,9 +226,12 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
 	down_read(&device_offload_lock);
 	netdev = tls_ctx->netdev;
 	if (netdev)
-		netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
-						   TLS_OFFLOAD_CTX_DIR_TX);
+		err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq,
+							 rcd_sn,
+							 TLS_OFFLOAD_CTX_DIR_TX);
 	up_read(&device_offload_lock);
+	if (err)
+		return;
 
 	clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags);
 }
-- 
cgit v1.2.3


From 378ef01b5f75e6c485b8f16b4f6a7842a312aa07 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Mon, 8 Jul 2019 23:17:35 -0500
Subject: devlink: Refactor physical port attributes

To support additional devlink port flavours and to support few common
and few different port attributes, move physical port attributes to a
different structure.

Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 13 ++++++++++--
 net/core/devlink.c    | 58 +++++++++++++++++++++++++++++++++------------------
 2 files changed, 49 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 6625ea068d5e..4538c80fe293 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -38,14 +38,23 @@ struct devlink {
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
+struct devlink_port_phys_attrs {
+	u32 port_number; /* Same value as "split group".
+			  * A physical port which is visible to the user
+			  * for a given port flavour.
+			  */
+	u32 split_subport_number;
+};
+
 struct devlink_port_attrs {
 	u8 set:1,
 	   split:1,
 	   switch_port:1;
 	enum devlink_port_flavour flavour;
-	u32 port_number; /* same value as "split group" */
-	u32 split_subport_number;
 	struct netdev_phys_item_id switch_id;
+	union {
+		struct devlink_port_phys_attrs phys;
+	};
 };
 
 struct devlink_port {
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 89c533778135..eacaf37b5108 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -515,14 +515,16 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 		return 0;
 	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
 		return -EMSGSIZE;
-	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
+			attrs->phys.port_number))
 		return -EMSGSIZE;
 	if (!attrs->split)
 		return 0;
-	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+			attrs->phys.port_number))
 		return -EMSGSIZE;
 	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
-			attrs->split_subport_number))
+			attrs->phys.split_subport_number))
 		return -EMSGSIZE;
 	return 0;
 }
@@ -5738,6 +5740,29 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
+static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
+				    enum devlink_port_flavour flavour,
+				    const unsigned char *switch_id,
+				    unsigned char switch_id_len)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+	if (WARN_ON(devlink_port->registered))
+		return -EEXIST;
+	attrs->set = true;
+	attrs->flavour = flavour;
+	if (switch_id) {
+		attrs->switch_port = true;
+		if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN))
+			switch_id_len = MAX_PHYS_ITEM_ID_LEN;
+		memcpy(attrs->switch_id.id, switch_id, switch_id_len);
+		attrs->switch_id.id_len = switch_id_len;
+	} else {
+		attrs->switch_port = false;
+	}
+	return 0;
+}
+
 /**
  *	devlink_port_attrs_set - Set port attributes
  *
@@ -5760,23 +5785,15 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    unsigned char switch_id_len)
 {
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+	int ret;
 
-	if (WARN_ON(devlink_port->registered))
+	ret = __devlink_port_attrs_set(devlink_port, flavour,
+				       switch_id, switch_id_len);
+	if (ret)
 		return;
-	attrs->set = true;
-	attrs->flavour = flavour;
-	attrs->port_number = port_number;
 	attrs->split = split;
-	attrs->split_subport_number = split_subport_number;
-	if (switch_id) {
-		attrs->switch_port = true;
-		if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN))
-			switch_id_len = MAX_PHYS_ITEM_ID_LEN;
-		memcpy(attrs->switch_id.id, switch_id, switch_id_len);
-		attrs->switch_id.id_len = switch_id_len;
-	} else {
-		attrs->switch_port = false;
-	}
+	attrs->phys.port_number = port_number;
+	attrs->phys.split_subport_number = split_subport_number;
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
 
@@ -5792,10 +5809,11 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
 	switch (attrs->flavour) {
 	case DEVLINK_PORT_FLAVOUR_PHYSICAL:
 		if (!attrs->split)
-			n = snprintf(name, len, "p%u", attrs->port_number);
+			n = snprintf(name, len, "p%u", attrs->phys.port_number);
 		else
-			n = snprintf(name, len, "p%us%u", attrs->port_number,
-				     attrs->split_subport_number);
+			n = snprintf(name, len, "p%us%u",
+				     attrs->phys.port_number,
+				     attrs->phys.split_subport_number);
 		break;
 	case DEVLINK_PORT_FLAVOUR_CPU:
 	case DEVLINK_PORT_FLAVOUR_DSA:
-- 
cgit v1.2.3


From 98fd2d6563fe4a799934a2a74d632601cd089beb Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Mon, 8 Jul 2019 23:17:37 -0500
Subject: devlink: Introduce PCI PF port flavour and port attribute

In an eswitch, PCI PF may have port which is normally represented
using a representor netdevice.
To have better visibility of eswitch port, its association with
PF and a representor netdevice, introduce a PCI PF port
flavour and port attriute.

When devlink port flavour is PCI PF, fill up PCI PF attributes of the
port.

Extend port name creation using PCI PF number on best effort basis.
So that vendor drivers can skip defining their own scheme.

$ devlink port show
pci/0000:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0

Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  8 ++++++++
 include/uapi/linux/devlink.h |  5 +++++
 net/core/devlink.c           | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4538c80fe293..97cef896e4d0 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -46,6 +46,10 @@ struct devlink_port_phys_attrs {
 	u32 split_subport_number;
 };
 
+struct devlink_port_pci_pf_attrs {
+	u16 pf;	/* Associated PCI PF for this port. */
+};
+
 struct devlink_port_attrs {
 	u8 set:1,
 	   split:1,
@@ -54,6 +58,7 @@ struct devlink_port_attrs {
 	struct netdev_phys_item_id switch_id;
 	union {
 		struct devlink_port_phys_attrs phys;
+		struct devlink_port_pci_pf_attrs pci_pf;
 	};
 };
 
@@ -599,6 +604,9 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    u32 split_subport_number,
 			    const unsigned char *switch_id,
 			    unsigned char switch_id_len);
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
+				   const unsigned char *switch_id,
+				   unsigned char switch_id_len, u16 pf);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 5287b42c181f..f7323884c3fe 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -169,6 +169,10 @@ enum devlink_port_flavour {
 	DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture
 				   * interconnect port.
 				   */
+	DEVLINK_PORT_FLAVOUR_PCI_PF, /* Represents eswitch port for
+				      * the PCI PF. It is an internal
+				      * port that faces the PCI PF.
+				      */
 };
 
 enum devlink_param_cmode {
@@ -337,6 +341,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,	/* u64 */
 	DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,	/* u64 */
 
+	DEVLINK_ATTR_PORT_PCI_PF_NUMBER,	/* u16 */
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index a9c4e5d8a99c..d362652a5cc7 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -515,6 +515,11 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 		return 0;
 	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
 		return -EMSGSIZE;
+	if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) {
+		if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+				attrs->pci_pf.pf))
+			return -EMSGSIZE;
+	}
 	if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL &&
 	    devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
 	    devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA)
@@ -5801,6 +5806,32 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
 
+/**
+ *	devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
+ *
+ *	@devlink_port: devlink port
+ *	@pf: associated PF for the devlink port instance
+ *	@switch_id: if the port is part of switch, this is buffer with ID,
+ *	            otherwise this is NULL
+ *	@switch_id_len: length of the switch_id buffer
+ */
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
+				   const unsigned char *switch_id,
+				   unsigned char switch_id_len, u16 pf)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+	int ret;
+
+	ret = __devlink_port_attrs_set(devlink_port,
+				       DEVLINK_PORT_FLAVOUR_PCI_PF,
+				       switch_id, switch_id_len);
+	if (ret)
+		return;
+
+	attrs->pci_pf.pf = pf;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
+
 static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
 					     char *name, size_t len)
 {
@@ -5826,6 +5857,9 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
 		 */
 		WARN_ON(1);
 		return -EINVAL;
+	case DEVLINK_PORT_FLAVOUR_PCI_PF:
+		n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
+		break;
 	}
 
 	if (n >= len)
-- 
cgit v1.2.3


From e41b6bf3cdd474dc9c587cb55906b0256835bf6d Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Mon, 8 Jul 2019 23:17:38 -0500
Subject: devlink: Introduce PCI VF port flavour and port attribute

In an eswitch, PCI VF may have port which is normally represented using
a representor netdevice.
To have better visibility of eswitch port, its association with VF,
and its representor netdevice, introduce a PCI VF port flavour.

When devlink port flavour is PCI VF, fill up PCI VF attributes of
the port.

Extend port name creation using PCI PF and VF number scheme on best
effort basis, so that vendor drivers can skip defining their own scheme.

$ devlink port show
pci/0000:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0
pci/0000:05:00.0/1: type eth netdev eth1 flavour pcivf pfnum 0 vfnum 0
pci/0000:05:00.0/2: type eth netdev eth2 flavour pcivf pfnum 0 vfnum 1

Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        | 10 ++++++++++
 include/uapi/linux/devlink.h |  6 ++++++
 net/core/devlink.c           | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 97cef896e4d0..bc36f942a7d5 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -50,6 +50,11 @@ struct devlink_port_pci_pf_attrs {
 	u16 pf;	/* Associated PCI PF for this port. */
 };
 
+struct devlink_port_pci_vf_attrs {
+	u16 pf;	/* Associated PCI PF for this port. */
+	u16 vf;	/* Associated PCI VF for of the PCI PF for this port. */
+};
+
 struct devlink_port_attrs {
 	u8 set:1,
 	   split:1,
@@ -59,6 +64,7 @@ struct devlink_port_attrs {
 	union {
 		struct devlink_port_phys_attrs phys;
 		struct devlink_port_pci_pf_attrs pci_pf;
+		struct devlink_port_pci_vf_attrs pci_vf;
 	};
 };
 
@@ -607,6 +613,10 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
 				   const unsigned char *switch_id,
 				   unsigned char switch_id_len, u16 pf);
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
+				   const unsigned char *switch_id,
+				   unsigned char switch_id_len,
+				   u16 pf, u16 vf);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index f7323884c3fe..ffc993256527 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -173,6 +173,10 @@ enum devlink_port_flavour {
 				      * the PCI PF. It is an internal
 				      * port that faces the PCI PF.
 				      */
+	DEVLINK_PORT_FLAVOUR_PCI_VF, /* Represents eswitch port
+				      * for the PCI VF. It is an internal
+				      * port that faces the PCI VF.
+				      */
 };
 
 enum devlink_param_cmode {
@@ -342,6 +346,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,	/* u64 */
 
 	DEVLINK_ATTR_PORT_PCI_PF_NUMBER,	/* u16 */
+	DEVLINK_ATTR_PORT_PCI_VF_NUMBER,	/* u16 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d362652a5cc7..4f40aeace902 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -519,6 +519,12 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 		if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
 				attrs->pci_pf.pf))
 			return -EMSGSIZE;
+	} else if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_VF) {
+		if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+				attrs->pci_vf.pf) ||
+		    nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER,
+				attrs->pci_vf.vf))
+			return -EMSGSIZE;
 	}
 	if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL &&
 	    devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
@@ -5832,6 +5838,34 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
 
+/**
+ *	devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
+ *
+ *	@devlink_port: devlink port
+ *	@pf: associated PF for the devlink port instance
+ *	@vf: associated VF of a PF for the devlink port instance
+ *	@switch_id: if the port is part of switch, this is buffer with ID,
+ *	            otherwise this is NULL
+ *	@switch_id_len: length of the switch_id buffer
+ */
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
+				   const unsigned char *switch_id,
+				   unsigned char switch_id_len,
+				   u16 pf, u16 vf)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+	int ret;
+
+	ret = __devlink_port_attrs_set(devlink_port,
+				       DEVLINK_PORT_FLAVOUR_PCI_VF,
+				       switch_id, switch_id_len);
+	if (ret)
+		return;
+	attrs->pci_vf.pf = pf;
+	attrs->pci_vf.vf = vf;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
+
 static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
 					     char *name, size_t len)
 {
@@ -5860,6 +5894,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
 	case DEVLINK_PORT_FLAVOUR_PCI_PF:
 		n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
 		break;
+	case DEVLINK_PORT_FLAVOUR_PCI_VF:
+		n = snprintf(name, len, "pf%uvf%u",
+			     attrs->pci_vf.pf, attrs->pci_vf.vf);
+		break;
 	}
 
 	if (n >= len)
-- 
cgit v1.2.3


From b57dc7c13ea90e09ae15f821d2583fa0231b4935 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Tue, 9 Jul 2019 10:30:48 +0300
Subject: net/sched: Introduce action ct

Allow sending a packet to conntrack module for connection tracking.

The packet will be marked with conntrack connection's state, and
any metadata such as conntrack mark and label. This state metadata
can later be matched against with tc classifers, for example with the
flower classifier as below.

In addition to committing new connections the user can optionally
specific a zone to track within, set a mark/label and configure nat
with an address range and port range.

Usage is as follows:
$ tc qdisc add dev ens1f0_0 ingress
$ tc qdisc add dev ens1f0_1 ingress

$ tc filter add dev ens1f0_0 ingress \
  prio 1 chain 0 proto ip \
  flower ip_proto tcp ct_state -trk \
  action ct zone 2 pipe \
  action goto chain 2
$ tc filter add dev ens1f0_0 ingress \
  prio 1 chain 2 proto ip \
  flower ct_state +trk+new \
  action ct zone 2 commit mark 0xbb nat src addr 5.5.5.7 pipe \
  action mirred egress redirect dev ens1f0_1
$ tc filter add dev ens1f0_0 ingress \
  prio 1 chain 2 proto ip \
  flower ct_zone 2 ct_mark 0xbb ct_state +trk+est \
  action ct nat pipe \
  action mirred egress redirect dev ens1f0_1

$ tc filter add dev ens1f0_1 ingress \
  prio 1 chain 0 proto ip \
  flower ip_proto tcp ct_state -trk \
  action ct zone 2 pipe \
  action goto chain 1
$ tc filter add dev ens1f0_1 ingress \
  prio 1 chain 1 proto ip \
  flower ct_zone 2 ct_mark 0xbb ct_state +trk+est \
  action ct nat pipe \
  action mirred egress redirect dev ens1f0_0

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>

Changelog:
V5->V6:
	Added CONFIG_NF_DEFRAG_IPV6 in handle fragments ipv6 case
V4->V5:
	Reordered nf_conntrack_put() in tcf_ct_skb_nfct_cached()
V3->V4:
	Added strict_start_type for act_ct policy
V2->V3:
	Fixed david's comments: Removed extra newline after rcu in tcf_ct_params , and indent of break in act_ct.c
V1->V2:
	Fixed parsing of ranges TCA_CT_NAT_IPV6_MAX as 'else' case overwritten ipv4 max
	Refactored NAT_PORT_MIN_MAX range handling as well
	Added ipv4/ipv6 defragmentation
	Removed extra skb pull push of nw offset in exectute nat
	Refactored tcf_ct_skb_network_trim after pull
	Removed TCA_ACT_CT define

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h        |   5 +
 include/net/tc_act/tc_ct.h        |  63 +++
 include/uapi/linux/pkt_cls.h      |   1 +
 include/uapi/linux/tc_act/tc_ct.h |  41 ++
 net/sched/Kconfig                 |  11 +
 net/sched/Makefile                |   1 +
 net/sched/act_ct.c                | 984 ++++++++++++++++++++++++++++++++++++++
 net/sched/cls_api.c               |   5 +
 8 files changed, 1111 insertions(+)
 create mode 100644 include/net/tc_act/tc_ct.h
 create mode 100644 include/uapi/linux/tc_act/tc_ct.h
 create mode 100644 net/sched/act_ct.c

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 36127c1858a4..a09e256d2b27 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -129,6 +129,7 @@ enum flow_action_id {
 	FLOW_ACTION_QUEUE,
 	FLOW_ACTION_SAMPLE,
 	FLOW_ACTION_POLICE,
+	FLOW_ACTION_CT,
 };
 
 /* This is mirroring enum pedit_header_type definition for easy mapping between
@@ -178,6 +179,10 @@ struct flow_action_entry {
 			s64			burst;
 			u64			rate_bytes_ps;
 		} police;
+		struct {				/* FLOW_ACTION_CT */
+			int action;
+			u16 zone;
+		} ct;
 	};
 };
 
diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
new file mode 100644
index 000000000000..bdc20ab3b88d
--- /dev/null
+++ b/include/net/tc_act/tc_ct.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_CT_H
+#define __NET_TC_CT_H
+
+#include <net/act_api.h>
+#include <uapi/linux/tc_act/tc_ct.h>
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+struct tcf_ct_params {
+	struct nf_conn *tmpl;
+	u16 zone;
+
+	u32 mark;
+	u32 mark_mask;
+
+	u32 labels[NF_CT_LABELS_MAX_SIZE / sizeof(u32)];
+	u32 labels_mask[NF_CT_LABELS_MAX_SIZE / sizeof(u32)];
+
+	struct nf_nat_range2 range;
+	bool ipv4_range;
+
+	u16 ct_action;
+
+	struct rcu_head rcu;
+};
+
+struct tcf_ct {
+	struct tc_action common;
+	struct tcf_ct_params __rcu *params;
+};
+
+#define to_ct(a) ((struct tcf_ct *)a)
+#define to_ct_params(a) ((struct tcf_ct_params *) \
+			 rtnl_dereference((to_ct(a)->params)))
+
+static inline uint16_t tcf_ct_zone(const struct tc_action *a)
+{
+	return to_ct_params(a)->zone;
+}
+
+static inline int tcf_ct_action(const struct tc_action *a)
+{
+	return to_ct_params(a)->ct_action;
+}
+
+#else
+static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; }
+static inline int tcf_ct_action(const struct tc_action *a) { return 0; }
+#endif /* CONFIG_NF_CONNTRACK */
+
+static inline bool is_tcf_ct(const struct tc_action *a)
+{
+#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK)
+	if (a->ops && a->ops->id == TCA_ID_CT)
+		return true;
+#endif
+	return false;
+}
+
+#endif /* __NET_TC_CT_H */
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index e22ef4a940bc..31db5589b7ca 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -106,6 +106,7 @@ enum tca_id {
 	TCA_ID_SAMPLE = TCA_ACT_SAMPLE,
 	TCA_ID_CTINFO,
 	TCA_ID_MPLS,
+	TCA_ID_CT,
 	/* other actions go here */
 	__TCA_ID_MAX = 255
 };
diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h
new file mode 100644
index 000000000000..5fb1d7ac1027
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ct.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __UAPI_TC_CT_H
+#define __UAPI_TC_CT_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+enum {
+	TCA_CT_UNSPEC,
+	TCA_CT_PARMS,
+	TCA_CT_TM,
+	TCA_CT_ACTION,		/* u16 */
+	TCA_CT_ZONE,		/* u16 */
+	TCA_CT_MARK,		/* u32 */
+	TCA_CT_MARK_MASK,	/* u32 */
+	TCA_CT_LABELS,		/* u128 */
+	TCA_CT_LABELS_MASK,	/* u128 */
+	TCA_CT_NAT_IPV4_MIN,	/* be32 */
+	TCA_CT_NAT_IPV4_MAX,	/* be32 */
+	TCA_CT_NAT_IPV6_MIN,	/* struct in6_addr */
+	TCA_CT_NAT_IPV6_MAX,	/* struct in6_addr */
+	TCA_CT_NAT_PORT_MIN,	/* be16 */
+	TCA_CT_NAT_PORT_MAX,	/* be16 */
+	TCA_CT_PAD,
+	__TCA_CT_MAX
+};
+
+#define TCA_CT_MAX (__TCA_CT_MAX - 1)
+
+#define TCA_CT_ACT_COMMIT	(1 << 0)
+#define TCA_CT_ACT_FORCE	(1 << 1)
+#define TCA_CT_ACT_CLEAR	(1 << 2)
+#define TCA_CT_ACT_NAT		(1 << 3)
+#define TCA_CT_ACT_NAT_SRC	(1 << 4)
+#define TCA_CT_ACT_NAT_DST	(1 << 5)
+
+struct tc_ct {
+	tc_gen;
+};
+
+#endif /* __UAPI_TC_CT_H */
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 731f5fbc2a3c..dd55b9ac3a66 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -940,6 +940,17 @@ config NET_ACT_TUNNEL_KEY
 	  To compile this code as a module, choose M here: the
 	  module will be called act_tunnel_key.
 
+config NET_ACT_CT
+        tristate "connection tracking tc action"
+        depends on NET_CLS_ACT && NF_CONNTRACK
+        help
+	  Say Y here to allow sending the packets to conntrack module.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_ct.
+
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index c26603606c22..415d1e1f237e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
 obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
 obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
+obj-$(CONFIG_NET_ACT_CT)	+= act_ct.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
new file mode 100644
index 000000000000..b501ce0cf116
--- /dev/null
+++ b/net/sched/act_ct.c
@@ -0,0 +1,984 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* -
+ * net/sched/act_ct.c  Connection Tracking action
+ *
+ * Authors:   Paul Blakey <paulb@mellanox.com>
+ *            Yossi Kuperman <yossiku@mellanox.com>
+ *            Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/act_api.h>
+#include <net/ip.h>
+#include <net/ipv6_frag.h>
+#include <uapi/linux/tc_act/tc_ct.h>
+#include <net/tc_act/tc_ct.h>
+
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static struct tc_action_ops act_ct_ops;
+static unsigned int ct_net_id;
+
+struct tc_ct_action_net {
+	struct tc_action_net tn; /* Must be first */
+	bool labels;
+};
+
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
+static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
+				   u16 zone_id, bool force)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return false;
+	if (!net_eq(net, read_pnet(&ct->ct_net)))
+		return false;
+	if (nf_ct_zone(ct)->id != zone_id)
+		return false;
+
+	/* Force conntrack entry direction. */
+	if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+		if (nf_ct_is_confirmed(ct))
+			nf_ct_kill(ct);
+
+		nf_conntrack_put(&ct->ct_general);
+		nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+		return false;
+	}
+
+	return true;
+}
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+	unsigned int len;
+	int err;
+
+	switch (family) {
+	case NFPROTO_IPV4:
+		len = ntohs(ip_hdr(skb)->tot_len);
+		break;
+	case NFPROTO_IPV6:
+		len = sizeof(struct ipv6hdr)
+			+ ntohs(ipv6_hdr(skb)->payload_len);
+		break;
+	default:
+		len = skb->len;
+	}
+
+	err = pskb_trim_rcsum(skb, len);
+
+	return err;
+}
+
+static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
+{
+	u8 family = NFPROTO_UNSPEC;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		family = NFPROTO_IPV4;
+		break;
+	case htons(ETH_P_IPV6):
+		family = NFPROTO_IPV6;
+		break;
+	default:
+		break;
+	}
+
+	return family;
+}
+
+static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
+{
+	unsigned int len;
+
+	len =  skb_network_offset(skb) + sizeof(struct iphdr);
+	if (unlikely(skb->len < len))
+		return -EINVAL;
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return -ENOMEM;
+
+	*frag = ip_is_fragment(ip_hdr(skb));
+	return 0;
+}
+
+static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
+{
+	unsigned int flags = 0, len, payload_ofs = 0;
+	unsigned short frag_off;
+	int nexthdr;
+
+	len =  skb_network_offset(skb) + sizeof(struct ipv6hdr);
+	if (unlikely(skb->len < len))
+		return -EINVAL;
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return -ENOMEM;
+
+	nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
+	if (unlikely(nexthdr < 0))
+		return -EPROTO;
+
+	*frag = flags & IP6_FH_F_FRAG;
+	return 0;
+}
+
+static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+				   u8 family, u16 zone)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	int err = 0;
+	bool frag;
+
+	/* Previously seen (loopback)? Ignore. */
+	ct = nf_ct_get(skb, &ctinfo);
+	if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
+		return 0;
+
+	if (family == NFPROTO_IPV4)
+		err = tcf_ct_ipv4_is_fragment(skb, &frag);
+	else
+		err = tcf_ct_ipv6_is_fragment(skb, &frag);
+	if (err || !frag)
+		return err;
+
+	skb_get(skb);
+
+	if (family == NFPROTO_IPV4) {
+		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+
+		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+		local_bh_disable();
+		err = ip_defrag(net, skb, user);
+		local_bh_enable();
+		if (err && err != -EINPROGRESS)
+			goto out_free;
+	} else { /* NFPROTO_IPV6 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+
+		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+		err = nf_ct_frag6_gather(net, skb, user);
+		if (err && err != -EINPROGRESS)
+			goto out_free;
+#else
+		err = -EOPNOTSUPP;
+		goto out_free;
+#endif
+	}
+
+	skb_clear_hash(skb);
+	skb->ignore_df = 1;
+	return err;
+
+out_free:
+	kfree_skb(skb);
+	return err;
+}
+
+static void tcf_ct_params_free(struct rcu_head *head)
+{
+	struct tcf_ct_params *params = container_of(head,
+						    struct tcf_ct_params, rcu);
+
+	if (params->tmpl)
+		nf_conntrack_put(&params->tmpl->ct_general);
+	kfree(params);
+}
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+			  enum ip_conntrack_info ctinfo,
+			  const struct nf_nat_range2 *range,
+			  enum nf_nat_manip_type maniptype)
+{
+	int hooknum, err = NF_ACCEPT;
+
+	/* See HOOK2MANIP(). */
+	if (maniptype == NF_NAT_MANIP_SRC)
+		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+	else
+		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (skb->protocol == htons(ETH_P_IP) &&
+		    ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				err = NF_DROP;
+			goto out;
+		} else if (IS_ENABLED(CONFIG_IPV6) &&
+			   skb->protocol == htons(ETH_P_IPV6)) {
+			__be16 frag_off;
+			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+			int hdrlen = ipv6_skip_exthdr(skb,
+						      sizeof(struct ipv6hdr),
+						      &nexthdr, &frag_off);
+
+			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+				if (!nf_nat_icmpv6_reply_translation(skb, ct,
+								     ctinfo,
+								     hooknum,
+								     hdrlen))
+					err = NF_DROP;
+				goto out;
+			}
+		}
+		/* Non-ICMP, fall thru to initialize if needed. */
+		/* fall through */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			/* Initialize according to the NAT action. */
+			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+				/* Action is set up to establish a new
+				 * mapping.
+				 */
+				? nf_nat_setup_info(ct, range, maniptype)
+				: nf_nat_alloc_null_binding(ct, hooknum);
+			if (err != NF_ACCEPT)
+				goto out;
+		}
+		break;
+
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		break;
+
+	default:
+		err = NF_DROP;
+		goto out;
+	}
+
+	err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+out:
+	return err;
+}
+#endif /* CONFIG_NF_NAT */
+
+static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+	u32 new_mark;
+
+	if (!mask)
+		return;
+
+	new_mark = mark | (ct->mark & ~(mask));
+	if (ct->mark != new_mark) {
+		ct->mark = new_mark;
+		if (nf_ct_is_confirmed(ct))
+			nf_conntrack_event_cache(IPCT_MARK, ct);
+	}
+#endif
+}
+
+static void tcf_ct_act_set_labels(struct nf_conn *ct,
+				  u32 *labels,
+				  u32 *labels_m)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
+	size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels);
+
+	if (!memchr_inv(labels_m, 0, labels_sz))
+		return;
+
+	nf_connlabels_replace(ct, labels, labels_m, 4);
+#endif
+}
+
+static int tcf_ct_act_nat(struct sk_buff *skb,
+			  struct nf_conn *ct,
+			  enum ip_conntrack_info ctinfo,
+			  int ct_action,
+			  struct nf_nat_range2 *range,
+			  bool commit)
+{
+#if IS_ENABLED(CONFIG_NF_NAT)
+	enum nf_nat_manip_type maniptype;
+
+	if (!(ct_action & TCA_CT_ACT_NAT))
+		return NF_ACCEPT;
+
+	/* Add NAT extension if not confirmed yet. */
+	if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+		return NF_DROP;   /* Can't NAT. */
+
+	if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+	    (ctinfo != IP_CT_RELATED || commit)) {
+		/* NAT an established or related connection like before. */
+		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+			/* This is the REPLY direction for a connection
+			 * for which NAT was applied in the forward
+			 * direction.  Do the reverse NAT.
+			 */
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+		else
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+	} else if (ct_action & TCA_CT_ACT_NAT_SRC) {
+		maniptype = NF_NAT_MANIP_SRC;
+	} else if (ct_action & TCA_CT_ACT_NAT_DST) {
+		maniptype = NF_NAT_MANIP_DST;
+	} else {
+		return NF_ACCEPT;
+	}
+
+	return ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+#else
+	return NF_ACCEPT;
+#endif
+}
+
+static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+		      struct tcf_result *res)
+{
+	struct net *net = dev_net(skb->dev);
+	bool cached, commit, clear, force;
+	enum ip_conntrack_info ctinfo;
+	struct tcf_ct *c = to_ct(a);
+	struct nf_conn *tmpl = NULL;
+	struct nf_hook_state state;
+	int nh_ofs, err, retval;
+	struct tcf_ct_params *p;
+	struct nf_conn *ct;
+	u8 family;
+
+	p = rcu_dereference_bh(c->params);
+
+	retval = READ_ONCE(c->tcf_action);
+	commit = p->ct_action & TCA_CT_ACT_COMMIT;
+	clear = p->ct_action & TCA_CT_ACT_CLEAR;
+	force = p->ct_action & TCA_CT_ACT_FORCE;
+	tmpl = p->tmpl;
+
+	if (clear) {
+		ct = nf_ct_get(skb, &ctinfo);
+		if (ct) {
+			nf_conntrack_put(&ct->ct_general);
+			nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+		}
+
+		goto out;
+	}
+
+	family = tcf_ct_skb_nf_family(skb);
+	if (family == NFPROTO_UNSPEC)
+		goto drop;
+
+	/* The conntrack module expects to be working at L3.
+	 * We also try to pull the IPv4/6 header to linear area
+	 */
+	nh_ofs = skb_network_offset(skb);
+	skb_pull_rcsum(skb, nh_ofs);
+	err = tcf_ct_handle_fragments(net, skb, family, p->zone);
+	if (err == -EINPROGRESS) {
+		retval = TC_ACT_STOLEN;
+		goto out;
+	}
+	if (err)
+		goto drop;
+
+	err = tcf_ct_skb_network_trim(skb, family);
+	if (err)
+		goto drop;
+
+	/* If we are recirculating packets to match on ct fields and
+	 * committing with a separate ct action, then we don't need to
+	 * actually run the packet through conntrack twice unless it's for a
+	 * different zone.
+	 */
+	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
+	if (!cached) {
+		/* Associate skb with specified zone. */
+		if (tmpl) {
+			ct = nf_ct_get(skb, &ctinfo);
+			if (skb_nfct(skb))
+				nf_conntrack_put(skb_nfct(skb));
+			nf_conntrack_get(&tmpl->ct_general);
+			nf_ct_set(skb, tmpl, IP_CT_NEW);
+		}
+
+		state.hook = NF_INET_PRE_ROUTING;
+		state.net = net;
+		state.pf = family;
+		err = nf_conntrack_in(skb, &state);
+		if (err != NF_ACCEPT)
+			goto out_push;
+	}
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		goto out_push;
+	nf_ct_deliver_cached_events(ct);
+
+	err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
+	if (err != NF_ACCEPT)
+		goto drop;
+
+	if (commit) {
+		tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
+		tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
+
+		/* This will take care of sending queued events
+		 * even if the connection is already confirmed.
+		 */
+		nf_conntrack_confirm(skb);
+	}
+
+out_push:
+	skb_push_rcsum(skb, nh_ofs);
+
+out:
+	bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+	return retval;
+
+drop:
+	qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+	return TC_ACT_SHOT;
+}
+
+static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
+	[TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 },
+	[TCA_CT_ACTION] = { .type = NLA_U16 },
+	[TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
+	[TCA_CT_ZONE] = { .type = NLA_U16 },
+	[TCA_CT_MARK] = { .type = NLA_U32 },
+	[TCA_CT_MARK_MASK] = { .type = NLA_U32 },
+	[TCA_CT_LABELS] = { .type = NLA_BINARY,
+			    .len = 128 / BITS_PER_BYTE },
+	[TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
+				 .len = 128 / BITS_PER_BYTE },
+	[TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
+	[TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
+	[TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN,
+				  .len = sizeof(struct in6_addr) },
+	[TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN,
+				   .len = sizeof(struct in6_addr) },
+	[TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
+	[TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
+};
+
+static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
+				  struct tc_ct *parm,
+				  struct nlattr **tb,
+				  struct netlink_ext_ack *extack)
+{
+	struct nf_nat_range2 *range;
+
+	if (!(p->ct_action & TCA_CT_ACT_NAT))
+		return 0;
+
+	if (!IS_ENABLED(CONFIG_NF_NAT)) {
+		NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
+		return -EOPNOTSUPP;
+	}
+
+	if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+		return 0;
+
+	if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
+	    (p->ct_action & TCA_CT_ACT_NAT_DST)) {
+		NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
+		return -EOPNOTSUPP;
+	}
+
+	range = &p->range;
+	if (tb[TCA_CT_NAT_IPV4_MIN]) {
+		struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
+
+		p->ipv4_range = true;
+		range->flags |= NF_NAT_RANGE_MAP_IPS;
+		range->min_addr.ip =
+			nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
+
+		range->max_addr.ip = max_attr ?
+				     nla_get_in_addr(max_attr) :
+				     range->min_addr.ip;
+	} else if (tb[TCA_CT_NAT_IPV6_MIN]) {
+		struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
+
+		p->ipv4_range = false;
+		range->flags |= NF_NAT_RANGE_MAP_IPS;
+		range->min_addr.in6 =
+			nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
+
+		range->max_addr.in6 = max_attr ?
+				      nla_get_in6_addr(max_attr) :
+				      range->min_addr.in6;
+	}
+
+	if (tb[TCA_CT_NAT_PORT_MIN]) {
+		range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+		range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
+
+		range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
+				       nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
+				       range->min_proto.all;
+	}
+
+	return 0;
+}
+
+static void tcf_ct_set_key_val(struct nlattr **tb,
+			       void *val, int val_type,
+			       void *mask, int mask_type,
+			       int len)
+{
+	if (!tb[val_type])
+		return;
+	nla_memcpy(val, tb[val_type], len);
+
+	if (!mask)
+		return;
+
+	if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
+		memset(mask, 0xff, len);
+	else
+		nla_memcpy(mask, tb[mask_type], len);
+}
+
+static int tcf_ct_fill_params(struct net *net,
+			      struct tcf_ct_params *p,
+			      struct tc_ct *parm,
+			      struct nlattr **tb,
+			      struct netlink_ext_ack *extack)
+{
+	struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+	struct nf_conntrack_zone zone;
+	struct nf_conn *tmpl;
+	int err;
+
+	p->zone = NF_CT_DEFAULT_ZONE_ID;
+
+	tcf_ct_set_key_val(tb,
+			   &p->ct_action, TCA_CT_ACTION,
+			   NULL, TCA_CT_UNSPEC,
+			   sizeof(p->ct_action));
+
+	if (p->ct_action & TCA_CT_ACT_CLEAR)
+		return 0;
+
+	err = tcf_ct_fill_params_nat(p, parm, tb, extack);
+	if (err)
+		return err;
+
+	if (tb[TCA_CT_MARK]) {
+		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+			NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
+			return -EOPNOTSUPP;
+		}
+		tcf_ct_set_key_val(tb,
+				   &p->mark, TCA_CT_MARK,
+				   &p->mark_mask, TCA_CT_MARK_MASK,
+				   sizeof(p->mark));
+	}
+
+	if (tb[TCA_CT_LABELS]) {
+		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+			NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
+			return -EOPNOTSUPP;
+		}
+
+		if (!tn->labels) {
+			NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
+			return -EOPNOTSUPP;
+		}
+		tcf_ct_set_key_val(tb,
+				   p->labels, TCA_CT_LABELS,
+				   p->labels_mask, TCA_CT_LABELS_MASK,
+				   sizeof(p->labels));
+	}
+
+	if (tb[TCA_CT_ZONE]) {
+		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+			NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
+			return -EOPNOTSUPP;
+		}
+
+		tcf_ct_set_key_val(tb,
+				   &p->zone, TCA_CT_ZONE,
+				   NULL, TCA_CT_UNSPEC,
+				   sizeof(p->zone));
+	}
+
+	if (p->zone == NF_CT_DEFAULT_ZONE_ID)
+		return 0;
+
+	nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
+	tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
+	if (!tmpl) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
+		return -ENOMEM;
+	}
+	__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+	nf_conntrack_get(&tmpl->ct_general);
+	p->tmpl = tmpl;
+
+	return 0;
+}
+
+static int tcf_ct_init(struct net *net, struct nlattr *nla,
+		       struct nlattr *est, struct tc_action **a,
+		       int replace, int bind, bool rtnl_held,
+		       struct tcf_proto *tp,
+		       struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+	struct tcf_ct_params *params = NULL;
+	struct nlattr *tb[TCA_CT_MAX + 1];
+	struct tcf_chain *goto_ch = NULL;
+	struct tc_ct *parm;
+	struct tcf_ct *c;
+	int err, res = 0;
+
+	if (!nla) {
+		NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_CT_PARMS]) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
+		return -EINVAL;
+	}
+	parm = nla_data(tb[TCA_CT_PARMS]);
+
+	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	if (err < 0)
+		return err;
+
+	if (!err) {
+		err = tcf_idr_create(tn, parm->index, est, a,
+				     &act_ct_ops, bind, true);
+		if (err) {
+			tcf_idr_cleanup(tn, parm->index);
+			return err;
+		}
+		res = ACT_P_CREATED;
+	} else {
+		if (bind)
+			return 0;
+
+		if (!replace) {
+			tcf_idr_release(*a, bind);
+			return -EEXIST;
+		}
+	}
+	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+	if (err < 0)
+		goto cleanup;
+
+	c = to_ct(*a);
+
+	params = kzalloc(sizeof(*params), GFP_KERNEL);
+	if (unlikely(!params)) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+
+	err = tcf_ct_fill_params(net, params, parm, tb, extack);
+	if (err)
+		goto cleanup;
+
+	spin_lock_bh(&c->tcf_lock);
+	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+	rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock));
+	spin_unlock_bh(&c->tcf_lock);
+
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+	if (params)
+		kfree_rcu(params, rcu);
+	if (res == ACT_P_CREATED)
+		tcf_idr_insert(tn, *a);
+
+	return res;
+
+cleanup:
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+	kfree(params);
+	tcf_idr_release(*a, bind);
+	return err;
+}
+
+static void tcf_ct_cleanup(struct tc_action *a)
+{
+	struct tcf_ct_params *params;
+	struct tcf_ct *c = to_ct(a);
+
+	params = rcu_dereference_protected(c->params, 1);
+	if (params)
+		call_rcu(&params->rcu, tcf_ct_params_free);
+}
+
+static int tcf_ct_dump_key_val(struct sk_buff *skb,
+			       void *val, int val_type,
+			       void *mask, int mask_type,
+			       int len)
+{
+	int err;
+
+	if (mask && !memchr_inv(mask, 0, len))
+		return 0;
+
+	err = nla_put(skb, val_type, len, val);
+	if (err)
+		return err;
+
+	if (mask_type != TCA_CT_UNSPEC) {
+		err = nla_put(skb, mask_type, len, mask);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
+{
+	struct nf_nat_range2 *range = &p->range;
+
+	if (!(p->ct_action & TCA_CT_ACT_NAT))
+		return 0;
+
+	if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+		return 0;
+
+	if (range->flags & NF_NAT_RANGE_MAP_IPS) {
+		if (p->ipv4_range) {
+			if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
+					    range->min_addr.ip))
+				return -1;
+			if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
+					    range->max_addr.ip))
+				return -1;
+		} else {
+			if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
+					     &range->min_addr.in6))
+				return -1;
+			if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
+					     &range->max_addr.in6))
+				return -1;
+		}
+	}
+
+	if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+		if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
+				 range->min_proto.all))
+			return -1;
+		if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
+				 range->max_proto.all))
+			return -1;
+	}
+
+	return 0;
+}
+
+static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
+			      int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ct *c = to_ct(a);
+	struct tcf_ct_params *p;
+
+	struct tc_ct opt = {
+		.index   = c->tcf_index,
+		.refcnt  = refcount_read(&c->tcf_refcnt) - ref,
+		.bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
+	};
+	struct tcf_t t;
+
+	spin_lock_bh(&c->tcf_lock);
+	p = rcu_dereference_protected(c->params,
+				      lockdep_is_held(&c->tcf_lock));
+	opt.action = c->tcf_action;
+
+	if (tcf_ct_dump_key_val(skb,
+				&p->ct_action, TCA_CT_ACTION,
+				NULL, TCA_CT_UNSPEC,
+				sizeof(p->ct_action)))
+		goto nla_put_failure;
+
+	if (p->ct_action & TCA_CT_ACT_CLEAR)
+		goto skip_dump;
+
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+	    tcf_ct_dump_key_val(skb,
+				&p->mark, TCA_CT_MARK,
+				&p->mark_mask, TCA_CT_MARK_MASK,
+				sizeof(p->mark)))
+		goto nla_put_failure;
+
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+	    tcf_ct_dump_key_val(skb,
+				p->labels, TCA_CT_LABELS,
+				p->labels_mask, TCA_CT_LABELS_MASK,
+				sizeof(p->labels)))
+		goto nla_put_failure;
+
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+	    tcf_ct_dump_key_val(skb,
+				&p->zone, TCA_CT_ZONE,
+				NULL, TCA_CT_UNSPEC,
+				sizeof(p->zone)))
+		goto nla_put_failure;
+
+	if (tcf_ct_dump_nat(skb, p))
+		goto nla_put_failure;
+
+skip_dump:
+	if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &c->tcf_tm);
+	if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
+		goto nla_put_failure;
+	spin_unlock_bh(&c->tcf_lock);
+
+	return skb->len;
+nla_put_failure:
+	spin_unlock_bh(&c->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_ct_walker(struct net *net, struct sk_buff *skb,
+			 struct netlink_callback *cb, int type,
+			 const struct tc_action_ops *ops,
+			 struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+	return tcf_idr_search(tn, a, index);
+}
+
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+			     u64 lastuse, bool hw)
+{
+	struct tcf_ct *c = to_ct(a);
+
+	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+	if (hw)
+		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+				   bytes, packets);
+	c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
+}
+
+static struct tc_action_ops act_ct_ops = {
+	.kind		=	"ct",
+	.id		=	TCA_ID_CT,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_ct_act,
+	.dump		=	tcf_ct_dump,
+	.init		=	tcf_ct_init,
+	.cleanup	=	tcf_ct_cleanup,
+	.walk		=	tcf_ct_walker,
+	.lookup		=	tcf_ct_search,
+	.stats_update	=	tcf_stats_update,
+	.size		=	sizeof(struct tcf_ct),
+};
+
+static __net_init int ct_init_net(struct net *net)
+{
+	unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8;
+	struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+
+	if (nf_connlabels_get(net, n_bits - 1)) {
+		tn->labels = false;
+		pr_err("act_ct: Failed to set connlabels length");
+	} else {
+		tn->labels = true;
+	}
+
+	return tc_action_net_init(&tn->tn, &act_ct_ops);
+}
+
+static void __net_exit ct_exit_net(struct list_head *net_list)
+{
+	struct net *net;
+
+	rtnl_lock();
+	list_for_each_entry(net, net_list, exit_list) {
+		struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+
+		if (tn->labels)
+			nf_connlabels_put(net);
+	}
+	rtnl_unlock();
+
+	tc_action_net_exit(net_list, ct_net_id);
+}
+
+static struct pernet_operations ct_net_ops = {
+	.init = ct_init_net,
+	.exit_batch = ct_exit_net,
+	.id   = &ct_net_id,
+	.size = sizeof(struct tc_ct_action_net),
+};
+
+static int __init ct_init_module(void)
+{
+	return tcf_register_action(&act_ct_ops, &ct_net_ops);
+}
+
+static void __exit ct_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+}
+
+module_init(ct_init_module);
+module_exit(ct_cleanup_module);
+MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
+MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
+MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
+MODULE_DESCRIPTION("Connection tracking action");
+MODULE_LICENSE("GPL v2");
+
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ad36bbcc583e..4a7331ce830d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -35,6 +35,7 @@
 #include <net/tc_act/tc_police.h>
 #include <net/tc_act/tc_sample.h>
 #include <net/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_ct.h>
 
 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
 
@@ -3266,6 +3267,10 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 			entry->police.burst = tcf_police_tcfp_burst(act);
 			entry->police.rate_bytes_ps =
 				tcf_police_rate_bytes_ps(act);
+		} else if (is_tcf_ct(act)) {
+			entry->id = FLOW_ACTION_CT;
+			entry->ct.action = tcf_ct_action(act);
+			entry->ct.zone = tcf_ct_zone(act);
 		} else {
 			goto err_out;
 		}
-- 
cgit v1.2.3


From 75a56758d6390ea6db523ad26ce378f34b907b0c Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Tue, 9 Jul 2019 10:30:49 +0300
Subject: net/flow_dissector: add connection tracking dissection

Retreives connection tracking zone, mark, label, and state from
a SKB.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 10 ++++++++++
 include/net/flow_dissector.h | 15 +++++++++++++++
 net/core/flow_dissector.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9d7a2c28ea35..d8af86d995d6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1325,6 +1325,16 @@ void skb_flow_dissect_meta(const struct sk_buff *skb,
 			   struct flow_dissector *flow_dissector,
 			   void *target_container);
 
+/* Gets a skb connection tracking info, ctinfo map should be a
+ * a map of mapsize to translate enum ip_conntrack_info states
+ * to user states.
+ */
+void
+skb_flow_dissect_ct(const struct sk_buff *skb,
+		    struct flow_dissector *flow_dissector,
+		    void *target_container,
+		    u16 *ctinfo_map,
+		    size_t mapsize);
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 02478e48fae4..90bd210be060 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -208,6 +208,20 @@ struct flow_dissector_key_meta {
 	int ingress_ifindex;
 };
 
+/**
+ * struct flow_dissector_key_ct:
+ * @ct_state: conntrack state after converting with map
+ * @ct_mark: conttrack mark
+ * @ct_zone: conntrack zone
+ * @ct_labels: conntrack labels
+ */
+struct flow_dissector_key_ct {
+	u16	ct_state;
+	u16	ct_zone;
+	u32	ct_mark;
+	u32	ct_labels[4];
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
@@ -234,6 +248,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */
 	FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */
 	FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
+	FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 01ad60b5aa75..3e6fedb57bc1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -27,6 +27,10 @@
 #include <scsi/fc/fc_fcoe.h>
 #include <uapi/linux/batadv_packet.h>
 #include <linux/bpf.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#endif
 
 static DEFINE_MUTEX(flow_dissector_mutex);
 
@@ -231,6 +235,46 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
 	ctrl->addr_type = type;
 }
 
+void
+skb_flow_dissect_ct(const struct sk_buff *skb,
+		    struct flow_dissector *flow_dissector,
+		    void *target_container,
+		    u16 *ctinfo_map,
+		    size_t mapsize)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	struct flow_dissector_key_ct *key;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_labels *cl;
+	struct nf_conn *ct;
+
+	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
+		return;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return;
+
+	key = skb_flow_dissector_target(flow_dissector,
+					FLOW_DISSECTOR_KEY_CT,
+					target_container);
+
+	if (ctinfo < mapsize)
+		key->ct_state = ctinfo_map[ctinfo];
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
+	key->ct_zone = ct->zone.id;
+#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+	key->ct_mark = ct->mark;
+#endif
+
+	cl = nf_ct_labels_find(ct);
+	if (cl)
+		memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
+#endif /* CONFIG_NF_CONNTRACK */
+}
+EXPORT_SYMBOL(skb_flow_dissect_ct);
+
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
-- 
cgit v1.2.3


From e0ace68af2acfe474bc89a3d9a2e24d700bf245d Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Tue, 9 Jul 2019 10:30:50 +0300
Subject: net/sched: cls_flower: Add matching on conntrack info

New matches for conntrack mark, label, zone, and state.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  16 ++++++
 net/sched/cls_flower.c       | 127 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 138 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 31db5589b7ca..b057aeeb6338 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -537,11 +537,27 @@ enum {
 	TCA_FLOWER_KEY_PORT_DST_MIN,	/* be16 */
 	TCA_FLOWER_KEY_PORT_DST_MAX,	/* be16 */
 
+	TCA_FLOWER_KEY_CT_STATE,	/* u16 */
+	TCA_FLOWER_KEY_CT_STATE_MASK,	/* u16 */
+	TCA_FLOWER_KEY_CT_ZONE,		/* u16 */
+	TCA_FLOWER_KEY_CT_ZONE_MASK,	/* u16 */
+	TCA_FLOWER_KEY_CT_MARK,		/* u32 */
+	TCA_FLOWER_KEY_CT_MARK_MASK,	/* u32 */
+	TCA_FLOWER_KEY_CT_LABELS,	/* u128 */
+	TCA_FLOWER_KEY_CT_LABELS_MASK,	/* u128 */
+
 	__TCA_FLOWER_MAX,
 };
 
 #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
 
+enum {
+	TCA_FLOWER_KEY_CT_FLAGS_NEW = 1 << 0, /* Beginning of a new connection. */
+	TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */
+	TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */
+	TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */
+};
+
 enum {
 	TCA_FLOWER_KEY_ENC_OPTS_UNSPEC,
 	TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5d4935b51e6f..bec37e16347f 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -26,6 +26,8 @@
 #include <net/dst.h>
 #include <net/dst_metadata.h>
 
+#include <uapi/linux/netfilter/nf_conntrack_common.h>
+
 struct fl_flow_key {
 	struct flow_dissector_key_meta meta;
 	struct flow_dissector_key_control control;
@@ -54,6 +56,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_enc_opts enc_opts;
 	struct flow_dissector_key_ports tp_min;
 	struct flow_dissector_key_ports tp_max;
+	struct flow_dissector_key_ct ct;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -272,14 +275,27 @@ static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
 	return __fl_lookup(mask, mkey);
 }
 
+static u16 fl_ct_info_to_flower_map[] = {
+	[IP_CT_ESTABLISHED] =		TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+					TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+	[IP_CT_RELATED] =		TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+					TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+	[IP_CT_ESTABLISHED_REPLY] =	TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+					TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+	[IP_CT_RELATED_REPLY] =		TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+					TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+	[IP_CT_NEW] =			TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+					TCA_FLOWER_KEY_CT_FLAGS_NEW,
+};
+
 static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
-	struct cls_fl_filter *f;
-	struct fl_flow_mask *mask;
-	struct fl_flow_key skb_key;
 	struct fl_flow_key skb_mkey;
+	struct fl_flow_key skb_key;
+	struct fl_flow_mask *mask;
+	struct cls_fl_filter *f;
 
 	list_for_each_entry_rcu(mask, &head->masks, list) {
 		fl_clear_masked_range(&skb_key, mask);
@@ -290,6 +306,9 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		 */
 		skb_key.basic.n_proto = skb->protocol;
 		skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
+		skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
+				    fl_ct_info_to_flower_map,
+				    ARRAY_SIZE(fl_ct_info_to_flower_map));
 		skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
 
 		fl_set_masked_key(&skb_mkey, &skb_key, mask);
@@ -686,6 +705,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ENC_OPTS]	= { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_ENC_OPTS_MASK]	= { .type = NLA_NESTED },
+	[TCA_FLOWER_KEY_CT_STATE]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_CT_STATE_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_CT_ZONE]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_CT_ZONE_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_CT_MARK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_CT_MARK_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_CT_LABELS]	= { .type = NLA_BINARY,
+					    .len = 128 / BITS_PER_BYTE },
+	[TCA_FLOWER_KEY_CT_LABELS_MASK]	= { .type = NLA_BINARY,
+					    .len = 128 / BITS_PER_BYTE },
 };
 
 static const struct nla_policy
@@ -707,11 +736,11 @@ static void fl_set_key_val(struct nlattr **tb,
 {
 	if (!tb[val_type])
 		return;
-	memcpy(val, nla_data(tb[val_type]), len);
+	nla_memcpy(val, tb[val_type], len);
 	if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type])
 		memset(mask, 0xff, len);
 	else
-		memcpy(mask, nla_data(tb[mask_type]), len);
+		nla_memcpy(mask, tb[mask_type], len);
 }
 
 static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
@@ -997,6 +1026,51 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 	return 0;
 }
 
+static int fl_set_key_ct(struct nlattr **tb,
+			 struct flow_dissector_key_ct *key,
+			 struct flow_dissector_key_ct *mask,
+			 struct netlink_ext_ack *extack)
+{
+	if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+		if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) {
+			NL_SET_ERR_MSG(extack, "Conntrack isn't enabled");
+			return -EOPNOTSUPP;
+		}
+		fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+			       &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+			       sizeof(key->ct_state));
+	}
+	if (tb[TCA_FLOWER_KEY_CT_ZONE]) {
+		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+			NL_SET_ERR_MSG(extack, "Conntrack zones isn't enabled");
+			return -EOPNOTSUPP;
+		}
+		fl_set_key_val(tb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+			       &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+			       sizeof(key->ct_zone));
+	}
+	if (tb[TCA_FLOWER_KEY_CT_MARK]) {
+		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+			NL_SET_ERR_MSG(extack, "Conntrack mark isn't enabled");
+			return -EOPNOTSUPP;
+		}
+		fl_set_key_val(tb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+			       &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+			       sizeof(key->ct_mark));
+	}
+	if (tb[TCA_FLOWER_KEY_CT_LABELS]) {
+		if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+			NL_SET_ERR_MSG(extack, "Conntrack labels aren't enabled");
+			return -EOPNOTSUPP;
+		}
+		fl_set_key_val(tb, key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+			       mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+			       sizeof(key->ct_labels));
+	}
+
+	return 0;
+}
+
 static int fl_set_key(struct net *net, struct nlattr **tb,
 		      struct fl_flow_key *key, struct fl_flow_key *mask,
 		      struct netlink_ext_ack *extack)
@@ -1206,6 +1280,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			return ret;
 	}
 
+	ret = fl_set_key_ct(tb, &key->ct, &mask->ct, extack);
+	if (ret)
+		return ret;
+
 	if (tb[TCA_FLOWER_KEY_FLAGS])
 		ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
 
@@ -1306,6 +1384,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
+	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+			     FLOW_DISSECTOR_KEY_CT, ct);
 
 	skb_flow_dissector_init(dissector, keys, cnt);
 }
@@ -2065,6 +2145,40 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int fl_dump_key_ct(struct sk_buff *skb,
+			  struct flow_dissector_key_ct *key,
+			  struct flow_dissector_key_ct *mask)
+{
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK) &&
+	    fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+			    &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+			    sizeof(key->ct_state)))
+		goto nla_put_failure;
+
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+	    fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+			    &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+			    sizeof(key->ct_zone)))
+		goto nla_put_failure;
+
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+	    fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+			    &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+			    sizeof(key->ct_mark)))
+		goto nla_put_failure;
+
+	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+	    fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+			    &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+			    sizeof(key->ct_labels)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
 			       struct flow_dissector_key_enc_opts *enc_opts)
 {
@@ -2298,6 +2412,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 	    fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
 		goto nla_put_failure;
 
+	if (fl_dump_key_ct(skb, &key->ct, &mask->ct))
+		goto nla_put_failure;
+
 	if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 4e95bc268b915c3a19ec8b9110f61e4ea41a1ed0 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:39 +0200
Subject: net: flow_offload: add flow_block_cb_setup_simple()

Most drivers do the same thing to set up the flow block callbacks, this
patch adds a helper function to do this.

This preparation patch reduces the number of changes to adapt the
existing drivers to use the flow block callback API.

This new helper function takes a flow block list per-driver, which is
set to NULL until this driver list is used.

This patch also introduces the flow_block_command and
flow_block_binder_type enumerations, which are renamed to use
FLOW_BLOCK_* in follow up patches.

There are three definitions (aliases) in order to reduce the number of
updates in this patch, which go away once drivers are fully adapted to
use this flow block API.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 26 ++++-------------
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c     | 28 ++++--------------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   | 26 ++++-------------
 drivers/net/ethernet/intel/i40e/i40e_main.c       | 26 ++++-------------
 drivers/net/ethernet/intel/iavf/iavf_main.c       | 35 ++++-------------------
 drivers/net/ethernet/intel/igb/igb_main.c         | 24 +++-------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 27 ++++-------------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 27 ++++-------------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 26 ++++-------------
 drivers/net/ethernet/netronome/nfp/abm/cls.c      | 17 ++---------
 drivers/net/ethernet/netronome/nfp/bpf/main.c     | 29 ++++---------------
 drivers/net/ethernet/qlogic/qede/qede_main.c      | 23 ++-------------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 22 ++------------
 drivers/net/netdevsim/netdev.c                    | 26 ++++-------------
 include/net/flow_offload.h                        | 27 +++++++++++++++++
 include/net/pkt_cls.h                             | 20 ++-----------
 net/core/flow_offload.c                           | 25 ++++++++++++++++
 17 files changed, 117 insertions(+), 317 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 2b5b0ab8961a..06819590f6d0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -9907,32 +9907,16 @@ static int bnxt_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
-static int bnxt_setup_tc_block(struct net_device *dev,
-			       struct tc_block_offload *f)
-{
-	struct bnxt *bp = netdev_priv(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, bnxt_setup_tc_block_cb,
-					     bp, bp, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, bnxt_setup_tc_block_cb, bp);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			 void *type_data)
 {
+	struct bnxt *bp = netdev_priv(dev);
+
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return bnxt_setup_tc_block(dev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  bnxt_setup_tc_block_cb,
+						  bp, bp, true);
 	case TC_SETUP_QDISC_MQPRIO: {
 		struct tc_mqprio_qopt *mqprio = type_data;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
index f760921389a3..89398ff011d4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
@@ -161,34 +161,16 @@ static int bnxt_vf_rep_setup_tc_block_cb(enum tc_setup_type type,
 	}
 }
 
-static int bnxt_vf_rep_setup_tc_block(struct net_device *dev,
-				      struct tc_block_offload *f)
-{
-	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block,
-					     bnxt_vf_rep_setup_tc_block_cb,
-					     vf_rep, vf_rep, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block,
-					bnxt_vf_rep_setup_tc_block_cb, vf_rep);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int bnxt_vf_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
 				void *type_data)
 {
+	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
+
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return bnxt_vf_rep_setup_tc_block(dev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  bnxt_vf_rep_setup_tc_block_cb,
+						  vf_rep, vf_rep, true);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index b08efc48d42f..9a486282a32e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3190,32 +3190,16 @@ static int cxgb_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
-static int cxgb_setup_tc_block(struct net_device *dev,
-			       struct tc_block_offload *f)
-{
-	struct port_info *pi = netdev2pinfo(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, cxgb_setup_tc_block_cb,
-					     pi, dev, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, cxgb_setup_tc_block_cb, pi);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			 void *type_data)
 {
+	struct port_info *pi = netdev2pinfo(dev);
+
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return cxgb_setup_tc_block(dev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  cxgb_setup_tc_block_cb,
+						  pi, dev, true);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5361c08328f7..52f0f14d4207 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8177,34 +8177,18 @@ static int i40e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
-static int i40e_setup_tc_block(struct net_device *dev,
-			       struct tc_block_offload *f)
-{
-	struct i40e_netdev_priv *np = netdev_priv(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, i40e_setup_tc_block_cb,
-					     np, np, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, i40e_setup_tc_block_cb, np);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 			   void *type_data)
 {
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+
 	switch (type) {
 	case TC_SETUP_QDISC_MQPRIO:
 		return i40e_setup_tc(netdev, type_data);
 	case TC_SETUP_BLOCK:
-		return i40e_setup_tc_block(netdev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  i40e_setup_tc_block_cb,
+						  np, np, true);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 881561b36083..fd0e2bcc75e5 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -3113,35 +3113,6 @@ static int iavf_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
-/**
- * iavf_setup_tc_block - register callbacks for tc
- * @netdev: network interface device structure
- * @f: tc offload data
- *
- * This function registers block callbacks for tc
- * offloads
- **/
-static int iavf_setup_tc_block(struct net_device *dev,
-			       struct tc_block_offload *f)
-{
-	struct iavf_adapter *adapter = netdev_priv(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, iavf_setup_tc_block_cb,
-					     adapter, adapter, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, iavf_setup_tc_block_cb,
-					adapter);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 /**
  * iavf_setup_tc - configure multiple traffic classes
  * @netdev: network interface device structure
@@ -3156,11 +3127,15 @@ static int iavf_setup_tc_block(struct net_device *dev,
 static int iavf_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 			 void *type_data)
 {
+	struct iavf_adapter *adapter = netdev_priv(netdev);
+
 	switch (type) {
 	case TC_SETUP_QDISC_MQPRIO:
 		return __iavf_setup_tc(netdev, type_data);
 	case TC_SETUP_BLOCK:
-		return iavf_setup_tc_block(netdev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  iavf_setup_tc_block_cb,
+						  adapter, adapter, true);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index f66dae72fe37..836f9e1a136c 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2783,25 +2783,6 @@ static int igb_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
-static int igb_setup_tc_block(struct igb_adapter *adapter,
-			      struct tc_block_offload *f)
-{
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, igb_setup_tc_block_cb,
-					     adapter, adapter, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, igb_setup_tc_block_cb,
-					adapter);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int igb_offload_txtime(struct igb_adapter *adapter,
 			      struct tc_etf_qopt_offload *qopt)
 {
@@ -2834,7 +2815,10 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	case TC_SETUP_QDISC_CBS:
 		return igb_offload_cbs(adapter, type_data);
 	case TC_SETUP_BLOCK:
-		return igb_setup_tc_block(adapter, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  igb_setup_tc_block_cb,
+						  adapter, adapter, true);
+
 	case TC_SETUP_QDISC_ETF:
 		return igb_offload_txtime(adapter, type_data);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index b613e72c8ee4..b098f5be9c0d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9607,27 +9607,6 @@ static int ixgbe_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
-static int ixgbe_setup_tc_block(struct net_device *dev,
-				struct tc_block_offload *f)
-{
-	struct ixgbe_adapter *adapter = netdev_priv(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, ixgbe_setup_tc_block_cb,
-					     adapter, adapter, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, ixgbe_setup_tc_block_cb,
-					adapter);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int ixgbe_setup_tc_mqprio(struct net_device *dev,
 				 struct tc_mqprio_qopt *mqprio)
 {
@@ -9638,9 +9617,13 @@ static int ixgbe_setup_tc_mqprio(struct net_device *dev,
 static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			    void *type_data)
 {
+	struct ixgbe_adapter *adapter = netdev_priv(dev);
+
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return ixgbe_setup_tc_block(dev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  ixgbe_setup_tc_block_cb,
+						  adapter, adapter, true);
 	case TC_SETUP_QDISC_MQPRIO:
 		return ixgbe_setup_tc_mqprio(dev, type_data);
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 10efd69de7ef..8e5ebdb7c459 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3457,36 +3457,19 @@ static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 		return -EOPNOTSUPP;
 	}
 }
-
-static int mlx5e_setup_tc_block(struct net_device *dev,
-				struct tc_block_offload *f)
-{
-	struct mlx5e_priv *priv = netdev_priv(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, mlx5e_setup_tc_block_cb,
-					     priv, priv, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, mlx5e_setup_tc_block_cb,
-					priv);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
 #endif
 
 static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			  void *type_data)
 {
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
 	switch (type) {
 #ifdef CONFIG_MLX5_ESWITCH
 	case TC_SETUP_BLOCK:
-		return mlx5e_setup_tc_block(dev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  mlx5e_setup_tc_block_cb,
+						  priv, priv, true);
 #endif
 	case TC_SETUP_QDISC_MQPRIO:
 		return mlx5e_setup_tc_mqprio(dev, type_data);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 6810b9fa0705..a211cdb5eb8b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1153,32 +1153,16 @@ static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
-static int mlx5e_rep_setup_tc_block(struct net_device *dev,
-				    struct tc_block_offload *f)
-{
-	struct mlx5e_priv *priv = netdev_priv(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, mlx5e_rep_setup_tc_cb,
-					     priv, priv, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, mlx5e_rep_setup_tc_cb, priv);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			      void *type_data)
 {
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return mlx5e_rep_setup_tc_block(dev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  mlx5e_rep_setup_tc_cb,
+						  priv, priv, true);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/abm/cls.c b/drivers/net/ethernet/netronome/nfp/abm/cls.c
index ff3913085665..29fb45734962 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/cls.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/cls.c
@@ -265,19 +265,6 @@ static int nfp_abm_setup_tc_block_cb(enum tc_setup_type type,
 int nfp_abm_setup_cls_block(struct net_device *netdev, struct nfp_repr *repr,
 			    struct tc_block_offload *f)
 {
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block,
-					     nfp_abm_setup_tc_block_cb,
-					     repr, repr, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, nfp_abm_setup_tc_block_cb,
-					repr);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
+	return flow_block_cb_setup_simple(f, NULL, nfp_abm_setup_tc_block_cb,
+					  repr, repr, true);
 }
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 9c136da25221..0c93c84a188a 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -160,35 +160,16 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 	return 0;
 }
 
-static int nfp_bpf_setup_tc_block(struct net_device *netdev,
-				  struct tc_block_offload *f)
-{
-	struct nfp_net *nn = netdev_priv(netdev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block,
-					     nfp_bpf_setup_tc_block_cb,
-					     nn, nn, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block,
-					nfp_bpf_setup_tc_block_cb,
-					nn);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
 			    enum tc_setup_type type, void *type_data)
 {
+	struct nfp_net *nn = netdev_priv(netdev);
+
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return nfp_bpf_setup_tc_block(netdev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  nfp_bpf_setup_tc_block_cb,
+						  nn, nn, true);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index d4a29660751d..cba97ed3dd56 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -579,25 +579,6 @@ static int qede_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
-static int qede_setup_tc_block(struct qede_dev *edev,
-			       struct tc_block_offload *f)
-{
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block,
-					     qede_setup_tc_block_cb,
-					     edev, edev, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, qede_setup_tc_block_cb, edev);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int
 qede_setup_tc_offload(struct net_device *dev, enum tc_setup_type type,
 		      void *type_data)
@@ -607,7 +588,9 @@ qede_setup_tc_offload(struct net_device *dev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return qede_setup_tc_block(edev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  qede_setup_tc_block_cb,
+						  edev, edev, true);
 	case TC_SETUP_QDISC_MQPRIO:
 		mqprio = type_data;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index e4b06dc484b7..93ef80c16f07 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3769,24 +3769,6 @@ static int stmmac_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	return ret;
 }
 
-static int stmmac_setup_tc_block(struct stmmac_priv *priv,
-				 struct tc_block_offload *f)
-{
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, stmmac_setup_tc_block_cb,
-				priv, priv, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, stmmac_setup_tc_block_cb, priv);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int stmmac_setup_tc(struct net_device *ndev, enum tc_setup_type type,
 			   void *type_data)
 {
@@ -3794,7 +3776,9 @@ static int stmmac_setup_tc(struct net_device *ndev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return stmmac_setup_tc_block(priv, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  stmmac_setup_tc_block_cb,
+						  priv, priv, true);
 	case TC_SETUP_QDISC_CBS:
 		return stmmac_tc_setup_cbs(priv, priv, type_data);
 	default:
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index e5c8aa08e1cd..920dc79e9dc9 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -78,26 +78,6 @@ nsim_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
 	return nsim_bpf_setup_tc_block_cb(type, type_data, cb_priv);
 }
 
-static int
-nsim_setup_tc_block(struct net_device *dev, struct tc_block_offload *f)
-{
-	struct netdevsim *ns = netdev_priv(dev);
-
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		return -EOPNOTSUPP;
-
-	switch (f->command) {
-	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, nsim_setup_tc_block_cb,
-					     ns, ns, f->extack);
-	case TC_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, nsim_setup_tc_block_cb, ns);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int nsim_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
 {
 	struct netdevsim *ns = netdev_priv(dev);
@@ -226,9 +206,13 @@ static int nsim_set_vf_link_state(struct net_device *dev, int vf, int state)
 static int
 nsim_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data)
 {
+	struct netdevsim *ns = netdev_priv(dev);
+
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return nsim_setup_tc_block(dev, type_data);
+		return flow_block_cb_setup_simple(type_data, NULL,
+						  nsim_setup_tc_block_cb,
+						  ns, ns, true);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index a09e256d2b27..2430e4907fe9 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -3,6 +3,7 @@
 
 #include <linux/kernel.h>
 #include <net/flow_dissector.h>
+#include <net/sch_generic.h>
 
 struct flow_match {
 	struct flow_dissector	*dissector;
@@ -237,4 +238,30 @@ static inline void flow_stats_update(struct flow_stats *flow_stats,
 	flow_stats->lastused	= max_t(u64, flow_stats->lastused, lastused);
 }
 
+enum flow_block_command {
+	TC_BLOCK_BIND,
+	TC_BLOCK_UNBIND,
+};
+
+enum flow_block_binder_type {
+	TCF_BLOCK_BINDER_TYPE_UNSPEC,
+	TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+	TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
+};
+
+struct tcf_block;
+struct netlink_ext_ack;
+
+struct flow_block_offload {
+	enum flow_block_command command;
+	enum flow_block_binder_type binder_type;
+	struct tcf_block *block;
+	struct list_head *driver_block_list;
+	struct netlink_ext_ack *extack;
+};
+
+int flow_block_cb_setup_simple(struct flow_block_offload *f,
+			       struct list_head *driver_list, tc_setup_cb_t *cb,
+			       void *cb_ident, void *cb_priv, bool ingress_only);
+
 #endif /* _NET_FLOW_OFFLOAD_H */
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 1a7596ba0dbe..b6c306fa9541 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -26,11 +26,9 @@ struct tcf_walker {
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
-enum tcf_block_binder_type {
-	TCF_BLOCK_BINDER_TYPE_UNSPEC,
-	TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
-	TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
-};
+#define tc_block_offload flow_block_offload
+#define tc_block_command flow_block_command
+#define tcf_block_binder_type flow_block_binder_type
 
 struct tcf_block_ext_info {
 	enum tcf_block_binder_type binder_type;
@@ -610,18 +608,6 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 		     void *type_data, bool err_stop);
 unsigned int tcf_exts_num_actions(struct tcf_exts *exts);
 
-enum tc_block_command {
-	TC_BLOCK_BIND,
-	TC_BLOCK_UNBIND,
-};
-
-struct tc_block_offload {
-	enum tc_block_command command;
-	enum tcf_block_binder_type binder_type;
-	struct tcf_block *block;
-	struct netlink_ext_ack *extack;
-};
-
 struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index f52fe0bc4017..e31c0fdb6b01 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <net/flow_offload.h>
+#include <net/pkt_cls.h>
 
 struct flow_rule *flow_rule_alloc(unsigned int num_actions)
 {
@@ -164,3 +165,27 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
 	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out);
 }
 EXPORT_SYMBOL(flow_rule_match_enc_opts);
+
+int flow_block_cb_setup_simple(struct flow_block_offload *f,
+			       struct list_head *driver_block_list,
+			       tc_setup_cb_t *cb, void *cb_ident, void *cb_priv,
+			       bool ingress_only)
+{
+	if (ingress_only &&
+	    f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	f->driver_block_list = driver_block_list;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, cb, cb_ident, cb_priv,
+					     f->extack);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, cb, cb_ident);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL(flow_block_cb_setup_simple);
-- 
cgit v1.2.3


From 9c0e189ec988f306331036bc3f71085582b24fdc Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:40 +0200
Subject: net: flow_offload: rename TC_BLOCK_{UN}BIND to FLOW_BLOCK_{UN}BIND

Rename from TC_BLOCK_{UN}BIND to FLOW_BLOCK_{UN}BIND and remove
temporary tc_block_command alias.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  4 ++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  4 ++--
 drivers/net/ethernet/mscc/ocelot_tc.c              |  4 ++--
 .../net/ethernet/netronome/nfp/flower/offload.c    |  8 ++++----
 include/net/flow_offload.h                         |  4 ++--
 include/net/pkt_cls.h                              |  1 -
 net/core/flow_offload.c                            |  4 ++--
 net/dsa/slave.c                                    |  4 ++--
 net/sched/cls_api.c                                | 22 +++++++++++-----------
 9 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index a211cdb5eb8b..853aff64ef4b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -705,7 +705,7 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev,
 		return -EOPNOTSUPP;
 
 	switch (f->command) {
-	case TC_BLOCK_BIND:
+	case FLOW_BLOCK_BIND:
 		indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
 		if (indr_priv)
 			return -EEXIST;
@@ -728,7 +728,7 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev,
 		}
 
 		return err;
-	case TC_BLOCK_UNBIND:
+	case FLOW_BLOCK_UNBIND:
 		indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
 		if (!indr_priv)
 			return -ENOENT;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index ce285fbeebd3..9cf61a9d8291 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1679,7 +1679,7 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
 	}
 
 	switch (f->command) {
-	case TC_BLOCK_BIND:
+	case FLOW_BLOCK_BIND:
 		err = tcf_block_cb_register(f->block, cb, mlxsw_sp_port,
 					    mlxsw_sp_port, f->extack);
 		if (err)
@@ -1692,7 +1692,7 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
 			return err;
 		}
 		return 0;
-	case TC_BLOCK_UNBIND:
+	case FLOW_BLOCK_UNBIND:
 		mlxsw_sp_setup_tc_block_flower_unbind(mlxsw_sp_port,
 						      f->block, ingress);
 		tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port);
diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c
index 72084306240d..c84942ef8e7b 100644
--- a/drivers/net/ethernet/mscc/ocelot_tc.c
+++ b/drivers/net/ethernet/mscc/ocelot_tc.c
@@ -147,14 +147,14 @@ static int ocelot_setup_tc_block(struct ocelot_port *port,
 	}
 
 	switch (f->command) {
-	case TC_BLOCK_BIND:
+	case FLOW_BLOCK_BIND:
 		ret = tcf_block_cb_register(f->block, cb, port,
 					    port, f->extack);
 		if (ret)
 			return ret;
 
 		return ocelot_setup_tc_block_flower_bind(port, f);
-	case TC_BLOCK_UNBIND:
+	case FLOW_BLOCK_UNBIND:
 		ocelot_setup_tc_block_flower_unbind(port, f);
 		tcf_block_cb_unregister(f->block, cb, port);
 		return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 6dbe947269c3..7c94f5142076 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -1315,11 +1315,11 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev,
 	repr_priv->block_shared = tcf_block_shared(f->block);
 
 	switch (f->command) {
-	case TC_BLOCK_BIND:
+	case FLOW_BLOCK_BIND:
 		return tcf_block_cb_register(f->block,
 					     nfp_flower_setup_tc_block_cb,
 					     repr, repr, f->extack);
-	case TC_BLOCK_UNBIND:
+	case FLOW_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block,
 					nfp_flower_setup_tc_block_cb,
 					repr);
@@ -1395,7 +1395,7 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app,
 		return -EOPNOTSUPP;
 
 	switch (f->command) {
-	case TC_BLOCK_BIND:
+	case FLOW_BLOCK_BIND:
 		cb_priv = kmalloc(sizeof(*cb_priv), GFP_KERNEL);
 		if (!cb_priv)
 			return -ENOMEM;
@@ -1413,7 +1413,7 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app,
 		}
 
 		return err;
-	case TC_BLOCK_UNBIND:
+	case FLOW_BLOCK_UNBIND:
 		cb_priv = nfp_flower_indr_block_cb_priv_lookup(app, netdev);
 		if (!cb_priv)
 			return -ENOENT;
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 2430e4907fe9..7c9f7a2ac7ce 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -239,8 +239,8 @@ static inline void flow_stats_update(struct flow_stats *flow_stats,
 }
 
 enum flow_block_command {
-	TC_BLOCK_BIND,
-	TC_BLOCK_UNBIND,
+	FLOW_BLOCK_BIND,
+	FLOW_BLOCK_UNBIND,
 };
 
 enum flow_block_binder_type {
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index b6c306fa9541..1a96f469164f 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -27,7 +27,6 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
 #define tc_block_offload flow_block_offload
-#define tc_block_command flow_block_command
 #define tcf_block_binder_type flow_block_binder_type
 
 struct tcf_block_ext_info {
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index e31c0fdb6b01..593e73f7593a 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -178,10 +178,10 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 	f->driver_block_list = driver_block_list;
 
 	switch (f->command) {
-	case TC_BLOCK_BIND:
+	case FLOW_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, cb, cb_ident, cb_priv,
 					     f->extack);
-	case TC_BLOCK_UNBIND:
+	case FLOW_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, cb, cb_ident);
 		return 0;
 	default:
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 99673f6b07f6..58a71ee0747a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -955,9 +955,9 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
 		return -EOPNOTSUPP;
 
 	switch (f->command) {
-	case TC_BLOCK_BIND:
+	case FLOW_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, cb, dev, dev, f->extack);
-	case TC_BLOCK_UNBIND:
+	case FLOW_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, cb, dev);
 		return 0;
 	default:
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 4a7331ce830d..ed6f35cc11ea 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -675,7 +675,7 @@ static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
 
 static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
 				  struct tc_indr_block_cb *indr_block_cb,
-				  enum tc_block_command command)
+				  enum flow_block_command command)
 {
 	struct tc_block_offload bo = {
 		.command	= command,
@@ -706,7 +706,7 @@ int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 	if (err)
 		goto err_dev_put;
 
-	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND);
+	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND);
 	return 0;
 
 err_dev_put:
@@ -743,7 +743,7 @@ void __tc_indr_block_cb_unregister(struct net_device *dev,
 		return;
 
 	/* Send unbind message if required to free any block cbs. */
-	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND);
+	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND);
 	tc_indr_block_cb_del(indr_block_cb);
 	tc_indr_block_dev_put(indr_dev);
 }
@@ -760,7 +760,7 @@ EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
 
 static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 			       struct tcf_block_ext_info *ei,
-			       enum tc_block_command command,
+			       enum flow_block_command command,
 			       struct netlink_ext_ack *extack)
 {
 	struct tc_indr_block_cb *indr_block_cb;
@@ -776,7 +776,7 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 	if (!indr_dev)
 		return;
 
-	indr_dev->block = command == TC_BLOCK_BIND ? block : NULL;
+	indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL;
 
 	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
 		indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
@@ -791,7 +791,7 @@ static bool tcf_block_offload_in_use(struct tcf_block *block)
 static int tcf_block_offload_cmd(struct tcf_block *block,
 				 struct net_device *dev,
 				 struct tcf_block_ext_info *ei,
-				 enum tc_block_command command,
+				 enum flow_block_command command,
 				 struct netlink_ext_ack *extack)
 {
 	struct tc_block_offload bo = {};
@@ -821,20 +821,20 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
 		return -EOPNOTSUPP;
 	}
 
-	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
+	err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack);
 	if (err == -EOPNOTSUPP)
 		goto no_offload_dev_inc;
 	if (err)
 		return err;
 
-	tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
 	return 0;
 
 no_offload_dev_inc:
 	if (tcf_block_offload_in_use(block))
 		return -EOPNOTSUPP;
 	block->nooffloaddevcnt++;
-	tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
 	return 0;
 }
 
@@ -844,11 +844,11 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 	struct net_device *dev = q->dev_queue->dev;
 	int err;
 
-	tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
 
 	if (!dev->netdev_ops->ndo_setup_tc)
 		goto no_offload_dev_dec;
-	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+	err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
 	if (err == -EOPNOTSUPP)
 		goto no_offload_dev_dec;
 	return;
-- 
cgit v1.2.3


From 32f8c4093ac353a5f1b36cfed0ce0138faf8e15f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:41 +0200
Subject: net: flow_offload: rename TCF_BLOCK_BINDER_TYPE_* to
 FLOW_BLOCK_BINDER_TYPE_*

Rename from TCF_BLOCK_BINDER_TYPE_* to FLOW_BLOCK_BINDER_TYPE_* and
remove temporary tcf_block_binder_type alias.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c    |  2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c      |  4 ++--
 drivers/net/ethernet/mscc/ocelot_flower.c           |  2 +-
 drivers/net/ethernet/mscc/ocelot_tc.c               |  4 ++--
 drivers/net/ethernet/netronome/nfp/flower/offload.c |  6 +++---
 include/net/flow_offload.h                          |  6 +++---
 include/net/pkt_cls.h                               |  3 +--
 net/core/flow_offload.c                             |  2 +-
 net/dsa/slave.c                                     |  4 ++--
 net/sched/cls_api.c                                 | 14 +++++++-------
 net/sched/sch_ingress.c                             |  6 +++---
 11 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 853aff64ef4b..f2ad1ca7ed2a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -701,7 +701,7 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev,
 	struct mlx5e_rep_indr_block_priv *indr_priv;
 	int err = 0;
 
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
 		return -EOPNOTSUPP;
 
 	switch (f->command) {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 9cf61a9d8291..a178d082f061 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1668,10 +1668,10 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
 	bool ingress;
 	int err;
 
-	if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) {
+	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) {
 		cb = mlxsw_sp_setup_tc_block_cb_matchall_ig;
 		ingress = true;
-	} else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS) {
+	} else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) {
 		cb = mlxsw_sp_setup_tc_block_cb_matchall_eg;
 		ingress = false;
 	} else {
diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c
index 8778dee5a471..b682f08a93b4 100644
--- a/drivers/net/ethernet/mscc/ocelot_flower.c
+++ b/drivers/net/ethernet/mscc/ocelot_flower.c
@@ -306,7 +306,7 @@ int ocelot_setup_tc_block_flower_bind(struct ocelot_port *port,
 	struct tcf_block_cb *block_cb;
 	int ret;
 
-	if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
 		return -EOPNOTSUPP;
 
 	block_cb = tcf_block_cb_lookup(f->block,
diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c
index c84942ef8e7b..58a0b5f8850c 100644
--- a/drivers/net/ethernet/mscc/ocelot_tc.c
+++ b/drivers/net/ethernet/mscc/ocelot_tc.c
@@ -137,10 +137,10 @@ static int ocelot_setup_tc_block(struct ocelot_port *port,
 	netdev_dbg(port->dev, "tc_block command %d, binder_type %d\n",
 		   f->command, f->binder_type);
 
-	if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) {
+	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) {
 		cb = ocelot_setup_tc_block_cb_ig;
 		port->tc.block_shared = tcf_block_shared(f->block);
-	} else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS) {
+	} else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) {
 		cb = ocelot_setup_tc_block_cb_eg;
 	} else {
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 7c94f5142076..46041e509150 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -1308,7 +1308,7 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev,
 	struct nfp_repr *repr = netdev_priv(netdev);
 	struct nfp_flower_repr_priv *repr_priv;
 
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
 		return -EOPNOTSUPP;
 
 	repr_priv = repr->app_priv;
@@ -1389,8 +1389,8 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app,
 	struct nfp_flower_priv *priv = app->priv;
 	int err;
 
-	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
-	    !(f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS &&
+	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+	    !(f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS &&
 	      nfp_flower_internal_port_can_offload(app, netdev)))
 		return -EOPNOTSUPP;
 
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 7c9f7a2ac7ce..f12b905ad95e 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -244,9 +244,9 @@ enum flow_block_command {
 };
 
 enum flow_block_binder_type {
-	TCF_BLOCK_BINDER_TYPE_UNSPEC,
-	TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
-	TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
+	FLOW_BLOCK_BINDER_TYPE_UNSPEC,
+	FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+	FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
 };
 
 struct tcf_block;
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 1a96f469164f..e4499526fde8 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -27,10 +27,9 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
 #define tc_block_offload flow_block_offload
-#define tcf_block_binder_type flow_block_binder_type
 
 struct tcf_block_ext_info {
-	enum tcf_block_binder_type binder_type;
+	enum flow_block_binder_type binder_type;
 	tcf_chain_head_change_t *chain_head_change;
 	void *chain_head_change_priv;
 	u32 block_index;
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 593e73f7593a..6d8187e8effc 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -172,7 +172,7 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       bool ingress_only)
 {
 	if (ingress_only &&
-	    f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+	    f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
 		return -EOPNOTSUPP;
 
 	f->driver_block_list = driver_block_list;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 58a71ee0747a..9b5e202c255e 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -947,9 +947,9 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
 {
 	tc_setup_cb_t *cb;
 
-	if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
 		cb = dsa_slave_setup_tc_block_cb_ig;
-	else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+	else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
 		cb = dsa_slave_setup_tc_block_cb_eg;
 	else
 		return -EOPNOTSUPP;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ed6f35cc11ea..49b89c89a8b9 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -679,7 +679,7 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
 {
 	struct tc_block_offload bo = {
 		.command	= command,
-		.binder_type	= TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
 		.block		= indr_dev->block,
 	};
 
@@ -1341,17 +1341,17 @@ static void tcf_block_release(struct Qdisc *q, struct tcf_block *block,
 struct tcf_block_owner_item {
 	struct list_head list;
 	struct Qdisc *q;
-	enum tcf_block_binder_type binder_type;
+	enum flow_block_binder_type binder_type;
 };
 
 static void
 tcf_block_owner_netif_keep_dst(struct tcf_block *block,
 			       struct Qdisc *q,
-			       enum tcf_block_binder_type binder_type)
+			       enum flow_block_binder_type binder_type)
 {
 	if (block->keep_dst &&
-	    binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
-	    binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+	    binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+	    binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
 		netif_keep_dst(qdisc_dev(q));
 }
 
@@ -1368,7 +1368,7 @@ EXPORT_SYMBOL(tcf_block_netif_keep_dst);
 
 static int tcf_block_owner_add(struct tcf_block *block,
 			       struct Qdisc *q,
-			       enum tcf_block_binder_type binder_type)
+			       enum flow_block_binder_type binder_type)
 {
 	struct tcf_block_owner_item *item;
 
@@ -1383,7 +1383,7 @@ static int tcf_block_owner_add(struct tcf_block *block,
 
 static void tcf_block_owner_del(struct tcf_block *block,
 				struct Qdisc *q,
-				enum tcf_block_binder_type binder_type)
+				enum flow_block_binder_type binder_type)
 {
 	struct tcf_block_owner_item *item;
 
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 599730f804d7..bf56aa519797 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -83,7 +83,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
 
 	mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
 
-	q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	q->block_info.chain_head_change = clsact_chain_head_change;
 	q->block_info.chain_head_change_priv = &q->miniqp;
 
@@ -217,7 +217,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
 
 	mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
 
-	q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	q->ingress_block_info.chain_head_change = clsact_chain_head_change;
 	q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
 
@@ -228,7 +228,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
 
 	mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
 
-	q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+	q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
 	q->egress_block_info.chain_head_change = clsact_chain_head_change;
 	q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
 
-- 
cgit v1.2.3


From d63db30c8537ba45208c156d71125db73d0fe522 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:42 +0200
Subject: net: flow_offload: add flow_block_cb_alloc() and flow_block_cb_free()

Add a new helper function to allocate flow_block_cb objects.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 14 ++++++++++++++
 net/core/flow_offload.c    | 28 ++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index f12b905ad95e..45d74cb542cd 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -260,6 +260,20 @@ struct flow_block_offload {
 	struct netlink_ext_ack *extack;
 };
 
+struct flow_block_cb {
+	struct list_head	list;
+	tc_setup_cb_t		*cb;
+	void			*cb_ident;
+	void			*cb_priv;
+	void			(*release)(void *cb_priv);
+	unsigned int		refcnt;
+};
+
+struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb,
+					  void *cb_ident, void *cb_priv,
+					  void (*release)(void *cb_priv));
+void flow_block_cb_free(struct flow_block_cb *block_cb);
+
 int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_list, tc_setup_cb_t *cb,
 			       void *cb_ident, void *cb_priv, bool ingress_only);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 6d8187e8effc..d08148cb6953 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -166,6 +166,34 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
 }
 EXPORT_SYMBOL(flow_rule_match_enc_opts);
 
+struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb,
+					  void *cb_ident, void *cb_priv,
+					  void (*release)(void *cb_priv))
+{
+	struct flow_block_cb *block_cb;
+
+	block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
+	if (!block_cb)
+		return ERR_PTR(-ENOMEM);
+
+	block_cb->cb = cb;
+	block_cb->cb_ident = cb_ident;
+	block_cb->cb_priv = cb_priv;
+	block_cb->release = release;
+
+	return block_cb;
+}
+EXPORT_SYMBOL(flow_block_cb_alloc);
+
+void flow_block_cb_free(struct flow_block_cb *block_cb)
+{
+	if (block_cb->release)
+		block_cb->release(block_cb->cb_priv);
+
+	kfree(block_cb);
+}
+EXPORT_SYMBOL(flow_block_cb_free);
+
 int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_block_list,
 			       tc_setup_cb_t *cb, void *cb_ident, void *cb_priv,
-- 
cgit v1.2.3


From da3eeb904ff432ec22cf7b4db17a47647428873a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:43 +0200
Subject: net: flow_offload: add list handling functions

This patch adds the list handling functions for the flow block API:

* flow_block_cb_lookup() allows drivers to look up for existing flow blocks.
* flow_block_cb_add() adds a flow block to the per driver list to be registered
  by the core.
* flow_block_cb_remove() to remove a flow block from the list of existing
  flow blocks per driver and to request the core to unregister this.

The flow block API also annotates the netns this flow block belongs to.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 19 +++++++++++++++++++
 net/core/flow_offload.c    | 17 +++++++++++++++++
 net/sched/cls_api.c        |  3 +++
 3 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 45d74cb542cd..563d7dc7afc1 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -256,12 +256,16 @@ struct flow_block_offload {
 	enum flow_block_command command;
 	enum flow_block_binder_type binder_type;
 	struct tcf_block *block;
+	struct net *net;
+	struct list_head cb_list;
 	struct list_head *driver_block_list;
 	struct netlink_ext_ack *extack;
 };
 
 struct flow_block_cb {
+	struct list_head	driver_list;
 	struct list_head	list;
+	struct net		*net;
 	tc_setup_cb_t		*cb;
 	void			*cb_ident;
 	void			*cb_priv;
@@ -274,6 +278,21 @@ struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb,
 					  void (*release)(void *cb_priv));
 void flow_block_cb_free(struct flow_block_cb *block_cb);
 
+struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *offload,
+					   tc_setup_cb_t *cb, void *cb_ident);
+
+static inline void flow_block_cb_add(struct flow_block_cb *block_cb,
+				     struct flow_block_offload *offload)
+{
+	list_add_tail(&block_cb->list, &offload->cb_list);
+}
+
+static inline void flow_block_cb_remove(struct flow_block_cb *block_cb,
+					struct flow_block_offload *offload)
+{
+	list_move(&block_cb->list, &offload->cb_list);
+}
+
 int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_list, tc_setup_cb_t *cb,
 			       void *cb_ident, void *cb_priv, bool ingress_only);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index d08148cb6953..c81a7e0c5e04 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -176,6 +176,7 @@ struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb,
 	if (!block_cb)
 		return ERR_PTR(-ENOMEM);
 
+	block_cb->net = net;
 	block_cb->cb = cb;
 	block_cb->cb_ident = cb_ident;
 	block_cb->cb_priv = cb_priv;
@@ -194,6 +195,22 @@ void flow_block_cb_free(struct flow_block_cb *block_cb)
 }
 EXPORT_SYMBOL(flow_block_cb_free);
 
+struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f,
+					   tc_setup_cb_t *cb, void *cb_ident)
+{
+	struct flow_block_cb *block_cb;
+
+	list_for_each_entry(block_cb, f->driver_block_list, driver_list) {
+		if (block_cb->net == f->net &&
+		    block_cb->cb == cb &&
+		    block_cb->cb_ident == cb_ident)
+			return block_cb;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(flow_block_cb_lookup);
+
 int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_block_list,
 			       tc_setup_cb_t *cb, void *cb_ident, void *cb_priv,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 49b89c89a8b9..ccbd51bed88c 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -680,6 +680,7 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
 	struct tc_block_offload bo = {
 		.command	= command,
 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+		.net		= dev_net(indr_dev->dev),
 		.block		= indr_dev->block,
 	};
 
@@ -768,6 +769,7 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 	struct tc_block_offload bo = {
 		.command	= command,
 		.binder_type	= ei->binder_type,
+		.net		= dev_net(dev),
 		.block		= block,
 		.extack		= extack,
 	};
@@ -796,6 +798,7 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
 {
 	struct tc_block_offload bo = {};
 
+	bo.net = dev_net(dev);
 	bo.command = command;
 	bo.binder_type = ei->binder_type;
 	bo.block = block;
-- 
cgit v1.2.3


From 67bd0d5ea7974d9dc9c502c7b4096e16a80a553d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:44 +0200
Subject: net: flow_offload: add flow_block_cb_{priv, incref, decref}()

This patch completes the flow block API to introduce:

* flow_block_cb_priv() to access callback private data.
* flow_block_cb_incref() to bump reference counter on this flow block.
* flow_block_cb_decref() to decrement the reference counter.

These functions are taken from the existing tcf_block_cb_priv(),
tcf_block_cb_incref() and tcf_block_cb_decref().

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h |  4 ++++
 net/core/flow_offload.c    | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 563d7dc7afc1..3fb9cc4da63e 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -281,6 +281,10 @@ void flow_block_cb_free(struct flow_block_cb *block_cb);
 struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *offload,
 					   tc_setup_cb_t *cb, void *cb_ident);
 
+void *flow_block_cb_priv(struct flow_block_cb *block_cb);
+void flow_block_cb_incref(struct flow_block_cb *block_cb);
+unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb);
+
 static inline void flow_block_cb_add(struct flow_block_cb *block_cb,
 				     struct flow_block_offload *offload)
 {
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index c81a7e0c5e04..a36a9dc1c6df 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -211,6 +211,24 @@ struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f,
 }
 EXPORT_SYMBOL(flow_block_cb_lookup);
 
+void *flow_block_cb_priv(struct flow_block_cb *block_cb)
+{
+	return block_cb->cb_priv;
+}
+EXPORT_SYMBOL(flow_block_cb_priv);
+
+void flow_block_cb_incref(struct flow_block_cb *block_cb)
+{
+	block_cb->refcnt++;
+}
+EXPORT_SYMBOL(flow_block_cb_incref);
+
+unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb)
+{
+	return --block_cb->refcnt;
+}
+EXPORT_SYMBOL(flow_block_cb_decref);
+
 int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_block_list,
 			       tc_setup_cb_t *cb, void *cb_ident, void *cb_priv,
-- 
cgit v1.2.3


From 955bcb6ea0df0d9ace89ac475405f1295ced5962 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:46 +0200
Subject: drivers: net: use flow block API

This patch updates flow_block_cb_setup_simple() to use the flow block API.
Several drivers are also adjusted to use it.

This patch introduces the per-driver list of flow blocks to account for
blocks that are already in use.

Remove tc_block_offload alias.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  5 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c      |  5 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  5 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  5 +-
 drivers/net/ethernet/intel/iavf/iavf_main.c        |  5 +-
 drivers/net/ethernet/intel/igb/igb_main.c          |  5 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  5 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  5 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 42 +++++++---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 92 ++++++++++++++--------
 drivers/net/ethernet/mscc/ocelot_ace.h             |  4 +-
 drivers/net/ethernet/mscc/ocelot_flower.c          | 46 ++++++-----
 drivers/net/ethernet/mscc/ocelot_tc.c              | 34 +++++---
 drivers/net/ethernet/netronome/nfp/abm/cls.c       |  7 +-
 drivers/net/ethernet/netronome/nfp/abm/main.h      |  2 +-
 drivers/net/ethernet/netronome/nfp/bpf/main.c      |  5 +-
 .../net/ethernet/netronome/nfp/flower/offload.c    | 68 +++++++++++-----
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  5 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  5 +-
 drivers/net/netdevsim/netdev.c                     |  5 +-
 include/net/flow_offload.h                         |  3 +-
 include/net/pkt_cls.h                              |  2 -
 net/core/flow_offload.c                            | 20 ++++-
 net/dsa/slave.c                                    | 22 +++++-
 net/sched/cls_api.c                                | 14 ++--
 25 files changed, 286 insertions(+), 130 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 06819590f6d0..3f632028eff0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -9907,6 +9907,8 @@ static int bnxt_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
+static LIST_HEAD(bnxt_block_cb_list);
+
 static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			 void *type_data)
 {
@@ -9914,7 +9916,8 @@ static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &bnxt_block_cb_list,
 						  bnxt_setup_tc_block_cb,
 						  bp, bp, true);
 	case TC_SETUP_QDISC_MQPRIO: {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
index 89398ff011d4..f9bf7d7250ab 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
@@ -161,6 +161,8 @@ static int bnxt_vf_rep_setup_tc_block_cb(enum tc_setup_type type,
 	}
 }
 
+static LIST_HEAD(bnxt_vf_block_cb_list);
+
 static int bnxt_vf_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
 				void *type_data)
 {
@@ -168,7 +170,8 @@ static int bnxt_vf_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &bnxt_vf_block_cb_list,
 						  bnxt_vf_rep_setup_tc_block_cb,
 						  vf_rep, vf_rep, true);
 	default:
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 9a486282a32e..fdc8ca4f8891 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3190,6 +3190,8 @@ static int cxgb_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
+static LIST_HEAD(cxgb_block_cb_list);
+
 static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			 void *type_data)
 {
@@ -3197,7 +3199,8 @@ static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &cxgb_block_cb_list,
 						  cxgb_setup_tc_block_cb,
 						  pi, dev, true);
 	default:
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 52f0f14d4207..7be1080680f5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8177,6 +8177,8 @@ static int i40e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
+static LIST_HEAD(i40e_block_cb_list);
+
 static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 			   void *type_data)
 {
@@ -8186,7 +8188,8 @@ static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 	case TC_SETUP_QDISC_MQPRIO:
 		return i40e_setup_tc(netdev, type_data);
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &i40e_block_cb_list,
 						  i40e_setup_tc_block_cb,
 						  np, np, true);
 	default:
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index fd0e2bcc75e5..05eca6f2e890 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -3113,6 +3113,8 @@ static int iavf_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
+static LIST_HEAD(iavf_block_cb_list);
+
 /**
  * iavf_setup_tc - configure multiple traffic classes
  * @netdev: network interface device structure
@@ -3133,7 +3135,8 @@ static int iavf_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 	case TC_SETUP_QDISC_MQPRIO:
 		return __iavf_setup_tc(netdev, type_data);
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &iavf_block_cb_list,
 						  iavf_setup_tc_block_cb,
 						  adapter, adapter, true);
 	default:
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 836f9e1a136c..00e8186e2c59 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2806,6 +2806,8 @@ static int igb_offload_txtime(struct igb_adapter *adapter,
 	return 0;
 }
 
+static LIST_HEAD(igb_block_cb_list);
+
 static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			void *type_data)
 {
@@ -2815,7 +2817,8 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	case TC_SETUP_QDISC_CBS:
 		return igb_offload_cbs(adapter, type_data);
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &igb_block_cb_list,
 						  igb_setup_tc_block_cb,
 						  adapter, adapter, true);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index b098f5be9c0d..cbaf712d6529 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9614,6 +9614,8 @@ static int ixgbe_setup_tc_mqprio(struct net_device *dev,
 	return ixgbe_setup_tc(dev, mqprio->num_tc);
 }
 
+static LIST_HEAD(ixgbe_block_cb_list);
+
 static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			    void *type_data)
 {
@@ -9621,7 +9623,8 @@ static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &ixgbe_block_cb_list,
 						  ixgbe_setup_tc_block_cb,
 						  adapter, adapter, true);
 	case TC_SETUP_QDISC_MQPRIO:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8e5ebdb7c459..4c138789c547 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3459,6 +3459,8 @@ static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 }
 #endif
 
+static LIST_HEAD(mlx5e_block_cb_list);
+
 static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			  void *type_data)
 {
@@ -3467,7 +3469,8 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	switch (type) {
 #ifdef CONFIG_MLX5_ESWITCH
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &mlx5e_block_cb_list,
 						  mlx5e_setup_tc_block_cb,
 						  priv, priv, true);
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index f2ad1ca7ed2a..7ca6b6472017 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -693,17 +693,29 @@ static int mlx5e_rep_indr_setup_block_cb(enum tc_setup_type type,
 	}
 }
 
+static void mlx5e_rep_indr_tc_block_unbind(void *cb_priv)
+{
+	struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv;
+
+	list_del(&indr_priv->list);
+	kfree(indr_priv);
+}
+
+static LIST_HEAD(mlx5e_block_cb_list);
+
 static int
 mlx5e_rep_indr_setup_tc_block(struct net_device *netdev,
 			      struct mlx5e_rep_priv *rpriv,
-			      struct tc_block_offload *f)
+			      struct flow_block_offload *f)
 {
 	struct mlx5e_rep_indr_block_priv *indr_priv;
-	int err = 0;
+	struct flow_block_cb *block_cb;
 
 	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
 		return -EOPNOTSUPP;
 
+	f->driver_block_list = &mlx5e_block_cb_list;
+
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
 		indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
@@ -719,26 +731,32 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev,
 		list_add(&indr_priv->list,
 			 &rpriv->uplink_priv.tc_indr_block_priv_list);
 
-		err = tcf_block_cb_register(f->block,
-					    mlx5e_rep_indr_setup_block_cb,
-					    indr_priv, indr_priv, f->extack);
-		if (err) {
+		block_cb = flow_block_cb_alloc(f->net,
+					       mlx5e_rep_indr_setup_block_cb,
+					       indr_priv, indr_priv,
+					       mlx5e_rep_indr_tc_block_unbind);
+		if (IS_ERR(block_cb)) {
 			list_del(&indr_priv->list);
 			kfree(indr_priv);
+			return PTR_ERR(block_cb);
 		}
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list);
 
-		return err;
+		return 0;
 	case FLOW_BLOCK_UNBIND:
 		indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
 		if (!indr_priv)
 			return -ENOENT;
 
-		tcf_block_cb_unregister(f->block,
-					mlx5e_rep_indr_setup_block_cb,
-					indr_priv);
-		list_del(&indr_priv->list);
-		kfree(indr_priv);
+		block_cb = flow_block_cb_lookup(f,
+						mlx5e_rep_indr_setup_block_cb,
+						indr_priv);
+		if (!block_cb)
+			return -ENOENT;
 
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 		return 0;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index a178d082f061..65bea6be84d6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1585,33 +1585,45 @@ static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type,
 	}
 }
 
+static void mlxsw_sp_tc_block_flower_release(void *cb_priv)
+{
+	struct mlxsw_sp_acl_block *acl_block = cb_priv;
+
+	mlxsw_sp_acl_block_destroy(acl_block);
+}
+
+static LIST_HEAD(mlxsw_sp_block_cb_list);
+
 static int
 mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port,
-				    struct tcf_block *block, bool ingress,
-				    struct netlink_ext_ack *extack)
+			            struct flow_block_offload *f, bool ingress)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct mlxsw_sp_acl_block *acl_block;
-	struct tcf_block_cb *block_cb;
+	struct flow_block_cb *block_cb;
+	bool register_block = false;
 	int err;
 
-	block_cb = tcf_block_cb_lookup(block, mlxsw_sp_setup_tc_block_cb_flower,
-				       mlxsw_sp);
+	block_cb = flow_block_cb_lookup(f, mlxsw_sp_setup_tc_block_cb_flower,
+					mlxsw_sp);
 	if (!block_cb) {
-		acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, block->net);
+		acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, f->net);
 		if (!acl_block)
 			return -ENOMEM;
-		block_cb = __tcf_block_cb_register(block,
-						   mlxsw_sp_setup_tc_block_cb_flower,
-						   mlxsw_sp, acl_block, extack);
+		block_cb = flow_block_cb_alloc(f->net,
+					       mlxsw_sp_setup_tc_block_cb_flower,
+					       mlxsw_sp, acl_block,
+					       mlxsw_sp_tc_block_flower_release);
 		if (IS_ERR(block_cb)) {
+			mlxsw_sp_acl_block_destroy(acl_block);
 			err = PTR_ERR(block_cb);
 			goto err_cb_register;
 		}
+		register_block = true;
 	} else {
-		acl_block = tcf_block_cb_priv(block_cb);
+		acl_block = flow_block_cb_priv(block_cb);
 	}
-	tcf_block_cb_incref(block_cb);
+	flow_block_cb_incref(block_cb);
 	err = mlxsw_sp_acl_block_bind(mlxsw_sp, acl_block,
 				      mlxsw_sp_port, ingress);
 	if (err)
@@ -1622,28 +1634,31 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port,
 	else
 		mlxsw_sp_port->eg_acl_block = acl_block;
 
+	if (register_block) {
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &mlxsw_sp_block_cb_list);
+	}
+
 	return 0;
 
 err_block_bind:
-	if (!tcf_block_cb_decref(block_cb)) {
-		__tcf_block_cb_unregister(block, block_cb);
+	if (!flow_block_cb_decref(block_cb))
+		flow_block_cb_free(block_cb);
 err_cb_register:
-		mlxsw_sp_acl_block_destroy(acl_block);
-	}
 	return err;
 }
 
 static void
 mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port,
-				      struct tcf_block *block, bool ingress)
+				      struct flow_block_offload *f, bool ingress)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct mlxsw_sp_acl_block *acl_block;
-	struct tcf_block_cb *block_cb;
+	struct flow_block_cb *block_cb;
 	int err;
 
-	block_cb = tcf_block_cb_lookup(block, mlxsw_sp_setup_tc_block_cb_flower,
-				       mlxsw_sp);
+	block_cb = flow_block_cb_lookup(f, mlxsw_sp_setup_tc_block_cb_flower,
+					mlxsw_sp);
 	if (!block_cb)
 		return;
 
@@ -1652,18 +1667,19 @@ mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port,
 	else
 		mlxsw_sp_port->eg_acl_block = NULL;
 
-	acl_block = tcf_block_cb_priv(block_cb);
+	acl_block = flow_block_cb_priv(block_cb);
 	err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block,
 					mlxsw_sp_port, ingress);
-	if (!err && !tcf_block_cb_decref(block_cb)) {
-		__tcf_block_cb_unregister(block, block_cb);
-		mlxsw_sp_acl_block_destroy(acl_block);
+	if (!err && !flow_block_cb_decref(block_cb)) {
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 	}
 }
 
 static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
-				   struct tc_block_offload *f)
+				   struct flow_block_offload *f)
 {
+	struct flow_block_cb *block_cb;
 	tc_setup_cb_t *cb;
 	bool ingress;
 	int err;
@@ -1678,24 +1694,32 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
 		return -EOPNOTSUPP;
 	}
 
+	f->driver_block_list = &mlxsw_sp_block_cb_list;
+
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
-		err = tcf_block_cb_register(f->block, cb, mlxsw_sp_port,
-					    mlxsw_sp_port, f->extack);
-		if (err)
-			return err;
-		err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port,
-							  f->block, ingress,
-							  f->extack);
+		block_cb = flow_block_cb_alloc(f->net, cb, mlxsw_sp_port,
+					       mlxsw_sp_port, NULL);
+		if (IS_ERR(block_cb))
+			return PTR_ERR(block_cb);
+		err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port, f,
+							  ingress);
 		if (err) {
-			tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port);
+			flow_block_cb_free(block_cb);
 			return err;
 		}
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &mlxsw_sp_block_cb_list);
 		return 0;
 	case FLOW_BLOCK_UNBIND:
 		mlxsw_sp_setup_tc_block_flower_unbind(mlxsw_sp_port,
-						      f->block, ingress);
-		tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port);
+						      f, ingress);
+		block_cb = flow_block_cb_lookup(f, cb, mlxsw_sp_port);
+		if (!block_cb)
+			return -ENOENT;
+
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 		return 0;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mscc/ocelot_ace.h b/drivers/net/ethernet/mscc/ocelot_ace.h
index d621683643e1..e98944c87259 100644
--- a/drivers/net/ethernet/mscc/ocelot_ace.h
+++ b/drivers/net/ethernet/mscc/ocelot_ace.h
@@ -225,8 +225,8 @@ int ocelot_ace_init(struct ocelot *ocelot);
 void ocelot_ace_deinit(void);
 
 int ocelot_setup_tc_block_flower_bind(struct ocelot_port *port,
-				      struct tc_block_offload *f);
+				      struct flow_block_offload *f);
 void ocelot_setup_tc_block_flower_unbind(struct ocelot_port *port,
-					 struct tc_block_offload *f);
+					 struct flow_block_offload *f);
 
 #endif /* _MSCC_OCELOT_ACE_H_ */
diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c
index b682f08a93b4..5b92c2a03f3d 100644
--- a/drivers/net/ethernet/mscc/ocelot_flower.c
+++ b/drivers/net/ethernet/mscc/ocelot_flower.c
@@ -299,36 +299,45 @@ static void ocelot_port_block_destroy(struct ocelot_port_block *block)
 	kfree(block);
 }
 
+static void ocelot_tc_block_unbind(void *cb_priv)
+{
+	struct ocelot_port_block *port_block = cb_priv;
+
+	ocelot_port_block_destroy(port_block);
+}
+
 int ocelot_setup_tc_block_flower_bind(struct ocelot_port *port,
-				      struct tc_block_offload *f)
+				      struct flow_block_offload *f)
 {
 	struct ocelot_port_block *port_block;
-	struct tcf_block_cb *block_cb;
+	struct flow_block_cb *block_cb;
 	int ret;
 
 	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
 		return -EOPNOTSUPP;
 
-	block_cb = tcf_block_cb_lookup(f->block,
-				       ocelot_setup_tc_block_cb_flower, port);
+	block_cb = flow_block_cb_lookup(f, ocelot_setup_tc_block_cb_flower,
+					port);
 	if (!block_cb) {
 		port_block = ocelot_port_block_create(port);
 		if (!port_block)
 			return -ENOMEM;
 
-		block_cb =
-			__tcf_block_cb_register(f->block,
-						ocelot_setup_tc_block_cb_flower,
-						port, port_block, f->extack);
+		block_cb = flow_block_cb_alloc(f->net,
+					       ocelot_setup_tc_block_cb_flower,
+					       port, port_block,
+					       ocelot_tc_block_unbind);
 		if (IS_ERR(block_cb)) {
 			ret = PTR_ERR(block_cb);
 			goto err_cb_register;
 		}
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, f->driver_block_list);
 	} else {
-		port_block = tcf_block_cb_priv(block_cb);
+		port_block = flow_block_cb_priv(block_cb);
 	}
 
-	tcf_block_cb_incref(block_cb);
+	flow_block_cb_incref(block_cb);
 	return 0;
 
 err_cb_register:
@@ -338,20 +347,17 @@ err_cb_register:
 }
 
 void ocelot_setup_tc_block_flower_unbind(struct ocelot_port *port,
-					 struct tc_block_offload *f)
+					 struct flow_block_offload *f)
 {
-	struct ocelot_port_block *port_block;
-	struct tcf_block_cb *block_cb;
+	struct flow_block_cb *block_cb;
 
-	block_cb = tcf_block_cb_lookup(f->block,
-				       ocelot_setup_tc_block_cb_flower, port);
+	block_cb = flow_block_cb_lookup(f, ocelot_setup_tc_block_cb_flower,
+					port);
 	if (!block_cb)
 		return;
 
-	port_block = tcf_block_cb_priv(block_cb);
-	if (!tcf_block_cb_decref(block_cb)) {
-		tcf_block_cb_unregister(f->block,
-					ocelot_setup_tc_block_cb_flower, port);
-		ocelot_port_block_destroy(port_block);
+	if (!flow_block_cb_decref(block_cb)) {
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 	}
 }
diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c
index 58a0b5f8850c..935a774cb291 100644
--- a/drivers/net/ethernet/mscc/ocelot_tc.c
+++ b/drivers/net/ethernet/mscc/ocelot_tc.c
@@ -128,35 +128,51 @@ static int ocelot_setup_tc_block_cb_eg(enum tc_setup_type type,
 					cb_priv, false);
 }
 
+static LIST_HEAD(ocelot_block_cb_list);
+
 static int ocelot_setup_tc_block(struct ocelot_port *port,
-				 struct tc_block_offload *f)
+				 struct flow_block_offload *f)
 {
+	struct flow_block_cb *block_cb;
 	tc_setup_cb_t *cb;
-	int ret;
+	int err;
 
 	netdev_dbg(port->dev, "tc_block command %d, binder_type %d\n",
 		   f->command, f->binder_type);
 
 	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) {
 		cb = ocelot_setup_tc_block_cb_ig;
-		port->tc.block_shared = tcf_block_shared(f->block);
+		port->tc.block_shared = f->block_shared;
 	} else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) {
 		cb = ocelot_setup_tc_block_cb_eg;
 	} else {
 		return -EOPNOTSUPP;
 	}
 
+	f->driver_block_list = &ocelot_block_cb_list;
+
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
-		ret = tcf_block_cb_register(f->block, cb, port,
-					    port, f->extack);
-		if (ret)
-			return ret;
+		block_cb = flow_block_cb_alloc(f->net, cb, port, port, NULL);
+		if (IS_ERR(block_cb))
+			return PTR_ERR(block_cb);
 
-		return ocelot_setup_tc_block_flower_bind(port, f);
+		err = ocelot_setup_tc_block_flower_bind(port, f);
+		if (err < 0) {
+			flow_block_cb_free(block_cb);
+			return err;
+		}
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, f->driver_block_list);
+		return 0;
 	case FLOW_BLOCK_UNBIND:
+		block_cb = flow_block_cb_lookup(f, cb, port);
+		if (!block_cb)
+			return -ENOENT;
+
 		ocelot_setup_tc_block_flower_unbind(port, f);
-		tcf_block_cb_unregister(f->block, cb, port);
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 		return 0;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/netronome/nfp/abm/cls.c b/drivers/net/ethernet/netronome/nfp/abm/cls.c
index 29fb45734962..23ebddfb9532 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/cls.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/cls.c
@@ -262,9 +262,12 @@ static int nfp_abm_setup_tc_block_cb(enum tc_setup_type type,
 	}
 }
 
+static LIST_HEAD(nfp_abm_block_cb_list);
+
 int nfp_abm_setup_cls_block(struct net_device *netdev, struct nfp_repr *repr,
-			    struct tc_block_offload *f)
+			    struct flow_block_offload *f)
 {
-	return flow_block_cb_setup_simple(f, NULL, nfp_abm_setup_tc_block_cb,
+	return flow_block_cb_setup_simple(f, &nfp_abm_block_cb_list,
+					  nfp_abm_setup_tc_block_cb,
 					  repr, repr, true);
 }
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 49749c60885e..48746c9c6224 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -247,7 +247,7 @@ int nfp_abm_setup_tc_mq(struct net_device *netdev, struct nfp_abm_link *alink,
 int nfp_abm_setup_tc_gred(struct net_device *netdev, struct nfp_abm_link *alink,
 			  struct tc_gred_qopt_offload *opt);
 int nfp_abm_setup_cls_block(struct net_device *netdev, struct nfp_repr *repr,
-			    struct tc_block_offload *opt);
+			    struct flow_block_offload *opt);
 
 int nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 0c93c84a188a..1c9fb11470df 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -160,6 +160,8 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 	return 0;
 }
 
+static LIST_HEAD(nfp_bpf_block_cb_list);
+
 static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
 			    enum tc_setup_type type, void *type_data)
 {
@@ -167,7 +169,8 @@ static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &nfp_bpf_block_cb_list,
 						  nfp_bpf_setup_tc_block_cb,
 						  nn, nn, true);
 	default:
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 46041e509150..ddd6b509f27e 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -1302,27 +1302,41 @@ static int nfp_flower_setup_tc_block_cb(enum tc_setup_type type,
 	}
 }
 
+static LIST_HEAD(nfp_block_cb_list);
+
 static int nfp_flower_setup_tc_block(struct net_device *netdev,
-				     struct tc_block_offload *f)
+				     struct flow_block_offload *f)
 {
 	struct nfp_repr *repr = netdev_priv(netdev);
 	struct nfp_flower_repr_priv *repr_priv;
+	struct flow_block_cb *block_cb;
 
 	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
 		return -EOPNOTSUPP;
 
 	repr_priv = repr->app_priv;
-	repr_priv->block_shared = tcf_block_shared(f->block);
+	repr_priv->block_shared = f->block_shared;
+	f->driver_block_list = &nfp_block_cb_list;
 
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
-		return tcf_block_cb_register(f->block,
-					     nfp_flower_setup_tc_block_cb,
-					     repr, repr, f->extack);
+		block_cb = flow_block_cb_alloc(f->net,
+					       nfp_flower_setup_tc_block_cb,
+					       repr, repr, NULL);
+		if (IS_ERR(block_cb))
+			return PTR_ERR(block_cb);
+
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &nfp_block_cb_list);
+		return 0;
 	case FLOW_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block,
-					nfp_flower_setup_tc_block_cb,
-					repr);
+		block_cb = flow_block_cb_lookup(f, nfp_flower_setup_tc_block_cb,
+						repr);
+		if (!block_cb)
+			return -ENOENT;
+
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 		return 0;
 	default:
 		return -EOPNOTSUPP;
@@ -1381,13 +1395,21 @@ static int nfp_flower_setup_indr_block_cb(enum tc_setup_type type,
 	}
 }
 
+static void nfp_flower_setup_indr_tc_release(void *cb_priv)
+{
+	struct nfp_flower_indr_block_cb_priv *priv = cb_priv;
+
+	list_del(&priv->list);
+	kfree(priv);
+}
+
 static int
 nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app,
-			       struct tc_block_offload *f)
+			       struct flow_block_offload *f)
 {
 	struct nfp_flower_indr_block_cb_priv *cb_priv;
 	struct nfp_flower_priv *priv = app->priv;
-	int err;
+	struct flow_block_cb *block_cb;
 
 	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
 	    !(f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS &&
@@ -1404,26 +1426,32 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app,
 		cb_priv->app = app;
 		list_add(&cb_priv->list, &priv->indr_block_cb_priv);
 
-		err = tcf_block_cb_register(f->block,
-					    nfp_flower_setup_indr_block_cb,
-					    cb_priv, cb_priv, f->extack);
-		if (err) {
+		block_cb = flow_block_cb_alloc(f->net,
+					       nfp_flower_setup_indr_block_cb,
+					       cb_priv, cb_priv,
+					       nfp_flower_setup_indr_tc_release);
+		if (IS_ERR(block_cb)) {
 			list_del(&cb_priv->list);
 			kfree(cb_priv);
+			return PTR_ERR(block_cb);
 		}
 
-		return err;
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &nfp_block_cb_list);
+		return 0;
 	case FLOW_BLOCK_UNBIND:
 		cb_priv = nfp_flower_indr_block_cb_priv_lookup(app, netdev);
 		if (!cb_priv)
 			return -ENOENT;
 
-		tcf_block_cb_unregister(f->block,
-					nfp_flower_setup_indr_block_cb,
-					cb_priv);
-		list_del(&cb_priv->list);
-		kfree(cb_priv);
+		block_cb = flow_block_cb_lookup(f,
+						nfp_flower_setup_indr_block_cb,
+						cb_priv);
+		if (!block_cb)
+			return -ENOENT;
 
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 		return 0;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index cba97ed3dd56..1be593a6e20d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -579,6 +579,8 @@ static int qede_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
+static LIST_HEAD(qede_block_cb_list);
+
 static int
 qede_setup_tc_offload(struct net_device *dev, enum tc_setup_type type,
 		      void *type_data)
@@ -588,7 +590,8 @@ qede_setup_tc_offload(struct net_device *dev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &qede_block_cb_list,
 						  qede_setup_tc_block_cb,
 						  edev, edev, true);
 	case TC_SETUP_QDISC_MQPRIO:
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 93ef80c16f07..c7c9e5f162e6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3769,6 +3769,8 @@ static int stmmac_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	return ret;
 }
 
+static LIST_HEAD(stmmac_block_cb_list);
+
 static int stmmac_setup_tc(struct net_device *ndev, enum tc_setup_type type,
 			   void *type_data)
 {
@@ -3776,7 +3778,8 @@ static int stmmac_setup_tc(struct net_device *ndev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &stmmac_block_cb_list,
 						  stmmac_setup_tc_block_cb,
 						  priv, priv, true);
 	case TC_SETUP_QDISC_CBS:
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 920dc79e9dc9..0740940f41b1 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -203,6 +203,8 @@ static int nsim_set_vf_link_state(struct net_device *dev, int vf, int state)
 	return 0;
 }
 
+static LIST_HEAD(nsim_block_cb_list);
+
 static int
 nsim_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data)
 {
@@ -210,7 +212,8 @@ nsim_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data)
 
 	switch (type) {
 	case TC_SETUP_BLOCK:
-		return flow_block_cb_setup_simple(type_data, NULL,
+		return flow_block_cb_setup_simple(type_data,
+						  &nsim_block_cb_list,
 						  nsim_setup_tc_block_cb,
 						  ns, ns, true);
 	default:
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 3fb9cc4da63e..377ba0004370 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -249,13 +249,12 @@ enum flow_block_binder_type {
 	FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
 };
 
-struct tcf_block;
 struct netlink_ext_ack;
 
 struct flow_block_offload {
 	enum flow_block_command command;
 	enum flow_block_binder_type binder_type;
-	struct tcf_block *block;
+	bool block_shared;
 	struct net *net;
 	struct list_head cb_list;
 	struct list_head *driver_block_list;
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e4499526fde8..9cf606b88526 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -26,8 +26,6 @@ struct tcf_walker {
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
-#define tc_block_offload flow_block_offload
-
 struct tcf_block_ext_info {
 	enum flow_block_binder_type binder_type;
 	tcf_chain_head_change_t *chain_head_change;
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index a36a9dc1c6df..a1b36b47dd89 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -2,7 +2,6 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <net/flow_offload.h>
-#include <net/pkt_cls.h>
 
 struct flow_rule *flow_rule_alloc(unsigned int num_actions)
 {
@@ -234,6 +233,8 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       tc_setup_cb_t *cb, void *cb_ident, void *cb_priv,
 			       bool ingress_only)
 {
+	struct flow_block_cb *block_cb;
+
 	if (ingress_only &&
 	    f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
 		return -EOPNOTSUPP;
@@ -242,10 +243,21 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, cb, cb_ident, cb_priv,
-					     f->extack);
+		block_cb = flow_block_cb_alloc(f->net, cb, cb_ident,
+					       cb_priv, NULL);
+		if (IS_ERR(block_cb))
+			return PTR_ERR(block_cb);
+
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, driver_block_list);
+		return 0;
 	case FLOW_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, cb, cb_ident);
+		block_cb = flow_block_cb_lookup(f, cb, cb_ident);
+		if (!block_cb)
+			return -ENOENT;
+
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 		return 0;
 	default:
 		return -EOPNOTSUPP;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9b5e202c255e..90c32fd680db 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -942,9 +942,12 @@ static int dsa_slave_setup_tc_block_cb_eg(enum tc_setup_type type,
 	return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, false);
 }
 
+static LIST_HEAD(dsa_slave_block_cb_list);
+
 static int dsa_slave_setup_tc_block(struct net_device *dev,
-				    struct tc_block_offload *f)
+				    struct flow_block_offload *f)
 {
+	struct flow_block_cb *block_cb;
 	tc_setup_cb_t *cb;
 
 	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
@@ -954,11 +957,24 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
 	else
 		return -EOPNOTSUPP;
 
+	f->driver_block_list = &dsa_slave_block_cb_list;
+
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, cb, dev, dev, f->extack);
+		block_cb = flow_block_cb_alloc(f->net, cb, dev, dev, NULL);
+		if (IS_ERR(block_cb))
+			return PTR_ERR(block_cb);
+
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &dsa_slave_block_cb_list);
+		return 0;
 	case FLOW_BLOCK_UNBIND:
-		tcf_block_cb_unregister(f->block, cb, dev);
+		block_cb = flow_block_cb_lookup(f, cb, dev);
+		if (!block_cb)
+			return -ENOENT;
+
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
 		return 0;
 	default:
 		return -EOPNOTSUPP;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f6602d0000e8..3589ccff5570 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -680,11 +680,11 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
 				  struct tc_indr_block_cb *indr_block_cb,
 				  enum flow_block_command command)
 {
-	struct tc_block_offload bo = {
+	struct flow_block_offload bo = {
 		.command	= command,
 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
 		.net		= dev_net(indr_dev->dev),
-		.block		= indr_dev->block,
+		.block_shared	= tcf_block_shared(indr_dev->block),
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
@@ -771,11 +771,11 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 {
 	struct tc_indr_block_cb *indr_block_cb;
 	struct tc_indr_block_dev *indr_dev;
-	struct tc_block_offload bo = {
+	struct flow_block_offload bo = {
 		.command	= command,
 		.binder_type	= ei->binder_type,
 		.net		= dev_net(dev),
-		.block		= block,
+		.block_shared	= tcf_block_shared(block),
 		.extack		= extack,
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
@@ -804,13 +804,13 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
 				 enum flow_block_command command,
 				 struct netlink_ext_ack *extack)
 {
-	struct tc_block_offload bo = {};
+	struct flow_block_offload bo = {};
 	int err;
 
 	bo.net = dev_net(dev);
 	bo.command = command;
 	bo.binder_type = ei->binder_type;
-	bo.block = block;
+	bo.block_shared = tcf_block_shared(block);
 	bo.extack = extack;
 	INIT_LIST_HEAD(&bo.cb_list);
 
@@ -3245,7 +3245,7 @@ EXPORT_SYMBOL(tcf_exts_dump_stats);
 int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 		     void *type_data, bool err_stop)
 {
-	struct tcf_block_cb *block_cb;
+	struct flow_block_cb *block_cb;
 	int ok_count = 0;
 	int err;
 
-- 
cgit v1.2.3


From 722d36e6e29e50c640c9f5ce186b8d8709cae1a6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:47 +0200
Subject: net: sched: remove tcf block API

Unused, now replaced by flow block API.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 69 ------------------------------------
 net/sched/cls_api.c   | 98 ---------------------------------------------------
 2 files changed, 167 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 9cf606b88526..17c388090c3c 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -66,22 +66,6 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 	return block->q;
 }
 
-void *tcf_block_cb_priv(struct tcf_block_cb *block_cb);
-struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
-					 tc_setup_cb_t *cb, void *cb_ident);
-void tcf_block_cb_incref(struct tcf_block_cb *block_cb);
-unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb);
-struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
-					     tc_setup_cb_t *cb, void *cb_ident,
-					     void *cb_priv,
-					     struct netlink_ext_ack *extack);
-int tcf_block_cb_register(struct tcf_block *block,
-			  tc_setup_cb_t *cb, void *cb_ident,
-			  void *cb_priv, struct netlink_ext_ack *extack);
-void __tcf_block_cb_unregister(struct tcf_block *block,
-			       struct tcf_block_cb *block_cb);
-void tcf_block_cb_unregister(struct tcf_block *block,
-			     tc_setup_cb_t *cb, void *cb_ident);
 int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 				tc_indr_block_bind_cb_t *cb, void *cb_ident);
 int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
@@ -144,59 +128,6 @@ void tc_setup_cb_block_unregister(struct tcf_block *block, tc_setup_cb_t *cb,
 {
 }
 
-static inline
-void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
-{
-	return NULL;
-}
-
-static inline
-struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
-					 tc_setup_cb_t *cb, void *cb_ident)
-{
-	return NULL;
-}
-
-static inline
-void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
-{
-}
-
-static inline
-unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
-{
-	return 0;
-}
-
-static inline
-struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
-					     tc_setup_cb_t *cb, void *cb_ident,
-					     void *cb_priv,
-					     struct netlink_ext_ack *extack)
-{
-	return NULL;
-}
-
-static inline
-int tcf_block_cb_register(struct tcf_block *block,
-			  tc_setup_cb_t *cb, void *cb_ident,
-			  void *cb_priv, struct netlink_ext_ack *extack)
-{
-	return 0;
-}
-
-static inline
-void __tcf_block_cb_unregister(struct tcf_block *block,
-			       struct tcf_block_cb *block_cb)
-{
-}
-
-static inline
-void tcf_block_cb_unregister(struct tcf_block *block,
-			     tc_setup_cb_t *cb, void *cb_ident)
-{
-}
-
 static inline
 int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 				tc_indr_block_bind_cb_t *cb, void *cb_ident)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3589ccff5570..638c1bc1ea1b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1513,43 +1513,6 @@ void tcf_block_put(struct tcf_block *block)
 
 EXPORT_SYMBOL(tcf_block_put);
 
-struct tcf_block_cb {
-	struct list_head list;
-	tc_setup_cb_t *cb;
-	void *cb_ident;
-	void *cb_priv;
-	unsigned int refcnt;
-};
-
-void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
-{
-	return block_cb->cb_priv;
-}
-EXPORT_SYMBOL(tcf_block_cb_priv);
-
-struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
-					 tc_setup_cb_t *cb, void *cb_ident)
-{	struct tcf_block_cb *block_cb;
-
-	list_for_each_entry(block_cb, &block->cb_list, list)
-		if (block_cb->cb == cb && block_cb->cb_ident == cb_ident)
-			return block_cb;
-	return NULL;
-}
-EXPORT_SYMBOL(tcf_block_cb_lookup);
-
-void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
-{
-	block_cb->refcnt++;
-}
-EXPORT_SYMBOL(tcf_block_cb_incref);
-
-unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
-{
-	return --block_cb->refcnt;
-}
-EXPORT_SYMBOL(tcf_block_cb_decref);
-
 static int
 tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
 			    void *cb_priv, bool add, bool offload_in_use,
@@ -1591,67 +1554,6 @@ err_playback_remove:
 	return err;
 }
 
-struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
-					     tc_setup_cb_t *cb, void *cb_ident,
-					     void *cb_priv,
-					     struct netlink_ext_ack *extack)
-{
-	struct tcf_block_cb *block_cb;
-	int err;
-
-	/* Replay any already present rules */
-	err = tcf_block_playback_offloads(block, cb, cb_priv, true,
-					  tcf_block_offload_in_use(block),
-					  extack);
-	if (err)
-		return ERR_PTR(err);
-
-	block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
-	if (!block_cb)
-		return ERR_PTR(-ENOMEM);
-	block_cb->cb = cb;
-	block_cb->cb_ident = cb_ident;
-	block_cb->cb_priv = cb_priv;
-	list_add(&block_cb->list, &block->cb_list);
-	return block_cb;
-}
-EXPORT_SYMBOL(__tcf_block_cb_register);
-
-int tcf_block_cb_register(struct tcf_block *block,
-			  tc_setup_cb_t *cb, void *cb_ident,
-			  void *cb_priv, struct netlink_ext_ack *extack)
-{
-	struct tcf_block_cb *block_cb;
-
-	block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
-					   extack);
-	return PTR_ERR_OR_ZERO(block_cb);
-}
-EXPORT_SYMBOL(tcf_block_cb_register);
-
-void __tcf_block_cb_unregister(struct tcf_block *block,
-			       struct tcf_block_cb *block_cb)
-{
-	tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
-				    false, tcf_block_offload_in_use(block),
-				    NULL);
-	list_del(&block_cb->list);
-	kfree(block_cb);
-}
-EXPORT_SYMBOL(__tcf_block_cb_unregister);
-
-void tcf_block_cb_unregister(struct tcf_block *block,
-			     tc_setup_cb_t *cb, void *cb_ident)
-{
-	struct tcf_block_cb *block_cb;
-
-	block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
-	if (!block_cb)
-		return;
-	__tcf_block_cb_unregister(block, block_cb);
-}
-EXPORT_SYMBOL(tcf_block_cb_unregister);
-
 static int tcf_block_bind(struct tcf_block *block,
 			  struct flow_block_offload *bo)
 {
-- 
cgit v1.2.3


From 0d4fd02e7199fbf57c0d175dd1890c82cd4a6f4f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:48 +0200
Subject: net: flow_offload: add flow_block_cb_is_busy() and use it

This patch adds a function to check if flow block callback is already in
use.  Call this new function from flow_block_cb_setup_simple() and from
drivers.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c    |  4 ++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c      |  4 ++++
 drivers/net/ethernet/mscc/ocelot_tc.c               |  3 +++
 drivers/net/ethernet/netronome/nfp/flower/offload.c |  4 ++++
 include/net/flow_offload.h                          |  3 +++
 net/core/flow_offload.c                             | 18 ++++++++++++++++++
 net/dsa/slave.c                                     |  3 +++
 7 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 7ca6b6472017..62cb5408424c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -722,6 +722,10 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev,
 		if (indr_priv)
 			return -EEXIST;
 
+		if (flow_block_cb_is_busy(mlx5e_rep_indr_setup_block_cb,
+					  indr_priv, &mlx5e_block_cb_list))
+			return -EBUSY;
+
 		indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL);
 		if (!indr_priv)
 			return -ENOMEM;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 65bea6be84d6..35adc174f277 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1698,6 +1698,10 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
 
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
+		if (flow_block_cb_is_busy(cb, mlxsw_sp_port,
+					  &mlxsw_sp_block_cb_list))
+			return -EBUSY;
+
 		block_cb = flow_block_cb_alloc(f->net, cb, mlxsw_sp_port,
 					       mlxsw_sp_port, NULL);
 		if (IS_ERR(block_cb))
diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c
index 935a774cb291..9e6464ffae5d 100644
--- a/drivers/net/ethernet/mscc/ocelot_tc.c
+++ b/drivers/net/ethernet/mscc/ocelot_tc.c
@@ -153,6 +153,9 @@ static int ocelot_setup_tc_block(struct ocelot_port *port,
 
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
+		if (flow_block_cb_is_busy(cb, port, &ocelot_block_cb_list))
+			return -EBUSY;
+
 		block_cb = flow_block_cb_alloc(f->net, cb, port, port, NULL);
 		if (IS_ERR(block_cb))
 			return PTR_ERR(block_cb);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index ddd6b509f27e..1b38cfeb646c 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -1320,6 +1320,10 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev,
 
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
+		if (flow_block_cb_is_busy(nfp_flower_setup_tc_block_cb, repr,
+					  &nfp_block_cb_list))
+			return -EBUSY;
+
 		block_cb = flow_block_cb_alloc(f->net,
 					       nfp_flower_setup_tc_block_cb,
 					       repr, repr, NULL);
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 377ba0004370..42a36a346003 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -296,6 +296,9 @@ static inline void flow_block_cb_remove(struct flow_block_cb *block_cb,
 	list_move(&block_cb->list, &offload->cb_list);
 }
 
+bool flow_block_cb_is_busy(tc_setup_cb_t *cb, void *cb_ident,
+			   struct list_head *driver_block_list);
+
 int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_list, tc_setup_cb_t *cb,
 			       void *cb_ident, void *cb_priv, bool ingress_only);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index a1b36b47dd89..76f8db3841d7 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -228,6 +228,21 @@ unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb)
 }
 EXPORT_SYMBOL(flow_block_cb_decref);
 
+bool flow_block_cb_is_busy(tc_setup_cb_t *cb, void *cb_ident,
+			   struct list_head *driver_block_list)
+{
+	struct flow_block_cb *block_cb;
+
+	list_for_each_entry(block_cb, driver_block_list, driver_list) {
+		if (block_cb->cb == cb &&
+		    block_cb->cb_ident == cb_ident)
+			return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL(flow_block_cb_is_busy);
+
 int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_block_list,
 			       tc_setup_cb_t *cb, void *cb_ident, void *cb_priv,
@@ -243,6 +258,9 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
+		if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list))
+			return -EBUSY;
+
 		block_cb = flow_block_cb_alloc(f->net, cb, cb_ident,
 					       cb_priv, NULL);
 		if (IS_ERR(block_cb))
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 90c32fd680db..9bcb598fc840 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -961,6 +961,9 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
 
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
+		if (flow_block_cb_is_busy(cb, dev, &dsa_slave_block_cb_list))
+			return -EBUSY;
+
 		block_cb = flow_block_cb_alloc(f->net, cb, dev, dev, NULL);
 		if (IS_ERR(block_cb))
 			return PTR_ERR(block_cb);
-- 
cgit v1.2.3


From f9e30088d20016a224d8110d45356da253eaa26a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 22:55:49 +0200
Subject: net: flow_offload: rename tc_cls_flower_offload to flow_cls_offload

And any other existing fields in this structure that refer to tc.
Specifically:

* tc_cls_flower_offload_flow_rule() to flow_cls_offload_flow_rule().
* TC_CLSFLOWER_* to FLOW_CLS_*.
* tc_cls_common_offload to tc_cls_common_offload.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c       | 18 ++++-----
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h       |  4 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  8 ++--
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 22 +++++------
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h   |  6 +--
 drivers/net/ethernet/intel/i40e/i40e_main.c        | 22 +++++------
 drivers/net/ethernet/intel/iavf/iavf_main.c        | 22 +++++------
 drivers/net/ethernet/intel/igb/igb_main.c          | 16 ++++----
 .../net/ethernet/mellanox/mlx5/core/en/tc_tun.c    |  6 +--
 .../net/ethernet/mellanox/mlx5/core/en/tc_tun.h    |  8 ++--
 .../ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c | 18 ++++-----
 .../ethernet/mellanox/mlx5/core/en/tc_tun_gre.c    |  4 +-
 .../ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c  | 10 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  8 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 16 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 34 ++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h    |  6 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 12 +++---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     | 10 ++---
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  | 34 ++++++++---------
 drivers/net/ethernet/mscc/ocelot_flower.c          | 22 +++++------
 drivers/net/ethernet/netronome/nfp/flower/action.c | 14 +++----
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  6 +--
 drivers/net/ethernet/netronome/nfp/flower/match.c  | 44 +++++++++++-----------
 .../net/ethernet/netronome/nfp/flower/metadata.c   |  2 +-
 .../net/ethernet/netronome/nfp/flower/offload.c    | 30 +++++++--------
 drivers/net/ethernet/qlogic/qede/qede.h            |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_filter.c     |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  8 ++--
 include/net/flow_offload.h                         | 30 +++++++++++++++
 include/net/pkt_cls.h                              | 40 +++-----------------
 net/sched/cls_flower.c                             | 24 ++++++------
 32 files changed, 254 insertions(+), 254 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 44d6c5743fb9..6fe4a7174271 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -170,10 +170,10 @@ static int bnxt_tc_parse_actions(struct bnxt *bp,
 }
 
 static int bnxt_tc_parse_flow(struct bnxt *bp,
-			      struct tc_cls_flower_offload *tc_flow_cmd,
+			      struct flow_cls_offload *tc_flow_cmd,
 			      struct bnxt_tc_flow *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(tc_flow_cmd);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(tc_flow_cmd);
 	struct flow_dissector *dissector = rule->match.dissector;
 
 	/* KEY_CONTROL and KEY_BASIC are needed for forming a meaningful key */
@@ -1262,7 +1262,7 @@ static void bnxt_tc_set_src_fid(struct bnxt *bp, struct bnxt_tc_flow *flow,
  * The hash-tables are already protected by the rhashtable API.
  */
 static int bnxt_tc_add_flow(struct bnxt *bp, u16 src_fid,
-			    struct tc_cls_flower_offload *tc_flow_cmd)
+			    struct flow_cls_offload *tc_flow_cmd)
 {
 	struct bnxt_tc_flow_node *new_node, *old_node;
 	struct bnxt_tc_info *tc_info = bp->tc_info;
@@ -1348,7 +1348,7 @@ done:
 }
 
 static int bnxt_tc_del_flow(struct bnxt *bp,
-			    struct tc_cls_flower_offload *tc_flow_cmd)
+			    struct flow_cls_offload *tc_flow_cmd)
 {
 	struct bnxt_tc_info *tc_info = bp->tc_info;
 	struct bnxt_tc_flow_node *flow_node;
@@ -1363,7 +1363,7 @@ static int bnxt_tc_del_flow(struct bnxt *bp,
 }
 
 static int bnxt_tc_get_flow_stats(struct bnxt *bp,
-				  struct tc_cls_flower_offload *tc_flow_cmd)
+				  struct flow_cls_offload *tc_flow_cmd)
 {
 	struct bnxt_tc_flow_stats stats, *curr_stats, *prev_stats;
 	struct bnxt_tc_info *tc_info = bp->tc_info;
@@ -1585,14 +1585,14 @@ void bnxt_tc_flow_stats_work(struct bnxt *bp)
 }
 
 int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid,
-			 struct tc_cls_flower_offload *cls_flower)
+			 struct flow_cls_offload *cls_flower)
 {
 	switch (cls_flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return bnxt_tc_add_flow(bp, src_fid, cls_flower);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return bnxt_tc_del_flow(bp, cls_flower);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return bnxt_tc_get_flow_stats(bp, cls_flower);
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h
index 8a0968967bc5..ffec57d1a5ec 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h
@@ -196,7 +196,7 @@ struct bnxt_tc_flow_node {
 };
 
 int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid,
-			 struct tc_cls_flower_offload *cls_flower);
+			 struct flow_cls_offload *cls_flower);
 int bnxt_init_tc(struct bnxt *bp);
 void bnxt_shutdown_tc(struct bnxt *bp);
 void bnxt_tc_flow_stats_work(struct bnxt *bp);
@@ -209,7 +209,7 @@ static inline bool bnxt_tc_flower_enabled(struct bnxt *bp)
 #else /* CONFIG_BNXT_FLOWER_OFFLOAD */
 
 static inline int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid,
-				       struct tc_cls_flower_offload *cls_flower)
+				       struct flow_cls_offload *cls_flower)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index fdc8ca4f8891..67202b6f352e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3135,14 +3135,14 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
 }
 
 static int cxgb_setup_tc_flower(struct net_device *dev,
-				struct tc_cls_flower_offload *cls_flower)
+				struct flow_cls_offload *cls_flower)
 {
 	switch (cls_flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return cxgb4_tc_flower_replace(dev, cls_flower);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return cxgb4_tc_flower_destroy(dev, cls_flower);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return cxgb4_tc_flower_stats(dev, cls_flower);
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index cfaf8f618d1f..312599c6b35a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -80,10 +80,10 @@ static struct ch_tc_flower_entry *ch_flower_lookup(struct adapter *adap,
 }
 
 static void cxgb4_process_flow_match(struct net_device *dev,
-				     struct tc_cls_flower_offload *cls,
+				     struct flow_cls_offload *cls,
 				     struct ch_filter_specification *fs)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(cls);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(cls);
 	u16 addr_type = 0;
 
 	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) {
@@ -223,9 +223,9 @@ static void cxgb4_process_flow_match(struct net_device *dev,
 }
 
 static int cxgb4_validate_flow_match(struct net_device *dev,
-				     struct tc_cls_flower_offload *cls)
+				     struct flow_cls_offload *cls)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(cls);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(cls);
 	struct flow_dissector *dissector = rule->match.dissector;
 	u16 ethtype_mask = 0;
 	u16 ethtype_key = 0;
@@ -378,10 +378,10 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val,
 }
 
 static void cxgb4_process_flow_actions(struct net_device *in,
-				       struct tc_cls_flower_offload *cls,
+				       struct flow_cls_offload *cls,
 				       struct ch_filter_specification *fs)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(cls);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(cls);
 	struct flow_action_entry *act;
 	int i;
 
@@ -544,9 +544,9 @@ static bool valid_pedit_action(struct net_device *dev,
 }
 
 static int cxgb4_validate_flow_actions(struct net_device *dev,
-				       struct tc_cls_flower_offload *cls)
+				       struct flow_cls_offload *cls)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(cls);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(cls);
 	struct flow_action_entry *act;
 	bool act_redir = false;
 	bool act_pedit = false;
@@ -633,7 +633,7 @@ static int cxgb4_validate_flow_actions(struct net_device *dev,
 }
 
 int cxgb4_tc_flower_replace(struct net_device *dev,
-			    struct tc_cls_flower_offload *cls)
+			    struct flow_cls_offload *cls)
 {
 	struct adapter *adap = netdev2adap(dev);
 	struct ch_tc_flower_entry *ch_flower;
@@ -709,7 +709,7 @@ free_entry:
 }
 
 int cxgb4_tc_flower_destroy(struct net_device *dev,
-			    struct tc_cls_flower_offload *cls)
+			    struct flow_cls_offload *cls)
 {
 	struct adapter *adap = netdev2adap(dev);
 	struct ch_tc_flower_entry *ch_flower;
@@ -783,7 +783,7 @@ static void ch_flower_stats_cb(struct timer_list *t)
 }
 
 int cxgb4_tc_flower_stats(struct net_device *dev,
-			  struct tc_cls_flower_offload *cls)
+			  struct flow_cls_offload *cls)
 {
 	struct adapter *adap = netdev2adap(dev);
 	struct ch_tc_flower_stats *ofld_stats;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
index 050c8a50ae41..eb4c95248baf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
@@ -109,11 +109,11 @@ struct ch_tc_pedit_fields {
 #define PEDIT_UDP_SPORT_DPORT		0x0
 
 int cxgb4_tc_flower_replace(struct net_device *dev,
-			    struct tc_cls_flower_offload *cls);
+			    struct flow_cls_offload *cls);
 int cxgb4_tc_flower_destroy(struct net_device *dev,
-			    struct tc_cls_flower_offload *cls);
+			    struct flow_cls_offload *cls);
 int cxgb4_tc_flower_stats(struct net_device *dev,
-			  struct tc_cls_flower_offload *cls);
+			  struct flow_cls_offload *cls);
 
 int cxgb4_init_tc_flower(struct adapter *adap);
 void cxgb4_cleanup_tc_flower(struct adapter *adap);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 7be1080680f5..9ebbe3da61bb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7759,15 +7759,15 @@ int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi,
 /**
  * i40e_parse_cls_flower - Parse tc flower filters provided by kernel
  * @vsi: Pointer to VSI
- * @cls_flower: Pointer to struct tc_cls_flower_offload
+ * @cls_flower: Pointer to struct flow_cls_offload
  * @filter: Pointer to cloud filter structure
  *
  **/
 static int i40e_parse_cls_flower(struct i40e_vsi *vsi,
-				 struct tc_cls_flower_offload *f,
+				 struct flow_cls_offload *f,
 				 struct i40e_cloud_filter *filter)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_dissector *dissector = rule->match.dissector;
 	u16 n_proto_mask = 0, n_proto_key = 0, addr_type = 0;
 	struct i40e_pf *pf = vsi->back;
@@ -8001,11 +8001,11 @@ static int i40e_handle_tclass(struct i40e_vsi *vsi, u32 tc,
 /**
  * i40e_configure_clsflower - Configure tc flower filters
  * @vsi: Pointer to VSI
- * @cls_flower: Pointer to struct tc_cls_flower_offload
+ * @cls_flower: Pointer to struct flow_cls_offload
  *
  **/
 static int i40e_configure_clsflower(struct i40e_vsi *vsi,
-				    struct tc_cls_flower_offload *cls_flower)
+				    struct flow_cls_offload *cls_flower)
 {
 	int tc = tc_classid_to_hwtc(vsi->netdev, cls_flower->classid);
 	struct i40e_cloud_filter *filter = NULL;
@@ -8097,11 +8097,11 @@ static struct i40e_cloud_filter *i40e_find_cloud_filter(struct i40e_vsi *vsi,
 /**
  * i40e_delete_clsflower - Remove tc flower filters
  * @vsi: Pointer to VSI
- * @cls_flower: Pointer to struct tc_cls_flower_offload
+ * @cls_flower: Pointer to struct flow_cls_offload
  *
  **/
 static int i40e_delete_clsflower(struct i40e_vsi *vsi,
-				 struct tc_cls_flower_offload *cls_flower)
+				 struct flow_cls_offload *cls_flower)
 {
 	struct i40e_cloud_filter *filter = NULL;
 	struct i40e_pf *pf = vsi->back;
@@ -8144,16 +8144,16 @@ static int i40e_delete_clsflower(struct i40e_vsi *vsi,
  * @type_data: offload data
  **/
 static int i40e_setup_tc_cls_flower(struct i40e_netdev_priv *np,
-				    struct tc_cls_flower_offload *cls_flower)
+				    struct flow_cls_offload *cls_flower)
 {
 	struct i40e_vsi *vsi = np->vsi;
 
 	switch (cls_flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return i40e_configure_clsflower(vsi, cls_flower);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return i40e_delete_clsflower(vsi, cls_flower);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return -EOPNOTSUPP;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 05eca6f2e890..9d2b50964a08 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2699,14 +2699,14 @@ exit:
 /**
  * iavf_parse_cls_flower - Parse tc flower filters provided by kernel
  * @adapter: board private structure
- * @cls_flower: pointer to struct tc_cls_flower_offload
+ * @cls_flower: pointer to struct flow_cls_offload
  * @filter: pointer to cloud filter structure
  */
 static int iavf_parse_cls_flower(struct iavf_adapter *adapter,
-				 struct tc_cls_flower_offload *f,
+				 struct flow_cls_offload *f,
 				 struct iavf_cloud_filter *filter)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_dissector *dissector = rule->match.dissector;
 	u16 n_proto_mask = 0;
 	u16 n_proto_key = 0;
@@ -2971,10 +2971,10 @@ static int iavf_handle_tclass(struct iavf_adapter *adapter, u32 tc,
 /**
  * iavf_configure_clsflower - Add tc flower filters
  * @adapter: board private structure
- * @cls_flower: Pointer to struct tc_cls_flower_offload
+ * @cls_flower: Pointer to struct flow_cls_offload
  */
 static int iavf_configure_clsflower(struct iavf_adapter *adapter,
-				    struct tc_cls_flower_offload *cls_flower)
+				    struct flow_cls_offload *cls_flower)
 {
 	int tc = tc_classid_to_hwtc(adapter->netdev, cls_flower->classid);
 	struct iavf_cloud_filter *filter = NULL;
@@ -3050,10 +3050,10 @@ static struct iavf_cloud_filter *iavf_find_cf(struct iavf_adapter *adapter,
 /**
  * iavf_delete_clsflower - Remove tc flower filters
  * @adapter: board private structure
- * @cls_flower: Pointer to struct tc_cls_flower_offload
+ * @cls_flower: Pointer to struct flow_cls_offload
  */
 static int iavf_delete_clsflower(struct iavf_adapter *adapter,
-				 struct tc_cls_flower_offload *cls_flower)
+				 struct flow_cls_offload *cls_flower)
 {
 	struct iavf_cloud_filter *filter = NULL;
 	int err = 0;
@@ -3077,17 +3077,17 @@ static int iavf_delete_clsflower(struct iavf_adapter *adapter,
  * @type_data: offload data
  */
 static int iavf_setup_tc_cls_flower(struct iavf_adapter *adapter,
-				    struct tc_cls_flower_offload *cls_flower)
+				    struct flow_cls_offload *cls_flower)
 {
 	if (cls_flower->common.chain_index)
 		return -EOPNOTSUPP;
 
 	switch (cls_flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return iavf_configure_clsflower(adapter, cls_flower);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return iavf_delete_clsflower(adapter, cls_flower);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return -EOPNOTSUPP;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 00e8186e2c59..b4df3e319467 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2578,11 +2578,11 @@ static int igb_offload_cbs(struct igb_adapter *adapter,
 #define VLAN_PRIO_FULL_MASK (0x07)
 
 static int igb_parse_cls_flower(struct igb_adapter *adapter,
-				struct tc_cls_flower_offload *f,
+				struct flow_cls_offload *f,
 				int traffic_class,
 				struct igb_nfc_filter *input)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_dissector *dissector = rule->match.dissector;
 	struct netlink_ext_ack *extack = f->common.extack;
 
@@ -2660,7 +2660,7 @@ static int igb_parse_cls_flower(struct igb_adapter *adapter,
 }
 
 static int igb_configure_clsflower(struct igb_adapter *adapter,
-				   struct tc_cls_flower_offload *cls_flower)
+				   struct flow_cls_offload *cls_flower)
 {
 	struct netlink_ext_ack *extack = cls_flower->common.extack;
 	struct igb_nfc_filter *filter, *f;
@@ -2722,7 +2722,7 @@ err_parse:
 }
 
 static int igb_delete_clsflower(struct igb_adapter *adapter,
-				struct tc_cls_flower_offload *cls_flower)
+				struct flow_cls_offload *cls_flower)
 {
 	struct igb_nfc_filter *filter;
 	int err;
@@ -2752,14 +2752,14 @@ out:
 }
 
 static int igb_setup_tc_cls_flower(struct igb_adapter *adapter,
-				   struct tc_cls_flower_offload *cls_flower)
+				   struct flow_cls_offload *cls_flower)
 {
 	switch (cls_flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return igb_configure_clsflower(adapter, cls_flower);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return igb_delete_clsflower(adapter, cls_flower);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return -EOPNOTSUPP;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
index 3739646b653f..a6a52806be45 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
@@ -452,7 +452,7 @@ int mlx5e_tc_tun_init_encap_attr(struct net_device *tunnel_dev,
 int mlx5e_tc_tun_parse(struct net_device *filter_dev,
 		       struct mlx5e_priv *priv,
 		       struct mlx5_flow_spec *spec,
-		       struct tc_cls_flower_offload *f,
+		       struct flow_cls_offload *f,
 		       void *headers_c,
 		       void *headers_v, u8 *match_level)
 {
@@ -489,11 +489,11 @@ out:
 
 int mlx5e_tc_tun_parse_udp_ports(struct mlx5e_priv *priv,
 				 struct mlx5_flow_spec *spec,
-				 struct tc_cls_flower_offload *f,
+				 struct flow_cls_offload *f,
 				 void *headers_c,
 				 void *headers_v)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct netlink_ext_ack *extack = f->common.extack;
 	struct flow_match_ports enc_ports;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h
index 3c48f7e62505..c362b9225dc2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h
@@ -33,12 +33,12 @@ struct mlx5e_tc_tunnel {
 				   struct mlx5e_encap_entry *e);
 	int (*parse_udp_ports)(struct mlx5e_priv *priv,
 			       struct mlx5_flow_spec *spec,
-			       struct tc_cls_flower_offload *f,
+			       struct flow_cls_offload *f,
 			       void *headers_c,
 			       void *headers_v);
 	int (*parse_tunnel)(struct mlx5e_priv *priv,
 			    struct mlx5_flow_spec *spec,
-			    struct tc_cls_flower_offload *f,
+			    struct flow_cls_offload *f,
 			    void *headers_c,
 			    void *headers_v);
 };
@@ -68,13 +68,13 @@ bool mlx5e_tc_tun_device_to_offload(struct mlx5e_priv *priv,
 int mlx5e_tc_tun_parse(struct net_device *filter_dev,
 		       struct mlx5e_priv *priv,
 		       struct mlx5_flow_spec *spec,
-		       struct tc_cls_flower_offload *f,
+		       struct flow_cls_offload *f,
 		       void *headers_c,
 		       void *headers_v, u8 *match_level);
 
 int mlx5e_tc_tun_parse_udp_ports(struct mlx5e_priv *priv,
 				 struct mlx5_flow_spec *spec,
-				 struct tc_cls_flower_offload *f,
+				 struct flow_cls_offload *f,
 				 void *headers_c,
 				 void *headers_v);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c
index 238ae85d07cc..951ea26d96bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c
@@ -20,9 +20,9 @@ static int mlx5e_tc_tun_calc_hlen_geneve(struct mlx5e_encap_entry *e)
 }
 
 static int mlx5e_tc_tun_check_udp_dport_geneve(struct mlx5e_priv *priv,
-					       struct tc_cls_flower_offload *f)
+					       struct flow_cls_offload *f)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct netlink_ext_ack *extack = f->common.extack;
 	struct flow_match_ports enc_ports;
 
@@ -48,7 +48,7 @@ static int mlx5e_tc_tun_check_udp_dport_geneve(struct mlx5e_priv *priv,
 
 static int mlx5e_tc_tun_parse_udp_ports_geneve(struct mlx5e_priv *priv,
 					       struct mlx5_flow_spec *spec,
-					       struct tc_cls_flower_offload *f,
+					       struct flow_cls_offload *f,
 					       void *headers_c,
 					       void *headers_v)
 {
@@ -122,9 +122,9 @@ static int mlx5e_gen_ip_tunnel_header_geneve(char buf[],
 
 static int mlx5e_tc_tun_parse_geneve_vni(struct mlx5e_priv *priv,
 					 struct mlx5_flow_spec *spec,
-					 struct tc_cls_flower_offload *f)
+					 struct flow_cls_offload *f)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct netlink_ext_ack *extack = f->common.extack;
 	struct flow_match_enc_keyid enc_keyid;
 	void *misc_c, *misc_v;
@@ -154,11 +154,11 @@ static int mlx5e_tc_tun_parse_geneve_vni(struct mlx5e_priv *priv,
 
 static int mlx5e_tc_tun_parse_geneve_options(struct mlx5e_priv *priv,
 					     struct mlx5_flow_spec *spec,
-					     struct tc_cls_flower_offload *f)
+					     struct flow_cls_offload *f)
 {
 	u8 max_tlv_option_data_len = MLX5_CAP_GEN(priv->mdev, max_geneve_tlv_option_data_len);
 	u8 max_tlv_options = MLX5_CAP_GEN(priv->mdev, max_geneve_tlv_options);
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct netlink_ext_ack *extack = f->common.extack;
 	void *misc_c, *misc_v, *misc_3_c, *misc_3_v;
 	struct geneve_opt *option_key, *option_mask;
@@ -277,7 +277,7 @@ static int mlx5e_tc_tun_parse_geneve_options(struct mlx5e_priv *priv,
 
 static int mlx5e_tc_tun_parse_geneve_params(struct mlx5e_priv *priv,
 					    struct mlx5_flow_spec *spec,
-					    struct tc_cls_flower_offload *f)
+					    struct flow_cls_offload *f)
 {
 	void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
 	void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,  misc_parameters);
@@ -306,7 +306,7 @@ static int mlx5e_tc_tun_parse_geneve_params(struct mlx5e_priv *priv,
 
 static int mlx5e_tc_tun_parse_geneve(struct mlx5e_priv *priv,
 				     struct mlx5_flow_spec *spec,
-				     struct tc_cls_flower_offload *f,
+				     struct flow_cls_offload *f,
 				     void *headers_c,
 				     void *headers_v)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c
index 06908441d932..58b13192df23 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c
@@ -54,13 +54,13 @@ static int mlx5e_gen_ip_tunnel_header_gretap(char buf[],
 
 static int mlx5e_tc_tun_parse_gretap(struct mlx5e_priv *priv,
 				     struct mlx5_flow_spec *spec,
-				     struct tc_cls_flower_offload *f,
+				     struct flow_cls_offload *f,
 				     void *headers_c,
 				     void *headers_v)
 {
 	void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
 	void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 
 	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_protocol);
 	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_GRE);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
index 2857b38527d6..37b176801bcc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
@@ -16,9 +16,9 @@ static int mlx5e_tc_tun_calc_hlen_vxlan(struct mlx5e_encap_entry *e)
 }
 
 static int mlx5e_tc_tun_check_udp_dport_vxlan(struct mlx5e_priv *priv,
-					      struct tc_cls_flower_offload *f)
+					      struct flow_cls_offload *f)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct netlink_ext_ack *extack = f->common.extack;
 	struct flow_match_ports enc_ports;
 
@@ -44,7 +44,7 @@ static int mlx5e_tc_tun_check_udp_dport_vxlan(struct mlx5e_priv *priv,
 
 static int mlx5e_tc_tun_parse_udp_ports_vxlan(struct mlx5e_priv *priv,
 					      struct mlx5_flow_spec *spec,
-					      struct tc_cls_flower_offload *f,
+					      struct flow_cls_offload *f,
 					      void *headers_c,
 					      void *headers_v)
 {
@@ -100,11 +100,11 @@ static int mlx5e_gen_ip_tunnel_header_vxlan(char buf[],
 
 static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv,
 				    struct mlx5_flow_spec *spec,
-				    struct tc_cls_flower_offload *f,
+				    struct flow_cls_offload *f,
 				    void *headers_c,
 				    void *headers_v)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct netlink_ext_ack *extack = f->common.extack;
 	struct flow_match_enc_keyid enc_keyid;
 	void *misc_c, *misc_v;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 4c138789c547..6d0ae87c8ded 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3426,17 +3426,17 @@ out:
 
 #ifdef CONFIG_MLX5_ESWITCH
 static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
-				     struct tc_cls_flower_offload *cls_flower,
+				     struct flow_cls_offload *cls_flower,
 				     int flags)
 {
 	switch (cls_flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
 					      flags);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
 					   flags);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
 					  flags);
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 62cb5408424c..10ef90a7bddd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -656,7 +656,7 @@ static void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv)
 
 static int
 mlx5e_rep_indr_offload(struct net_device *netdev,
-		       struct tc_cls_flower_offload *flower,
+		       struct flow_cls_offload *flower,
 		       struct mlx5e_rep_indr_block_priv *indr_priv)
 {
 	struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev);
@@ -664,13 +664,13 @@ mlx5e_rep_indr_offload(struct net_device *netdev,
 	int err = 0;
 
 	switch (flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		err = mlx5e_configure_flower(netdev, priv, flower, flags);
 		break;
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		err = mlx5e_delete_flower(netdev, priv, flower, flags);
 		break;
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		err = mlx5e_stats_flower(netdev, priv, flower, flags);
 		break;
 	default:
@@ -1144,16 +1144,16 @@ static int mlx5e_rep_close(struct net_device *dev)
 
 static int
 mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
-			      struct tc_cls_flower_offload *cls_flower, int flags)
+			      struct flow_cls_offload *cls_flower, int flags)
 {
 	switch (cls_flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
 					      flags);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
 					   flags);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
 					  flags);
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 3ac9b1e423ee..2d6436257f9d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1350,7 +1350,7 @@ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
 
 static int parse_tunnel_attr(struct mlx5e_priv *priv,
 			     struct mlx5_flow_spec *spec,
-			     struct tc_cls_flower_offload *f,
+			     struct flow_cls_offload *f,
 			     struct net_device *filter_dev, u8 *match_level)
 {
 	struct netlink_ext_ack *extack = f->common.extack;
@@ -1358,7 +1358,7 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
 				       outer_headers);
 	void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
 				       outer_headers);
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	int err;
 
 	err = mlx5e_tc_tun_parse(filter_dev, priv, spec, f,
@@ -1478,7 +1478,7 @@ static void *get_match_headers_value(u32 flags,
 
 static int __parse_cls_flower(struct mlx5e_priv *priv,
 			      struct mlx5_flow_spec *spec,
-			      struct tc_cls_flower_offload *f,
+			      struct flow_cls_offload *f,
 			      struct net_device *filter_dev,
 			      u8 *match_level, u8 *tunnel_match_level)
 {
@@ -1491,7 +1491,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 				    misc_parameters);
 	void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
 				    misc_parameters);
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_dissector *dissector = rule->match.dissector;
 	u16 addr_type = 0;
 	u8 ip_proto = 0;
@@ -1831,7 +1831,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 static int parse_cls_flower(struct mlx5e_priv *priv,
 			    struct mlx5e_tc_flow *flow,
 			    struct mlx5_flow_spec *spec,
-			    struct tc_cls_flower_offload *f,
+			    struct flow_cls_offload *f,
 			    struct net_device *filter_dev)
 {
 	struct netlink_ext_ack *extack = f->common.extack;
@@ -3115,7 +3115,7 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
 
 static int
 mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
-		 struct tc_cls_flower_offload *f, u16 flow_flags,
+		 struct flow_cls_offload *f, u16 flow_flags,
 		 struct mlx5e_tc_flow_parse_attr **__parse_attr,
 		 struct mlx5e_tc_flow **__flow)
 {
@@ -3149,7 +3149,7 @@ static void
 mlx5e_flow_esw_attr_init(struct mlx5_esw_flow_attr *esw_attr,
 			 struct mlx5e_priv *priv,
 			 struct mlx5e_tc_flow_parse_attr *parse_attr,
-			 struct tc_cls_flower_offload *f,
+			 struct flow_cls_offload *f,
 			 struct mlx5_eswitch_rep *in_rep,
 			 struct mlx5_core_dev *in_mdev)
 {
@@ -3171,13 +3171,13 @@ mlx5e_flow_esw_attr_init(struct mlx5_esw_flow_attr *esw_attr,
 
 static struct mlx5e_tc_flow *
 __mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
-		     struct tc_cls_flower_offload *f,
+		     struct flow_cls_offload *f,
 		     u16 flow_flags,
 		     struct net_device *filter_dev,
 		     struct mlx5_eswitch_rep *in_rep,
 		     struct mlx5_core_dev *in_mdev)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct netlink_ext_ack *extack = f->common.extack;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 	struct mlx5e_tc_flow *flow;
@@ -3221,7 +3221,7 @@ out:
 	return ERR_PTR(err);
 }
 
-static int mlx5e_tc_add_fdb_peer_flow(struct tc_cls_flower_offload *f,
+static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f,
 				      struct mlx5e_tc_flow *flow,
 				      u16 flow_flags)
 {
@@ -3273,7 +3273,7 @@ out:
 
 static int
 mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
-		   struct tc_cls_flower_offload *f,
+		   struct flow_cls_offload *f,
 		   u16 flow_flags,
 		   struct net_device *filter_dev,
 		   struct mlx5e_tc_flow **__flow)
@@ -3307,12 +3307,12 @@ out:
 
 static int
 mlx5e_add_nic_flow(struct mlx5e_priv *priv,
-		   struct tc_cls_flower_offload *f,
+		   struct flow_cls_offload *f,
 		   u16 flow_flags,
 		   struct net_device *filter_dev,
 		   struct mlx5e_tc_flow **__flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct netlink_ext_ack *extack = f->common.extack;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 	struct mlx5e_tc_flow *flow;
@@ -3358,7 +3358,7 @@ out:
 
 static int
 mlx5e_tc_add_flow(struct mlx5e_priv *priv,
-		  struct tc_cls_flower_offload *f,
+		  struct flow_cls_offload *f,
 		  int flags,
 		  struct net_device *filter_dev,
 		  struct mlx5e_tc_flow **flow)
@@ -3383,7 +3383,7 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv,
 }
 
 int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv,
-			   struct tc_cls_flower_offload *f, int flags)
+			   struct flow_cls_offload *f, int flags)
 {
 	struct netlink_ext_ack *extack = f->common.extack;
 	struct rhashtable *tc_ht = get_tc_ht(priv, flags);
@@ -3430,7 +3430,7 @@ static bool same_flow_direction(struct mlx5e_tc_flow *flow, int flags)
 }
 
 int mlx5e_delete_flower(struct net_device *dev, struct mlx5e_priv *priv,
-			struct tc_cls_flower_offload *f, int flags)
+			struct flow_cls_offload *f, int flags)
 {
 	struct rhashtable *tc_ht = get_tc_ht(priv, flags);
 	struct mlx5e_tc_flow *flow;
@@ -3449,7 +3449,7 @@ int mlx5e_delete_flower(struct net_device *dev, struct mlx5e_priv *priv,
 }
 
 int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv,
-		       struct tc_cls_flower_offload *f, int flags)
+		       struct flow_cls_offload *f, int flags)
 {
 	struct mlx5_devcom *devcom = priv->mdev->priv.devcom;
 	struct rhashtable *tc_ht = get_tc_ht(priv, flags);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 8f288cc53cee..3ab39275ca7d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -54,12 +54,12 @@ int mlx5e_tc_esw_init(struct rhashtable *tc_ht);
 void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht);
 
 int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv,
-			   struct tc_cls_flower_offload *f, int flags);
+			   struct flow_cls_offload *f, int flags);
 int mlx5e_delete_flower(struct net_device *dev, struct mlx5e_priv *priv,
-			struct tc_cls_flower_offload *f, int flags);
+			struct flow_cls_offload *f, int flags);
 
 int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv,
-		       struct tc_cls_flower_offload *f, int flags);
+		       struct flow_cls_offload *f, int flags);
 
 struct mlx5e_encap_entry;
 void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 35adc174f277..4d34d42b3b0e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1508,21 +1508,21 @@ static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 
 static int
 mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_acl_block *acl_block,
-			     struct tc_cls_flower_offload *f)
+			     struct flow_cls_offload *f)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_acl_block_mlxsw_sp(acl_block);
 
 	switch (f->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return mlxsw_sp_flower_replace(mlxsw_sp, acl_block, f);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		mlxsw_sp_flower_destroy(mlxsw_sp, acl_block, f);
 		return 0;
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return mlxsw_sp_flower_stats(mlxsw_sp, acl_block, f);
-	case TC_CLSFLOWER_TMPLT_CREATE:
+	case FLOW_CLS_TMPLT_CREATE:
 		return mlxsw_sp_flower_tmplt_create(mlxsw_sp, acl_block, f);
-	case TC_CLSFLOWER_TMPLT_DESTROY:
+	case FLOW_CLS_TMPLT_DESTROY:
 		mlxsw_sp_flower_tmplt_destroy(mlxsw_sp, acl_block, f);
 		return 0;
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index abbb563db440..a252b080dda9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -807,19 +807,19 @@ extern const struct mlxsw_afk_ops mlxsw_sp2_afk_ops;
 /* spectrum_flower.c */
 int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp,
 			    struct mlxsw_sp_acl_block *block,
-			    struct tc_cls_flower_offload *f);
+			    struct flow_cls_offload *f);
 void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp,
 			     struct mlxsw_sp_acl_block *block,
-			     struct tc_cls_flower_offload *f);
+			     struct flow_cls_offload *f);
 int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp,
 			  struct mlxsw_sp_acl_block *block,
-			  struct tc_cls_flower_offload *f);
+			  struct flow_cls_offload *f);
 int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp,
 				 struct mlxsw_sp_acl_block *block,
-				 struct tc_cls_flower_offload *f);
+				 struct flow_cls_offload *f);
 void mlxsw_sp_flower_tmplt_destroy(struct mlxsw_sp *mlxsw_sp,
 				   struct mlxsw_sp_acl_block *block,
-				   struct tc_cls_flower_offload *f);
+				   struct flow_cls_offload *f);
 
 /* spectrum_qdisc.c */
 int mlxsw_sp_tc_qdisc_init(struct mlxsw_sp_port *mlxsw_sp_port);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index a83e1a986ef1..202e9a246019 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -121,10 +121,10 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int mlxsw_sp_flower_parse_meta(struct mlxsw_sp_acl_rule_info *rulei,
-				      struct tc_cls_flower_offload *f,
+				      struct flow_cls_offload *f,
 				      struct mlxsw_sp_acl_block *block)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct mlxsw_sp_port *mlxsw_sp_port;
 	struct net_device *ingress_dev;
 	struct flow_match_meta match;
@@ -164,7 +164,7 @@ static int mlxsw_sp_flower_parse_meta(struct mlxsw_sp_acl_rule_info *rulei,
 }
 
 static void mlxsw_sp_flower_parse_ipv4(struct mlxsw_sp_acl_rule_info *rulei,
-				       struct tc_cls_flower_offload *f)
+				       struct flow_cls_offload *f)
 {
 	struct flow_match_ipv4_addrs match;
 
@@ -179,7 +179,7 @@ static void mlxsw_sp_flower_parse_ipv4(struct mlxsw_sp_acl_rule_info *rulei,
 }
 
 static void mlxsw_sp_flower_parse_ipv6(struct mlxsw_sp_acl_rule_info *rulei,
-				       struct tc_cls_flower_offload *f)
+				       struct flow_cls_offload *f)
 {
 	struct flow_match_ipv6_addrs match;
 
@@ -213,10 +213,10 @@ static void mlxsw_sp_flower_parse_ipv6(struct mlxsw_sp_acl_rule_info *rulei,
 
 static int mlxsw_sp_flower_parse_ports(struct mlxsw_sp *mlxsw_sp,
 				       struct mlxsw_sp_acl_rule_info *rulei,
-				       struct tc_cls_flower_offload *f,
+				       struct flow_cls_offload *f,
 				       u8 ip_proto)
 {
-	const struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	const struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_match_ports match;
 
 	if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_PORTS))
@@ -240,10 +240,10 @@ static int mlxsw_sp_flower_parse_ports(struct mlxsw_sp *mlxsw_sp,
 
 static int mlxsw_sp_flower_parse_tcp(struct mlxsw_sp *mlxsw_sp,
 				     struct mlxsw_sp_acl_rule_info *rulei,
-				     struct tc_cls_flower_offload *f,
+				     struct flow_cls_offload *f,
 				     u8 ip_proto)
 {
-	const struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	const struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_match_tcp match;
 
 	if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_TCP))
@@ -265,10 +265,10 @@ static int mlxsw_sp_flower_parse_tcp(struct mlxsw_sp *mlxsw_sp,
 
 static int mlxsw_sp_flower_parse_ip(struct mlxsw_sp *mlxsw_sp,
 				    struct mlxsw_sp_acl_rule_info *rulei,
-				    struct tc_cls_flower_offload *f,
+				    struct flow_cls_offload *f,
 				    u16 n_proto)
 {
-	const struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	const struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_match_ip match;
 
 	if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IP))
@@ -299,9 +299,9 @@ static int mlxsw_sp_flower_parse_ip(struct mlxsw_sp *mlxsw_sp,
 static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp,
 				 struct mlxsw_sp_acl_block *block,
 				 struct mlxsw_sp_acl_rule_info *rulei,
-				 struct tc_cls_flower_offload *f)
+				 struct flow_cls_offload *f)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_dissector *dissector = rule->match.dissector;
 	u16 n_proto_mask = 0;
 	u16 n_proto_key = 0;
@@ -426,7 +426,7 @@ static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp,
 
 int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp,
 			    struct mlxsw_sp_acl_block *block,
-			    struct tc_cls_flower_offload *f)
+			    struct flow_cls_offload *f)
 {
 	struct mlxsw_sp_acl_rule_info *rulei;
 	struct mlxsw_sp_acl_ruleset *ruleset;
@@ -473,7 +473,7 @@ err_rule_create:
 
 void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp,
 			     struct mlxsw_sp_acl_block *block,
-			     struct tc_cls_flower_offload *f)
+			     struct flow_cls_offload *f)
 {
 	struct mlxsw_sp_acl_ruleset *ruleset;
 	struct mlxsw_sp_acl_rule *rule;
@@ -495,7 +495,7 @@ void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp,
 
 int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp,
 			  struct mlxsw_sp_acl_block *block,
-			  struct tc_cls_flower_offload *f)
+			  struct flow_cls_offload *f)
 {
 	struct mlxsw_sp_acl_ruleset *ruleset;
 	struct mlxsw_sp_acl_rule *rule;
@@ -531,7 +531,7 @@ err_rule_get_stats:
 
 int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp,
 				 struct mlxsw_sp_acl_block *block,
-				 struct tc_cls_flower_offload *f)
+				 struct flow_cls_offload *f)
 {
 	struct mlxsw_sp_acl_ruleset *ruleset;
 	struct mlxsw_sp_acl_rule_info rulei;
@@ -552,7 +552,7 @@ int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp,
 
 void mlxsw_sp_flower_tmplt_destroy(struct mlxsw_sp *mlxsw_sp,
 				   struct mlxsw_sp_acl_block *block,
-				   struct tc_cls_flower_offload *f)
+				   struct flow_cls_offload *f)
 {
 	struct mlxsw_sp_acl_ruleset *ruleset;
 
diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c
index 5b92c2a03f3d..7aaddc09c185 100644
--- a/drivers/net/ethernet/mscc/ocelot_flower.c
+++ b/drivers/net/ethernet/mscc/ocelot_flower.c
@@ -19,7 +19,7 @@ static u16 get_prio(u32 prio)
 	return prio >> 16;
 }
 
-static int ocelot_flower_parse_action(struct tc_cls_flower_offload *f,
+static int ocelot_flower_parse_action(struct flow_cls_offload *f,
 				      struct ocelot_ace_rule *rule)
 {
 	const struct flow_action_entry *a;
@@ -44,10 +44,10 @@ static int ocelot_flower_parse_action(struct tc_cls_flower_offload *f,
 	return 0;
 }
 
-static int ocelot_flower_parse(struct tc_cls_flower_offload *f,
+static int ocelot_flower_parse(struct flow_cls_offload *f,
 			       struct ocelot_ace_rule *ocelot_rule)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_dissector *dissector = rule->match.dissector;
 
 	if (dissector->used_keys &
@@ -174,7 +174,7 @@ finished_key_parsing:
 }
 
 static
-struct ocelot_ace_rule *ocelot_ace_rule_create(struct tc_cls_flower_offload *f,
+struct ocelot_ace_rule *ocelot_ace_rule_create(struct flow_cls_offload *f,
 					       struct ocelot_port_block *block)
 {
 	struct ocelot_ace_rule *rule;
@@ -188,7 +188,7 @@ struct ocelot_ace_rule *ocelot_ace_rule_create(struct tc_cls_flower_offload *f,
 	return rule;
 }
 
-static int ocelot_flower_replace(struct tc_cls_flower_offload *f,
+static int ocelot_flower_replace(struct flow_cls_offload *f,
 				 struct ocelot_port_block *port_block)
 {
 	struct ocelot_ace_rule *rule;
@@ -212,7 +212,7 @@ static int ocelot_flower_replace(struct tc_cls_flower_offload *f,
 	return 0;
 }
 
-static int ocelot_flower_destroy(struct tc_cls_flower_offload *f,
+static int ocelot_flower_destroy(struct flow_cls_offload *f,
 				 struct ocelot_port_block *port_block)
 {
 	struct ocelot_ace_rule rule;
@@ -230,7 +230,7 @@ static int ocelot_flower_destroy(struct tc_cls_flower_offload *f,
 	return 0;
 }
 
-static int ocelot_flower_stats_update(struct tc_cls_flower_offload *f,
+static int ocelot_flower_stats_update(struct flow_cls_offload *f,
 				      struct ocelot_port_block *port_block)
 {
 	struct ocelot_ace_rule rule;
@@ -247,15 +247,15 @@ static int ocelot_flower_stats_update(struct tc_cls_flower_offload *f,
 	return 0;
 }
 
-static int ocelot_setup_tc_cls_flower(struct tc_cls_flower_offload *f,
+static int ocelot_setup_tc_cls_flower(struct flow_cls_offload *f,
 				      struct ocelot_port_block *port_block)
 {
 	switch (f->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return ocelot_flower_replace(f, port_block);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return ocelot_flower_destroy(f, port_block);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return ocelot_flower_stats_update(f, port_block);
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index b6bd31fe44b2..5a54fe848de4 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -171,7 +171,7 @@ nfp_fl_output(struct nfp_app *app, struct nfp_fl_output *output,
 }
 
 static bool
-nfp_flower_tun_is_gre(struct tc_cls_flower_offload *flow, int start_idx)
+nfp_flower_tun_is_gre(struct flow_cls_offload *flow, int start_idx)
 {
 	struct flow_action_entry *act = flow->rule->action.entries;
 	int num_act = flow->rule->action.num_entries;
@@ -188,7 +188,7 @@ nfp_flower_tun_is_gre(struct tc_cls_flower_offload *flow, int start_idx)
 
 static enum nfp_flower_tun_type
 nfp_fl_get_tun_from_act(struct nfp_app *app,
-			struct tc_cls_flower_offload *flow,
+			struct flow_cls_offload *flow,
 			const struct flow_action_entry *act, int act_idx)
 {
 	const struct ip_tunnel_info *tun = act->tunnel;
@@ -669,11 +669,11 @@ struct nfp_flower_pedit_acts {
 };
 
 static int
-nfp_fl_commit_mangle(struct tc_cls_flower_offload *flow, char *nfp_action,
+nfp_fl_commit_mangle(struct flow_cls_offload *flow, char *nfp_action,
 		     int *a_len, struct nfp_flower_pedit_acts *set_act,
 		     u32 *csum_updated)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 	size_t act_size = 0;
 	u8 ip_proto = 0;
 
@@ -771,7 +771,7 @@ nfp_fl_commit_mangle(struct tc_cls_flower_offload *flow, char *nfp_action,
 
 static int
 nfp_fl_pedit(const struct flow_action_entry *act,
-	     struct tc_cls_flower_offload *flow, char *nfp_action, int *a_len,
+	     struct flow_cls_offload *flow, char *nfp_action, int *a_len,
 	     u32 *csum_updated, struct nfp_flower_pedit_acts *set_act,
 	     struct netlink_ext_ack *extack)
 {
@@ -858,7 +858,7 @@ nfp_flower_output_action(struct nfp_app *app,
 
 static int
 nfp_flower_loop_action(struct nfp_app *app, const struct flow_action_entry *act,
-		       struct tc_cls_flower_offload *flow,
+		       struct flow_cls_offload *flow,
 		       struct nfp_fl_payload *nfp_fl, int *a_len,
 		       struct net_device *netdev,
 		       enum nfp_flower_tun_type *tun_type, int *tun_out_cnt,
@@ -1021,7 +1021,7 @@ static bool nfp_fl_check_mangle_end(struct flow_action *flow_act,
 }
 
 int nfp_flower_compile_action(struct nfp_app *app,
-			      struct tc_cls_flower_offload *flow,
+			      struct flow_cls_offload *flow,
 			      struct net_device *netdev,
 			      struct nfp_fl_payload *nfp_flow,
 			      struct netlink_ext_ack *extack)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 1f165d89582d..af9441d5787f 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -343,19 +343,19 @@ int nfp_flower_merge_offloaded_flows(struct nfp_app *app,
 				     struct nfp_fl_payload *sub_flow1,
 				     struct nfp_fl_payload *sub_flow2);
 int nfp_flower_compile_flow_match(struct nfp_app *app,
-				  struct tc_cls_flower_offload *flow,
+				  struct flow_cls_offload *flow,
 				  struct nfp_fl_key_ls *key_ls,
 				  struct net_device *netdev,
 				  struct nfp_fl_payload *nfp_flow,
 				  enum nfp_flower_tun_type tun_type,
 				  struct netlink_ext_ack *extack);
 int nfp_flower_compile_action(struct nfp_app *app,
-			      struct tc_cls_flower_offload *flow,
+			      struct flow_cls_offload *flow,
 			      struct net_device *netdev,
 			      struct nfp_fl_payload *nfp_flow,
 			      struct netlink_ext_ack *extack);
 int nfp_compile_flow_metadata(struct nfp_app *app,
-			      struct tc_cls_flower_offload *flow,
+			      struct flow_cls_offload *flow,
 			      struct nfp_fl_payload *nfp_flow,
 			      struct net_device *netdev,
 			      struct netlink_ext_ack *extack);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c
index c1690de19172..9cc3ba17ff69 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/match.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/match.c
@@ -10,9 +10,9 @@
 static void
 nfp_flower_compile_meta_tci(struct nfp_flower_meta_tci *ext,
 			    struct nfp_flower_meta_tci *msk,
-			    struct tc_cls_flower_offload *flow, u8 key_type)
+			    struct flow_cls_offload *flow, u8 key_type)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 	u16 tmp_tci;
 
 	memset(ext, 0, sizeof(struct nfp_flower_meta_tci));
@@ -78,9 +78,9 @@ nfp_flower_compile_port(struct nfp_flower_in_port *frame, u32 cmsg_port,
 static void
 nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext,
 		       struct nfp_flower_mac_mpls *msk,
-		       struct tc_cls_flower_offload *flow)
+		       struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 
 	memset(ext, 0, sizeof(struct nfp_flower_mac_mpls));
 	memset(msk, 0, sizeof(struct nfp_flower_mac_mpls));
@@ -130,9 +130,9 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext,
 static void
 nfp_flower_compile_tport(struct nfp_flower_tp_ports *ext,
 			 struct nfp_flower_tp_ports *msk,
-			 struct tc_cls_flower_offload *flow)
+			 struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 
 	memset(ext, 0, sizeof(struct nfp_flower_tp_ports));
 	memset(msk, 0, sizeof(struct nfp_flower_tp_ports));
@@ -151,9 +151,9 @@ nfp_flower_compile_tport(struct nfp_flower_tp_ports *ext,
 static void
 nfp_flower_compile_ip_ext(struct nfp_flower_ip_ext *ext,
 			  struct nfp_flower_ip_ext *msk,
-			  struct tc_cls_flower_offload *flow)
+			  struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 
 	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) {
 		struct flow_match_basic match;
@@ -225,9 +225,9 @@ nfp_flower_compile_ip_ext(struct nfp_flower_ip_ext *ext,
 static void
 nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *ext,
 			struct nfp_flower_ipv4 *msk,
-			struct tc_cls_flower_offload *flow)
+			struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 	struct flow_match_ipv4_addrs match;
 
 	memset(ext, 0, sizeof(struct nfp_flower_ipv4));
@@ -247,9 +247,9 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *ext,
 static void
 nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *ext,
 			struct nfp_flower_ipv6 *msk,
-			struct tc_cls_flower_offload *flow)
+			struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 
 	memset(ext, 0, sizeof(struct nfp_flower_ipv6));
 	memset(msk, 0, sizeof(struct nfp_flower_ipv6));
@@ -269,7 +269,7 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *ext,
 
 static int
 nfp_flower_compile_geneve_opt(void *ext, void *msk,
-			      struct tc_cls_flower_offload *flow)
+			      struct flow_cls_offload *flow)
 {
 	struct flow_match_enc_opts match;
 
@@ -283,9 +283,9 @@ nfp_flower_compile_geneve_opt(void *ext, void *msk,
 static void
 nfp_flower_compile_tun_ipv4_addrs(struct nfp_flower_tun_ipv4 *ext,
 				  struct nfp_flower_tun_ipv4 *msk,
-				  struct tc_cls_flower_offload *flow)
+				  struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 
 	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
 		struct flow_match_ipv4_addrs match;
@@ -301,9 +301,9 @@ nfp_flower_compile_tun_ipv4_addrs(struct nfp_flower_tun_ipv4 *ext,
 static void
 nfp_flower_compile_tun_ip_ext(struct nfp_flower_tun_ip_ext *ext,
 			      struct nfp_flower_tun_ip_ext *msk,
-			      struct tc_cls_flower_offload *flow)
+			      struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 
 	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) {
 		struct flow_match_ip match;
@@ -319,9 +319,9 @@ nfp_flower_compile_tun_ip_ext(struct nfp_flower_tun_ip_ext *ext,
 static void
 nfp_flower_compile_ipv4_gre_tun(struct nfp_flower_ipv4_gre_tun *ext,
 				struct nfp_flower_ipv4_gre_tun *msk,
-				struct tc_cls_flower_offload *flow)
+				struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 
 	memset(ext, 0, sizeof(struct nfp_flower_ipv4_gre_tun));
 	memset(msk, 0, sizeof(struct nfp_flower_ipv4_gre_tun));
@@ -348,9 +348,9 @@ nfp_flower_compile_ipv4_gre_tun(struct nfp_flower_ipv4_gre_tun *ext,
 static void
 nfp_flower_compile_ipv4_udp_tun(struct nfp_flower_ipv4_udp_tun *ext,
 				struct nfp_flower_ipv4_udp_tun *msk,
-				struct tc_cls_flower_offload *flow)
+				struct flow_cls_offload *flow)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 
 	memset(ext, 0, sizeof(struct nfp_flower_ipv4_udp_tun));
 	memset(msk, 0, sizeof(struct nfp_flower_ipv4_udp_tun));
@@ -371,7 +371,7 @@ nfp_flower_compile_ipv4_udp_tun(struct nfp_flower_ipv4_udp_tun *ext,
 }
 
 int nfp_flower_compile_flow_match(struct nfp_app *app,
-				  struct tc_cls_flower_offload *flow,
+				  struct flow_cls_offload *flow,
 				  struct nfp_fl_key_ls *key_ls,
 				  struct net_device *netdev,
 				  struct nfp_fl_payload *nfp_flow,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/metadata.c b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
index dae60961c1eb..7c4a15e967df 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/metadata.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
@@ -290,7 +290,7 @@ nfp_check_mask_remove(struct nfp_app *app, char *mask_data, u32 mask_len,
 }
 
 int nfp_compile_flow_metadata(struct nfp_app *app,
-			      struct tc_cls_flower_offload *flow,
+			      struct flow_cls_offload *flow,
 			      struct nfp_fl_payload *nfp_flow,
 			      struct net_device *netdev,
 			      struct netlink_ext_ack *extack)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 1b38cfeb646c..7e725fa60347 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -121,9 +121,9 @@ nfp_flower_xmit_flow(struct nfp_app *app, struct nfp_fl_payload *nfp_flow,
 	return 0;
 }
 
-static bool nfp_flower_check_higher_than_mac(struct tc_cls_flower_offload *f)
+static bool nfp_flower_check_higher_than_mac(struct flow_cls_offload *f)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 
 	return flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS) ||
 	       flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS) ||
@@ -131,9 +131,9 @@ static bool nfp_flower_check_higher_than_mac(struct tc_cls_flower_offload *f)
 	       flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP);
 }
 
-static bool nfp_flower_check_higher_than_l3(struct tc_cls_flower_offload *f)
+static bool nfp_flower_check_higher_than_l3(struct flow_cls_offload *f)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(f);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 
 	return flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_PORTS) ||
 	       flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP);
@@ -212,11 +212,11 @@ static int
 nfp_flower_calculate_key_layers(struct nfp_app *app,
 				struct net_device *netdev,
 				struct nfp_fl_key_ls *ret_key_ls,
-				struct tc_cls_flower_offload *flow,
+				struct flow_cls_offload *flow,
 				enum nfp_flower_tun_type *tun_type,
 				struct netlink_ext_ack *extack)
 {
-	struct flow_rule *rule = tc_cls_flower_offload_flow_rule(flow);
+	struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
 	struct flow_dissector *dissector = rule->match.dissector;
 	struct flow_match_basic basic = { NULL, NULL};
 	struct nfp_flower_priv *priv = app->priv;
@@ -866,7 +866,7 @@ int nfp_flower_merge_offloaded_flows(struct nfp_app *app,
 				     struct nfp_fl_payload *sub_flow1,
 				     struct nfp_fl_payload *sub_flow2)
 {
-	struct tc_cls_flower_offload merge_tc_off;
+	struct flow_cls_offload merge_tc_off;
 	struct nfp_flower_priv *priv = app->priv;
 	struct netlink_ext_ack *extack = NULL;
 	struct nfp_fl_payload *merge_flow;
@@ -962,7 +962,7 @@ err_destroy_merge_flow:
  */
 static int
 nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
-		       struct tc_cls_flower_offload *flow)
+		       struct flow_cls_offload *flow)
 {
 	enum nfp_flower_tun_type tun_type = NFP_FL_TUNNEL_NONE;
 	struct nfp_flower_priv *priv = app->priv;
@@ -1125,7 +1125,7 @@ nfp_flower_del_linked_merge_flows(struct nfp_app *app,
  */
 static int
 nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev,
-		       struct tc_cls_flower_offload *flow)
+		       struct flow_cls_offload *flow)
 {
 	struct nfp_flower_priv *priv = app->priv;
 	struct netlink_ext_ack *extack = NULL;
@@ -1232,7 +1232,7 @@ nfp_flower_update_merge_stats(struct nfp_app *app,
  */
 static int
 nfp_flower_get_stats(struct nfp_app *app, struct net_device *netdev,
-		     struct tc_cls_flower_offload *flow)
+		     struct flow_cls_offload *flow)
 {
 	struct nfp_flower_priv *priv = app->priv;
 	struct netlink_ext_ack *extack = NULL;
@@ -1265,17 +1265,17 @@ nfp_flower_get_stats(struct nfp_app *app, struct net_device *netdev,
 
 static int
 nfp_flower_repr_offload(struct nfp_app *app, struct net_device *netdev,
-			struct tc_cls_flower_offload *flower)
+			struct flow_cls_offload *flower)
 {
 	if (!eth_proto_is_802_3(flower->common.protocol))
 		return -EOPNOTSUPP;
 
 	switch (flower->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return nfp_flower_add_offload(app, netdev, flower);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return nfp_flower_del_offload(app, netdev, flower);
-	case TC_CLSFLOWER_STATS:
+	case FLOW_CLS_STATS:
 		return nfp_flower_get_stats(app, netdev, flower);
 	default:
 		return -EOPNOTSUPP;
@@ -1385,7 +1385,7 @@ static int nfp_flower_setup_indr_block_cb(enum tc_setup_type type,
 					  void *type_data, void *cb_priv)
 {
 	struct nfp_flower_indr_block_cb_priv *priv = cb_priv;
-	struct tc_cls_flower_offload *flower = type_data;
+	struct flow_cls_offload *flower = type_data;
 
 	if (flower->common.chain_index)
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index b972ab07c18b..0e931c04fecf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -551,7 +551,7 @@ int qede_txq_has_work(struct qede_tx_queue *txq);
 void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, u8 count);
 void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq);
 int qede_add_tc_flower_fltr(struct qede_dev *edev, __be16 proto,
-			    struct tc_cls_flower_offload *f);
+			    struct flow_cls_offload *f);
 
 #define RX_RING_SIZE_POW	13
 #define RX_RING_SIZE		((u16)BIT(RX_RING_SIZE_POW))
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index add922b93d2c..9a6a9a008714 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1943,7 +1943,7 @@ qede_parse_flow_attr(struct qede_dev *edev, __be16 proto,
 }
 
 int qede_add_tc_flower_fltr(struct qede_dev *edev, __be16 proto,
-			    struct tc_cls_flower_offload *f)
+			    struct flow_cls_offload *f)
 {
 	struct qede_arfs_fltr_node *n;
 	int min_hlen, rc = -EINVAL;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 1be593a6e20d..8d1c208f778f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -548,13 +548,13 @@ static int qede_setup_tc(struct net_device *ndev, u8 num_tc)
 }
 
 static int
-qede_set_flower(struct qede_dev *edev, struct tc_cls_flower_offload *f,
+qede_set_flower(struct qede_dev *edev, struct flow_cls_offload *f,
 		__be16 proto)
 {
 	switch (f->command) {
-	case TC_CLSFLOWER_REPLACE:
+	case FLOW_CLS_REPLACE:
 		return qede_add_tc_flower_fltr(edev, proto, f);
-	case TC_CLSFLOWER_DESTROY:
+	case FLOW_CLS_DESTROY:
 		return qede_delete_flow_filter(edev, f->cookie);
 	default:
 		return -EOPNOTSUPP;
@@ -564,7 +564,7 @@ qede_set_flower(struct qede_dev *edev, struct tc_cls_flower_offload *f,
 static int qede_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 				  void *cb_priv)
 {
-	struct tc_cls_flower_offload *f;
+	struct flow_cls_offload *f;
 	struct qede_dev *edev = cb_priv;
 
 	if (!tc_cls_can_offload_and_chain0(edev->ndev, type_data))
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 42a36a346003..db337299e81e 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -303,4 +303,34 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_list, tc_setup_cb_t *cb,
 			       void *cb_ident, void *cb_priv, bool ingress_only);
 
+enum flow_cls_command {
+	FLOW_CLS_REPLACE,
+	FLOW_CLS_DESTROY,
+	FLOW_CLS_STATS,
+	FLOW_CLS_TMPLT_CREATE,
+	FLOW_CLS_TMPLT_DESTROY,
+};
+
+struct flow_cls_common_offload {
+	u32 chain_index;
+	__be16 protocol;
+	u32 prio;
+	struct netlink_ext_ack *extack;
+};
+
+struct flow_cls_offload {
+	struct flow_cls_common_offload common;
+	enum flow_cls_command command;
+	unsigned long cookie;
+	struct flow_rule *rule;
+	struct flow_stats stats;
+	u32 classid;
+};
+
+static inline struct flow_rule *
+flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd)
+{
+	return flow_cmd->rule;
+}
+
 #endif /* _NET_FLOW_OFFLOAD_H */
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 17c388090c3c..b03d466182db 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -535,13 +535,6 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 		     void *type_data, bool err_stop);
 unsigned int tcf_exts_num_actions(struct tcf_exts *exts);
 
-struct tc_cls_common_offload {
-	u32 chain_index;
-	__be16 protocol;
-	u32 prio;
-	struct netlink_ext_ack *extack;
-};
-
 struct tc_cls_u32_knode {
 	struct tcf_exts *exts;
 	struct tcf_result *res;
@@ -569,7 +562,7 @@ enum tc_clsu32_command {
 };
 
 struct tc_cls_u32_offload {
-	struct tc_cls_common_offload common;
+	struct flow_cls_common_offload common;
 	/* knode values */
 	enum tc_clsu32_command command;
 	union {
@@ -596,7 +589,7 @@ static inline bool tc_can_offload_extack(const struct net_device *dev,
 
 static inline bool
 tc_cls_can_offload_and_chain0(const struct net_device *dev,
-			      struct tc_cls_common_offload *common)
+			      struct flow_cls_common_offload *common)
 {
 	if (!tc_can_offload_extack(dev, common->extack))
 		return false;
@@ -638,7 +631,7 @@ static inline bool tc_in_hw(u32 flags)
 }
 
 static inline void
-tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
+tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common,
 			   const struct tcf_proto *tp, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
@@ -649,29 +642,6 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 		cls_common->extack = extack;
 }
 
-enum tc_fl_command {
-	TC_CLSFLOWER_REPLACE,
-	TC_CLSFLOWER_DESTROY,
-	TC_CLSFLOWER_STATS,
-	TC_CLSFLOWER_TMPLT_CREATE,
-	TC_CLSFLOWER_TMPLT_DESTROY,
-};
-
-struct tc_cls_flower_offload {
-	struct tc_cls_common_offload common;
-	enum tc_fl_command command;
-	unsigned long cookie;
-	struct flow_rule *rule;
-	struct flow_stats stats;
-	u32 classid;
-};
-
-static inline struct flow_rule *
-tc_cls_flower_offload_flow_rule(struct tc_cls_flower_offload *tc_flow_cmd)
-{
-	return tc_flow_cmd->rule;
-}
-
 enum tc_matchall_command {
 	TC_CLSMATCHALL_REPLACE,
 	TC_CLSMATCHALL_DESTROY,
@@ -679,7 +649,7 @@ enum tc_matchall_command {
 };
 
 struct tc_cls_matchall_offload {
-	struct tc_cls_common_offload common;
+	struct flow_cls_common_offload common;
 	enum tc_matchall_command command;
 	struct flow_rule *rule;
 	struct flow_stats stats;
@@ -692,7 +662,7 @@ enum tc_clsbpf_command {
 };
 
 struct tc_cls_bpf_offload {
-	struct tc_cls_common_offload common;
+	struct flow_cls_common_offload common;
 	enum tc_clsbpf_command command;
 	struct tcf_exts *exts;
 	struct bpf_prog *prog;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index bec37e16347f..38d6e85693fc 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -409,14 +409,14 @@ static void fl_destroy_filter_work(struct work_struct *work)
 static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 				 bool rtnl_held, struct netlink_ext_ack *extack)
 {
-	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
+	struct flow_cls_offload cls_flower = {};
 
 	if (!rtnl_held)
 		rtnl_lock();
 
 	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
-	cls_flower.command = TC_CLSFLOWER_DESTROY;
+	cls_flower.command = FLOW_CLS_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
 	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
@@ -434,8 +434,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 				struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = fl_head_dereference(tp);
-	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
+	struct flow_cls_offload cls_flower = {};
 	bool skip_sw = tc_skip_sw(f->flags);
 	int err = 0;
 
@@ -449,7 +449,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	}
 
 	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
-	cls_flower.command = TC_CLSFLOWER_REPLACE;
+	cls_flower.command = FLOW_CLS_REPLACE;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.rule->match.dissector = &f->mask->dissector;
 	cls_flower.rule->match.mask = &f->mask->key;
@@ -498,14 +498,14 @@ errout:
 static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
 			       bool rtnl_held)
 {
-	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
+	struct flow_cls_offload cls_flower = {};
 
 	if (!rtnl_held)
 		rtnl_lock();
 
 	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
-	cls_flower.command = TC_CLSFLOWER_STATS;
+	cls_flower.command = FLOW_CLS_STATS;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.classid = f->res.classid;
 
@@ -1803,8 +1803,8 @@ fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add)
 static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
 			void *cb_priv, struct netlink_ext_ack *extack)
 {
-	struct tc_cls_flower_offload cls_flower = {};
 	struct tcf_block *block = tp->chain->block;
+	struct flow_cls_offload cls_flower = {};
 	struct cls_fl_filter *f = NULL;
 	int err;
 
@@ -1825,7 +1825,7 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
 		tc_cls_common_offload_init(&cls_flower.common, tp, f->flags,
 					   extack);
 		cls_flower.command = add ?
-			TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
+			FLOW_CLS_REPLACE : FLOW_CLS_DESTROY;
 		cls_flower.cookie = (unsigned long)f;
 		cls_flower.rule->match.dissector = &f->mask->dissector;
 		cls_flower.rule->match.mask = &f->mask->key;
@@ -1869,7 +1869,7 @@ next_flow:
 static int fl_hw_create_tmplt(struct tcf_chain *chain,
 			      struct fl_flow_tmplt *tmplt)
 {
-	struct tc_cls_flower_offload cls_flower = {};
+	struct flow_cls_offload cls_flower = {};
 	struct tcf_block *block = chain->block;
 
 	cls_flower.rule = flow_rule_alloc(0);
@@ -1877,7 +1877,7 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
 		return -ENOMEM;
 
 	cls_flower.common.chain_index = chain->index;
-	cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
+	cls_flower.command = FLOW_CLS_TMPLT_CREATE;
 	cls_flower.cookie = (unsigned long) tmplt;
 	cls_flower.rule->match.dissector = &tmplt->dissector;
 	cls_flower.rule->match.mask = &tmplt->mask;
@@ -1895,11 +1895,11 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
 static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
 				struct fl_flow_tmplt *tmplt)
 {
-	struct tc_cls_flower_offload cls_flower = {};
+	struct flow_cls_offload cls_flower = {};
 	struct tcf_block *block = chain->block;
 
 	cls_flower.common.chain_index = chain->index;
-	cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
+	cls_flower.command = FLOW_CLS_TMPLT_DESTROY;
 	cls_flower.cookie = (unsigned long) tmplt;
 
 	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
-- 
cgit v1.2.3


From c9626a2cbdb20e26587b3fad99960520a023432b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jul 2019 23:00:43 +0200
Subject: netfilter: nf_tables: add hardware offload support

This patch adds hardware offload support for nftables through the
existing netdev_ops->ndo_setup_tc() interface, the TC_SETUP_CLSFLOWER
classifier and the flow rule API. This hardware offload support is
available for the NFPROTO_NETDEV family and the ingress hook.

Each nftables expression has a new ->offload interface, that is used to
populate the flow rule object that is attached to the transaction
object.

There is a new per-table NFT_TABLE_F_HW flag, that is set on to offload
an entire table, including all of its chains.

This patch supports for basic metadata (layer 3 and 4 protocol numbers),
5-tuple payload matching and the accept/drop actions; this also includes
basechain hardware offload only.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_tables.h         |  14 ++
 include/net/netfilter/nf_tables_offload.h |  76 +++++++++
 include/uapi/linux/netfilter/nf_tables.h  |   2 +
 net/netfilter/Makefile                    |   2 +-
 net/netfilter/nf_tables_api.c             |  39 ++++-
 net/netfilter/nf_tables_offload.c         | 267 ++++++++++++++++++++++++++++++
 net/netfilter/nft_cmp.c                   |  53 ++++++
 net/netfilter/nft_immediate.c             |  31 ++++
 net/netfilter/nft_meta.c                  |  27 +++
 net/netfilter/nft_payload.c               | 187 +++++++++++++++++++++
 10 files changed, 691 insertions(+), 7 deletions(-)
 create mode 100644 include/net/netfilter/nf_tables_offload.h
 create mode 100644 net/netfilter/nf_tables_offload.c

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 9e8493aad49d..35dfdd9f69b3 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -161,6 +161,7 @@ struct nft_ctx {
 	const struct nlattr * const 	*nla;
 	u32				portid;
 	u32				seq;
+	u16				flags;
 	u8				family;
 	u8				level;
 	bool				report;
@@ -735,6 +736,9 @@ enum nft_trans_phase {
 	NFT_TRANS_RELEASE
 };
 
+struct nft_flow_rule;
+struct nft_offload_ctx;
+
 /**
  *	struct nft_expr_ops - nf_tables expression operations
  *
@@ -777,6 +781,10 @@ struct nft_expr_ops {
 						    const struct nft_data **data);
 	bool				(*gc)(struct net *net,
 					      const struct nft_expr *expr);
+	int				(*offload)(struct nft_offload_ctx *ctx,
+						   struct nft_flow_rule *flow,
+						   const struct nft_expr *expr);
+	u32				offload_flags;
 	const struct nft_expr_type	*type;
 	void				*data;
 };
@@ -859,6 +867,7 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule)
 
 enum nft_chain_flags {
 	NFT_BASE_CHAIN			= 0x1,
+	NFT_CHAIN_HW_OFFLOAD		= 0x2,
 };
 
 /**
@@ -942,6 +951,7 @@ struct nft_stats {
  *	@stats: per-cpu chain stats
  *	@chain: the chain
  *	@dev_name: device name that this base chain is attached to (if any)
+ *	@cb_list: list of flow block callbacks (for hardware offload)
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops;
@@ -951,6 +961,7 @@ struct nft_base_chain {
 	struct nft_stats __percpu	*stats;
 	struct nft_chain		chain;
 	char 				dev_name[IFNAMSIZ];
+	struct list_head		cb_list;
 };
 
 static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain)
@@ -1322,11 +1333,14 @@ struct nft_trans {
 
 struct nft_trans_rule {
 	struct nft_rule			*rule;
+	struct nft_flow_rule		*flow;
 	u32				rule_id;
 };
 
 #define nft_trans_rule(trans)	\
 	(((struct nft_trans_rule *)trans->data)->rule)
+#define nft_trans_flow_rule(trans)	\
+	(((struct nft_trans_rule *)trans->data)->flow)
 #define nft_trans_rule_id(trans)	\
 	(((struct nft_trans_rule *)trans->data)->rule_id)
 
diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h
new file mode 100644
index 000000000000..3196663a10e3
--- /dev/null
+++ b/include/net/netfilter/nf_tables_offload.h
@@ -0,0 +1,76 @@
+#ifndef _NET_NF_TABLES_OFFLOAD_H
+#define _NET_NF_TABLES_OFFLOAD_H
+
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_offload_reg {
+	u32		key;
+	u32		len;
+	u32		base_offset;
+	u32		offset;
+	struct nft_data	mask;
+};
+
+enum nft_offload_dep_type {
+	NFT_OFFLOAD_DEP_UNSPEC	= 0,
+	NFT_OFFLOAD_DEP_NETWORK,
+	NFT_OFFLOAD_DEP_TRANSPORT,
+};
+
+struct nft_offload_ctx {
+	struct {
+		enum nft_offload_dep_type	type;
+		__be16				l3num;
+		u8				protonum;
+	} dep;
+	unsigned int				num_actions;
+	struct nft_offload_reg			regs[NFT_REG32_15 + 1];
+};
+
+void nft_offload_set_dependency(struct nft_offload_ctx *ctx,
+				enum nft_offload_dep_type type);
+void nft_offload_update_dependency(struct nft_offload_ctx *ctx,
+				   const void *data, u32 len);
+
+struct nft_flow_key {
+	struct flow_dissector_key_basic			basic;
+	union {
+		struct flow_dissector_key_ipv4_addrs	ipv4;
+		struct flow_dissector_key_ipv6_addrs	ipv6;
+	};
+	struct flow_dissector_key_ports			tp;
+	struct flow_dissector_key_ip			ip;
+	struct flow_dissector_key_vlan			vlan;
+	struct flow_dissector_key_eth_addrs		eth_addrs;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
+
+struct nft_flow_match {
+	struct flow_dissector	dissector;
+	struct nft_flow_key	key;
+	struct nft_flow_key	mask;
+};
+
+struct nft_flow_rule {
+	__be16			proto;
+	struct nft_flow_match	match;
+	struct flow_rule	*rule;
+};
+
+#define NFT_OFFLOAD_F_ACTION	(1 << 0)
+
+struct nft_rule;
+struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule);
+void nft_flow_rule_destroy(struct nft_flow_rule *flow);
+int nft_flow_rule_offload_commit(struct net *net);
+
+#define NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg)		\
+	(__reg)->base_offset	=					\
+		offsetof(struct nft_flow_key, __base);			\
+	(__reg)->offset		=					\
+		offsetof(struct nft_flow_key, __base.__field);		\
+	(__reg)->len		= __len;				\
+	(__reg)->key		= __key;				\
+	memset(&(__reg)->mask, 0xff, (__reg)->len);
+
+#endif
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 0e3462dfb182..82abaa183fc3 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -192,6 +192,7 @@ enum nft_table_attributes {
  * @NFTA_CHAIN_USE: number of references to this chain (NLA_U32)
  * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING)
  * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes)
+ * @NFTA_CHAIN_FLAGS: chain flags
  */
 enum nft_chain_attributes {
 	NFTA_CHAIN_UNSPEC,
@@ -204,6 +205,7 @@ enum nft_chain_attributes {
 	NFTA_CHAIN_TYPE,
 	NFTA_CHAIN_COUNTERS,
 	NFTA_CHAIN_PAD,
+	NFTA_CHAIN_FLAGS,
 	__NFTA_CHAIN_MAX
 };
 #define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index deada20975ff..9270a7fae484 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -78,7 +78,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
 		  nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \
-		  nft_chain_route.o
+		  nft_chain_route.o nf_tables_offload.o
 
 nf_tables_set-objs := nf_tables_set_core.o \
 		      nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d22d00ca78c1..ed17a7c29b86 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -18,6 +18,7 @@
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 
@@ -97,6 +98,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 	ctx->nla   	= nla;
 	ctx->portid	= NETLINK_CB(skb).portid;
 	ctx->report	= nlmsg_report(nlh);
+	ctx->flags	= nlh->nlmsg_flags;
 	ctx->seq	= nlh->nlmsg_seq;
 }
 
@@ -1169,6 +1171,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
 	[NFTA_CHAIN_POLICY]	= { .type = NLA_U32 },
 	[NFTA_CHAIN_TYPE]	= { .type = NLA_STRING },
 	[NFTA_CHAIN_COUNTERS]	= { .type = NLA_NESTED },
+	[NFTA_CHAIN_FLAGS]	= { .type = NLA_U32 },
 };
 
 static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -1603,7 +1606,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha
 }
 
 static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
-			      u8 policy)
+			      u8 policy, u32 flags)
 {
 	const struct nlattr * const *nla = ctx->nla;
 	struct nft_table *table = ctx->table;
@@ -1657,8 +1660,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		ops->hook	= hook.type->hooks[ops->hooknum];
 		ops->dev	= hook.dev;
 
-		chain->flags |= NFT_BASE_CHAIN;
+		chain->flags |= NFT_BASE_CHAIN | flags;
 		basechain->policy = NF_ACCEPT;
+		INIT_LIST_HEAD(&basechain->cb_list);
 	} else {
 		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
 		if (chain == NULL)
@@ -1718,7 +1722,8 @@ err1:
 	return err;
 }
 
-static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
+static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
+			      u32 flags)
 {
 	const struct nlattr * const *nla = ctx->nla;
 	struct nft_table *table = ctx->table;
@@ -1730,6 +1735,9 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
 	struct nft_trans *trans;
 	int err;
 
+	if (chain->flags ^ flags)
+		return -EOPNOTSUPP;
+
 	if (nla[NFTA_CHAIN_HOOK]) {
 		if (!nft_is_base_chain(chain))
 			return -EBUSY;
@@ -1835,6 +1843,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	u8 policy = NF_ACCEPT;
 	struct nft_ctx ctx;
 	u64 handle = 0;
+	u32 flags = 0;
 
 	lockdep_assert_held(&net->nft.commit_mutex);
 
@@ -1889,6 +1898,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 		}
 	}
 
+	if (nla[NFTA_CHAIN_FLAGS])
+		flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS]));
+
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
 
 	if (chain != NULL) {
@@ -1899,10 +1911,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
-		return nf_tables_updchain(&ctx, genmask, policy);
+		return nf_tables_updchain(&ctx, genmask, policy, flags);
 	}
 
-	return nf_tables_addchain(&ctx, family, genmask, policy);
+	return nf_tables_addchain(&ctx, family, genmask, policy, flags);
 }
 
 static int nf_tables_delchain(struct net *net, struct sock *nlsk,
@@ -2658,6 +2670,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	u8 genmask = nft_genmask_next(net);
 	struct nft_expr_info *info = NULL;
 	int family = nfmsg->nfgen_family;
+	struct nft_flow_rule *flow;
 	struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule, *old_rule = NULL;
@@ -2804,7 +2817,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 
 		list_add_tail_rcu(&rule->list, &old_rule->list);
 	} else {
-		if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+		trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
+		if (!trans) {
 			err = -ENOMEM;
 			goto err2;
 		}
@@ -2827,6 +2841,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	if (net->nft.validate_state == NFT_VALIDATE_DO)
 		return nft_table_validate(net, table);
 
+	if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+		flow = nft_flow_rule_create(rule);
+		if (IS_ERR(flow))
+			return PTR_ERR(flow);
+
+		nft_trans_flow_rule(trans) = flow;
+	}
+
 	return 0;
 err2:
 	nf_tables_rule_release(&ctx, rule);
@@ -6624,6 +6646,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 	struct nft_trans_elem *te;
 	struct nft_chain *chain;
 	struct nft_table *table;
+	int err;
 
 	if (list_empty(&net->nft.commit_list)) {
 		mutex_unlock(&net->nft.commit_mutex);
@@ -6634,6 +6657,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 	if (nf_tables_validate(net) < 0)
 		return -EAGAIN;
 
+	err = nft_flow_rule_offload_commit(net);
+	if (err < 0)
+		return err;
+
 	/* 1.  Allocate space for next generation rules_gen_X[] */
 	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
 		int ret;
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
new file mode 100644
index 000000000000..2c3302845f67
--- /dev/null
+++ b/net/netfilter/nf_tables_offload.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
+#include <net/pkt_cls.h>
+
+static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
+{
+	struct nft_flow_rule *flow;
+
+	flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL);
+	if (!flow)
+		return NULL;
+
+	flow->rule = flow_rule_alloc(num_actions);
+	if (!flow->rule) {
+		kfree(flow);
+		return NULL;
+	}
+
+	flow->rule->match.dissector	= &flow->match.dissector;
+	flow->rule->match.mask		= &flow->match.mask;
+	flow->rule->match.key		= &flow->match.key;
+
+	return flow;
+}
+
+struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule)
+{
+	struct nft_offload_ctx ctx = {
+		.dep	= {
+			.type	= NFT_OFFLOAD_DEP_UNSPEC,
+		},
+	};
+	struct nft_flow_rule *flow;
+	int num_actions = 0, err;
+	struct nft_expr *expr;
+
+	expr = nft_expr_first(rule);
+	while (expr->ops && expr != nft_expr_last(rule)) {
+		if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION)
+			num_actions++;
+
+		expr = nft_expr_next(expr);
+	}
+
+	flow = nft_flow_rule_alloc(num_actions);
+	if (!flow)
+		return ERR_PTR(-ENOMEM);
+
+	expr = nft_expr_first(rule);
+	while (expr->ops && expr != nft_expr_last(rule)) {
+		if (!expr->ops->offload) {
+			err = -EOPNOTSUPP;
+			goto err_out;
+		}
+		err = expr->ops->offload(&ctx, flow, expr);
+		if (err < 0)
+			goto err_out;
+
+		expr = nft_expr_next(expr);
+	}
+	flow->proto = ctx.dep.l3num;
+
+	return flow;
+err_out:
+	nft_flow_rule_destroy(flow);
+
+	return ERR_PTR(err);
+}
+
+void nft_flow_rule_destroy(struct nft_flow_rule *flow)
+{
+	kfree(flow->rule);
+	kfree(flow);
+}
+
+void nft_offload_set_dependency(struct nft_offload_ctx *ctx,
+				enum nft_offload_dep_type type)
+{
+	ctx->dep.type = type;
+}
+
+void nft_offload_update_dependency(struct nft_offload_ctx *ctx,
+				   const void *data, u32 len)
+{
+	switch (ctx->dep.type) {
+	case NFT_OFFLOAD_DEP_NETWORK:
+		WARN_ON(len != sizeof(__u16));
+		memcpy(&ctx->dep.l3num, data, sizeof(__u16));
+		break;
+	case NFT_OFFLOAD_DEP_TRANSPORT:
+		WARN_ON(len != sizeof(__u8));
+		memcpy(&ctx->dep.protonum, data, sizeof(__u8));
+		break;
+	default:
+		break;
+	}
+	ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
+}
+
+static void nft_flow_offload_common_init(struct flow_cls_common_offload *common,
+					 __be16 proto,
+					struct netlink_ext_ack *extack)
+{
+	common->protocol = proto;
+	common->extack = extack;
+}
+
+static int nft_setup_cb_call(struct nft_base_chain *basechain,
+			     enum tc_setup_type type, void *type_data)
+{
+	struct flow_block_cb *block_cb;
+	int err;
+
+	list_for_each_entry(block_cb, &basechain->cb_list, list) {
+		err = block_cb->cb(type, type_data, block_cb->cb_priv);
+		if (err < 0)
+			return err;
+	}
+	return 0;
+}
+
+static int nft_flow_offload_rule(struct nft_trans *trans,
+				 enum flow_cls_command command)
+{
+	struct nft_flow_rule *flow = nft_trans_flow_rule(trans);
+	struct nft_rule *rule = nft_trans_rule(trans);
+	struct flow_cls_offload cls_flow = {};
+	struct nft_base_chain *basechain;
+	struct netlink_ext_ack extack;
+	__be16 proto = ETH_P_ALL;
+
+	if (!nft_is_base_chain(trans->ctx.chain))
+		return -EOPNOTSUPP;
+
+	basechain = nft_base_chain(trans->ctx.chain);
+
+	if (flow)
+		proto = flow->proto;
+
+	nft_flow_offload_common_init(&cls_flow.common, proto, &extack);
+	cls_flow.command = command;
+	cls_flow.cookie = (unsigned long) rule;
+	if (flow)
+		cls_flow.rule = flow->rule;
+
+	return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow);
+}
+
+static int nft_flow_offload_bind(struct flow_block_offload *bo,
+				 struct nft_base_chain *basechain)
+{
+	list_splice(&bo->cb_list, &basechain->cb_list);
+	return 0;
+}
+
+static int nft_flow_offload_unbind(struct flow_block_offload *bo,
+				   struct nft_base_chain *basechain)
+{
+	struct flow_block_cb *block_cb, *next;
+
+	list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+		list_del(&block_cb->list);
+		flow_block_cb_free(block_cb);
+	}
+
+	return 0;
+}
+
+#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
+
+static int nft_flow_offload_chain(struct nft_trans *trans,
+				  enum flow_block_command cmd)
+{
+	struct nft_chain *chain = trans->ctx.chain;
+	struct netlink_ext_ack extack = {};
+	struct flow_block_offload bo = {};
+	struct nft_base_chain *basechain;
+	struct net_device *dev;
+	int err;
+
+	if (!nft_is_base_chain(chain))
+		return -EOPNOTSUPP;
+
+	basechain = nft_base_chain(chain);
+	dev = basechain->ops.dev;
+	if (!dev || !dev->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	/* Only default policy to accept is supported for now. */
+	if (cmd == FLOW_BLOCK_BIND &&
+	    nft_trans_chain_policy(trans) != -1 &&
+	    nft_trans_chain_policy(trans) != NF_ACCEPT)
+		return -EOPNOTSUPP;
+
+	bo.command = cmd;
+	bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	bo.extack = &extack;
+	INIT_LIST_HEAD(&bo.cb_list);
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo);
+	if (err < 0)
+		return err;
+
+	switch (cmd) {
+	case FLOW_BLOCK_BIND:
+		err = nft_flow_offload_bind(&bo, basechain);
+		break;
+	case FLOW_BLOCK_UNBIND:
+		err = nft_flow_offload_unbind(&bo, basechain);
+		break;
+	}
+
+	return err;
+}
+
+int nft_flow_rule_offload_commit(struct net *net)
+{
+	struct nft_trans *trans;
+	int err = 0;
+
+	list_for_each_entry(trans, &net->nft.commit_list, list) {
+		if (trans->ctx.family != NFPROTO_NETDEV)
+			continue;
+
+		switch (trans->msg_type) {
+		case NFT_MSG_NEWCHAIN:
+			if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+				continue;
+
+			err = nft_flow_offload_chain(trans, FLOW_BLOCK_BIND);
+			break;
+		case NFT_MSG_DELCHAIN:
+			if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+				continue;
+
+			err = nft_flow_offload_chain(trans, FLOW_BLOCK_UNBIND);
+			break;
+		case NFT_MSG_NEWRULE:
+			if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+				continue;
+
+			if (trans->ctx.flags & NLM_F_REPLACE ||
+			    !(trans->ctx.flags & NLM_F_APPEND))
+				return -EOPNOTSUPP;
+
+			err = nft_flow_offload_rule(trans, FLOW_CLS_REPLACE);
+			nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+			break;
+		case NFT_MSG_DELRULE:
+			if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+				continue;
+
+			err = nft_flow_offload_rule(trans, FLOW_CLS_DESTROY);
+			break;
+		}
+
+		if (err)
+			return err;
+	}
+
+	return err;
+}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 411c0cf741e3..bd173b1824c6 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -12,6 +12,7 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables_offload.h>
 #include <net/netfilter/nf_tables.h>
 
 struct nft_cmp_expr {
@@ -107,12 +108,44 @@ nla_put_failure:
 	return -1;
 }
 
+static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
+			     struct nft_flow_rule *flow,
+			     const struct nft_cmp_expr *priv)
+{
+	struct nft_offload_reg *reg = &ctx->regs[priv->sreg];
+	u8 *mask = (u8 *)&flow->match.mask;
+	u8 *key = (u8 *)&flow->match.key;
+
+	if (priv->op != NFT_CMP_EQ)
+		return -EOPNOTSUPP;
+
+	memcpy(key + reg->offset, &priv->data, priv->len);
+	memcpy(mask + reg->offset, &reg->mask, priv->len);
+
+	flow->match.dissector.used_keys |= BIT(reg->key);
+	flow->match.dissector.offset[reg->key] = reg->base_offset;
+
+	nft_offload_update_dependency(ctx, &priv->data, priv->len);
+
+	return 0;
+}
+
+static int nft_cmp_offload(struct nft_offload_ctx *ctx,
+			   struct nft_flow_rule *flow,
+			   const struct nft_expr *expr)
+{
+	const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+
+	return __nft_cmp_offload(ctx, flow, priv);
+}
+
 static const struct nft_expr_ops nft_cmp_ops = {
 	.type		= &nft_cmp_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
 	.eval		= nft_cmp_eval,
 	.init		= nft_cmp_init,
 	.dump		= nft_cmp_dump,
+	.offload	= nft_cmp_offload,
 };
 
 static int nft_cmp_fast_init(const struct nft_ctx *ctx,
@@ -143,6 +176,25 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
+static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
+				struct nft_flow_rule *flow,
+				const struct nft_expr *expr)
+{
+	const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+	struct nft_cmp_expr cmp = {
+		.data	= {
+			.data	= {
+				[0] = priv->data,
+			},
+		},
+		.sreg	= priv->sreg,
+		.len	= priv->len / BITS_PER_BYTE,
+		.op	= NFT_CMP_EQ,
+	};
+
+	return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
 static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
@@ -169,6 +221,7 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
 	.eval		= NULL,	/* inlined */
 	.init		= nft_cmp_fast_init,
 	.dump		= nft_cmp_fast_dump,
+	.offload	= nft_cmp_fast_offload,
 };
 
 static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index cb8547f97220..ca2ae4b95a8d 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -13,6 +13,7 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
 
 void nft_immediate_eval(const struct nft_expr *expr,
 			struct nft_regs *regs,
@@ -124,6 +125,34 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
 	return 0;
 }
 
+static int nft_immediate_offload(struct nft_offload_ctx *ctx,
+				 struct nft_flow_rule *flow,
+				 const struct nft_expr *expr)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	struct flow_action_entry *entry;
+	const struct nft_data *data;
+
+	if (priv->dreg != NFT_REG_VERDICT)
+		return -EOPNOTSUPP;
+
+	entry = &flow->rule->action.entries[ctx->num_actions++];
+
+	data = &priv->data;
+	switch (data->verdict.code) {
+	case NF_ACCEPT:
+		entry->id = FLOW_ACTION_ACCEPT;
+		break;
+	case NF_DROP:
+		entry->id = FLOW_ACTION_DROP;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static const struct nft_expr_ops nft_imm_ops = {
 	.type		= &nft_imm_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -133,6 +162,8 @@ static const struct nft_expr_ops nft_imm_ops = {
 	.deactivate	= nft_immediate_deactivate,
 	.dump		= nft_immediate_dump,
 	.validate	= nft_immediate_validate,
+	.offload	= nft_immediate_offload,
+	.offload_flags	= NFT_OFFLOAD_F_ACTION,
 };
 
 struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 417f8d32e9a3..76866f77e343 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -22,6 +22,7 @@
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nft_meta.h>
+#include <net/netfilter/nf_tables_offload.h>
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
@@ -490,6 +491,31 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
 
+static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
+				struct nft_flow_rule *flow,
+				const struct nft_expr *expr)
+{
+	const struct nft_meta *priv = nft_expr_priv(expr);
+	struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+	switch (priv->key) {
+	case NFT_META_PROTOCOL:
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
+				  sizeof(__u16), reg);
+		nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+		break;
+	case NFT_META_L4PROTO:
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+				  sizeof(__u8), reg);
+		nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static const struct nft_expr_ops nft_meta_get_ops = {
 	.type		= &nft_meta_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -497,6 +523,7 @@ static const struct nft_expr_ops nft_meta_get_ops = {
 	.init		= nft_meta_get_init,
 	.dump		= nft_meta_get_dump,
 	.validate	= nft_meta_get_validate,
+	.offload	= nft_meta_get_offload,
 };
 
 static const struct nft_expr_ops nft_meta_set_ops = {
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 1260f78a034d..22a80eb60222 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -15,10 +15,13 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
 /* For layer 4 checksum field offset. */
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/icmpv6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 
 /* add vlan header into the user buffer for if tag was removed by offloads */
 static bool
@@ -150,12 +153,195 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
+				  struct nft_flow_rule *flow,
+				  const struct nft_payload *priv)
+{
+	struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+	switch (priv->offset) {
+	case offsetof(struct ethhdr, h_source):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
+				  src, ETH_ALEN, reg);
+		break;
+	case offsetof(struct ethhdr, h_dest):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
+				  dst, ETH_ALEN, reg);
+		break;
+	}
+
+	return 0;
+}
+
+static int nft_payload_offload_ip(struct nft_offload_ctx *ctx,
+				  struct nft_flow_rule *flow,
+				  const struct nft_payload *priv)
+{
+	struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+	switch (priv->offset) {
+	case offsetof(struct iphdr, saddr):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src,
+				  sizeof(struct in_addr), reg);
+		break;
+	case offsetof(struct iphdr, daddr):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst,
+				  sizeof(struct in_addr), reg);
+		break;
+	case offsetof(struct iphdr, protocol):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+				  sizeof(__u8), reg);
+		nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx,
+				  struct nft_flow_rule *flow,
+				  const struct nft_payload *priv)
+{
+	struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+	switch (priv->offset) {
+	case offsetof(struct ipv6hdr, saddr):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src,
+				  sizeof(struct in6_addr), reg);
+		break;
+	case offsetof(struct ipv6hdr, daddr):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst,
+				  sizeof(struct in6_addr), reg);
+		break;
+	case offsetof(struct ipv6hdr, nexthdr):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+				  sizeof(__u8), reg);
+		nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int nft_payload_offload_nh(struct nft_offload_ctx *ctx,
+				  struct nft_flow_rule *flow,
+				  const struct nft_payload *priv)
+{
+	int err;
+
+	switch (ctx->dep.l3num) {
+	case htons(ETH_P_IP):
+		err = nft_payload_offload_ip(ctx, flow, priv);
+		break;
+	case htons(ETH_P_IPV6):
+		err = nft_payload_offload_ip6(ctx, flow, priv);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx,
+				   struct nft_flow_rule *flow,
+				   const struct nft_payload *priv)
+{
+	struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+	switch (priv->offset) {
+	case offsetof(struct tcphdr, source):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
+				  sizeof(__be16), reg);
+		break;
+	case offsetof(struct tcphdr, dest):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
+				  sizeof(__be16), reg);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int nft_payload_offload_udp(struct nft_offload_ctx *ctx,
+				   struct nft_flow_rule *flow,
+				   const struct nft_payload *priv)
+{
+	struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+	switch (priv->offset) {
+	case offsetof(struct udphdr, source):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
+				  sizeof(__be16), reg);
+		break;
+	case offsetof(struct udphdr, dest):
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
+				  sizeof(__be16), reg);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int nft_payload_offload_th(struct nft_offload_ctx *ctx,
+				  struct nft_flow_rule *flow,
+				  const struct nft_payload *priv)
+{
+	int err;
+
+	switch (ctx->dep.protonum) {
+	case IPPROTO_TCP:
+		err = nft_payload_offload_tcp(ctx, flow, priv);
+		break;
+	case IPPROTO_UDP:
+		err = nft_payload_offload_udp(ctx, flow, priv);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+static int nft_payload_offload(struct nft_offload_ctx *ctx,
+			       struct nft_flow_rule *flow,
+			       const struct nft_expr *expr)
+{
+	const struct nft_payload *priv = nft_expr_priv(expr);
+	int err;
+
+	switch (priv->base) {
+	case NFT_PAYLOAD_LL_HEADER:
+		err = nft_payload_offload_ll(ctx, flow, priv);
+		break;
+	case NFT_PAYLOAD_NETWORK_HEADER:
+		err = nft_payload_offload_nh(ctx, flow, priv);
+		break;
+	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		err = nft_payload_offload_th(ctx, flow, priv);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+	return err;
+}
+
 static const struct nft_expr_ops nft_payload_ops = {
 	.type		= &nft_payload_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)),
 	.eval		= nft_payload_eval,
 	.init		= nft_payload_init,
 	.dump		= nft_payload_dump,
+	.offload	= nft_payload_offload,
 };
 
 const struct nft_expr_ops nft_payload_fast_ops = {
@@ -164,6 +350,7 @@ const struct nft_expr_ops nft_payload_fast_ops = {
 	.eval		= nft_payload_eval,
 	.init		= nft_payload_init,
 	.dump		= nft_payload_dump,
+	.offload	= nft_payload_offload,
 };
 
 static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
-- 
cgit v1.2.3


From fbc697796e358d1ed8ed25758b19bdb3a1f8e9f9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 9 Jul 2019 14:45:17 -0700
Subject: pkt_sched: Include const.h

Commit 9903c8dc7342 changed TC_ETF defines to use _BITUL instead of BIT
but did not add the dependecy on linux/const.h. As a consequence,
importing the uapi headers into iproute2 causes builds to fail. Add
the dependency.

Fixes: 9903c8dc7342 ("etf: Don't use BIT() in UAPI headers.")
Cc: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 390efb54b2e0..1f623252abe8 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_PKT_SCHED_H
 #define __LINUX_PKT_SCHED_H
 
+#include <linux/const.h>
 #include <linux/types.h>
 
 /* Logical priority bands not depending on specific packet scheduler.
-- 
cgit v1.2.3