summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-05-11 19:31:40 -0400
committerDavid S. Miller <davem@davemloft.net>2016-05-11 19:31:56 -0400
commitc3f1010b30f7fc611139cfb702a8685741aa6827 (patch)
treeb68c8a9b5148687c7bce7c6a5ab14e26a5c47050
parentca4aa976f04d14bc7da60dce0e2afc34c9f0f1d2 (diff)
parent0b922b7a829c06e3b0790c58cd9ca026de86096e (diff)
downloadlinux-c3f1010b30f7fc611139cfb702a8685741aa6827.tar.bz2
Merge branch 'vrf-pktinfo'
David Ahern says: ==================== net: vrf: Fixup PKTINFO to return enslaved device index Applications such as OSPF and BFD need the original ingress device not the VRF device; the latter can be derived from the former. To that end move the packet intercept from an rx handler that is invoked by __netif_receive_skb_core to the ipv4 and ipv6 receive processing. IPv6 already saves the skb_iif to the control buffer in ipv6_rcv. Since the skb->dev has not been switched the cb has the enslaved device. Make the same happen for IPv4 by adding the skb_iif to inet_skb_parm and set it in ipv4 code after clearing the skb control buffer similar to IPv6. From there the pktinfo can just pull it from cb with the PKTINFO_SKB_CB cast. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/vrf.c189
-rw-r--r--include/linux/ipv6.h17
-rw-r--r--include/linux/netdevice.h2
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/l3mdev.h42
-rw-r--r--include/net/tcp.h4
-rw-r--r--net/core/dev.c3
-rw-r--r--net/ipv4/ip_input.c8
-rw-r--r--net/ipv4/ip_sockglue.c7
-rw-r--r--net/ipv6/ip6_input.c7
10 files changed, 178 insertions, 102 deletions
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index c8db55aa8280..0ea29345eb2e 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -42,9 +42,6 @@
#define DRV_NAME "vrf"
#define DRV_VERSION "1.0"
-#define vrf_master_get_rcu(dev) \
- ((struct net_device *)rcu_dereference(dev->rx_handler_data))
-
struct net_vrf {
struct rtable *rth;
struct rt6_info *rt6;
@@ -60,90 +57,12 @@ struct pcpu_dstats {
struct u64_stats_sync syncp;
};
-/* neighbor handling is done with actual device; do not want
- * to flip skb->dev for those ndisc packets. This really fails
- * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
- * a start.
- */
-#if IS_ENABLED(CONFIG_IPV6)
-static bool check_ipv6_frame(const struct sk_buff *skb)
-{
- const struct ipv6hdr *ipv6h;
- struct ipv6hdr _ipv6h;
- bool rc = true;
-
- ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
- if (!ipv6h)
- goto out;
-
- if (ipv6h->nexthdr == NEXTHDR_ICMP) {
- const struct icmp6hdr *icmph;
- struct icmp6hdr _icmph;
-
- icmph = skb_header_pointer(skb, sizeof(_ipv6h),
- sizeof(_icmph), &_icmph);
- if (!icmph)
- goto out;
-
- switch (icmph->icmp6_type) {
- case NDISC_ROUTER_SOLICITATION:
- case NDISC_ROUTER_ADVERTISEMENT:
- case NDISC_NEIGHBOUR_SOLICITATION:
- case NDISC_NEIGHBOUR_ADVERTISEMENT:
- case NDISC_REDIRECT:
- rc = false;
- break;
- }
- }
-
-out:
- return rc;
-}
-#else
-static bool check_ipv6_frame(const struct sk_buff *skb)
-{
- return false;
-}
-#endif
-
-static bool is_ip_rx_frame(struct sk_buff *skb)
-{
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- return true;
- case htons(ETH_P_IPV6):
- return check_ipv6_frame(skb);
- }
- return false;
-}
-
static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
{
vrf_dev->stats.tx_errors++;
kfree_skb(skb);
}
-/* note: already called with rcu_read_lock */
-static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
-{
- struct sk_buff *skb = *pskb;
-
- if (is_ip_rx_frame(skb)) {
- struct net_device *dev = vrf_master_get_rcu(skb->dev);
- struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
-
- u64_stats_update_begin(&dstats->syncp);
- dstats->rx_pkts++;
- dstats->rx_bytes += skb->len;
- u64_stats_update_end(&dstats->syncp);
-
- skb->dev = dev;
-
- return RX_HANDLER_ANOTHER;
- }
- return RX_HANDLER_PASS;
-}
-
static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *stats)
{
@@ -506,28 +425,14 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
{
int ret;
- /* register the packet handler for slave ports */
- ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
- if (ret) {
- netdev_err(port_dev,
- "Device %s failed to register rx_handler\n",
- port_dev->name);
- goto out_fail;
- }
-
ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
if (ret < 0)
- goto out_unregister;
+ return ret;
port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
cycle_netdev(port_dev);
return 0;
-
-out_unregister:
- netdev_rx_handler_unregister(port_dev);
-out_fail:
- return ret;
}
static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
@@ -544,8 +449,6 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
netdev_upper_dev_unlink(port_dev, dev);
port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
- netdev_rx_handler_unregister(port_dev);
-
cycle_netdev(port_dev);
return 0;
@@ -670,6 +573,95 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
}
#if IS_ENABLED(CONFIG_IPV6)
+/* neighbor handling is done with actual device; do not want
+ * to flip skb->dev for those ndisc packets. This really fails
+ * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
+ * a start.
+ */
+static bool ipv6_ndisc_frame(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ bool rc = false;
+
+ if (iph->nexthdr == NEXTHDR_ICMP) {
+ const struct icmp6hdr *icmph;
+ struct icmp6hdr _icmph;
+
+ icmph = skb_header_pointer(skb, sizeof(*iph),
+ sizeof(_icmph), &_icmph);
+ if (!icmph)
+ goto out;
+
+ switch (icmph->icmp6_type) {
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_ROUTER_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ case NDISC_REDIRECT:
+ rc = true;
+ break;
+ }
+ }
+
+out:
+ return rc;
+}
+
+static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
+ struct sk_buff *skb)
+{
+ /* if packet is NDISC keep the ingress interface */
+ if (!ipv6_ndisc_frame(skb)) {
+ skb->dev = vrf_dev;
+ skb->skb_iif = vrf_dev->ifindex;
+
+ skb_push(skb, skb->mac_len);
+ dev_queue_xmit_nit(skb, vrf_dev);
+ skb_pull(skb, skb->mac_len);
+
+ IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
+ }
+
+ return skb;
+}
+
+#else
+static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
+ struct sk_buff *skb)
+{
+ return skb;
+}
+#endif
+
+static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
+ struct sk_buff *skb)
+{
+ skb->dev = vrf_dev;
+ skb->skb_iif = vrf_dev->ifindex;
+
+ skb_push(skb, skb->mac_len);
+ dev_queue_xmit_nit(skb, vrf_dev);
+ skb_pull(skb, skb->mac_len);
+
+ return skb;
+}
+
+/* called with rcu lock held */
+static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
+ struct sk_buff *skb,
+ u16 proto)
+{
+ switch (proto) {
+ case AF_INET:
+ return vrf_ip_rcv(vrf_dev, skb);
+ case AF_INET6:
+ return vrf_ip6_rcv(vrf_dev, skb);
+ }
+
+ return skb;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
const struct flowi6 *fl6)
{
@@ -690,6 +682,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
.l3mdev_fib_table = vrf_fib_table,
.l3mdev_get_rtable = vrf_get_rtable,
.l3mdev_get_saddr = vrf_get_saddr,
+ .l3mdev_l3_rcv = vrf_l3_rcv,
#if IS_ENABLED(CONFIG_IPV6)
.l3mdev_get_rt6_dst = vrf_get_rt6_dst,
#endif
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 58d6e158755f..5c91b0b055d4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -118,14 +118,29 @@ struct inet6_skb_parm {
#define IP6SKB_ROUTERALERT 8
#define IP6SKB_FRAGMENTED 16
#define IP6SKB_HOPBYHOP 32
+#define IP6SKB_L3SLAVE 64
};
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+static inline bool skb_l3mdev_slave(__u16 flags)
+{
+ return flags & IP6SKB_L3SLAVE;
+}
+#else
+static inline bool skb_l3mdev_slave(__u16 flags)
+{
+ return false;
+}
+#endif
+
#define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb))
#define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb))
static inline int inet6_iif(const struct sk_buff *skb)
{
- return IP6CB(skb)->iif;
+ bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
+
+ return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
}
struct tcp6_request_sock {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 63580e6d0df4..c2f5112f08f7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3258,6 +3258,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb);
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+
extern int netdev_budget;
/* Called by rtnetlink.c:rtnl_unlock() */
diff --git a/include/net/ip.h b/include/net/ip.h
index 247ac82e9cf2..37165fba3741 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -36,6 +36,7 @@
struct sock;
struct inet_skb_parm {
+ int iif;
struct ip_options opt; /* Compiled IP options */
unsigned char flags;
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 78872bd1dc2c..374388dc01c8 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -25,6 +25,8 @@
struct l3mdev_ops {
u32 (*l3mdev_fib_table)(const struct net_device *dev);
+ struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
+ struct sk_buff *skb, u16 proto);
/* IPv4 ops */
struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev,
@@ -134,6 +136,34 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6);
+static inline
+struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
+{
+ struct net_device *master = NULL;
+
+ if (netif_is_l3_slave(skb->dev))
+ master = netdev_master_upper_dev_get_rcu(skb->dev);
+ else if (netif_is_l3_master(skb->dev))
+ master = skb->dev;
+
+ if (master && master->l3mdev_ops->l3mdev_l3_rcv)
+ skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);
+
+ return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+{
+ return l3mdev_l3_rcv(skb, AF_INET);
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+{
+ return l3mdev_l3_rcv(skb, AF_INET6);
+}
+
#else
static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -194,6 +224,18 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6)
{
return NULL;
}
+
+static inline
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+{
+ return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+{
+ return skb;
+}
#endif
#endif /* _NET_L3MDEV_H_ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c9ab561387c4..0bcc70f4e1fb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -786,7 +786,9 @@ struct tcp_skb_cb {
*/
static inline int tcp_v6_iif(const struct sk_buff *skb)
{
- return TCP_SKB_CB(skb)->header.h6.iif;
+ bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags);
+
+ return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
}
#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index c7490339315c..12436d1312ca 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
* taps currently in use.
*/
-static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
struct sk_buff *skb2 = NULL;
@@ -1907,6 +1907,7 @@ out_unlock:
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
/**
* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 751c0658e194..4b351af3e67b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -471,6 +478,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index bdb222c0c6a2..5805762d7fc7 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1193,7 +1193,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
ipv6_sk_rxinfo(sk);
if (prepare && skb_rtable(skb)) {
- pktinfo->ipi_ifindex = inet_iif(skb);
+ /* skb->cb is overloaded: prior to this point it is IP{6}CB
+ * which has interface index (iif) as the first member of the
+ * underlying inet{6}_skb_parm struct. This code then overlays
+ * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
+ * element so the iif is picked up from the prior IPCB
+ */
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6ed56012005d..f185cbcda114 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,6 +49,13 @@
int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+
if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
const struct inet6_protocol *ipprot;