From 635c223cfa05af9523146b2f37e119d945f449ae Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 31 Aug 2016 14:15:05 +0800 Subject: rps: flow_dissector: Fix uninitialized flow_keys used in __skb_get_hash possibly The original codes depend on that the function parameters are evaluated from left to right. But the parameter's evaluation order is not defined in C standard actually. When flow_keys_have_l4(&keys) is invoked before ___skb_get_hash(skb, &keys, hashrnd) with some compilers or environment, the keys passed to flow_keys_have_l4 is not initialized. Fixes: 6db61d79c1e1 ("flow_dissector: Ignore flow dissector return value from ___skb_get_hash") Acked-by: Eric Dumazet Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 61ad43f61c5e..52742a02814f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -680,11 +680,13 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; + u32 hash; __flow_hash_secret_init(); - __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd), - flow_keys_have_l4(&keys)); + hash = ___skb_get_hash(skb, &keys, hashrnd); + + __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); -- cgit v1.2.3 From 24b27fc4cdf9e10c5e79e5923b6b7c2c5c95096c Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Thu, 1 Sep 2016 22:18:34 -0700 Subject: bonding: Fix bonding crash Following few steps will crash kernel - (a) Create bonding master > modprobe bonding miimon=50 (b) Create macvlan bridge on eth2 > ip link add link eth2 dev mvl0 address aa:0:0:0:0:01 \ type macvlan (c) Now try adding eth2 into the bond > echo +eth2 > /sys/class/net/bond0/bonding/slaves Bonding does lots of things before checking if the device enslaved is busy or not. In this case when the notifier call-chain sends notifications, the bond_netdev_event() assumes that the rx_handler /rx_handler_data is registered while the bond_enslave() hasn't progressed far enough to register rx_handler for the new slave. This patch adds a rx_handler check that can be performed right at the beginning of the enslave code to avoid getting into this situation. Signed-off-by: Mahesh Bandewar Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 ++++--- include/linux/netdevice.h | 1 + net/core/dev.c | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 217e8da0628c..9599ed6f1213 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1341,9 +1341,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) slave_dev->name); } - /* already enslaved */ - if (slave_dev->flags & IFF_SLAVE) { - netdev_dbg(bond_dev, "Error: Device was already enslaved\n"); + /* already in-use? */ + if (netdev_is_rx_handler_busy(slave_dev)) { + netdev_err(bond_dev, + "Error: Device is in use and cannot be enslaved\n"); return -EBUSY; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a788bf0affd..e8d79d4ebcfe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3267,6 +3267,7 @@ static inline void napi_free_frags(struct napi_struct *napi) napi->skb = NULL; } +bool netdev_is_rx_handler_busy(struct net_device *dev); int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data); diff --git a/net/core/dev.c b/net/core/dev.c index dd6ce598de89..ea6312057a71 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3974,6 +3974,22 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, return skb; } +/** + * netdev_is_rx_handler_busy - check if receive handler is registered + * @dev: device to check + * + * Check if a receive handler is already registered for a given device. + * Return true if there one. + * + * The caller must hold the rtnl_mutex. + */ +bool netdev_is_rx_handler_busy(struct net_device *dev) +{ + ASSERT_RTNL(); + return dev && rtnl_dereference(dev->rx_handler); +} +EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); + /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for -- cgit v1.2.3 From 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a Mon Sep 17 00:00:00 2001 From: Yaogong Wang Date: Wed, 7 Sep 2016 14:49:28 -0700 Subject: tcp: use an RB tree for ooo receive queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Over the years, TCP BDP has increased by several orders of magnitude, and some people are considering to reach the 2 Gbytes limit. Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000 MSS. In presence of packet losses (or reorders), TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can be in the 10^5 range. Most packets are appended to the tail of this queue, and when packets can finally be transferred to receive queue, we scan the queue from its head. However, in presence of heavy losses, we might have to find an arbitrary point in this queue, involving a linear scan for every incoming packet, throwing away cpu caches. This patch converts it to a RB tree, to get bounded latencies. Yaogong wrote a preliminary patch about 2 years ago. Eric did the rebase, added ofo_last_skb cache, polishing and tests. Tested with network dropping between 1 and 10 % packets, with good success (about 30 % increase of throughput in stress tests) Next step would be to also use an RB tree for the write queue at sender side ;) Signed-off-by: Yaogong Wang Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Ilpo Järvinen Acked-By: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 + include/linux/tcp.h | 7 +- include/net/tcp.h | 2 +- net/core/skbuff.c | 19 +++ net/ipv4/tcp.c | 4 +- net/ipv4/tcp_input.c | 330 +++++++++++++++++++++++++++-------------------- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 1 - 8 files changed, 218 insertions(+), 149 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cfb7219be665..4c5662f05bda 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2402,6 +2402,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } +void skb_rbtree_purge(struct rb_root *root); + void *netdev_alloc_frag(unsigned int fragsz); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7be9b1242354..c723a465125d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -281,10 +281,9 @@ struct tcp_sock { struct sk_buff* lost_skb_hint; struct sk_buff *retransmit_skb_hint; - /* OOO segments go in this list. Note that socket lock must be held, - * as we do not use sk_buff_head lock. - */ - struct sk_buff_head out_of_order_queue; + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d6ae36512429..fdfbedd61c67 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -640,7 +640,7 @@ static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (skb_queue_empty(&tp->out_of_order_queue) && + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3864b4b68fa1..1e329d411242 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2444,6 +2444,25 @@ void skb_queue_purge(struct sk_buff_head *list) } EXPORT_SYMBOL(skb_queue_purge); +/** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +void skb_rbtree_purge(struct rb_root *root) +{ + struct sk_buff *skb, *next; + + rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) + kfree_skb(skb); + + *root = RB_ROOT; +} + /** * skb_queue_head - queue a buffer at the list head * @list: list to use diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 77311a92275c..a13fcb369f52 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -380,7 +380,7 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - __skb_queue_head_init(&tp->out_of_order_queue); + tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); @@ -2243,7 +2243,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8cd02c0b056c..a5934c4c8cd4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4108,7 +4108,7 @@ void tcp_fin(struct sock *sk) /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_mem_reclaim(sk); @@ -4268,7 +4268,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ - if (skb_queue_empty(&tp->out_of_order_queue)) { + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; return; } @@ -4344,10 +4344,13 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; + bool fin, fragstolen, eaten; struct sk_buff *skb, *tail; - bool fragstolen, eaten; + struct rb_node *p; - while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + p = rb_first(&tp->out_of_order_queue); + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4357,9 +4360,10 @@ static void tcp_ofo_queue(struct sock *sk) dsack_high = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + p = rb_next(p); + rb_erase(&skb->rbnode, &tp->out_of_order_queue); - __skb_unlink(skb, &tp->out_of_order_queue); - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); tcp_drop(sk, skb); continue; @@ -4371,12 +4375,19 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); + fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - tcp_fin(sk); - if (eaten) + else kfree_skb_partial(skb, fragstolen); + + if (unlikely(fin)) { + tcp_fin(sk); + /* tcp_fin() purges tp->out_of_order_queue, + * so we must end this loop right now. + */ + break; + } } } @@ -4403,8 +4414,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct rb_node **p, *q, *parent; struct sk_buff *skb1; u32 seq, end_seq; + bool fragstolen; tcp_ecn_check_ce(tp, skb); @@ -4419,88 +4432,85 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) inet_csk_schedule_ack(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); + seq = TCP_SKB_CB(skb)->seq; + end_seq = TCP_SKB_CB(skb)->end_seq; SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + tp->rcv_nxt, seq, end_seq); - skb1 = skb_peek_tail(&tp->out_of_order_queue); - if (!skb1) { + p = &tp->out_of_order_queue.rb_node; + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; - tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; - tp->selective_acks[0].end_seq = - TCP_SKB_CB(skb)->end_seq; + tp->selective_acks[0].start_seq = seq; + tp->selective_acks[0].end_seq = end_seq; } - __skb_queue_head(&tp->out_of_order_queue, skb); + rb_link_node(&skb->rbnode, NULL, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + tp->ooo_last_skb = skb; goto end; } - seq = TCP_SKB_CB(skb)->seq; - end_seq = TCP_SKB_CB(skb)->end_seq; - - if (seq == TCP_SKB_CB(skb1)->end_seq) { - bool fragstolen; - - if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); - } else { - tcp_grow_window(sk, skb); - kfree_skb_partial(skb, fragstolen); - skb = NULL; + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { +coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); + skb = NULL; + goto add_sack; + } + + /* Find place to insert this segment. Handle overlaps on the way. */ + parent = NULL; + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(seq, TCP_SKB_CB(skb1)->seq)) { + p = &parent->rb_left; + continue; } - - if (!tp->rx_opt.num_sacks || - tp->selective_acks[0].end_seq != seq) - goto add_sack; - - /* Common case: data arrive in order after hole. */ - tp->selective_acks[0].end_seq = end_seq; - goto end; - } - - /* Find place to insert this segment. */ - while (1) { - if (!after(TCP_SKB_CB(skb1)->seq, seq)) - break; - if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { - skb1 = NULL; - break; + if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + /* skb's seq == skb1's seq and skb covers skb1. + * Replace skb1 with skb. + */ + rb_replace_node(&skb1->rbnode, &skb->rbnode, + &tp->out_of_order_queue); + tcp_dsack_extend(sk, + TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb1); + goto add_sack; + } + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + goto coalesce_done; } - skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); + p = &parent->rb_right; } - /* Do skb overlap to previous one? */ - if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - /* All the bits are present. Drop. */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb); - skb = NULL; - tcp_dsack_set(sk, seq, end_seq); - goto add_sack; - } - if (after(seq, TCP_SKB_CB(skb1)->seq)) { - /* Partial overlap. */ - tcp_dsack_set(sk, seq, - TCP_SKB_CB(skb1)->end_seq); - } else { - if (skb_queue_is_first(&tp->out_of_order_queue, - skb1)) - skb1 = NULL; - else - skb1 = skb_queue_prev( - &tp->out_of_order_queue, - skb1); - } - } - if (!skb1) - __skb_queue_head(&tp->out_of_order_queue, skb); - else - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + /* Insert segment into RB tree. */ + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); - /* And clean segments covered by new one as whole. */ - while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { - skb1 = skb_queue_next(&tp->out_of_order_queue, skb); + /* Remove other segments covered by skb. */ + while ((q = rb_next(&skb->rbnode)) != NULL) { + skb1 = rb_entry(q, struct sk_buff, rbnode); if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; @@ -4509,12 +4519,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq); break; } - __skb_unlink(skb1, &tp->out_of_order_queue); + rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop(sk, skb1); } + /* If there is no skb after us, we are the last_skb ! */ + if (!q) + tp->ooo_last_skb = skb; add_sack: if (tcp_is_sack(tp)) @@ -4651,13 +4664,13 @@ queue_and_out: if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); - if (!skb_queue_empty(&tp->out_of_order_queue)) { + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC2581. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ - if (skb_queue_empty(&tp->out_of_order_queue)) + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) inet_csk(sk)->icsk_ack.pingpong = 0; } @@ -4711,48 +4724,76 @@ drop: tcp_data_queue_ofo(sk, skb); } +static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) +{ + if (list) + return !skb_queue_is_last(list, skb) ? skb->next : NULL; + + return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); +} + static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *list) + struct sk_buff_head *list, + struct rb_root *root) { - struct sk_buff *next = NULL; + struct sk_buff *next = tcp_skb_next(skb, list); - if (!skb_queue_is_last(list, skb)) - next = skb_queue_next(list, skb); + if (list) + __skb_unlink(skb, list); + else + rb_erase(&skb->rbnode, root); - __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ +static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sk_buff *skb1; + + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, root); +} + /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * - * If tail is NULL, this means until the end of the list. + * If tail is NULL, this means until the end of the queue. * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff_head *list, - struct sk_buff *head, struct sk_buff *tail, - u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, + struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { - struct sk_buff *skb, *n; + struct sk_buff *skb = head, *n; + struct sk_buff_head tmp; bool end_of_skbs; /* First, check that queue is collapsible and find - * the point where collapsing can be useful. */ - skb = head; + * the point where collapsing can be useful. + */ restart: - end_of_skbs = true; - skb_queue_walk_from_safe(list, skb, n) { - if (skb == tail) - break; + for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { + n = tcp_skb_next(skb, list); + /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb) break; goto restart; @@ -4770,13 +4811,10 @@ restart: break; } - if (!skb_queue_is_last(list, skb)) { - struct sk_buff *next = skb_queue_next(list, skb); - if (next != tail && - TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { - end_of_skbs = false; - break; - } + if (n && n != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { + end_of_skbs = false; + break; } /* Decided to skip this, advance start seq. */ @@ -4786,17 +4824,22 @@ restart: (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; + __skb_queue_head_init(&tmp); + while (before(start, end)) { int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) - return; + break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_queue_before(list, skb, nskb); + if (list) + __skb_queue_before(list, skb, nskb); + else + __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -4814,14 +4857,17 @@ restart: start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) - return; + goto end; } } } +end: + skb_queue_walk_safe(&tmp, skb, n) + tcp_rbtree_insert(root, skb); } /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs @@ -4830,43 +4876,43 @@ restart: static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); - struct sk_buff *head; + struct sk_buff *skb, *head; + struct rb_node *p; u32 start, end; - if (!skb) + p = rb_first(&tp->out_of_order_queue); + skb = rb_entry_safe(p, struct sk_buff, rbnode); +new_range: + if (!skb) { + p = rb_last(&tp->out_of_order_queue); + /* Note: This is possible p is NULL here. We do not + * use rb_entry_safe(), as ooo_last_skb is valid only + * if rbtree is not empty. + */ + tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); return; - + } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; - head = skb; - - for (;;) { - struct sk_buff *next = NULL; - if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) - next = skb_queue_next(&tp->out_of_order_queue, skb); - skb = next; + for (head = skb;;) { + skb = tcp_skb_next(skb, NULL); - /* Segment is terminated when we see gap or when - * we are at the end of all the queue. */ + /* Range is terminated when we see a gap or when + * we are at the queue end. + */ if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, &tp->out_of_order_queue, + tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); - head = skb; - if (!skb) - break; - /* Start new segment */ + goto new_range; + } + + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) end = TCP_SKB_CB(skb)->end_seq; - } else { - if (before(TCP_SKB_CB(skb)->seq, start)) - start = TCP_SKB_CB(skb)->seq; - if (after(TCP_SKB_CB(skb)->end_seq, end)) - end = TCP_SKB_CB(skb)->end_seq; - } } } @@ -4883,20 +4929,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct rb_node *node, *prev; - if (skb_queue_empty(&tp->out_of_order_queue)) + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); - - while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) { - tcp_drop(sk, skb); + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); + tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; - } + node = prev; + } while (node); + tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection @@ -4930,7 +4980,7 @@ static int tcp_prune_queue(struct sock *sk) tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) - tcp_collapse(sk, &sk->sk_receive_queue, + tcp_collapse(sk, &sk->sk_receive_queue, NULL, skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); @@ -5035,7 +5085,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ - (ofo_possible && skb_peek(&tp->out_of_order_queue))) { + (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { /* Then ack it now */ tcp_send_ack(sk); } else { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a75bf48d7950..04b989328558 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1845,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_write_queue_purge(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); #ifdef CONFIG_TCP_MD5SIG /* Clean up the MD5 key list, if any */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4b95ec4ed2c8..f63c73dc0acb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -488,7 +488,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_cwnd_cnt = 0; tcp_init_xmit_timers(newsk); - __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; -- cgit v1.2.3 From b8b867e132d2c32f16a49b3ce5c11ee289a92c4e Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 7 Sep 2016 13:57:36 -0700 Subject: rtnetlink: remove unused ifla_stats_policy This structure is defined but never used. Flagged with W=1 Signed-off-by: Stephen Hemminger Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1dfca1c3f8f5..937e459bdaa9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3669,10 +3669,6 @@ nla_put_failure: return -EMSGSIZE; } -static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = { - [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) }, -}; - static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { -- cgit v1.2.3 From 6088b5823b4cb132a838878747384cbfb5ce6646 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:28 +0200 Subject: bpf: minor cleanups in helpers Some minor misc cleanups, f.e. use sizeof(__u32) instead of hardcoding and in __bpf_skb_max_len(), I missed that we always have skb->dev valid anyway, so we can drop the unneeded test for dev; also few more other misc bits addressed here. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/helpers.c | 6 +++--- net/core/filter.c | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ea3afba1a4f..6df73bd1ba34 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -126,7 +126,7 @@ static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct task_struct *task = current; - if (!task) + if (unlikely(!task)) return -EINVAL; return (u64) task->tgid << 32 | task->pid; @@ -144,12 +144,12 @@ static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) kuid_t uid; kgid_t gid; - if (!task) + if (unlikely(!task)) return -EINVAL; current_uid_gid(&uid, &gid); return (u64) from_kgid(&init_user_ns, gid) << 32 | - from_kuid(&init_user_ns, uid); + from_kuid(&init_user_ns, uid); } const struct bpf_func_proto bpf_get_current_uid_gid_proto = { diff --git a/net/core/filter.c b/net/core/filter.c index a83766be1ad2..628ed8c7d38d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2010,8 +2010,7 @@ static u32 __bpf_skb_min_len(const struct sk_buff *skb) static u32 __bpf_skb_max_len(const struct sk_buff *skb) { - return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : - 65536; + return skb->dev->mtu + skb->dev->hard_header_len; } static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) @@ -2605,7 +2604,7 @@ static bool __is_valid_xdp_access(int off, int size, return false; if (off % size != 0) return false; - if (size != 4) + if (size != sizeof(__u32)) return false; return true; @@ -2727,7 +2726,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, dst_reg, src_reg, insn); case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); prog->cb_access = 1; -- cgit v1.2.3 From f035a51536af9802f55d8c79bd87f184ebffb093 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:29 +0200 Subject: bpf: add BPF_SIZEOF and BPF_FIELD_SIZEOF macros Add BPF_SIZEOF() and BPF_FIELD_SIZEOF() macros to improve the code a bit which otherwise often result in overly long bytes_to_bpf_size(sizeof()) and bytes_to_bpf_size(FIELD_SIZEOF()) lines. So place them into a macro helper instead. Moreover, we currently have a BUILD_BUG_ON(BPF_FIELD_SIZEOF()) check in convert_bpf_extensions(), but we should rather make that generic as well and add a BUILD_BUG_ON() test in all BPF_SIZEOF()/BPF_FIELD_SIZEOF() users to detect any rewriter size issues at compile time. Note, there are currently none, but we want to assert that it stays this way. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 14 ++++++++++++++ kernel/trace/bpf_trace.c | 12 ++++++------ net/core/filter.c | 15 +++++++-------- 3 files changed, 27 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/include/linux/filter.h b/include/linux/filter.h index a16439b99fd9..7fabad8dc3fc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -314,6 +314,20 @@ struct bpf_prog_aux; bpf_size; \ }) +#define BPF_SIZEOF(type) \ + ({ \ + const int __size = bytes_to_bpf_size(sizeof(type)); \ + BUILD_BUG_ON(__size < 0); \ + __size; \ + }) + +#define BPF_FIELD_SIZEOF(type, field) \ + ({ \ + const int __size = bytes_to_bpf_size(FIELD_SIZEOF(type, field)); \ + BUILD_BUG_ON(__size < 0); \ + __size; \ + }) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d3869b03d9fe..e63d7d435796 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -583,18 +583,18 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, switch (ctx_off) { case offsetof(struct bpf_perf_event_data, sample_period): BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)), - dst_reg, src_reg, + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), dst_reg, src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, offsetof(struct perf_sample_data, period)); break; default: - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)), - dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + regs), dst_reg, src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)), - dst_reg, dst_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); break; } diff --git a/net/core/filter.c b/net/core/filter.c index 628ed8c7d38d..120c813ef030 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -233,9 +233,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); - BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ @@ -2685,7 +2684,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, case offsetof(struct __sk_buff, ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), dst_reg, src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); @@ -2750,7 +2749,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, break; case offsetof(struct __sk_buff, data): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), dst_reg, src_reg, offsetof(struct sk_buff, data)); break; @@ -2759,8 +2758,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, ctx_off -= offsetof(struct __sk_buff, data_end); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct bpf_skb_data_end, data_end); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), - dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, + ctx_off); break; case offsetof(struct __sk_buff, tc_index): @@ -2795,12 +2794,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, switch (ctx_off) { case offsetof(struct xdp_md, data): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), dst_reg, src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_end): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), dst_reg, src_reg, offsetof(struct xdp_buff, data_end)); break; -- cgit v1.2.3 From 374fb54eeaaa6b2cb82bca73a11273687bb2a96a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:30 +0200 Subject: bpf: add own ctx rewriter on ifindex for clsact progs When fetching ifindex, we don't need to test dev for being NULL since we're always guaranteed to have a valid dev for clsact programs. Thus, avoid this test in fast path. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 120c813ef030..d6d9bb89ce3a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2634,10 +2634,10 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, - struct bpf_insn *insn_buf, - struct bpf_prog *prog) +static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -2785,6 +2785,31 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + dst_reg, src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + offsetof(struct net_device, ifindex)); + break; + default: + return sk_filter_convert_ctx_access(type, dst_reg, src_reg, + ctx_off, insn_buf, prog); + } + + return insn - insn_buf; +} + static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2811,13 +2836,13 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .convert_ctx_access = sk_filter_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .convert_ctx_access = tc_cls_act_convert_ctx_access, }; static const struct bpf_verifier_ops xdp_ops = { -- cgit v1.2.3 From f3694e00123802d688180e7ae90b240669910e3c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:31 +0200 Subject: bpf: add BPF_CALL_x macros for declaring helpers This work adds BPF_CALL_() macros and converts all the eBPF helper functions to use them, in a similar fashion like we do with SYSCALL_DEFINE() macros that are used today. Motivation for this is to hide all the register handling and all necessary casts from the user, so that it is done automatically in the background when adding a BPF_CALL_() call. This makes current helpers easier to review, eases to write future helpers, avoids getting the casting mess wrong, and allows for extending all helpers at once (f.e. build time checks, etc). It also helps detecting more easily in code reviews that unused registers are not instrumented in the code by accident, breaking compatibility with existing programs. BPF_CALL_() internals are quite similar to SYSCALL_DEFINE() ones with some fundamental differences, for example, for generating the actual helper function that carries all u64 regs, we need to fill unused regs, so that we always end up with 5 u64 regs as an argument. I reviewed several 0-5 generated BPF_CALL_() variants of the .i results and they look all as expected. No sparse issue spotted. We let this also sit for a few days with Fengguang's kbuild test robot, and there were no issues seen. On s390, it barked on the "uses dynamic stack allocation" notice, which is an old one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion to the call wrapper, just telling that the perf raw record/frag sits on stack (gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests and they were fine as well. All eBPF helpers are now converted to use these macros, getting rid of a good chunk of all the raw castings. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 50 ++++++++++++++++++ kernel/bpf/core.c | 2 +- kernel/bpf/helpers.c | 46 +++++------------ kernel/bpf/stackmap.c | 5 +- kernel/trace/bpf_trace.c | 75 +++++++++++++-------------- net/core/filter.c | 129 ++++++++++++++++++----------------------------- 6 files changed, 149 insertions(+), 158 deletions(-) (limited to 'net/core') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7fabad8dc3fc..1f09c521adfe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -328,6 +328,56 @@ struct bpf_prog_aux; __size; \ }) +#define __BPF_MAP_0(m, v, ...) v +#define __BPF_MAP_1(m, v, t, a, ...) m(t, a) +#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) +#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) +#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) +#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) + +#define __BPF_REG_0(...) __BPF_PAD(5) +#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) +#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) +#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) +#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) +#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) + +#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) +#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) + +#define __BPF_CAST(t, a) \ + (__force t) \ + (__force \ + typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ + (unsigned long)0, (t)0))) a +#define __BPF_V void +#define __BPF_N + +#define __BPF_DECL_ARGS(t, a) t a +#define __BPF_DECL_REGS(t, a) u64 a + +#define __BPF_PAD(n) \ + __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ + u64, __ur_3, u64, __ur_4, u64, __ur_5) + +#define BPF_CALL_x(x, name, ...) \ + static __always_inline \ + u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ + u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ + u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ + { \ + return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ + } \ + static __always_inline \ + u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) + +#define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) +#define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) +#define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) +#define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) +#define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) +#define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 03fd23d4d587..7b7baaed9ed4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void) prandom_init_once(&bpf_user_rnd_state); } -u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6df73bd1ba34..a5b8bf8cfcfd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -16,6 +16,7 @@ #include #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -26,24 +27,10 @@ * if program is allowed to access maps, so check rcu_read_lock_held in * all three functions. */ -static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - /* verifier checked that R1 contains a valid pointer to bpf_map - * and R2 points to a program stack and map->key_size bytes were - * initialized - */ - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value; - WARN_ON_ONCE(!rcu_read_lock_held()); - - value = map->ops->map_lookup_elem(map, key); - - /* lookup() returns either pointer to element value or NULL - * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type - */ - return (unsigned long) value; + return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { @@ -54,15 +41,11 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, + void *, value, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value = (void *) (unsigned long) r3; - WARN_ON_ONCE(!rcu_read_lock_held()); - - return map->ops->map_update_elem(map, key, value, r4); + return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { @@ -75,13 +58,9 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - WARN_ON_ONCE(!rcu_read_lock_held()); - return map->ops->map_delete_elem(map, key); } @@ -99,7 +78,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } @@ -110,7 +89,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); @@ -122,7 +101,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -138,7 +117,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; @@ -158,10 +137,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; - char *buf = (char *) (long) r1; if (unlikely(!task)) goto err_clear; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index bf4495fcd25d..732ae16d12b7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -116,10 +116,9 @@ free_smap: return ERR_PTR(err); } -u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e63d7d435796..5dcb99281259 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -61,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); -static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - void *dst = (void *) (long) r1; - int ret, size = (int) r2; - void *unsafe_ptr = (void *) (long) r3; + int ret; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) @@ -83,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, + u32, size) { - void *unsafe_ptr = (void *) (long) r1; - void *src = (void *) (long) r2; - int size = (int) r3; - /* * Ensure we're in user context which is safe for the helper to * run. This helper has no business in a kthread. @@ -130,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ -static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { - char *fmt = (char *) (long) r1; bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; @@ -178,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) switch (fmt_cnt) { case 1: - unsafe_addr = r3; - r3 = (long) buf; + unsafe_addr = arg1; + arg1 = (long) buf; break; case 2: - unsafe_addr = r4; - r4 = (long) buf; + unsafe_addr = arg2; + arg2 = (long) buf; break; case 3: - unsafe_addr = r5; - r5 = (long) buf; + unsafe_addr = arg3; + arg3 = (long) buf; break; } buf[0] = 0; @@ -209,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) } return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, - mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, - mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); + mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, + mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, + mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -233,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; @@ -312,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return 0; } -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { - struct pt_regs *regs = (struct pt_regs *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; - void *data = (void *)(long) r4; struct perf_raw_record raw = { .frag = { .size = size, @@ -367,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, return __bpf_perf_event_output(regs, map, flags, &raw); } -static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_task) { return (long) current; } @@ -378,16 +370,13 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { - struct bpf_map *map = (struct bpf_map *)(long)r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; - u32 idx = (u32)r2; if (unlikely(in_interrupt())) return -EINVAL; - if (unlikely(idx >= array->map.max_entries)) return -E2BIG; @@ -481,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = { .type = BPF_PROG_TYPE_KPROBE, }; -static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + /* * r1 points to perf tracepoint buffer where first 8 bytes are hidden * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it - * from there and call the same bpf_perf_event_output() helper + * from there and call the same bpf_perf_event_output() helper inline. */ - u64 ctx = *(long *)(uintptr_t)r1; - - return bpf_perf_event_output(ctx, r2, index, r4, size); + return ____bpf_perf_event_output(regs, map, flags, data, size); } static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { @@ -504,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg5_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags) { - u64 ctx = *(long *)(uintptr_t)r1; + struct pt_regs *regs = *(struct pt_regs **)tp_buff; - return bpf_get_stackid(ctx, r2, r3, r4, r5); + /* + * Same comment as in bpf_perf_event_output_tp(), only that this time + * the other helper's function body cannot be inlined due to being + * external, thus we need to call raw helper function. + */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); } static const struct bpf_func_proto bpf_get_stackid_proto_tp = { diff --git a/net/core/filter.c b/net/core/filter.c index d6d9bb89ce3a..298b146b47e7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -94,14 +94,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) { - return skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff(skb); } -static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -120,9 +119,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -145,7 +143,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_0(__get_raw_cpu_id) { return raw_smp_processor_id(); } @@ -1376,12 +1374,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } -static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; - void *from = (void *) (long) r3; - unsigned int len = (unsigned int) r4; void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) @@ -1416,12 +1411,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, + void *, to, u32, len) { - const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; - unsigned int offset = (unsigned int) r2; - void *to = (void *)(unsigned long) r3; - unsigned int len = (unsigned int) r4; void *ptr; if (unlikely(offset > 0xffff)) @@ -1449,10 +1441,9 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) @@ -1494,12 +1485,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; - unsigned int offset = (unsigned int) r2; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | @@ -1547,12 +1537,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, + __be32 *, to, u32, to_size, __wsum, seed) { struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); - u64 diff_size = from_size + to_size; - __be32 *from = (__be32 *) (long) r1; - __be32 *to = (__be32 *) (long) r3; + u32 diff_size = from_size + to_size; int i, j = 0; /* This is quite flexible, some examples: @@ -1610,9 +1599,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) return ret; } -static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; struct net_device *dev; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1648,7 +1636,7 @@ struct redirect_info { static DEFINE_PER_CPU(struct redirect_info, redirect_info); -static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); @@ -1687,9 +1675,9 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { - return task_get_classid((struct sk_buff *) (unsigned long) r1); + return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { @@ -1699,9 +1687,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { - return dst_tclassid((struct sk_buff *) (unsigned long) r1); + return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { @@ -1711,14 +1699,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ - return skb_get_hash((struct sk_buff *) (unsigned long) r1); + return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { @@ -1728,10 +1716,9 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, + u16, vlan_tci) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 vlan_proto = (__force __be16) r2; int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && @@ -1756,9 +1743,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { }; EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); -static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; int ret; bpf_push_mac_rcsum(skb); @@ -1933,10 +1919,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) return -ENOTSUPP; } -static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, + u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 proto = (__force __be16) r2; int ret; if (unlikely(flags)) @@ -1973,11 +1958,8 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u32 pkt_type = r2; - /* We only allow a restricted subset to be changed for now. */ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || !skb_pkt_type_ok(pkt_type))) @@ -2028,12 +2010,11 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) { - struct sk_buff *skb = (struct sk_buff *)(long) r1; u32 max_len = __bpf_skb_max_len(skb); u32 min_len = __bpf_skb_min_len(skb); - u32 new_len = (u32) r2; int ret; if (unlikely(flags || new_len > max_len || new_len < min_len)) @@ -2113,13 +2094,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, return 0; } -static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, - u64 meta_size) +BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) { - struct sk_buff *skb = (struct sk_buff *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; - void *meta = (void *)(long) r4; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; @@ -2146,10 +2124,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags) return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } -static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, + u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; @@ -2214,10 +2191,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u8 *to = (u8 *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; @@ -2252,10 +2227,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { static struct metadata_dst __percpu *md_dst; -static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, + const struct bpf_tunnel_key *, from, u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; @@ -2273,7 +2247,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); - from = (struct bpf_tunnel_key *)compat; + from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; @@ -2323,10 +2297,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, + const u8 *, from, u32, size) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u8 *from = (u8 *) (long) r2; struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); @@ -2372,23 +2345,20 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } -static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, + u32, idx) { - struct sk_buff *skb = (struct sk_buff *)(long)r1; - struct bpf_map *map = (struct bpf_map *)(long)r2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; - u32 i = (u32)r3; sk = skb->sk; if (!sk || !sk_fullsock(sk)) return -ENOENT; - - if (unlikely(i >= array->map.max_entries)) + if (unlikely(idx >= array->map.max_entries)) return -E2BIG; - cgrp = READ_ONCE(array->ptrs[i]); + cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; @@ -2411,13 +2381,10 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, return 0; } -static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4, - u64 meta_size) +BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) { - struct xdp_buff *xdp = (struct xdp_buff *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; - void *meta = (void *)(long) r4; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; -- cgit v1.2.3 From 181402a5c7899fad945485130ded47ca2bf1161e Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 9 Sep 2016 08:43:15 -0400 Subject: net: use IS_ENABLED() instead of checking for built-in or module The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either built-in or as a module, use that macro instead of open coding the same. Using the macro makes the code more readable by helping abstract away some of the Kconfig built-in and module enable details. Signed-off-by: Javier Martinez Canillas Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 34b5322bc081..b0d307b6af19 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3904,8 +3904,7 @@ static void net_tx_action(struct softirq_action *h) } } -#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ - (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; -- cgit v1.2.3 From 69ae6ad2ff37911903a90256e216d7e7ae460002 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Fri, 16 Sep 2016 15:05:37 +0200 Subject: net: core: Add offload stats to if_stats_msg Add a nested attribute of offload stats to if_stats_msg named IFLA_STATS_LINK_OFFLOAD_XSTATS. Under it, add SW stats, meaning stats only per packets that went via slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 9 ++++ net/core/rtnetlink.c | 111 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 116 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 9bf3aecfe05b..2351776a724f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -826,6 +826,7 @@ enum { IFLA_STATS_LINK_64, IFLA_STATS_LINK_XSTATS, IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, __IFLA_STATS_MAX, }; @@ -845,6 +846,14 @@ enum { }; #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + /* XDP section */ enum { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 937e459bdaa9..0dbae4244a89 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3577,6 +3577,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) (!idxattr || idxattr == attrid); } +#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) +static int rtnl_get_offload_stats_attr_size(int attr_id) +{ + switch (attr_id) { + case IFLA_OFFLOAD_XSTATS_CPU_HIT: + return sizeof(struct rtnl_link_stats64); + } + + return 0; +} + +static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, + int *prividx) +{ + struct nlattr *attr = NULL; + int attr_id, size; + void *attr_data; + int err; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return -ENODATA; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (attr_id < *prividx) + continue; + + size = rtnl_get_offload_stats_attr_size(attr_id); + if (!size) + continue; + + if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + continue; + + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + goto nla_put_failure; + + attr_data = nla_data(attr); + memset(attr_data, 0, size); + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, + attr_data); + if (err) + goto get_offload_stats_failure; + } + + if (!attr) + return -ENODATA; + + *prividx = 0; + return 0; + +nla_put_failure: + err = -EMSGSIZE; +get_offload_stats_failure: + *prividx = attr_id; + return err; +} + +static int rtnl_get_offload_stats_size(const struct net_device *dev) +{ + int nla_size = 0; + int attr_id; + int size; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return 0; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + continue; + size = rtnl_get_offload_stats_attr_size(attr_id); + nla_size += nla_total_size_64bit(size); + } + + if (nla_size != 0) + nla_size += nla_total_size(0); + + return nla_size; +} + static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, unsigned int filter_mask, @@ -3586,6 +3671,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; + int err; ASSERT_RTNL(); @@ -3614,8 +3700,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS); @@ -3639,8 +3723,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS_SLAVE); @@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, + *idxattr)) { + *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; + attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); + if (!attr) + goto nla_put_failure; + + err = rtnl_get_offload_stats(skb, dev, prividx); + if (err == -ENODATA) + nla_nest_cancel(skb, attr); + else + nla_nest_end(skb, attr); + + if (err && err != -ENODATA) + goto nla_put_failure; + *idxattr = 0; + } + nlmsg_end(skb, nlh); return 0; @@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) + size += rtnl_get_offload_stats_size(dev); + return size; } -- cgit v1.2.3 From d979a39d7242e0601bf9b60e89628fb8ac577179 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 19 Sep 2016 14:44:38 -0700 Subject: cgroup: duplicate cgroup reference when cloning sockets When a socket is cloned, the associated sock_cgroup_data is duplicated but not its reference on the cgroup. As a result, the cgroup reference count will underflow when both sockets are destroyed later on. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Link: http://lkml.kernel.org/r/20160914194846.11153-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: Michal Hocko Cc: Vladimir Davydov Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++++ net/core/sock.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..5e8dab5bf9ad 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6270,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { diff --git a/net/core/sock.c b/net/core/sock.c index 25dab8b60223..fd7b41edf1ce 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,7 +1362,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); - cgroup_sk_alloc(&sk->sk_cgrp_data); } return sk; @@ -1422,6 +1421,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); + cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); } @@ -1566,6 +1566,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + + cgroup_sk_alloc(&newsk->sk_cgrp_data); + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) -- cgit v1.2.3 From 07b26c9454a2a19fff86d6fcf2aba6bc801eb8d8 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 19 Sep 2016 12:58:47 +0200 Subject: gso: Support partial splitting at the frag_list pointer Since commit 8a29111c7 ("net: gro: allow to build full sized skb") gro may build buffers with a frag_list. This can hurt forwarding because most NICs can't offload such packets, they need to be segmented in software. This patch splits buffers with a frag_list at the frag_list pointer into buffers that can be TSO offloaded. Signed-off-by: Steffen Klassert Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 51 +++++++++++++++++++++++++++++++++++++++----------- net/ipv4/af_inet.c | 14 ++++++++++---- net/ipv4/gre_offload.c | 6 ++++-- net/ipv4/tcp_offload.c | 13 +++++++------ net/ipv4/udp_offload.c | 6 ++++-- net/ipv6/ip6_offload.c | 5 ++++- 6 files changed, 69 insertions(+), 26 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1e329d411242..7bf82a28e10a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3097,11 +3097,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); - /* GSO partial only requires that we trim off any excess that - * doesn't fit into an MSS sized block, so take care of that - * now. - */ - if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { + if (sg && csum && (mss != GSO_BY_FRAGS)) { + if (!(features & NETIF_F_GSO_PARTIAL)) { + struct sk_buff *iter; + + if (!list_skb || + !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) + goto normal; + + /* Split the buffer at the frag_list pointer. + * This is based on the assumption that all + * buffers in the chain excluding the last + * containing the same amount of data. + */ + skb_walk_frags(head_skb, iter) { + if (skb_headlen(iter)) + goto normal; + + len -= iter->len; + } + } + + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + */ partial_segs = len / mss; if (partial_segs > 1) mss *= partial_segs; @@ -3109,6 +3129,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, partial_segs = 0; } +normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3300,21 +3321,29 @@ perform_csum_check: */ segs->prev = tail; - /* Update GSO info on first skb in partial sequence. */ if (partial_segs) { + struct sk_buff *iter; int type = skb_shinfo(head_skb)->gso_type; + unsigned short gso_size = skb_shinfo(head_skb)->gso_size; /* Update type to add partial and then remove dodgy if set */ - type |= SKB_GSO_PARTIAL; + type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; type &= ~SKB_GSO_DODGY; /* Update GSO info and prepare to start updating headers on * our way back down the stack of protocols. */ - skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; - skb_shinfo(segs)->gso_segs = partial_segs; - skb_shinfo(segs)->gso_type = type; - SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; + for (iter = segs; iter; iter = iter->next) { + skb_shinfo(iter)->gso_size = gso_size; + skb_shinfo(iter)->gso_segs = partial_segs; + skb_shinfo(iter)->gso_type = type; + SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; + } + + if (tail->len - doffset <= gso_size) + skb_shinfo(tail)->gso_size = 0; + else if (tail != segs) + skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); } /* Following permits correct backpressure, for protocols diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e94b47be0019..1effc986739e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1192,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool udpfrag = false, fixedid = false, encap; + bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; @@ -1245,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (IS_ERR_OR_NULL(segs)) goto out; + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); @@ -1259,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; } - tot_len = skb_shinfo(skb)->gso_size + - SKB_GSO_CB(skb)->data_offset + - skb->head - (unsigned char *)iph; + + if (gso_partial) + tot_len = skb_shinfo(skb)->gso_size + + SKB_GSO_CB(skb)->data_offset + + skb->head - (unsigned char *)iph; + else + tot_len = skb->len - nhoff; } else { if (!fixedid) iph->id = htons(id++); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index ecd1e09dbbf1..96e0efecefa6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, ufo; + bool need_csum, ufo, gso_partial; if (!skb->encapsulation) goto out; @@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, goto out; } + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + outer_hlen = skb_tnl_header_len(skb); gre_offset = outer_hlen - tnl_hlen; skb = segs; @@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (skb_is_gso(skb)) { + if (gso_partial) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 5c5964962d0c..bc68da38ea86 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, goto out; } - /* GSO partial only requires splitting the frame into an MSS - * multiple and possibly a remainder. So update the mss now. - */ - if (features & NETIF_F_GSO_PARTIAL) - mss = skb->len - (skb->len % mss); - copy_destructor = gso_skb->destructor == tcp_wfree; ooo_okay = gso_skb->ooo_okay; /* All segments but the first should have ooo_okay cleared */ @@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, /* Only first segment might have ooo_okay set */ segs->ooo_okay = ooo_okay; + /* GSO partial and frag_list segmentation only requires splitting + * the frame into an MSS multiple and possibly a remainder, both + * cases return a GSO skb. So update the mss now. + */ + if (skb_is_gso(segs)) + mss *= skb_shinfo(segs)->gso_segs; + delta = htonl(oldlen + (thlen + mss)); skb = segs; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 81f253b6ff36..f9333c963607 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - bool remcsum, need_csum, offload_csum, ufo; + bool remcsum, need_csum, offload_csum, ufo, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; @@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, goto out; } + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + outer_hlen = skb_tnl_header_len(skb); udp_offset = outer_hlen - tnl_hlen; skb = segs; @@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (skb_is_gso(skb)) { + if (gso_partial) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 22e90e56b5a9..e7bfd55899a3 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int offset = 0; bool encap, udpfrag; int nhoff; + bool gso_partial; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); @@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (IS_ERR(segs)) goto out; + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (skb_is_gso(skb)) + if (gso_partial) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); -- cgit v1.2.3 From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Sep 2016 00:26:13 +0200 Subject: bpf: direct packet write and access for helpers for clsact progs This work implements direct packet access for helpers and direct packet write in a similar fashion as already available for XDP types via commits 4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and 6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a complementary feature to the already available direct packet read for tc (cls/act) programs. For enabling this, we need to introduce two helpers, bpf_skb_pull_data() and bpf_csum_update(). The first is generally needed for both, read and write, because they would otherwise only be limited to the current linear skb head. Usually, when the data_end test fails, programs just bail out, or, in the direct read case, use bpf_skb_load_bytes() as an alternative to overcome this limitation. If such data sits in non-linear parts, we can just pull them in once with the new helper, retest and eventually access them. At the same time, this also makes sure the skb is uncloned, which is, of course, a necessary condition for direct write. As this needs to be an invariant for the write part only, the verifier detects writes and adds a prologue that is calling bpf_skb_pull_data() to effectively unclone the skb from the very beginning in case it is indeed cloned. The heuristic makes use of a similar trick that was done in 233577a22089 ("net: filter: constify detection of pkt_type_offset"). This comes at zero cost for other programs that do not use the direct write feature. Should a program use this feature only sparsely and has read access for the most parts with, for example, drop return codes, then such write action can be delegated to a tail called program for mitigating this cost of potential uncloning to a late point in time where it would have been paid similarly with the bpf_skb_store_bytes() as well. Advantage of direct write is that the writes are inlined whereas the helper cannot make any length assumptions and thus needs to generate a call to memcpy() also for small sizes, as well as cost of helper call itself with sanity checks are avoided. Plus, when direct read is already used, we don't need to cache or perform rechecks on the data boundaries (due to verifier invalidating previous checks for helpers that change skb->data), so more complex programs using rewrites can benefit from switching to direct read plus write. For direct packet access to helpers, we save the otherwise needed copy into a temp struct sitting on stack memory when use-case allows. Both facilities are enabled via may_access_direct_pkt_data() in verifier. For now, we limit this to map helpers and csum_diff, and can successively enable other helpers where we find it makes sense. Helpers that definitely cannot be allowed for this are those part of bpf_helper_changes_skb_data() since they can change underlying data, and those that write into memory as this could happen for packet typed args when still cloned. bpf_csum_update() helper accommodates for the fact that we need to fixup checksum_complete when using direct write instead of bpf_skb_store_bytes(), meaning the programs can use available helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(), csum_block_add(), csum_block_sub() equivalents in eBPF together with the new helper. A usage example will be provided for iproute2's examples/bpf/ directory. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 +- include/linux/skbuff.h | 14 ++++- include/uapi/linux/bpf.h | 21 ++++++++ kernel/bpf/helpers.c | 3 ++ kernel/bpf/verifier.c | 54 ++++++++++++++----- net/core/filter.c | 134 +++++++++++++++++++++++++++++++++++++++++------ 6 files changed, 196 insertions(+), 34 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9a904f63f8c1..5691fdc83819 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -96,6 +96,7 @@ enum bpf_return_type { struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; + bool pkt_access; enum bpf_return_type ret_type; enum bpf_arg_type arg1_type; enum bpf_arg_type arg2_type; @@ -151,7 +152,8 @@ struct bpf_verifier_ops { */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type); - + int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, + const struct bpf_prog *prog); u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn, struct bpf_prog *prog); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4c5662f05bda..c6dab3f7457c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -676,13 +676,23 @@ struct sk_buff { */ kmemcheck_bitfield_begin(flags1); __u16 queue_mapping; + +/* if you move cloned around you also must adapt those constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define CLONED_MASK (1 << 7) +#else +#define CLONED_MASK 1 +#endif +#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) + + __u8 __cloned_offset[0]; __u8 cloned:1, nohdr:1, fclone:2, peeked:1, head_frag:1, - xmit_more:1; - /* one bit hole */ + xmit_more:1, + __unused:1; /* one bit hole */ kmemcheck_bitfield_end(flags1); /* fields enclosed in headers_start/headers_end are copied diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f896dfac4ac0..e07432b9f8b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -398,6 +398,27 @@ enum bpf_func_id { */ BPF_FUNC_skb_change_tail, + /** + * bpf_skb_pull_data(skb, len) + * The helper will pull in non-linear data in case the + * skb is non-linear and not all of len are part of the + * linear section. Only needed for read/write with direct + * packet access. + * @skb: pointer to skb + * @len: len to make read/writeable + * Return: 0 on success or negative error + */ + BPF_FUNC_skb_pull_data, + + /** + * bpf_csum_update(skb, csum) + * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. + * @skb: pointer to skb + * @csum: csum to add + * Return: csum on success or negative error + */ + BPF_FUNC_csum_update, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a5b8bf8cfcfd..39918402e6e9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -36,6 +36,7 @@ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -51,6 +52,7 @@ BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -67,6 +69,7 @@ BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bc138f34e38c..3a75ee3bdcd1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -196,6 +196,7 @@ struct verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool seen_direct_write; }; #define BPF_COMPLEXITY_LIMIT_INSNS 65536 @@ -204,6 +205,7 @@ struct verifier_env { struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; + bool pkt_access; int regno; int access_size; }; @@ -654,10 +656,17 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff -static bool may_write_pkt_data(enum bpf_prog_type type) +static bool may_access_direct_pkt_data(struct verifier_env *env, + const struct bpf_call_arg_meta *meta) { - switch (type) { + switch (env->prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + if (meta) + return meta->pkt_access; + + env->seen_direct_write = true; return true; default: return false; @@ -817,7 +826,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -950,8 +959,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) { - verbose("helper access to the packet is not allowed for clsact\n"); + if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + verbose("helper access to the packet is not allowed\n"); return -EACCES; } @@ -1191,6 +1200,7 @@ static int check_call(struct verifier_env *env, int func_id) changes_data = bpf_helper_changes_skb_data(fn->func); memset(&meta, 0, sizeof(meta)); + meta.pkt_access = fn->pkt_access; /* We only support one arg being in raw mode at the moment, which * is sufficient for the helper functions we have right now. @@ -2675,18 +2685,35 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) */ static int convert_ctx_accesses(struct verifier_env *env) { - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16]; + const struct bpf_verifier_ops *ops = env->prog->aux->ops; + struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i; + int i, insn_cnt, cnt; - if (!env->prog->aux->ops->convert_ctx_access) + if (ops->gen_prologue) { + cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, + env->prog); + if (cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (cnt) { + new_prog = bpf_patch_insn_single(env->prog, 0, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + } + } + + if (!ops->convert_ctx_access) return 0; + insn_cnt = env->prog->len; + insn = env->prog->insnsi; + for (i = 0; i < insn_cnt; i++, insn++) { - u32 insn_delta, cnt; + u32 insn_delta; if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) @@ -2703,9 +2730,8 @@ static int convert_ctx_accesses(struct verifier_env *env) continue; } - cnt = env->prog->aux->ops-> - convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; diff --git a/net/core/filter.c b/net/core/filter.c index 298b146b47e7..0920c2ac1d00 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, return err; } +static int bpf_try_make_head_writable(struct sk_buff *skb) +{ + return bpf_try_make_writable(skb, skb_headlen(skb)); +} + static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) @@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_STACK_SIZE, }; +BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto bpf_skb_pull_data_proto = { + .func = bpf_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_STACK, .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, @@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +{ + /* The interface is to be used in combination with bpf_csum_diff() + * for direct packet writes. csum rotation for alignment as well + * as emulating csum_sub() can be done from the eBPF program. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + return (skb->csum = csum_add(skb->csum, csum)); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_csum_update_proto = { + .func = bpf_csum_update, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; + struct sk_buff *clone; + int ret; if (unlikely(flags & ~(BPF_F_INGRESS))) return -EINVAL; @@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) if (unlikely(!dev)) return -EINVAL; - skb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb)) + clone = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!clone)) return -ENOMEM; - bpf_push_mac_rcsum(skb); + /* For direct write, we need to keep the invariant that the skbs + * we're dealing with need to be uncloned. Should uncloning fail + * here, we need to free the just generated clone to unclone once + * again. + */ + ret = bpf_try_make_head_writable(skb); + if (unlikely(ret)) { + kfree_skb(clone); + return -ENOMEM; + } + + bpf_push_mac_rcsum(clone); return flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -2063,19 +2124,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { bool bpf_helper_changes_skb_data(void *func) { - if (func == bpf_skb_vlan_push) - return true; - if (func == bpf_skb_vlan_pop) - return true; - if (func == bpf_skb_store_bytes) - return true; - if (func == bpf_skb_change_proto) - return true; - if (func == bpf_skb_change_tail) - return true; - if (func == bpf_l3_csum_replace) - return true; - if (func == bpf_l4_csum_replace) + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_tail || + func == bpf_skb_pull_data || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace) return true; return false; @@ -2440,8 +2496,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -2533,6 +2593,45 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return TC_ACT_SHOT; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); + *insn++ = BPF_EXIT_INSN(); + + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) @@ -2810,6 +2909,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, }; static const struct bpf_verifier_ops xdp_ops = { -- cgit v1.2.3 From bfca4c520f7ea78138ddccea2de18dc062b0fefd Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 19 Sep 2016 19:11:09 +0300 Subject: net: skbuff: Export __skb_vlan_pop This exports the functionality of extracting the tag from the payload, without moving next vlan tag into hw accel tag. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c6dab3f7457c..9bf60b556bd2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3085,6 +3085,7 @@ bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7bf82a28e10a..6c22351bd519 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4522,8 +4522,10 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len) } EXPORT_SYMBOL(skb_ensure_writable); -/* remove VLAN header from packet and update csum accordingly. */ -static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +/* remove VLAN header from packet and update csum accordingly. + * expects a non skb_vlan_tag_present skb with a vlan tag payload + */ +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr; unsigned int offset = skb->data - skb_mac_header(skb); @@ -4554,6 +4556,7 @@ pull: return err; } +EXPORT_SYMBOL(__skb_vlan_pop); int skb_vlan_pop(struct sk_buff *skb) { -- cgit v1.2.3 From 636c2628086e40c86dac7ddc84a1c4b4fcccc6e3 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Tue, 20 Sep 2016 12:48:36 +0300 Subject: net: skbuff: Remove errornous length validation in skb_vlan_pop() In 93515d53b1 "net: move vlan pop/push functions into common code" skb_vlan_pop was moved from its private location in openvswitch to skbuff common code. In case skb has non hw-accel vlan tag, the original 'pop_vlan()' assured that skb->len is sufficient (if skb->len < VLAN_ETH_HLEN then pop was considered a no-op). This validation was moved as is into the new common 'skb_vlan_pop'. Alas, in its original location (openvswitch), there was a guarantee that 'data' points to the mac_header, therefore the 'skb->len < VLAN_ETH_HLEN' condition made sense. However there's no such guarantee in the generic 'skb_vlan_pop'. For short packets received in rx path going through 'skb_vlan_pop', this causes 'skb_vlan_pop' to fail pop-ing a valid vlan hdr (in the non hw-accel case) or to fail moving next tag into hw-accel tag. Remove the 'skb->len < VLAN_ETH_HLEN' condition entirely: It is superfluous since inner '__skb_vlan_pop' already verifies there are VLAN_ETH_HLEN writable bytes at the mac_header. Note this presents a slight change to skb_vlan_pop() users: In case total length is smaller than VLAN_ETH_HLEN, skb_vlan_pop() now returns an error, as opposed to previous "no-op" behavior. Existing callers (e.g. tc act vlan, ovs) usually drop the packet if 'skb_vlan_pop' fails. Fixes: 93515d53b1 ("net: move vlan pop/push functions into common code") Signed-off-by: Shmulik Ladkani Cc: Pravin Shelar Reviewed-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6c22351bd519..b2a51bf1b0f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4567,9 +4567,8 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (unlikely(skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4577,9 +4576,8 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (likely(skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD))) return 0; vlan_proto = skb->protocol; -- cgit v1.2.3 From ecf4ee41d25832a6ec52f8b54dfaa46c08b949d5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Tue, 20 Sep 2016 12:48:37 +0300 Subject: net: skbuff: Coding: Use eth_type_vlan() instead of open coding it Fix 'skb_vlan_pop' to use eth_type_vlan instead of directly comparing skb->protocol to ETH_P_8021Q or ETH_P_8021AD. Signed-off-by: Shmulik Ladkani Reviewed-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b2a51bf1b0f9..d36c7548952f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4567,8 +4567,7 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + if (unlikely(!eth_type_vlan(skb->protocol))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4576,8 +4575,7 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + if (likely(!eth_type_vlan(skb->protocol))) return 0; vlan_proto = skb->protocol; -- cgit v1.2.3 From 2d48c5f9335e48ddac7a52db10bf3bfd01986b9c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Sep 2016 01:28:35 +0200 Subject: bpf: use skb_to_full_sk helper in bpf_skb_under_cgroup We need to use skb_to_full_sk() helper introduced in commit bd5eb35f16a9 ("xfrm: take care of request sockets") as otherwise we miss tcp synack messages, since ownership is on request socket and therefore it would miss the sk_fullsock() check. Use skb_to_full_sk() as also done similarly in the bpf_get_cgroup_classid() helper via 2309236c13fe ("cls_cgroup: get sk_classid only from full sockets") fix to not let this fall through. Fixes: 4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 0920c2ac1d00..e5d997759d5e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2408,7 +2408,7 @@ BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, struct cgroup *cgrp; struct sock *sk; - sk = skb->sk; + sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; if (unlikely(idx >= array->map.max_entries)) -- cgit v1.2.3 From 669dc4d76d0ecc2d795df735839f43cfddf9f617 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Sep 2016 01:28:36 +0200 Subject: bpf: use bpf_get_smp_processor_id_proto instead of raw one Same motivation as in commit 80b48c445797 ("bpf: don't use raw processor id in generic helper"), but this time for XDP typed programs. Thus, allow for preemption checks when we have DEBUG_PREEMPT enabled, and otherwise use the raw variant. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e5d997759d5e..acf84fbfb043 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2551,6 +2551,8 @@ xdp_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_xdp_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From 7a4b28c6cc9ffac50f791b99cc7e46106436e5d8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Sep 2016 01:28:37 +0200 Subject: bpf: add helper to invalidate hash Add a small helper that complements 36bbef52c7eb ("bpf: direct packet write and access for helpers for clsact progs") for invalidating the current skb->hash after mangling on headers via direct packet write. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ net/core/filter.c | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e07432b9f8b8..f09c70b97eca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -419,6 +419,13 @@ enum bpf_func_id { */ BPF_FUNC_csum_update, + /** + * bpf_set_hash_invalid(skb) + * Invalidate current skb>hash. + * @skb: pointer to skb + */ + BPF_FUNC_set_hash_invalid, + __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index acf84fbfb043..00351cdf7d0c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1777,6 +1777,22 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) +{ + /* After all direct packet write, this can be used once for + * triggering a lazy recalc on next skb_get_hash() invocation. + */ + skb_clear_hash(skb); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_invalid_proto = { + .func = bpf_set_hash_invalid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { @@ -2534,6 +2550,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: -- cgit v1.2.3 From 79aab093a0b5370d7fc4e99df75996f4744dc03f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:15 +0300 Subject: net: Update API for VF vlan protocol 802.1ad support Introduce new rtnl UAPI that exposes a list of vlans per VF, giving the ability for user-space application to specify it for the VF, as an option to support 802.1ad. We adjusted IP Link tool to support this option. For future use cases, the new UAPI supports multiple vlans. For now we limit the list size to a single vlan in kernel. Add IFLA_VF_VLAN_LIST in addition to IFLA_VF_VLAN to keep backward compatibility with older versions of IP Link tool. Add a vlan protocol parameter to the ndo_set_vf_vlan callback. We kept 802.1Q as the drivers' default vlan protocol. Suitable ip link tool command examples: Set vf vlan protocol 802.1ad: ip link set eth0 vf 1 vlan 100 proto 802.1ad Set vf to VST (802.1Q) mode: ip link set eth0 vf 1 vlan 100 proto 802.1Q Or by omitting the new parameter ip link set eth0 vf 1 vlan 100 Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 9 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 6 +- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h | 2 +- drivers/net/ethernet/emulex/benet/be_main.c | 6 +- drivers/net/ethernet/intel/fm10k/fm10k.h | 2 +- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 6 +- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 4 +- drivers/net/ethernet/intel/igb/igb_main.c | 9 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 5 +- drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h | 2 +- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 6 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 6 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h | 2 +- .../net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c | 5 +- drivers/net/ethernet/sfc/sriov.c | 5 +- drivers/net/ethernet/sfc/sriov.h | 2 +- include/linux/if_link.h | 1 + include/linux/netdevice.h | 6 +- include/uapi/linux/if_link.h | 19 ++++- net/core/rtnetlink.c | 80 ++++++++++++++++++---- 23 files changed, 161 insertions(+), 42 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index 0e68fadecfdb..243cb9748d35 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -492,7 +492,8 @@ int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto, int bnx2x_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi); int bnx2x_set_vf_mac(struct net_device *dev, int queue, u8 *mac); -int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos); +int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto); /* select_queue callback */ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 6c586b045d1d..3f77d0863543 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -2521,7 +2521,8 @@ void bnx2x_pf_set_vfs_vlan(struct bnx2x *bp) for_each_vf(bp, vfidx) { bulletin = BP_VF_BULLETIN(bp, vfidx); if (bulletin->valid_bitmap & (1 << VLAN_VALID)) - bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0); + bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0, + htons(ETH_P_8021Q)); } } @@ -2781,7 +2782,8 @@ static int bnx2x_set_vf_vlan_filter(struct bnx2x *bp, struct bnx2x_virtf *vf, return 0; } -int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos) +int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos, + __be16 vlan_proto) { struct pf_vf_bulletin_content *bulletin = NULL; struct bnx2x *bp = netdev_priv(dev); @@ -2796,6 +2798,9 @@ int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos) return -EINVAL; } + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + DP(BNX2X_MSG_IOV, "configuring VF %d with VLAN %d qos %d\n", vfidx, vlan, 0); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 8be718508600..ec6cd18842c3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -174,7 +174,8 @@ int bnxt_set_vf_mac(struct net_device *dev, int vf_id, u8 *mac) return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); } -int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos) +int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos, + __be16 vlan_proto) { struct hwrm_func_cfg_input req = {0}; struct bnxt *bp = netdev_priv(dev); @@ -185,6 +186,9 @@ int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos) if (bp->hwrm_spec_code < 0x10201) return -ENOTSUPP; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + rc = bnxt_vf_ndo_prep(bp, vf_id); if (rc) return rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h index 0392670ab49c..1ab72e4820af 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h @@ -12,7 +12,7 @@ int bnxt_get_vf_config(struct net_device *, int, struct ifla_vf_info *); int bnxt_set_vf_mac(struct net_device *, int, u8 *); -int bnxt_set_vf_vlan(struct net_device *, int, u16, u8); +int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16); int bnxt_set_vf_bw(struct net_device *, int, int, int); int bnxt_set_vf_link_state(struct net_device *, int, int); int bnxt_set_vf_spoofchk(struct net_device *, int, bool); diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 9a94840c5757..ac513e6627d1 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1895,7 +1895,8 @@ static int be_clear_vf_tvt(struct be_adapter *adapter, int vf) return 0; } -static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos) +static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) { struct be_adapter *adapter = netdev_priv(netdev); struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf]; @@ -1907,6 +1908,9 @@ static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos) if (vf >= adapter->num_vfs || vlan > 4095 || qos > 7) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + if (vlan || qos) { vlan |= qos << VLAN_PRIO_SHIFT; status = be_set_vf_tvt(adapter, vf, vlan); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index 67ff01aeb11a..4d19e46f7c55 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -507,7 +507,7 @@ int fm10k_iov_configure(struct pci_dev *pdev, int num_vfs); s32 fm10k_iov_update_pvid(struct fm10k_intfc *interface, u16 glort, u16 pvid); int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac); int fm10k_ndo_set_vf_vlan(struct net_device *netdev, - int vf_idx, u16 vid, u8 qos); + int vf_idx, u16 vid, u8 qos, __be16 vlan_proto); int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, int rate, int unused); int fm10k_ndo_get_vf_config(struct net_device *netdev, diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index d9dec81f6b6d..5f4dac0d36ef 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -445,7 +445,7 @@ int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac) } int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid, - u8 qos) + u8 qos, __be16 vlan_proto) { struct fm10k_intfc *interface = netdev_priv(netdev); struct fm10k_iov_data *iov_data = interface->iov_data; @@ -460,6 +460,10 @@ int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid, if (qos || (vid > (VLAN_VID_MASK - 1))) return -EINVAL; + /* VF VLAN Protocol part to default is unsupported */ + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + vf_info = &iov_data->vf_info[vf_idx]; /* exit if there is nothing to do */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index da3423561b3a..724d8740d4cc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2747,11 +2747,12 @@ error_param: * @vf_id: VF identifier * @vlan_id: mac address * @qos: priority setting + * @vlan_proto: vlan protocol * * program VF vlan id and/or qos **/ -int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, - int vf_id, u16 vlan_id, u8 qos) +int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, + u16 vlan_id, u8 qos, __be16 vlan_proto) { u16 vlanprio = vlan_id | (qos << I40E_VLAN_PRIORITY_SHIFT); struct i40e_netdev_priv *np = netdev_priv(netdev); @@ -2774,6 +2775,12 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, goto error_pvid; } + if (vlan_proto != htons(ETH_P_8021Q)) { + dev_err(&pf->pdev->dev, "VF VLAN protocol is not supported\n"); + ret = -EPROTONOSUPPORT; + goto error_pvid; + } + vf = &(pf->vf[vf_id]); vsi = pf->vsi[vf->lan_vsi_idx]; if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states)) { diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 875174141451..4012d069939a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -129,8 +129,8 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf); /* VF configuration related iplink handlers */ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac); -int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, - int vf_id, u16 vlan_id, u8 qos); +int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, + u16 vlan_id, u8 qos, __be16 vlan_proto); int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate, int max_tx_rate); int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index af75eac5fa16..a83aa13a5bf4 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -169,7 +169,7 @@ static int igb_set_vf_mac(struct igb_adapter *, int, unsigned char *); static void igb_restore_vf_multicasts(struct igb_adapter *adapter); static int igb_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac); static int igb_ndo_set_vf_vlan(struct net_device *netdev, - int vf, u16 vlan, u8 qos); + int vf, u16 vlan, u8 qos, __be16 vlan_proto); static int igb_ndo_set_vf_bw(struct net_device *, int, int, int); static int igb_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting); @@ -6222,14 +6222,17 @@ static int igb_disable_port_vlan(struct igb_adapter *adapter, int vf) return 0; } -static int igb_ndo_set_vf_vlan(struct net_device *netdev, - int vf, u16 vlan, u8 qos) +static int igb_ndo_set_vf_vlan(struct net_device *netdev, int vf, + u16 vlan, u8 qos, __be16 vlan_proto) { struct igb_adapter *adapter = netdev_priv(netdev); if ((vf >= adapter->vfs_allocated_count) || (vlan > 4095) || (qos > 7)) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + return (vlan || qos) ? igb_enable_port_vlan(adapter, vf, vlan, qos) : igb_disable_port_vlan(adapter, vf); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index 8618599dfd6f..b18590a995db 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -1354,13 +1354,16 @@ static int ixgbe_disable_port_vlan(struct ixgbe_adapter *adapter, int vf) return err; } -int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos) +int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, + u8 qos, __be16 vlan_proto) { int err = 0; struct ixgbe_adapter *adapter = netdev_priv(netdev); if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7)) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; if (vlan || qos) { /* Check if there is already a port VLAN set, if so * we have to delete the old one first before we diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h index 47e65e2f886a..0c7977d27b71 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h @@ -43,7 +43,7 @@ void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter); void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter); int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac); int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan, - u8 qos); + u8 qos, __be16 vlan_proto); int ixgbe_link_mbps(struct ixgbe_adapter *adapter); int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate, int max_tx_rate); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index a94f8a3f026c..132eeeafcdc4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2400,11 +2400,15 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac) return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64); } -static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos) +static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) { struct mlx4_en_priv *en_priv = netdev_priv(dev); struct mlx4_en_dev *mdev = en_priv->mdev; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c12792314be7..b58cfe37dead 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2917,11 +2917,15 @@ static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac) return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac); } -static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos) +static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) { struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1, vlan, qos); } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index cd23a2946db7..0e198fe89d1a 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -100,7 +100,8 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev, static void qede_link_update(void *dev, struct qed_link_output *link); #ifdef CONFIG_QED_SRIOV -static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos) +static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) { struct qede_dev *edev = netdev_priv(ndev); @@ -109,6 +110,9 @@ static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos) return -EINVAL; } + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + DP_VERBOSE(edev, QED_MSG_IOV, "Setting Vlan 0x%04x to VF [%d]\n", vlan, vf); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h index 24061b9b92e8..5f327659efa7 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h @@ -238,7 +238,7 @@ int qlcnic_sriov_set_vf_mac(struct net_device *, int, u8 *); int qlcnic_sriov_set_vf_tx_rate(struct net_device *, int, int, int); int qlcnic_sriov_get_vf_config(struct net_device *, int , struct ifla_vf_info *); -int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8); +int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8, __be16); int qlcnic_sriov_set_vf_spoofchk(struct net_device *, int, bool); #else static inline void qlcnic_sriov_pf_disable(struct qlcnic_adapter *adapter) {} diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c index afd687e5e779..50eaafa3eaba 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c @@ -1915,7 +1915,7 @@ int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf, } int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf, - u16 vlan, u8 qos) + u16 vlan, u8 qos, __be16 vlan_proto) { struct qlcnic_adapter *adapter = netdev_priv(netdev); struct qlcnic_sriov *sriov = adapter->ahw->sriov; @@ -1928,6 +1928,9 @@ int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf, if (vf >= sriov->num_vfs || qos > 7) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + if (vlan > MAX_VLAN_ID) { netdev_err(netdev, "Invalid VLAN ID, allowed range is [0 - %d]\n", diff --git a/drivers/net/ethernet/sfc/sriov.c b/drivers/net/ethernet/sfc/sriov.c index 816c44689e67..9abcf4aded30 100644 --- a/drivers/net/ethernet/sfc/sriov.c +++ b/drivers/net/ethernet/sfc/sriov.c @@ -22,7 +22,7 @@ int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac) } int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan, - u8 qos) + u8 qos, __be16 vlan_proto) { struct efx_nic *efx = netdev_priv(net_dev); @@ -31,6 +31,9 @@ int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan, (qos & ~(VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT))) return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + return efx->type->sriov_set_vf_vlan(efx, vf_i, vlan, qos); } else { return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/sfc/sriov.h b/drivers/net/ethernet/sfc/sriov.h index 400df526586d..ba1762e7f216 100644 --- a/drivers/net/ethernet/sfc/sriov.h +++ b/drivers/net/ethernet/sfc/sriov.h @@ -16,7 +16,7 @@ int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac); int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan, - u8 qos); + u8 qos, __be16 vlan_proto); int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf_i, bool spoofchk); int efx_sriov_get_vf_config(struct net_device *net_dev, int vf_i, diff --git a/include/linux/if_link.h b/include/linux/if_link.h index f923d15b432c..0b17c585b5cd 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -25,5 +25,6 @@ struct ifla_vf_info { __u32 max_tx_rate; __u32 rss_query_en; __u32 trusted; + __be16 vlan_proto; }; #endif /* _LINUX_IF_LINK_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69f242c71865..1e8a5c734d72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -946,7 +946,8 @@ struct netdev_xdp { * * SR-IOV management functions. * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac); - * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos); + * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, + * u8 qos, __be16 proto); * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, * int max_tx_rate); * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); @@ -1187,7 +1188,8 @@ struct net_device_ops { int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); int (*ndo_set_vf_vlan)(struct net_device *dev, - int queue, u16 vlan, u8 qos); + int queue, u16 vlan, + u8 qos, __be16 proto); int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7ec9e99d5491..b4fba662cd32 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -619,7 +619,7 @@ enum { enum { IFLA_VF_UNSPEC, IFLA_VF_MAC, /* Hardware queue specific attributes */ - IFLA_VF_VLAN, + IFLA_VF_VLAN, /* VLAN ID and QoS */ IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ @@ -631,6 +631,7 @@ enum { IFLA_VF_TRUST, /* Trust VF */ IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ __IFLA_VF_MAX, }; @@ -647,6 +648,22 @@ struct ifla_vf_vlan { __u32 qos; }; +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + struct ifla_vf_tx_rate { __u32 vf; __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0dbae4244a89..3ac8946bf244 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -843,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += nla_total_size(num_vfs * sizeof(struct nlattr)); size += num_vfs * (nla_total_size(sizeof(struct ifla_vf_mac)) + - nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct nlattr)) + + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + @@ -1111,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct nlattr *vfinfo) { struct ifla_vf_rss_query_en vf_rss_query_en; + struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; - struct nlattr *vf, *vfstats; struct ifla_vf_mac vf_mac; struct ifla_vf_info ivi; @@ -1135,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; + /* VLAN Protocol by default is 802.1Q */ + ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; vf_mac.vf = vf_vlan.vf = + vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = @@ -1150,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; + vf_vlan_info.vlan = ivi.vlan; + vf_vlan_info.qos = ivi.qos; + vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; @@ -1158,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vf) + goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), @@ -1177,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) - return -EMSGSIZE; + goto nla_put_vf_failure; + vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST); + if (!vfvlanlist) + goto nla_put_vf_failure; + if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), + &vf_vlan_info)) { + nla_nest_cancel(skb, vfvlanlist); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfvlanlist); memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start(skb, IFLA_VF_STATS); - if (!vfstats) { - nla_nest_cancel(skb, vf); - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vfstats) + goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, @@ -1199,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) - return -EMSGSIZE; + vf_stats.multicast, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; + +nla_put_vf_failure: + nla_nest_cancel(skb, vf); +nla_put_vfinfo_failure: + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) @@ -1448,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, @@ -1704,7 +1727,34 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, - ivv->qos); + ivv->qos, + htons(ETH_P_8021Q)); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN_LIST]) { + struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; + struct nlattr *attr; + int rem, len = 0; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_vlan) + return err; + + nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { + if (nla_type(attr) != IFLA_VF_VLAN_INFO || + nla_len(attr) < NLA_HDRLEN) { + return -EINVAL; + } + if (len >= MAX_VLAN_LIST_LEN) + return -EOPNOTSUPP; + ivvl[len] = nla_data(attr); + + len++; + } + err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, + ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } -- cgit v1.2.3