diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 204 | ||||
-rw-r--r-- | net/core/dev.c | 230 | ||||
-rw-r--r-- | net/core/dev_addr_lists.c | 100 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 4 | ||||
-rw-r--r-- | net/core/devlink.c | 5 | ||||
-rw-r--r-- | net/core/ethtool.c | 12 | ||||
-rw-r--r-- | net/core/filter.c | 485 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 13 | ||||
-rw-r--r-- | net/core/gro_cells.c | 1 | ||||
-rw-r--r-- | net/core/neighbour.c | 455 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 2 | ||||
-rw-r--r-- | net/core/net_namespace.c | 162 | ||||
-rw-r--r-- | net/core/netpoll.c | 9 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 527 | ||||
-rw-r--r-- | net/core/skbuff.c | 318 | ||||
-rw-r--r-- | net/core/skmsg.c | 28 | ||||
-rw-r--r-- | net/core/sock.c | 30 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 1 | ||||
-rw-r--r-- | net/core/stream.c | 2 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 20 |
20 files changed, 1989 insertions, 619 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index 57f3a6fcfc1e..b2651bb6d2a3 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -408,27 +408,20 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } EXPORT_SYMBOL(skb_kill_datagram); -/** - * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: iovec iterator to copy to - * @len: amount of data to copy from buffer to iovec - */ -int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len) +int __skb_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, bool fault_short, + size_t (*cb)(const void *, size_t, void *, struct iov_iter *), + void *data) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; - trace_skb_copy_datagram_iovec(skb, len); - /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; - n = copy_to_iter(skb->data + offset, copy, to); + n = cb(skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; @@ -445,11 +438,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { + struct page *page = skb_frag_page(frag); + u8 *vaddr = kmap(page); + if (copy > len) copy = len; - n = copy_page_to_iter(skb_frag_page(frag), - frag->page_offset + offset - - start, copy, to); + n = cb(vaddr + frag->page_offset + + offset - start, copy, data, to); + kunmap(page); offset += n; if (n != copy) goto short_copy; @@ -468,8 +464,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_iter(frag_iter, offset - start, - to, copy)) + if (__skb_datagram_iter(frag_iter, offset - start, + to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; @@ -490,11 +486,50 @@ fault: return -EFAULT; short_copy: - if (iov_iter_count(to)) + if (fault_short || iov_iter_count(to)) goto fault; return 0; } + +/** + * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator + * and update a hash. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @hash: hash request to update + */ +int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + struct ahash_request *hash) +{ + return __skb_datagram_iter(skb, offset, to, len, true, + hash_and_copy_to_iter, hash); +} +EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); + +static size_t simple_copy_to_iter(const void *addr, size_t bytes, + void *data __always_unused, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +/** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + trace_skb_copy_datagram_iovec(skb, len); + return __skb_datagram_iter(skb, offset, to, len, false, + simple_copy_to_iter, NULL); +} EXPORT_SYMBOL(skb_copy_datagram_iter); /** @@ -645,131 +680,22 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) } EXPORT_SYMBOL(zerocopy_sg_from_iter); +/** + * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * and update a checksum. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @csump: checksum pointer + */ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { - int start = skb_headlen(skb); - int i, copy = start - offset, start_off = offset; - struct sk_buff *frag_iter; - int pos = 0; - int n; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); - offset += n; - if (n != copy) - goto fault; - if ((len -= copy) == 0) - return 0; - pos = copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); - - if (copy > len) - copy = len; - n = csum_and_copy_to_iter(vaddr + frag->page_offset + - offset - start, copy, - &csum2, to); - kunmap(page); - offset += n; - if (n != copy) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if (!(len -= copy)) - return 0; - pos += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - if (copy > len) - copy = len; - if (skb_copy_and_csum_datagram(frag_iter, - offset - start, - to, copy, - &csum2)) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if ((len -= copy) == 0) - return 0; - offset += copy; - pos += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - iov_iter_revert(to, offset - start_off); - return -EFAULT; -} - -__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) -{ - __sum16 sum; - - sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - if (!skb_shared(skb)) - skb->csum_valid = !sum; - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete_head); - -__sum16 __skb_checksum_complete(struct sk_buff *skb) -{ - __wsum csum; - __sum16 sum; - - csum = skb_checksum(skb, 0, skb->len, 0); - - /* skb->csum holds pseudo checksum */ - sum = csum_fold(csum_add(skb->csum, csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - - if (!skb_shared(skb)) { - /* Save full packet checksum */ - skb->csum = csum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - skb->csum_valid = !sum; - } - - return sum; + return __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, csump); } -EXPORT_SYMBOL(__skb_checksum_complete); /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. @@ -810,7 +736,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(NULL); + netdev_rx_csum_fault(NULL, skb); } return 0; fault: diff --git a/net/core/dev.c b/net/core/dev.c index 77d43ae2a7bb..1b5a4410be0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> +#include <linux/indirect_call_wrapper.h> #include "net-sysfs.h" @@ -162,6 +163,9 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); static struct napi_struct *napi_by_id(unsigned int napi_id); /* @@ -1361,7 +1365,7 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1377,7 +1381,7 @@ static int __dev_open(struct net_device *dev) */ netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; @@ -1406,7 +1410,8 @@ static int __dev_open(struct net_device *dev) /** * dev_open - prepare an interface for use. - * @dev: device to open + * @dev: device to open + * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally @@ -1416,14 +1421,14 @@ static int __dev_open(struct net_device *dev) * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ -int dev_open(struct net_device *dev) +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; @@ -1585,6 +1590,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + N(PRE_CHANGEADDR) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1733,6 +1739,18 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1744,11 +1762,7 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info = { - .dev = dev, - }; - - return call_netdevice_notifiers_info(val, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -2175,6 +2189,20 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void reset_xps_maps(struct net_device *dev, + struct xps_dev_maps *dev_maps, + bool is_rxqs_map) +{ + if (is_rxqs_map) { + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + } + static_key_slow_dec_cpuslocked(&xps_needed); + kfree_rcu(dev_maps, rcu); +} + static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, struct xps_dev_maps *dev_maps, unsigned int nr_ids, u16 offset, u16 count, bool is_rxqs_map) @@ -2186,18 +2214,15 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, j < nr_ids;) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); - if (!active) { - if (is_rxqs_map) { - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + if (!active) + reset_xps_maps(dev, dev_maps, is_rxqs_map); - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write( - netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + if (!is_rxqs_map) { + for (i = offset + (count - 1); count--; i--) { + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); } - kfree_rcu(dev_maps, rcu); } } @@ -2234,10 +2259,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, false); out_no_maps: - if (static_key_enabled(&xps_rxqs_needed)) - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - - static_key_slow_dec_cpuslocked(&xps_needed); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2355,9 +2376,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!new_dev_maps) goto out_no_new_maps; - static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) - static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + if (!dev_maps) { + /* Increment static keys at most once per type */ + static_key_slow_inc_cpuslocked(&xps_needed); + if (is_rxqs_map) + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + } for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { @@ -2455,13 +2479,8 @@ out_no_new_maps: } /* free map if not active */ - if (!active) { - if (is_rxqs_map) - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - else - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - kfree_rcu(dev_maps, rcu); - } + if (!active) + reset_xps_maps(dev, dev_maps, is_rxqs_map); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -3091,10 +3110,17 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + if (dev) + pr_err("dev features: %pNF\n", &dev->features); + pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", + skb->len, skb->data_len, skb->pkt_type, + skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, + skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level); dump_stack(); } } @@ -3272,7 +3298,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de } skb = next; - if (netif_xmit_stopped(txq) && skb) { + if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } @@ -4520,9 +4546,14 @@ static int netif_rx_internal(struct sk_buff *skb) int netif_rx(struct sk_buff *skb) { + int ret; + trace_netif_rx_entry(skb); - return netif_rx_internal(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_rx); @@ -4537,6 +4568,7 @@ int netif_rx_ni(struct sk_buff *skb) if (local_softirq_pending()) do_softirq(); preempt_enable(); + trace_netif_rx_ni_exit(err); return err; } @@ -4889,7 +4921,7 @@ skip_classify: * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; @@ -5009,7 +5041,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; - list_del(&skb->list); + skb_list_del_init(skb); __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; @@ -5165,7 +5197,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(netdev_tstamp_prequeue, skb); - list_del(&skb->list); + skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } @@ -5176,7 +5208,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) rcu_read_lock(); list_for_each_entry_safe(skb, next, head, list) { xdp_prog = rcu_dereference(skb->dev->xdp_prog); - list_del(&skb->list); + skb_list_del_init(skb); if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) list_add_tail(&skb->list, &sublist); } @@ -5195,7 +5227,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) if (cpu >= 0) { /* Will be handled, remove from list */ - list_del(&skb->list); + skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } @@ -5222,9 +5254,14 @@ static void netif_receive_skb_list_internal(struct list_head *head) */ int netif_receive_skb(struct sk_buff *skb) { + int ret; + trace_netif_receive_skb_entry(skb); - return netif_receive_skb_internal(skb); + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -5244,9 +5281,12 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; - list_for_each_entry(skb, head, list) - trace_netif_receive_skb_list_entry(skb); + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + } netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); @@ -5299,6 +5339,8 @@ static void flush_all_backlogs(void) put_online_cpus(); } +INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct sk_buff *skb) { struct packet_offload *ptype; @@ -5318,7 +5360,9 @@ static int napi_gro_complete(struct sk_buff *skb) if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb, 0); + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); break; } rcu_read_unlock(); @@ -5357,11 +5401,13 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - u32 i; + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - if (test_bit(i, &napi->gro_bitmask)) - __napi_gro_flush_chain(napi, i, flush_old); + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); @@ -5386,7 +5432,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5461,6 +5509,10 @@ static void gro_flush_oldest(struct list_head *head) napi_gro_complete(oldest); } +INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, + struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -5510,7 +5562,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(gro_head, skb); + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + gro_head, skb); break; } rcu_read_unlock(); @@ -5634,12 +5688,17 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + gro_result_t ret; + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + trace_napi_gro_receive_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_receive); @@ -5652,9 +5711,13 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); @@ -5753,6 +5816,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) gro_result_t napi_gro_frags(struct napi_struct *napi) { + gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); if (!skb) @@ -5760,7 +5824,10 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_frags); @@ -5776,10 +5843,11 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; @@ -5966,11 +6034,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) timeout = n->dev->gro_flush_timeout; + /* When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer + */ + napi_gro_flush(n, !!timeout); if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - else - napi_gro_flush(n, false); } if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6197,8 +6268,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) - pr_err_once("netif_napi_add() called with weight %d on device %s\n", - weight, dev->name); + netdev_err_once(dev, "%s() called with weight %d\n", __func__, + weight); napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; @@ -7455,7 +7526,8 @@ unsigned int dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -7492,7 +7564,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if (old_flags & IFF_UP) __dev_close(dev); else - ret = __dev_open(dev); + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { @@ -7552,16 +7624,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, * dev_change_flags - change device settings * @dev: device * @flags: device state flags + * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; @@ -7694,13 +7768,36 @@ void dev_set_group(struct net_device *dev, int new_group) EXPORT_SYMBOL(dev_set_group); /** + * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + */ +int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_pre_changeaddr_info info = { + .info.dev = dev, + .info.extack = extack, + .dev_addr = addr, + }; + int rc; + + rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); + return notifier_to_errno(rc); +} +EXPORT_SYMBOL(dev_pre_changeaddr_notify); + +/** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address + * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; @@ -7711,6 +7808,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; + err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + if (err) + return err; err = ops->ndo_set_mac_address(dev, sa); if (err) return err; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index d884d8f5f0e5..a6723b306717 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -278,6 +278,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, EXPORT_SYMBOL(__hw_addr_sync_dev); /** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + +/** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from * @dev: device to sync @@ -401,6 +498,9 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); + err = dev_pre_changeaddr_notify(dev, addr, NULL); + if (err) + return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 90e8aa36881e..31380fd5a4e2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -234,7 +234,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ @@ -246,7 +246,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a4b29a13d31..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, + .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, + .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d05402868575..158264f7cfaf 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -793,8 +793,13 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops->get_regs_len) - info.regdump_len = ops->get_regs_len(dev); + if (ops->get_regs_len) { + int ret = ops->get_regs_len(dev); + + if (ret > 0) + info.regdump_len = ret; + } + if (ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); @@ -1337,6 +1342,9 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) return -EFAULT; reglen = ops->get_regs_len(dev); + if (reglen <= 0) + return reglen; + if (regs.len > reglen) regs.len = reglen; diff --git a/net/core/filter.c b/net/core/filter.c index e521c5ebc7d1..447dd1bad31f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -296,22 +296,18 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_VLAN_TAG: - case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); - if (skb_field == SKF_AD_VLAN_TAG) { - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, - ~VLAN_TAG_PRESENT); - } else { - /* dst_reg >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); - /* dst_reg &= 1 */ + break; + case SKF_AD_VLAN_TAG_PRESENT: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); - } break; } @@ -467,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); - *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { @@ -2428,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { .arg4_type = ARG_ANYTHING, }; +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + sge->length = a; + pop -= (sge->length - a); + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3908,6 +4073,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +static const struct bpf_func_proto bpf_sockopt_event_output_proto = { + .func = bpf_sockopt_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -4825,47 +5010,40 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) + int dif, int sdif, u8 family, u8 proto) { bool refcounted = false; struct sock *sk = NULL; - int dif = 0; - - if (skb->dev) - dif = skb->dev->ifindex; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; - int sdif = inet_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, skb); + dif, sdif, &udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - u16 hnum = ntohs(tuple->ipv6.dport); - int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, tuple->ipv6.dport, dif, sdif, - &udp_table, skb); + &udp_table, NULL); #endif } @@ -4882,31 +5060,34 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, * callers to satisfy BPF_CALL declarations. */ static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { - struct net *caller_net; struct sock *sk = NULL; u8 family = AF_UNSPEC; struct net *net; + int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; - if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + if (unlikely(family == AF_UNSPEC || flags || + !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; - if (skb->dev) - caller_net = dev_net(skb->dev); + if (family == AF_INET) + sdif = inet_sdif(skb); else - caller_net = sock_net(skb->sk); - if (netns_id) { + sdif = inet6_sdif(skb); + + if ((s32)netns_id < 0) { + net = caller_net; + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); + } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); - } else { - net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); } if (sk) @@ -4915,6 +5096,25 @@ out: return (unsigned long) sk; } +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, + proto, netns_id, flags); +} + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { @@ -4964,6 +5164,87 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -4986,6 +5267,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || + func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5070,6 +5352,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -5214,6 +5504,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5240,6 +5538,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_sockopt_event_output_proto; default: return bpf_base_func_proto(func_id); } @@ -5264,6 +5564,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; default: return bpf_base_func_proto(func_id); } @@ -5436,8 +5738,12 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): - if (size != sizeof(struct bpf_flow_keys *)) + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) return false; break; default: @@ -5465,8 +5771,10 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5490,7 +5798,8 @@ static bool cg_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): @@ -5505,6 +5814,10 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; default: return false; } @@ -5531,7 +5844,9 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5741,6 +6056,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): break; default: return false; @@ -5757,7 +6073,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5959,7 +6275,9 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5995,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size, if (type == BPF_WRITE) return false; + if (off % size != 0) + return false; + switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; @@ -6006,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - default: + case bpf_ctx_range(struct sk_msg_md, family): + case bpf_ctx_range(struct sk_msg_md, remote_ip4): + case bpf_ctx_range(struct sk_msg_md, local_ip4): + case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct sk_msg_md, remote_port): + case bpf_ctx_range(struct sk_msg_md, local_port): + case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; - } - - if (off < 0 || off >= sizeof(struct sk_msg_md)) - return false; - if (off % size != 0) + break; + default: return false; - + } return true; } @@ -6040,12 +6365,14 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6140,19 +6467,19 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, vlan_present): - case offsetof(struct __sk_buff, vlan_tci): - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + break; + case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); - if (si->off == offsetof(struct __sk_buff, vlan_tci)) { - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, - ~VLAN_TAG_PRESENT); - } else { - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); - } break; case offsetof(struct __sk_buff, cb[0]) ... @@ -6355,6 +6682,33 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + break; + + case offsetof(struct __sk_buff, wire_len): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, wire_len); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, pkt_len); + *target_size = 4; + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); } return insn - insn_buf; @@ -7071,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, int off; #endif + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), @@ -7183,6 +7540,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct sk_msg_md, size): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_sg, size)); + break; } return insn - insn_buf; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 676f3ad629f9..9f2840510e63 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -783,6 +783,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, /* Pass parameters to the BPF program */ cb->qdisc_cb.flow_keys = &flow_keys; flow_keys.nhoff = nhoff; + flow_keys.thoff = nhoff; bpf_compute_data_pointers((struct sk_buff *)skb); result = BPF_PROG_RUN(attached, skb); @@ -790,9 +791,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb, /* Restore state */ memcpy(cb, &cb_saved, sizeof(cb_saved)); + flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len); + flow_keys.thoff = clamp_t(u16, flow_keys.thoff, + flow_keys.nhoff, skb->len); + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); - key_control->thoff = min_t(u16, key_control->thoff, skb->len); rcu_read_unlock(); return result == BPF_OK; } @@ -952,8 +956,7 @@ proto_again: if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); - key_vlan->vlan_priority = - (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; @@ -1166,8 +1169,8 @@ ip_proto_again: break; } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) && + !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 4b54e5f107c6..acf45ddbe924 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -84,6 +84,7 @@ void gro_cells_destroy(struct gro_cells *gcells) for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + napi_disable(&cell->napi); netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 41954e42a2de..763a7b08df67 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,21 +118,77 @@ unsigned long neigh_rand_reach_time(unsigned long base) } EXPORT_SYMBOL(neigh_rand_reach_time); +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } +} + +static void neigh_update_gc_list(struct neighbour *n) +{ + bool on_gc_list, exempt_from_gc; + + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + + /* remove from the gc list if new state is permanent or if neighbor + * is externally learned; otherwise entry should be on the gc list + */ + exempt_from_gc = n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_LEARNED; + on_gc_list = !list_empty(&n->gc_list); + + if (exempt_from_gc && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!exempt_from_gc && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } + + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); +} -static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, - struct neighbour __rcu **np, struct neigh_table *tbl) +static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, + int *notify) +{ + bool rc = false; + u8 ndm_flags; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return rc; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + rc = true; + *notify = 1; + } + + return rc; +} + +static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, + struct neigh_table *tbl) { bool retval = false; write_lock(&n->lock); - if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && - !(n->flags & flags)) { + if (refcount_read(&n->refcnt) == 1) { struct neighbour *neigh; neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); - n->dead = 1; + neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); @@ -158,7 +214,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock)))) { if (n == ndel) - return neigh_del(n, 0, 0, np, tbl); + return neigh_del(n, np, tbl); np = &n->next; } return false; @@ -166,32 +222,29 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { + int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + unsigned long tref = jiffies - 5 * HZ; + struct neighbour *n, *tmp; int shrunk = 0; - int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - /* Neighbour record may be discarded if: - * - nobody refers to it. - * - it is not permanent - */ - if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, - tbl)) { - shrunk = 1; - continue; - } - np = &n->next; + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + + write_lock(&n->lock); + if ((n->nud_state == NUD_FAILED) || + time_after(tref, n->updated)) + remove = true; + write_unlock(&n->lock); + + if (remove && neigh_remove_one(n, tbl)) + shrunk++; + if (shrunk >= max_clean) + break; } } @@ -260,8 +313,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); - n->dead = 1; - + neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, @@ -321,13 +373,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - entries = atomic_inc_return(&tbl->entries) - 1; + if (exempt_from_gc) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -340,6 +397,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device } } +do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -358,11 +416,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; + INIT_LIST_HEAD(&n->gc_list); + + atomic_inc(&tbl->entries); out: return n; out_entries: - atomic_dec(&tbl->entries); + if (!exempt_from_gc) + atomic_dec(&tbl->gc_entries); goto out; } @@ -505,13 +567,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, bool want_ref) +static struct neighbour *___neigh_create(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev, + bool exempt_from_gc, bool want_ref) { + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); u32 hash_val; unsigned int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -574,6 +638,9 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, } n->dead = 0; + if (!exempt_from_gc) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, @@ -591,6 +658,12 @@ out_neigh_release: neigh_release(n); goto out; } + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + return ___neigh_create(tbl, pkey, dev, false, want_ref); +} EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) @@ -652,6 +725,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (!n) goto out; + n->protocol = 0; write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; @@ -854,7 +928,7 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; - n->dead = 1; + neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; @@ -1137,9 +1211,11 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags, u32 nlmsg_pid) +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) { + bool ext_learn_change = false; u8 old; int err; int notify = 0; @@ -1155,10 +1231,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - if (neigh->dead) + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); goto out; + } - neigh_update_ext_learned(neigh, flags, ¬ify); + ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1193,8 +1271,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, use it, otherwise discard the request. */ err = -EINVAL; - if (!(old & NUD_VALID)) + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; + } lladdr = neigh->ha; } @@ -1302,11 +1382,20 @@ out: neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); + if (((new ^ old) & NUD_PERMANENT) || ext_learn_change) + neigh_update_gc_list(neigh); + if (notify) neigh_update_notify(neigh, nlmsg_pid); return err; } + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is @@ -1571,6 +1660,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1662,6 +1752,19 @@ static struct neigh_table *neigh_find_table(int family) return tbl; } +const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, + [NDA_PROTOCOL] = { .type = NLA_U8 }, +}; + static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1678,8 +1781,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); - if (dst_attr == NULL) + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1694,8 +1799,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < (int)tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); @@ -1711,10 +1818,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh, tbl); @@ -1736,16 +1842,19 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; + u8 protocol = 0; int err; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; err = -EINVAL; - if (tb[NDA_DST] == NULL) + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1755,19 +1864,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; + } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } + dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (tb[NDA_PROTOCOL]) + protocol = nla_get_u8(tb[NDA_PROTOCOL]); + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -1775,22 +1892,30 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm->ndm_flags; + if (protocol) + pn->protocol = protocol; err = 0; } goto out; } - if (dev == NULL) + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); goto out; + } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { + bool exempt_from_gc; + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } - neigh = __neigh_lookup_errno(tbl, dst, dev); + exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || + ndm->ndm_flags & NTF_EXT_LEARNED; + neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -1817,8 +1942,12 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + + if (protocol) + neigh->protocol = protocol; + neigh_release(neigh); out: @@ -2312,6 +2441,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; + if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -2343,6 +2475,9 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; + if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -2494,16 +2629,21 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || - ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { + ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); return -EINVAL; } + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); + return -EINVAL; + } + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + nda_policy, extack); } else { err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + nda_policy, extack); } if (err < 0) return err; @@ -2515,17 +2655,9 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: - if (nla_len(tb[i]) != sizeof(u32)) { - NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request"); - return -EINVAL; - } filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: - if (nla_len(tb[i]) != sizeof(u32)) { - NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request"); - return -EINVAL; - } filter->master_idx = nla_get_u32(tb[i]); break; default: @@ -2585,6 +2717,186 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int neigh_valid_get_req(const struct nlmsghdr *nlh, + struct neigh_table **tbl, + void **dst, int *dev_idx, u8 *ndm_flags, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *dev_idx = ndm->ndm_ifindex; + *tbl = neigh_find_table(ndm->ndm_family); + if (*tbl == NULL) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + return -EAFNOSUPPORT; + } + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_DST: + if (nla_len(tb[i]) != (int)(*tbl)->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + return -EINVAL; + } + *dst = nla_data(tb[i]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); + return -EINVAL; + } + } + + return 0; +} + +static inline size_t neigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int neigh_get_reply(struct net *net, struct neighbour *neigh, + u32 pid, u32 seq) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static inline size_t pneigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, + u32 pid, u32 seq, struct neigh_table *tbl) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct net_device *dev = NULL; + struct neigh_table *tbl = NULL; + struct neighbour *neigh; + void *dst = NULL; + u8 ndm_flags = 0; + int dev_idx = 0; + int err; + + err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, + extack); + if (err < 0) + return err; + + if (dev_idx) { + dev = __dev_get_by_index(net, dev_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (!dst) { + NL_SET_ERR_MSG(extack, "Network address not specified"); + return -EINVAL; + } + + if (ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + pn = pneigh_lookup(tbl, net, dst, dev, 0); + if (!pn) { + NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); + return -ENOENT; + } + return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, tbl); + } + + if (!dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -EINVAL; + } + + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + return -ENOENT; + } + + err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq); + + neigh_release(neigh); + + return err; +} + void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; @@ -2631,7 +2943,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); - n->dead = 1; + neigh_mark_dead(n); } else np = &n->next; write_unlock(&n->lock); @@ -2992,15 +3304,6 @@ static const struct seq_operations neigh_stat_seq_ops = { }; #endif /* CONFIG_PROC_FS */ -static inline size_t neigh_nlmsg_size(void) -{ - return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ - + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(4); /* NDA_PROBES */ -} - static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { @@ -3384,7 +3687,7 @@ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bd67c4d0fcfd..ff9fd2bb4ce4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -337,7 +337,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(dev, (unsigned int)new_flags); + return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..b02fb19df2cc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -735,23 +736,38 @@ static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } -static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, int nsid) +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; + bool add_ref; + int ref_nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nla_put_s32(skb, NETNSA_NSID, nsid)) + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) + goto nla_put_failure; + + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -767,10 +783,15 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; + struct net *peer, *target = net; struct nlattr *nla; struct sk_buff *msg; - struct net *peer; - int err, id; + int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); @@ -782,6 +803,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; @@ -793,15 +819,29 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); + } + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } - id = peernet2id(net, peer); - err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, net, id); + fillargs.nsid = peernet2id(target, peer); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -811,14 +851,17 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: + if (fillargs.add_ref) + put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { - struct net *net; + struct net *tgt_net; + struct net *ref_net; struct sk_buff *skb; - struct netlink_callback *cb; + struct net_fill_args fillargs; int idx; int s_idx; }; @@ -831,9 +874,10 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) if (net_cb->idx < net_cb->s_idx) goto cont; - ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, - net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, id); + net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -842,33 +886,96 @@ cont: return 0; } +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; + net_cb->tgt_net = net; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { - .net = net, + .tgt_net = sock_net(skb->sk), .skb = skb, - .cb = cb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, .idx = 0, .s_idx = cb->args[0], }; + int err = 0; - if (cb->strict_check && - nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { - NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); - return -EINVAL; + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; } - spin_lock_bh(&net->nsid_lock); - idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_bh(&net->nsid_lock); + spin_lock_bh(&net_cb.tgt_net->nsid_lock); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net) && + !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + err = -EAGAIN; + goto end; + } + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net)) + spin_unlock_bh(&net_cb.ref_net->nsid_lock); + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; - return skb->len; +end: + if (net_cb.fillargs.add_ref) + put_net(net_cb.tgt_net); + return err < 0 ? err : skb->len; } static void rtnl_net_notifyid(struct net *net, int cmd, int id) { + struct net_fill_args fillargs = { + .cmd = cmd, + .nsid = id, + }; struct sk_buff *msg; int err = -ENOMEM; @@ -876,7 +983,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -917,7 +1024,8 @@ static int __init net_ns_init(void) init_net_initialized = true; up_write(&pernet_ops_rwsem); - register_pernet_subsys(&net_ns_ops); + if (register_pernet_subsys(&net_ns_ops)) + panic("Could not register network namespace subsystems"); rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, RTNL_FLAG_DOIT_UNLOCKED); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5da9552b186b..361aabffb8c0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -663,7 +663,7 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - err = dev_open(ndev); + err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); @@ -717,7 +717,8 @@ int netpoll_setup(struct netpoll *np) read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; @@ -800,7 +801,7 @@ void __netpoll_cleanup(struct netpoll *np) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); - call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); } @@ -811,7 +812,7 @@ void __netpoll_free(struct netpoll *np) ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); + synchronize_rcu(); __netpoll_cleanup(np); kfree(np); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e01274bd5e3e..5ea1bed08ede 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> -#define RTNL_MAX_TYPE 49 +#define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { @@ -2444,7 +2444,7 @@ static int do_setlink(const struct sk_buff *skb, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = dev_set_mac_address(dev, sa); + err = dev_set_mac_address(dev, sa, extack); kfree(sa); if (err) goto errout; @@ -2489,7 +2489,8 @@ static int do_setlink(const struct sk_buff *skb, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } @@ -2870,7 +2871,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { - err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); if (err < 0) return err; } @@ -2885,9 +2887,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *net, - const char *ifname, unsigned char name_assign_type, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; @@ -2903,11 +2907,15 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - if (num_tx_queues < 1 || num_tx_queues > 4096) + if (num_tx_queues < 1 || num_tx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); + } - if (num_rx_queues < 1 || num_rx_queues > 4096) + if (num_rx_queues < 1 || num_rx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); + } dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); @@ -2965,20 +2973,24 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attr, struct netlink_ext_ack *extack) { + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; + unsigned char name_assign_type = NET_NAME_USER; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + const struct rtnl_link_ops *m_ops = NULL; + struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *tb[IFLA_MAX + 1]; + struct net *dest_net, *link_net; + struct nlattr **slave_data; + char kind[MODULE_NAME_LEN]; struct net_device *dev; - struct net_device *master_dev = NULL; struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; - unsigned char name_assign_type = NET_NAME_USER; + struct nlattr **data; int err; #ifdef CONFIG_MODULES @@ -3034,193 +3046,200 @@ replay: ops = NULL; } - if (1) { - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; - struct nlattr **data = NULL; - struct nlattr **slave_data = NULL; - struct net *dest_net, *link_net = NULL; - - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + data = NULL; + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, NULL); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data, extack); + if (err < 0) + return err; } + } - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; + slave_data = NULL; + if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested(slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - NULL); - if (err < 0) - return err; - slave_data = slave_attr; - } + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + slave_data = slave_attr; } + } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + if (dev) { + int status = 0; - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } - err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data, - extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; - return do_setlink(skb, dev, ifm, extack, tb, ifname, - status); + err = m_ops->slave_changelink(master_dev, dev, tb, + slave_data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, + return do_setlink(skb, dev, ifm, extack, tb, ifname, status); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); - return -ENODEV; - } + return -ENODEV; + } - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; - if (!ops) { + if (!ops) { #ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - return -EOPNOTSUPP; + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; } +#endif + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } - if (!ops->setup) - return -EOPNOTSUPP; + if (!ops->setup) + return -EOPNOTSUPP; - if (!ifname[0]) { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } + if (!ifname[0]) { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } - - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } + + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } - dev->ifindex = ifm->ifi_index; + dev->ifindex = ifm->ifi_index; - if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data, - extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(dev); - goto out; - } - } else { - err = register_netdevice(dev); - if (err < 0) { + if (ops->newlink) { + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) free_netdev(dev); - goto out; - } + goto out; } - err = rtnl_configure_link(dev, ifm); + } else { + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; + } + } + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), - extack); - if (err) - goto out_unregister; - } + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); + if (ops->newlink) { + LIST_HEAD(list_kill); - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); } + goto out; +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr **attr; + int ret; + + attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = __rtnl_newlink(skb, nlh, attr, extack); + kfree(attr); + return ret; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3367,7 +3386,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = 0; } ret = dumpit(skb, cb); - if (ret < 0) + if (ret) break; } cb->family = idx; @@ -3800,6 +3819,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, { int err; + if (dev->type != ARPHRD_ETHER) + return -EINVAL; + netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) @@ -3999,6 +4021,165 @@ out: return skb->len; } +static int valid_fdb_get_strict(const struct nlmsghdr *nlh, + struct nlattr **tb, u8 *ndm_flags, + int *br_idx, int *brport_idx, u8 **addr, + u16 *vid, struct netlink_ext_ack *extack) +{ + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_MASTER: + *br_idx = nla_get_u32(tb[i]); + break; + case NDA_LLADDR: + if (nla_len(tb[i]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); + return -EINVAL; + } + *addr = nla_data(tb[i]); + break; + case NDA_VLAN: + err = fdb_vid_parse(tb[i], vid, extack); + if (err) + return err; + break; + case NDA_VNI: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL, *br_dev = NULL; + const struct net_device_ops *ops = NULL; + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct sk_buff *skb; + int brport_idx = 0; + u8 ndm_flags = 0; + int br_idx = 0; + u8 *addr = NULL; + u16 vid = 0; + int err; + + err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, + &brport_idx, &addr, &vid, extack); + if (err < 0) + return err; + + if (!addr) { + NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); + return -EINVAL; + } + + if (brport_idx) { + dev = __dev_get_by_index(net, brport_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (br_idx) { + if (dev) { + NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); + return -EINVAL; + } + + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Invalid master ifindex"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } + + if (dev) { + if (!ndm_flags || (ndm_flags & NTF_MASTER)) { + if (!(dev->priv_flags & IFF_BRIDGE_PORT)) { + NL_SET_ERR_MSG(extack, "Device is not a bridge port"); + return -EINVAL; + } + br_dev = netdev_master_upper_dev_get(dev); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Master of device not found"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } else { + if (!(ndm_flags & NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); + return -EINVAL; + } + ops = dev->netdev_ops; + } + } + + if (!br_dev && !dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -ENODEV; + } + + if (!ops || !ops->ndo_fdb_get) { + NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); + return -EOPNOTSUPP; + } + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (br_dev) + dev = br_dev; + err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); + if (err) + goto out; + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + kfree_skb(skb); + return err; +} + static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { @@ -4310,7 +4491,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags); + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, + extack); if (err) goto out; @@ -4322,7 +4504,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, - flags); + flags, + extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; @@ -5057,7 +5240,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0); + rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 946de0e24c87..37317ffec146 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,9 @@ struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; +#ifdef CONFIG_SKB_EXTENSIONS +static struct kmem_cache *skbuff_ext_cache __ro_after_init; +#endif int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -606,7 +609,6 @@ fastpath: void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -614,9 +616,7 @@ void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); -#endif + skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ @@ -796,9 +796,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); -#ifdef CONFIG_XFRM - new->sp = secpath_get(old->sp); -#endif + __skb_ext_copy(new, old); __nf_copy(new, old, false); /* Note : this field could be in headers_start/headers_end section @@ -1089,7 +1087,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -void sock_zerocopy_put_abort(struct ubuf_info *uarg) +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1097,7 +1095,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - sock_zerocopy_put(uarg); + if (have_uref) + sock_zerocopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); @@ -1105,6 +1104,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) @@ -1131,7 +1136,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1151,7 +1156,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } - skb_zcopy_set(nskb, skb_uarg(orig)); + skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } @@ -1925,8 +1930,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - BUG_ON(!list); - if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; @@ -2366,19 +2369,6 @@ error: } EXPORT_SYMBOL_GPL(skb_send_sock_locked); -/* Send skb data on a socket. */ -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) -{ - int ret = 0; - - lock_sock(sk); - ret = skb_send_sock_locked(sk, skb, offset, len); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL_GPL(skb_send_sock); - /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer @@ -2645,6 +2635,65 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) { net_warn_ratelimited( @@ -2962,28 +3011,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) @@ -3873,6 +3900,46 @@ done: } EXPORT_SYMBOL_GPL(skb_gro_receive); +#ifdef CONFIG_SKB_EXTENSIONS +#define SKB_EXT_ALIGN_VALUE 8 +#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) + +static const u8 skb_ext_type_len[] = { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), +#endif +#ifdef CONFIG_XFRM + [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), +#endif +}; + +static __always_inline unsigned int skb_ext_total_length(void) +{ + return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif +#ifdef CONFIG_XFRM + skb_ext_type_len[SKB_EXT_SEC_PATH] + +#endif + 0; +} + +static void skb_extensions_init(void) +{ + BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(skb_ext_total_length() > 255); + + skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", + SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} +#else +static void skb_extensions_init(void) {} +#endif + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", @@ -3887,6 +3954,7 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + skb_extensions_init(); } static int @@ -4854,6 +4922,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset(skb); nf_reset_trace(skb); +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; +#endif + if (!xnet) return; @@ -4944,6 +5017,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * + * @skb: GSO skb + * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * @@ -5121,7 +5196,7 @@ int skb_vlan_pop(struct sk_buff *skb) int err; if (likely(skb_vlan_tag_present(skb))) { - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; @@ -5518,3 +5593,148 @@ void skb_condense(struct sk_buff *skb) */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } + +#ifdef CONFIG_SKB_EXTENSIONS +static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) +{ + return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); +} + +static struct skb_ext *skb_ext_alloc(void) +{ + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + + if (new) { + memset(new->offset, 0, sizeof(new->offset)); + refcount_set(&new->refcnt, 1); + } + + return new; +} + +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, + unsigned int old_active) +{ + struct skb_ext *new; + + if (refcount_read(&old->refcnt) == 1) + return old; + + new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); + refcount_set(&new->refcnt, 1); + +#ifdef CONFIG_XFRM + if (old_active & (1 << SKB_EXT_SEC_PATH)) { + struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } +#endif + __skb_ext_put(old); + return new; +} + +/** + * skb_ext_add - allocate space for given extension, COW if needed + * @skb: buffer + * @id: extension to allocate space for + * + * Allocates enough space for the given extension. + * If the extension is already present, a pointer to that extension + * is returned. + * + * If the skb was cloned, COW applies and the returned memory can be + * modified without changing the extension space of clones buffers. + * + * Returns pointer to the extension or NULL on allocation failure. + */ +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *new, *old = NULL; + unsigned int newlen, newoff; + + if (skb->active_extensions) { + old = skb->extensions; + + new = skb_ext_maybe_cow(old, skb->active_extensions); + if (!new) + return NULL; + + if (__skb_ext_exist(new, id)) + goto set_active; + + newoff = new->chunks; + } else { + newoff = SKB_EXT_CHUNKSIZEOF(*new); + + new = skb_ext_alloc(); + if (!new) + return NULL; + } + + newlen = newoff + skb_ext_type_len[id]; + new->chunks = newlen; + new->offset[id] = newoff; +set_active: + skb->extensions = new; + skb->active_extensions |= 1 << id; + return skb_ext_get_ptr(new, id); +} +EXPORT_SYMBOL(skb_ext_add); + +#ifdef CONFIG_XFRM +static void skb_ext_put_sp(struct sec_path *sp) +{ + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); +} +#endif + +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *ext = skb->extensions; + + skb->active_extensions &= ~(1 << id); + if (skb->active_extensions == 0) { + skb->extensions = NULL; + __skb_ext_put(ext); +#ifdef CONFIG_XFRM + } else if (id == SKB_EXT_SEC_PATH && + refcount_read(&ext->refcnt) == 1) { + struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); + + skb_ext_put_sp(sp); + sp->len = 0; +#endif + } +} +EXPORT_SYMBOL(__skb_ext_del); + +void __skb_ext_put(struct skb_ext *ext) +{ + /* If this is last clone, nothing can increment + * it after check passes. Avoids one atomic op. + */ + if (refcount_read(&ext->refcnt) == 1) + goto free_now; + + if (!refcount_dec_and_test(&ext->refcnt)) + return; +free_now: +#ifdef CONFIG_XFRM + if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) + skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); +#endif + + kmem_cache_free(skbuff_ext_cache, ext); +} +EXPORT_SYMBOL(__skb_ext_put); +#endif /* CONFIG_SKB_EXTENSIONS */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 56a99d0c9aa0..d6d5c20d7044 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -94,6 +94,9 @@ int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, } while (len) { + if (sk_msg_full(dst)) + return -ENOSPC; + sge_len = sge->length - off; sge_off = sge->offset + off; if (sge_len > len) @@ -403,7 +406,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) msg->skb = skb; sk_psock_queue_msg(psock, msg); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); return copied; } @@ -572,6 +575,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { rcu_assign_sk_user_data(sk, NULL); sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); sk_psock_restore_proto(sk, psock); write_lock_bh(&sk->sk_callback_lock); @@ -580,7 +584,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - call_rcu_sched(&psock->rcu, sk_psock_destroy); + call_rcu(&psock->rcu, sk_psock_destroy); } EXPORT_SYMBOL_GPL(sk_psock_drop); @@ -669,6 +673,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, bool ingress; switch (verdict) { + case __SK_PASS: + sk_other = psock->sk; + if (sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + goto out_free; + } + if (atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf) { + struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); + + tcp->bpf.flags |= BPF_F_INGRESS; + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + break; + } + goto out_free; case __SK_REDIRECT: sk_other = tcp_skb_bpf_redirect_fetch(skb); if (unlikely(!sk_other)) @@ -735,7 +755,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) } /* Called with socket lock held. */ -static void sk_psock_data_ready(struct sock *sk) +static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; @@ -783,7 +803,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; } diff --git a/net/core/sock.c b/net/core/sock.c index 6fcc4bc07d19..6aa2e7e0b4fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); @@ -698,6 +700,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); @@ -950,10 +953,12 @@ set_rcvbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; - else + } else if (val != sk->sk_mark) { sk->sk_mark = val; + sk_dst_reset(sk); + } break; case SO_RXQ_OVFL: @@ -1014,7 +1019,10 @@ set_rcvbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; @@ -2743,6 +2751,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; +#if BITS_PER_LONG==32 + seqlock_init(&sk->sk_stamp_seq); +#endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL @@ -2842,12 +2853,13 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) struct timeval tv; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); if (tv.tv_sec == -1) return -ENOENT; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; } @@ -2858,11 +2870,12 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) struct timespec ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); ts = ktime_to_timespec(sk->sk_stamp); } return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; @@ -3279,6 +3292,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && + protocol != IPPROTO_RAW && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index ba5cba56f574..d8fe3e549373 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } +EXPORT_SYMBOL(reuseport_add_sock); void reuseport_detach_sock(struct sock *sk) { diff --git a/net/core/stream.c b/net/core/stream.c index 7d329fb1f553..e94bb02a5629 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_is_writeable(sk) && sock) { + if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 37b4667128a3..d67ec17f2cc8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -28,6 +28,8 @@ static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static long long_one __maybe_unused = 1; +static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -289,6 +291,17 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } + +static int +proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} #endif static struct ctl_table net_core_table[] = { @@ -398,10 +411,11 @@ static struct ctl_table net_core_table[] = { { .procname = "bpf_jit_limit", .data = &bpf_jit_limit, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0600, - .proc_handler = proc_dointvec_minmax_bpf_restricted, - .extra1 = &one, + .proc_handler = proc_dolongvec_minmax_bpf_restricted, + .extra1 = &long_one, + .extra2 = &long_max, }, #endif { |