summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile4
-rw-r--r--net/core/dev.c723
-rw-r--r--net/core/dev_addr_lists.c93
-rw-r--r--net/core/dev_addr_lists_test.c236
-rw-r--r--net/core/dev_ioctl.c5
-rw-r--r--net/core/devlink.c70
-rw-r--r--net/core/drop_monitor.c6
-rw-r--r--net/core/dst.c8
-rw-r--r--net/core/failover.c4
-rw-r--r--net/core/filter.c23
-rw-r--r--net/core/flow_dissector.c2
-rw-r--r--net/core/gro.c766
-rw-r--r--net/core/link_watch.c8
-rw-r--r--net/core/neighbour.c18
-rw-r--r--net/core/net-sysfs.c34
-rw-r--r--net/core/netpoll.c4
-rw-r--r--net/core/pktgen.c8
-rw-r--r--net/core/rtnetlink.c15
-rw-r--r--net/core/secure_seq.c4
-rw-r--r--net/core/skbuff.c162
-rw-r--r--net/core/sock.c61
21 files changed, 1240 insertions, 1014 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 4268846f2f47..a8e4f737692b 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -11,7 +11,9 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
- fib_notifier.o xdp.o flow_offload.o
+ fib_notifier.o xdp.o flow_offload.o gro.o
+
+obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 2a352e668d10..4420086f3aeb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,16 +153,10 @@
#include "net-sysfs.h"
-#define MAX_GRO_SKBS 8
-
-/* This should be increased if a protocol with a bigger head is added. */
-#define GRO_MAX_HEAD (MAX_HEADER + 128)
static DEFINE_SPINLOCK(ptype_lock);
-static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
-static struct list_head offload_base __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
@@ -371,12 +365,12 @@ static void list_netdevice(struct net_device *dev)
ASSERT_RTNL();
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
dev_base_seq_inc(net);
}
@@ -389,11 +383,11 @@ static void unlist_netdevice(struct net_device *dev)
ASSERT_RTNL();
/* Unlink dev from the device chain */
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
list_del_rcu(&dev->dev_list);
netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
@@ -604,84 +598,6 @@ void dev_remove_pack(struct packet_type *pt)
EXPORT_SYMBOL(dev_remove_pack);
-/**
- * dev_add_offload - register offload handlers
- * @po: protocol offload declaration
- *
- * Add protocol offload handlers to the networking stack. The passed
- * &proto_offload is linked into kernel lists and may not be freed until
- * it has been removed from the kernel lists.
- *
- * This call does not sleep therefore it can not
- * guarantee all CPU's that are in middle of receiving packets
- * will see the new offload handlers (until the next received packet).
- */
-void dev_add_offload(struct packet_offload *po)
-{
- struct packet_offload *elem;
-
- spin_lock(&offload_lock);
- list_for_each_entry(elem, &offload_base, list) {
- if (po->priority < elem->priority)
- break;
- }
- list_add_rcu(&po->list, elem->list.prev);
- spin_unlock(&offload_lock);
-}
-EXPORT_SYMBOL(dev_add_offload);
-
-/**
- * __dev_remove_offload - remove offload handler
- * @po: packet offload declaration
- *
- * Remove a protocol offload handler that was previously added to the
- * kernel offload handlers by dev_add_offload(). The passed &offload_type
- * is removed from the kernel lists and can be freed or reused once this
- * function returns.
- *
- * The packet type might still be in use by receivers
- * and must not be freed until after all the CPU's have gone
- * through a quiescent state.
- */
-static void __dev_remove_offload(struct packet_offload *po)
-{
- struct list_head *head = &offload_base;
- struct packet_offload *po1;
-
- spin_lock(&offload_lock);
-
- list_for_each_entry(po1, head, list) {
- if (po == po1) {
- list_del_rcu(&po->list);
- goto out;
- }
- }
-
- pr_warn("dev_remove_offload: %p not found\n", po);
-out:
- spin_unlock(&offload_lock);
-}
-
-/**
- * dev_remove_offload - remove packet offload handler
- * @po: packet offload declaration
- *
- * Remove a packet offload handler that was previously added to the kernel
- * offload handlers by dev_add_offload(). The passed &offload_type is
- * removed from the kernel lists and can be freed or reused once this
- * function returns.
- *
- * This call sleeps to guarantee that no CPU is looking at the packet
- * type after return.
- */
-void dev_remove_offload(struct packet_offload *po)
-{
- __dev_remove_offload(po);
-
- synchronize_net();
-}
-EXPORT_SYMBOL(dev_remove_offload);
-
/*******************************************************************************
*
* Device Interface Subroutines
@@ -1272,15 +1188,15 @@ rollback:
netdev_adjacent_rename_links(dev, oldname);
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
netdev_name_node_del(dev->name_node);
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
synchronize_rcu();
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
netdev_name_node_add(net, dev->name_node);
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
ret = notifier_to_errno(ret);
@@ -1461,6 +1377,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
int ret;
ASSERT_RTNL();
+ dev_addr_check(dev);
if (!netif_device_present(dev)) {
/* may be detached because parent is runtime-suspended */
@@ -3315,40 +3232,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
return __vlan_get_protocol(skb, type, depth);
}
-/**
- * skb_mac_gso_segment - mac layer segmentation handler.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- */
-struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
- struct packet_offload *ptype;
- int vlan_depth = skb->mac_len;
- __be16 type = skb_network_protocol(skb, &vlan_depth);
-
- if (unlikely(!type))
- return ERR_PTR(-EINVAL);
-
- __skb_pull(skb, vlan_depth);
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, &offload_base, list) {
- if (ptype->type == type && ptype->callbacks.gso_segment) {
- segs = ptype->callbacks.gso_segment(skb, features);
- break;
- }
- }
- rcu_read_unlock();
-
- __skb_push(skb, skb->data - skb_mac_header(skb));
-
- return segs;
-}
-EXPORT_SYMBOL(skb_mac_gso_segment);
-
-
/* openvswitch calls this on rx path, so we need a different check.
*/
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
@@ -3513,7 +3396,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
{
u16 gso_segs = skb_shinfo(skb)->gso_segs;
- if (gso_segs > dev->gso_max_segs)
+ if (gso_segs > READ_ONCE(dev->gso_max_segs))
return features & ~NETIF_F_GSO_MASK;
if (!skb_shinfo(skb)->gso_type) {
@@ -4323,8 +4206,6 @@ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
int dev_rx_weight __read_mostly = 64;
int dev_tx_weight __read_mostly = 64;
-/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
-int gro_normal_batch __read_mostly = 8;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -5667,7 +5548,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
return ret;
}
-static void netif_receive_skb_list_internal(struct list_head *head)
+void netif_receive_skb_list_internal(struct list_head *head)
{
struct sk_buff *skb, *next;
struct list_head sublist;
@@ -5845,550 +5726,6 @@ static void flush_all_backlogs(void)
cpus_read_unlock();
}
-/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
-static void gro_normal_list(struct napi_struct *napi)
-{
- if (!napi->rx_count)
- return;
- netif_receive_skb_list_internal(&napi->rx_list);
- INIT_LIST_HEAD(&napi->rx_list);
- napi->rx_count = 0;
-}
-
-/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
- * pass the whole batch up to the stack.
- */
-static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
-{
- list_add_tail(&skb->list, &napi->rx_list);
- napi->rx_count += segs;
- if (napi->rx_count >= gro_normal_batch)
- gro_normal_list(napi);
-}
-
-static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
-{
- struct packet_offload *ptype;
- __be16 type = skb->protocol;
- struct list_head *head = &offload_base;
- int err = -ENOENT;
-
- BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
-
- if (NAPI_GRO_CB(skb)->count == 1) {
- skb_shinfo(skb)->gso_size = 0;
- goto out;
- }
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_complete)
- continue;
-
- err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
- ipv6_gro_complete, inet_gro_complete,
- skb, 0);
- break;
- }
- rcu_read_unlock();
-
- if (err) {
- WARN_ON(&ptype->list == head);
- kfree_skb(skb);
- return;
- }
-
-out:
- gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
-}
-
-static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
- bool flush_old)
-{
- struct list_head *head = &napi->gro_hash[index].list;
- struct sk_buff *skb, *p;
-
- list_for_each_entry_safe_reverse(skb, p, head, list) {
- if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
- return;
- skb_list_del_init(skb);
- napi_gro_complete(napi, skb);
- napi->gro_hash[index].count--;
- }
-
- if (!napi->gro_hash[index].count)
- __clear_bit(index, &napi->gro_bitmask);
-}
-
-/* napi->gro_hash[].list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
-{
- unsigned long bitmask = napi->gro_bitmask;
- unsigned int i, base = ~0U;
-
- while ((i = ffs(bitmask)) != 0) {
- bitmask >>= i;
- base += i;
- __napi_gro_flush_chain(napi, base, flush_old);
- }
-}
-EXPORT_SYMBOL(napi_gro_flush);
-
-static void gro_list_prepare(const struct list_head *head,
- const struct sk_buff *skb)
-{
- unsigned int maclen = skb->dev->hard_header_len;
- u32 hash = skb_get_hash_raw(skb);
- struct sk_buff *p;
-
- list_for_each_entry(p, head, list) {
- unsigned long diffs;
-
- NAPI_GRO_CB(p)->flush = 0;
-
- if (hash != skb_get_hash_raw(p)) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
-
- diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
- diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
- if (skb_vlan_tag_present(p))
- diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
- diffs |= skb_metadata_differs(p, skb);
- if (maclen == ETH_HLEN)
- diffs |= compare_ether_header(skb_mac_header(p),
- skb_mac_header(skb));
- else if (!diffs)
- diffs = memcmp(skb_mac_header(p),
- skb_mac_header(skb),
- maclen);
-
- /* in most common scenarions 'slow_gro' is 0
- * otherwise we are already on some slower paths
- * either skip all the infrequent tests altogether or
- * avoid trying too hard to skip each of them individually
- */
- if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
-#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- struct tc_skb_ext *skb_ext;
- struct tc_skb_ext *p_ext;
-#endif
-
- diffs |= p->sk != skb->sk;
- diffs |= skb_metadata_dst_cmp(p, skb);
- diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
-
-#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- skb_ext = skb_ext_find(skb, TC_SKB_EXT);
- p_ext = skb_ext_find(p, TC_SKB_EXT);
-
- diffs |= (!!p_ext) ^ (!!skb_ext);
- if (!diffs && unlikely(skb_ext))
- diffs |= p_ext->chain ^ skb_ext->chain;
-#endif
- }
-
- NAPI_GRO_CB(p)->same_flow = !diffs;
- }
-}
-
-static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
-{
- const struct skb_shared_info *pinfo = skb_shinfo(skb);
- const skb_frag_t *frag0 = &pinfo->frags[0];
-
- NAPI_GRO_CB(skb)->data_offset = 0;
- NAPI_GRO_CB(skb)->frag0 = NULL;
- NAPI_GRO_CB(skb)->frag0_len = 0;
-
- if (!skb_headlen(skb) && pinfo->nr_frags &&
- !PageHighMem(skb_frag_page(frag0)) &&
- (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
- NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
- NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
- skb_frag_size(frag0),
- skb->end - skb->tail);
- }
-}
-
-static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
-{
- struct skb_shared_info *pinfo = skb_shinfo(skb);
-
- BUG_ON(skb->end - skb->tail < grow);
-
- memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
-
- skb->data_len -= grow;
- skb->tail += grow;
-
- skb_frag_off_add(&pinfo->frags[0], grow);
- skb_frag_size_sub(&pinfo->frags[0], grow);
-
- if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
- skb_frag_unref(skb, 0);
- memmove(pinfo->frags, pinfo->frags + 1,
- --pinfo->nr_frags * sizeof(pinfo->frags[0]));
- }
-}
-
-static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
-{
- struct sk_buff *oldest;
-
- oldest = list_last_entry(head, struct sk_buff, list);
-
- /* We are called with head length >= MAX_GRO_SKBS, so this is
- * impossible.
- */
- if (WARN_ON_ONCE(!oldest))
- return;
-
- /* Do not adjust napi->gro_hash[].count, caller is adding a new
- * SKB to the chain.
- */
- skb_list_del_init(oldest);
- napi_gro_complete(napi, oldest);
-}
-
-static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-{
- u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
- struct gro_list *gro_list = &napi->gro_hash[bucket];
- struct list_head *head = &offload_base;
- struct packet_offload *ptype;
- __be16 type = skb->protocol;
- struct sk_buff *pp = NULL;
- enum gro_result ret;
- int same_flow;
- int grow;
-
- if (netif_elide_gro(skb->dev))
- goto normal;
-
- gro_list_prepare(&gro_list->list, skb);
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_receive)
- continue;
-
- skb_set_network_header(skb, skb_gro_offset(skb));
- skb_reset_mac_len(skb);
- NAPI_GRO_CB(skb)->same_flow = 0;
- NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->free = 0;
- NAPI_GRO_CB(skb)->encap_mark = 0;
- NAPI_GRO_CB(skb)->recursion_counter = 0;
- NAPI_GRO_CB(skb)->is_fou = 0;
- NAPI_GRO_CB(skb)->is_atomic = 1;
- NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
-
- /* Setup for GRO checksum validation */
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- NAPI_GRO_CB(skb)->csum = skb->csum;
- NAPI_GRO_CB(skb)->csum_valid = 1;
- NAPI_GRO_CB(skb)->csum_cnt = 0;
- break;
- case CHECKSUM_UNNECESSARY:
- NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
- NAPI_GRO_CB(skb)->csum_valid = 0;
- break;
- default:
- NAPI_GRO_CB(skb)->csum_cnt = 0;
- NAPI_GRO_CB(skb)->csum_valid = 0;
- }
-
- pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
- ipv6_gro_receive, inet_gro_receive,
- &gro_list->list, skb);
- break;
- }
- rcu_read_unlock();
-
- if (&ptype->list == head)
- goto normal;
-
- if (PTR_ERR(pp) == -EINPROGRESS) {
- ret = GRO_CONSUMED;
- goto ok;
- }
-
- same_flow = NAPI_GRO_CB(skb)->same_flow;
- ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
-
- if (pp) {
- skb_list_del_init(pp);
- napi_gro_complete(napi, pp);
- gro_list->count--;
- }
-
- if (same_flow)
- goto ok;
-
- if (NAPI_GRO_CB(skb)->flush)
- goto normal;
-
- if (unlikely(gro_list->count >= MAX_GRO_SKBS))
- gro_flush_oldest(napi, &gro_list->list);
- else
- gro_list->count++;
-
- NAPI_GRO_CB(skb)->count = 1;
- NAPI_GRO_CB(skb)->age = jiffies;
- NAPI_GRO_CB(skb)->last = skb;
- skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- list_add(&skb->list, &gro_list->list);
- ret = GRO_HELD;
-
-pull:
- grow = skb_gro_offset(skb) - skb_headlen(skb);
- if (grow > 0)
- gro_pull_from_frag0(skb, grow);
-ok:
- if (gro_list->count) {
- if (!test_bit(bucket, &napi->gro_bitmask))
- __set_bit(bucket, &napi->gro_bitmask);
- } else if (test_bit(bucket, &napi->gro_bitmask)) {
- __clear_bit(bucket, &napi->gro_bitmask);
- }
-
- return ret;
-
-normal:
- ret = GRO_NORMAL;
- goto pull;
-}
-
-struct packet_offload *gro_find_receive_by_type(__be16 type)
-{
- struct list_head *offload_head = &offload_base;
- struct packet_offload *ptype;
-
- list_for_each_entry_rcu(ptype, offload_head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_receive)
- continue;
- return ptype;
- }
- return NULL;
-}
-EXPORT_SYMBOL(gro_find_receive_by_type);
-
-struct packet_offload *gro_find_complete_by_type(__be16 type)
-{
- struct list_head *offload_head = &offload_base;
- struct packet_offload *ptype;
-
- list_for_each_entry_rcu(ptype, offload_head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_complete)
- continue;
- return ptype;
- }
- return NULL;
-}
-EXPORT_SYMBOL(gro_find_complete_by_type);
-
-static gro_result_t napi_skb_finish(struct napi_struct *napi,
- struct sk_buff *skb,
- gro_result_t ret)
-{
- switch (ret) {
- case GRO_NORMAL:
- gro_normal_one(napi, skb, 1);
- break;
-
- case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
- napi_skb_free_stolen_head(skb);
- else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
- __kfree_skb(skb);
- else
- __kfree_skb_defer(skb);
- break;
-
- case GRO_HELD:
- case GRO_MERGED:
- case GRO_CONSUMED:
- break;
- }
-
- return ret;
-}
-
-gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-{
- gro_result_t ret;
-
- skb_mark_napi_id(skb, napi);
- trace_napi_gro_receive_entry(skb);
-
- skb_gro_reset_offset(skb, 0);
-
- ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
- trace_napi_gro_receive_exit(ret);
-
- return ret;
-}
-EXPORT_SYMBOL(napi_gro_receive);
-
-static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
-{
- if (unlikely(skb->pfmemalloc)) {
- consume_skb(skb);
- return;
- }
- __skb_pull(skb, skb_headlen(skb));
- /* restore the reserve we had after netdev_alloc_skb_ip_align() */
- skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
- __vlan_hwaccel_clear_tag(skb);
- skb->dev = napi->dev;
- skb->skb_iif = 0;
-
- /* eth_type_trans() assumes pkt_type is PACKET_HOST */
- skb->pkt_type = PACKET_HOST;
-
- skb->encapsulation = 0;
- skb_shinfo(skb)->gso_type = 0;
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- if (unlikely(skb->slow_gro)) {
- skb_orphan(skb);
- skb_ext_reset(skb);
- nf_reset_ct(skb);
- skb->slow_gro = 0;
- }
-
- napi->skb = skb;
-}
-
-struct sk_buff *napi_get_frags(struct napi_struct *napi)
-{
- struct sk_buff *skb = napi->skb;
-
- if (!skb) {
- skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
- if (skb) {
- napi->skb = skb;
- skb_mark_napi_id(skb, napi);
- }
- }
- return skb;
-}
-EXPORT_SYMBOL(napi_get_frags);
-
-static gro_result_t napi_frags_finish(struct napi_struct *napi,
- struct sk_buff *skb,
- gro_result_t ret)
-{
- switch (ret) {
- case GRO_NORMAL:
- case GRO_HELD:
- __skb_push(skb, ETH_HLEN);
- skb->protocol = eth_type_trans(skb, skb->dev);
- if (ret == GRO_NORMAL)
- gro_normal_one(napi, skb, 1);
- break;
-
- case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
- napi_skb_free_stolen_head(skb);
- else
- napi_reuse_skb(napi, skb);
- break;
-
- case GRO_MERGED:
- case GRO_CONSUMED:
- break;
- }
-
- return ret;
-}
-
-/* Upper GRO stack assumes network header starts at gro_offset=0
- * Drivers could call both napi_gro_frags() and napi_gro_receive()
- * We copy ethernet header into skb->data to have a common layout.
- */
-static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
-{
- struct sk_buff *skb = napi->skb;
- const struct ethhdr *eth;
- unsigned int hlen = sizeof(*eth);
-
- napi->skb = NULL;
-
- skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb, hlen);
-
- if (unlikely(skb_gro_header_hard(skb, hlen))) {
- eth = skb_gro_header_slow(skb, hlen, 0);
- if (unlikely(!eth)) {
- net_warn_ratelimited("%s: dropping impossible skb from %s\n",
- __func__, napi->dev->name);
- napi_reuse_skb(napi, skb);
- return NULL;
- }
- } else {
- eth = (const struct ethhdr *)skb->data;
- gro_pull_from_frag0(skb, hlen);
- NAPI_GRO_CB(skb)->frag0 += hlen;
- NAPI_GRO_CB(skb)->frag0_len -= hlen;
- }
- __skb_pull(skb, hlen);
-
- /*
- * This works because the only protocols we care about don't require
- * special handling.
- * We'll fix it up properly in napi_frags_finish()
- */
- skb->protocol = eth->h_proto;
-
- return skb;
-}
-
-gro_result_t napi_gro_frags(struct napi_struct *napi)
-{
- gro_result_t ret;
- struct sk_buff *skb = napi_frags_skb(napi);
-
- trace_napi_gro_frags_entry(skb);
-
- ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
- trace_napi_gro_frags_exit(ret);
-
- return ret;
-}
-EXPORT_SYMBOL(napi_gro_frags);
-
-/* Compute the checksum from gro_offset and return the folded value
- * after adding in any pseudo checksum.
- */
-__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
-{
- __wsum wsum;
- __sum16 sum;
-
- wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
-
- /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
- sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
- /* See comments in __skb_checksum_complete(). */
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev, skb);
- }
-
- NAPI_GRO_CB(skb)->csum = wsum;
- NAPI_GRO_CB(skb)->csum_valid = 1;
-
- return sum;
-}
-EXPORT_SYMBOL(__skb_gro_checksum_complete);
-
static void net_rps_send_ipi(struct softnet_data *remsd)
{
#ifdef CONFIG_RPS
@@ -7200,6 +6537,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
struct netdev_adjacent {
struct net_device *dev;
+ netdevice_tracker dev_tracker;
/* upper master flag, there can only be one master device per list */
bool master;
@@ -7964,7 +7302,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->ref_nr = 1;
adj->private = private;
adj->ignore = false;
- dev_hold(adj_dev);
+ dev_hold_track(adj_dev, &adj->dev_tracker, GFP_KERNEL);
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
@@ -7993,8 +7331,8 @@ remove_symlinks:
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
+ dev_put_track(adj_dev, &adj->dev_tracker);
kfree(adj);
- dev_put(adj_dev);
return ret;
}
@@ -8035,7 +7373,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
list_del_rcu(&adj->list);
pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
- dev_put(adj_dev);
+ dev_put_track(adj_dev, &adj->dev_tracker);
kfree_rcu(adj, rcu);
}
@@ -9224,35 +8562,17 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
EXPORT_SYMBOL(netdev_port_same_parent_id);
/**
- * dev_change_proto_down - update protocol port state information
+ * dev_change_proto_down - set carrier according to proto_down.
+ *
* @dev: device
* @proto_down: new value
- *
- * This info can be used by switch drivers to set the phys state of the
- * port.
*/
int dev_change_proto_down(struct net_device *dev, bool proto_down)
{
- const struct net_device_ops *ops = dev->netdev_ops;
-
- if (!ops->ndo_change_proto_down)
+ if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
- return ops->ndo_change_proto_down(dev, proto_down);
-}
-EXPORT_SYMBOL(dev_change_proto_down);
-
-/**
- * dev_change_proto_down_generic - generic implementation for
- * ndo_change_proto_down that sets carrier according to
- * proto_down.
- *
- * @dev: device
- * @proto_down: new value
- */
-int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
-{
if (proto_down)
netif_carrier_off(dev);
else
@@ -9260,7 +8580,7 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
dev->proto_down = proto_down;
return 0;
}
-EXPORT_SYMBOL(dev_change_proto_down_generic);
+EXPORT_SYMBOL(dev_change_proto_down);
/**
* dev_change_proto_down_reason - proto down reason
@@ -10545,6 +9865,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
netdev_unregister_timeout_secs * HZ)) {
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
dev->name, refcnt);
+ ref_tracker_dir_print(&dev->refcnt_tracker, 10);
warning_time = jiffies;
}
}
@@ -10835,6 +10156,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
+ ref_tracker_dir_init(&dev->refcnt_tracker, 128);
#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
@@ -10951,6 +10273,7 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
@@ -11643,8 +10966,6 @@ static int __init net_dev_init(void)
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
- INIT_LIST_HEAD(&offload_base);
-
if (register_pernet_subsys(&netdev_net_ops))
goto out;
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index f0cb38344126..bead38ca50bd 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -16,6 +16,35 @@
* General list handling functions
*/
+static int __hw_addr_insert(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *new, int addr_len)
+{
+ struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
+ struct netdev_hw_addr *ha;
+
+ while (*ins_point) {
+ int diff;
+
+ ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+ diff = memcmp(new->addr, ha->addr, addr_len);
+ if (diff == 0)
+ diff = memcmp(&new->type, &ha->type, sizeof(new->type));
+
+ parent = *ins_point;
+ if (diff < 0)
+ ins_point = &parent->rb_left;
+ else if (diff > 0)
+ ins_point = &parent->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node_rcu(&new->node, parent, ins_point);
+ rb_insert_color(&new->node, &list->tree);
+
+ return 0;
+}
+
static struct netdev_hw_addr*
__hw_addr_create(const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync)
@@ -50,11 +79,6 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
if (addr_len > MAX_ADDR_LEN)
return -EINVAL;
- ha = list_first_entry(&list->list, struct netdev_hw_addr, list);
- if (ha && !memcmp(addr, ha->addr, addr_len) &&
- (!addr_type || addr_type == ha->type))
- goto found_it;
-
while (*ins_point) {
int diff;
@@ -69,7 +93,6 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
} else if (diff > 0) {
ins_point = &parent->rb_right;
} else {
-found_it:
if (exclusive)
return -EEXIST;
if (global) {
@@ -94,16 +117,8 @@ found_it:
if (!ha)
return -ENOMEM;
- /* The first address in dev->dev_addrs is pointed to by dev->dev_addr
- * and mutated freely by device drivers and netdev ops, so if we insert
- * it into the tree we'll end up with an invalid rbtree.
- */
- if (list->count > 0) {
- rb_link_node(&ha->node, parent, ins_point);
- rb_insert_color(&ha->node, &list->tree);
- } else {
- RB_CLEAR_NODE(&ha->node);
- }
+ rb_link_node(&ha->node, parent, ins_point);
+ rb_insert_color(&ha->node, &list->tree);
list_add_tail_rcu(&ha->list, &list->list);
list->count++;
@@ -138,8 +153,7 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
if (--ha->refcount)
return 0;
- if (!RB_EMPTY_NODE(&ha->node))
- rb_erase(&ha->node, &list->tree);
+ rb_erase(&ha->node, &list->tree);
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
@@ -151,18 +165,8 @@ static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type)
{
- struct netdev_hw_addr *ha;
struct rb_node *node;
- /* The first address isn't inserted into the tree because in the dev->dev_addrs
- * list it's the address pointed to by dev->dev_addr which is freely mutated
- * in place, so we need to check it separately.
- */
- ha = list_first_entry(&list->list, struct netdev_hw_addr, list);
- if (ha && !memcmp(addr, ha->addr, addr_len) &&
- (!addr_type || addr_type == ha->type))
- return ha;
-
node = list->tree.rb_node;
while (node) {
@@ -498,6 +502,21 @@ EXPORT_SYMBOL(__hw_addr_init);
* Device addresses handling functions
*/
+/* Check that netdev->dev_addr is not written to directly as this would
+ * break the rbtree layout. All changes should go thru dev_addr_set() and co.
+ * Remove this check in mid-2024.
+ */
+void dev_addr_check(struct net_device *dev)
+{
+ if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN))
+ return;
+
+ netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr);
+ netdev_warn(dev, "Expected addr: %*ph\n",
+ MAX_ADDR_LEN, dev->dev_addr_shadow);
+ netdev_WARN(dev, "Incorrect netdev->dev_addr\n");
+}
+
/**
* dev_addr_flush - Flush device address list
* @dev: device
@@ -509,11 +528,11 @@ EXPORT_SYMBOL(__hw_addr_init);
void dev_addr_flush(struct net_device *dev)
{
/* rtnl_mutex must be held here */
+ dev_addr_check(dev);
__hw_addr_flush(&dev->dev_addrs);
dev->dev_addr = NULL;
}
-EXPORT_SYMBOL(dev_addr_flush);
/**
* dev_addr_init - Init device address list
@@ -547,7 +566,21 @@ int dev_addr_init(struct net_device *dev)
}
return err;
}
-EXPORT_SYMBOL(dev_addr_init);
+
+void dev_addr_mod(struct net_device *dev, unsigned int offset,
+ const void *addr, size_t len)
+{
+ struct netdev_hw_addr *ha;
+
+ dev_addr_check(dev);
+
+ ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]);
+ rb_erase(&ha->node, &dev->dev_addrs.tree);
+ memcpy(&ha->addr[offset], addr, len);
+ memcpy(&dev->dev_addr_shadow[offset], addr, len);
+ WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len));
+}
+EXPORT_SYMBOL(dev_addr_mod);
/**
* dev_addr_add - Add a device address
diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c
new file mode 100644
index 000000000000..049cfbc58aa9
--- /dev/null
+++ b/net/core/dev_addr_lists_test.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <kunit/test.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+
+static const struct net_device_ops dummy_netdev_ops = {
+};
+
+struct dev_addr_test_priv {
+ u32 addr_seen;
+};
+
+static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ datp->addr_seen |= 1 << a[0];
+ return 0;
+}
+
+static int dev_addr_test_unsync(struct net_device *netdev,
+ const unsigned char *a)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ datp->addr_seen &= ~(1 << a[0]);
+ return 0;
+}
+
+static int dev_addr_test_init(struct kunit *test)
+{
+ struct dev_addr_test_priv *datp;
+ struct net_device *netdev;
+ int err;
+
+ netdev = alloc_etherdev(sizeof(*datp));
+ KUNIT_ASSERT_TRUE(test, !!netdev);
+
+ test->priv = netdev;
+ netdev->netdev_ops = &dummy_netdev_ops;
+
+ err = register_netdev(netdev);
+ if (err) {
+ free_netdev(netdev);
+ KUNIT_FAIL(test, "Can't register netdev %d", err);
+ }
+
+ rtnl_lock();
+ return 0;
+}
+
+static void dev_addr_test_exit(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+
+ rtnl_unlock();
+ unregister_netdev(netdev);
+ free_netdev(netdev);
+}
+
+static void dev_addr_test_basic(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+
+ KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr);
+
+ memset(addr, 2, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+ KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr)));
+
+ memset(addr, 3, sizeof(addr));
+ dev_addr_set(netdev, addr);
+ KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr)));
+}
+
+static void dev_addr_test_sync_one(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ memset(addr, 1, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 2, datp->addr_seen);
+
+ memset(addr, 2, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ datp->addr_seen = 0;
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ /* It's not going to sync anything because the main address is
+ * considered synced and we overwrite in place.
+ */
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen);
+}
+
+static void dev_addr_test_add_del(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ datp = netdev_priv(netdev);
+
+ for (i = 1; i < 4; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+ /* Add 3 again */
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen);
+
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen);
+
+ for (i = 1; i < 4; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1, datp->addr_seen);
+}
+
+static void dev_addr_test_del_main(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+
+ memset(addr, 1, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+}
+
+static void dev_addr_test_add_set(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ datp = netdev_priv(netdev);
+
+ /* There is no external API like dev_addr_add_excl(),
+ * so shuffle the tree a little bit and exploit aliasing.
+ */
+ for (i = 1; i < 16; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+
+ memset(addr, i, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ memset(addr, 0, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen);
+}
+
+static void dev_addr_test_add_excl(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr));
+ }
+ KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
+
+ for (i = 0; i < 10; i += 2) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+ }
+ for (i = 1; i < 10; i += 2) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
+ }
+}
+
+static struct kunit_case dev_addr_test_cases[] = {
+ KUNIT_CASE(dev_addr_test_basic),
+ KUNIT_CASE(dev_addr_test_sync_one),
+ KUNIT_CASE(dev_addr_test_add_del),
+ KUNIT_CASE(dev_addr_test_del_main),
+ KUNIT_CASE(dev_addr_test_add_set),
+ KUNIT_CASE(dev_addr_test_add_excl),
+ {}
+};
+
+static struct kunit_suite dev_addr_test_suite = {
+ .name = "dev-addr-list-test",
+ .test_cases = dev_addr_test_cases,
+ .init = dev_addr_test_init,
+ .exit = dev_addr_test_exit,
+};
+kunit_test_suite(dev_addr_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index cbab5fec64b1..1d309a666932 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -313,6 +313,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
const struct net_device_ops *ops;
+ netdevice_tracker dev_tracker;
if (!dev)
return -ENODEV;
@@ -381,10 +382,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return -ENODEV;
if (!netif_is_bridge_master(dev))
return -EOPNOTSUPP;
- dev_hold(dev);
+ dev_hold_track(dev, &dev_tracker, GFP_KERNEL);
rtnl_unlock();
err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
- dev_put(dev);
+ dev_put_track(dev, &dev_tracker);
rtnl_lock();
return err;
diff --git a/net/core/devlink.c b/net/core/devlink.c
index c06c9ba6e8c5..0a9349a02cad 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -69,6 +69,35 @@ struct devlink {
char priv[] __aligned(NETDEV_ALIGN);
};
+/**
+ * struct devlink_resource - devlink resource
+ * @name: name of the resource
+ * @id: id, per devlink instance
+ * @size: size of the resource
+ * @size_new: updated size of the resource, reload is needed
+ * @size_valid: valid in case the total size of the resource is valid
+ * including its children
+ * @parent: parent resource
+ * @size_params: size parameters
+ * @list: parent list
+ * @resource_list: list of child resources
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+struct devlink_resource {
+ const char *name;
+ u64 id;
+ u64 size;
+ u64 size_new;
+ bool size_valid;
+ struct devlink_resource *parent;
+ struct devlink_resource_size_params size_params;
+ struct list_head list;
+ struct list_head resource_list;
+ devlink_resource_occ_get_t *occ_get;
+ void *occ_get_priv;
+};
+
void *devlink_priv(struct devlink *devlink)
{
return &devlink->priv;
@@ -4432,6 +4461,11 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
@@ -8840,8 +8874,6 @@ static const struct genl_small_ops devlink_nl_ops[] = {
GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT |
- DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
@@ -9905,34 +9937,38 @@ out:
}
EXPORT_SYMBOL_GPL(devlink_resource_register);
+static void devlink_resource_unregister(struct devlink *devlink,
+ struct devlink_resource *resource)
+{
+ struct devlink_resource *tmp, *child_resource;
+
+ list_for_each_entry_safe(child_resource, tmp, &resource->resource_list,
+ list) {
+ devlink_resource_unregister(devlink, child_resource);
+ list_del(&child_resource->list);
+ kfree(child_resource);
+ }
+}
+
/**
* devlink_resources_unregister - free all resources
*
* @devlink: devlink
- * @resource: resource
*/
-void devlink_resources_unregister(struct devlink *devlink,
- struct devlink_resource *resource)
+void devlink_resources_unregister(struct devlink *devlink)
{
struct devlink_resource *tmp, *child_resource;
- struct list_head *resource_list;
- if (resource)
- resource_list = &resource->resource_list;
- else
- resource_list = &devlink->resource_list;
-
- if (!resource)
- mutex_lock(&devlink->lock);
+ mutex_lock(&devlink->lock);
- list_for_each_entry_safe(child_resource, tmp, resource_list, list) {
- devlink_resources_unregister(devlink, child_resource);
+ list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list,
+ list) {
+ devlink_resource_unregister(devlink, child_resource);
list_del(&child_resource->list);
kfree(child_resource);
}
- if (!resource)
- mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink->lock);
}
EXPORT_SYMBOL_GPL(devlink_resources_unregister);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 49442cae6f69..3d0ab2eec916 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -850,7 +850,7 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata)
}
hw_metadata->input_dev = metadata->input_dev;
- dev_hold(hw_metadata->input_dev);
+ dev_hold_track(hw_metadata->input_dev, &hw_metadata->dev_tracker, GFP_ATOMIC);
return hw_metadata;
@@ -864,9 +864,9 @@ free_hw_metadata:
}
static void
-net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata)
+net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata)
{
- dev_put(hw_metadata->input_dev);
+ dev_put_track(hw_metadata->input_dev, &hw_metadata->dev_tracker);
kfree(hw_metadata->fa_cookie);
kfree(hw_metadata->trap_name);
kfree(hw_metadata->trap_group_name);
diff --git a/net/core/dst.c b/net/core/dst.c
index 497ef9b3fc6a..d16c2c9bfebd 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -49,7 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
unsigned short flags)
{
dst->dev = dev;
- dev_hold(dev);
+ dev_hold_track(dev, &dst->dev_tracker, GFP_ATOMIC);
dst->ops = ops;
dst_init_metrics(dst, dst_default_metrics.metrics, true);
dst->expires = 0UL;
@@ -117,7 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
if (dst->ops->destroy)
dst->ops->destroy(dst);
- dev_put(dst->dev);
+ dev_put_track(dst->dev, &dst->dev_tracker);
lwtstate_put(dst->lwtstate);
@@ -159,8 +159,8 @@ void dst_dev_put(struct dst_entry *dst)
dst->input = dst_discard;
dst->output = dst_discard_out;
dst->dev = blackhole_netdev;
- dev_hold(dst->dev);
- dev_put(dev);
+ dev_replace_track(dev, blackhole_netdev, &dst->dev_tracker,
+ GFP_ATOMIC);
}
EXPORT_SYMBOL(dst_dev_put);
diff --git a/net/core/failover.c b/net/core/failover.c
index b5cd3c727285..dcaa92a85ea2 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -252,7 +252,7 @@ struct failover *failover_register(struct net_device *dev,
return ERR_PTR(-ENOMEM);
rcu_assign_pointer(failover->ops, ops);
- dev_hold(dev);
+ dev_hold_track(dev, &failover->dev_tracker, GFP_KERNEL);
dev->priv_flags |= IFF_FAILOVER;
rcu_assign_pointer(failover->failover_dev, dev);
@@ -285,7 +285,7 @@ void failover_unregister(struct failover *failover)
failover_dev->name);
failover_dev->priv_flags &= ~IFF_FAILOVER;
- dev_put(failover_dev);
+ dev_put_track(failover_dev, &failover->dev_tracker);
spin_lock(&failover_lock);
list_del(&failover->list);
diff --git a/net/core/filter.c b/net/core/filter.c
index 6102f093d59a..8271624a19aa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -301,7 +301,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
break;
case SKF_AD_PKTTYPE:
- *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
@@ -323,7 +323,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
offsetof(struct sk_buff, vlan_tci));
break;
case SKF_AD_VLAN_TAG_PRESENT:
- *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET());
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET);
if (PKT_VLAN_PRESENT_BIT)
*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT);
if (PKT_VLAN_PRESENT_BIT < 7)
@@ -8029,7 +8029,7 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
* (Fast-path, otherwise approximation that we might be
* a clone, do the rest in helper.)
*/
- *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
@@ -8617,7 +8617,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, pkt_type):
*target_size = 1;
*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
- PKT_TYPE_OFFSET());
+ PKT_TYPE_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
@@ -8642,7 +8642,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, vlan_present):
*target_size = 1;
*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
- PKT_VLAN_PRESENT_OFFSET());
+ PKT_VLAN_PRESENT_OFFSET);
if (PKT_VLAN_PRESENT_BIT)
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT);
if (PKT_VLAN_PRESENT_BIT < 7)
@@ -10543,6 +10543,7 @@ static bool sk_lookup_is_valid_access(int off, int size,
case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
case bpf_ctx_range(struct bpf_sk_lookup, local_port):
+ case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
bpf_ctx_record_field_size(info, sizeof(__u32));
return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
@@ -10632,6 +10633,12 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(struct bpf_sk_lookup_kern,
dport, 2, target_size));
break;
+
+ case offsetof(struct bpf_sk_lookup, ingress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ ingress_ifindex, 4, target_size));
+ break;
}
return insn - insn_buf;
@@ -10656,14 +10663,10 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
}
-#ifdef CONFIG_DEBUG_INFO_BTF
-BTF_ID_LIST_GLOBAL(btf_sock_ids)
+BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
BTF_SOCK_TYPE_xxx
#undef BTF_SOCK_TYPE
-#else
-u32 btf_sock_ids[MAX_BTF_SOCK_TYPE];
-#endif
BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
{
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3255f57f5131..257976cb55ce 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1460,7 +1460,7 @@ out_bad:
}
EXPORT_SYMBOL(__skb_flow_dissect);
-static siphash_key_t hashrnd __read_mostly;
+static siphash_aligned_key_t hashrnd;
static __always_inline void __flow_hash_secret_init(void)
{
net_get_random_once(&hashrnd, sizeof(hashrnd));
diff --git a/net/core/gro.c b/net/core/gro.c
new file mode 100644
index 000000000000..8ec8b44596da
--- /dev/null
+++ b/net/core/gro.c
@@ -0,0 +1,766 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <net/gro.h>
+#include <net/dst_metadata.h>
+#include <net/busy_poll.h>
+#include <trace/events/net.h>
+
+#define MAX_GRO_SKBS 8
+
+/* This should be increased if a protocol with a bigger head is added. */
+#define GRO_MAX_HEAD (MAX_HEADER + 128)
+
+static DEFINE_SPINLOCK(offload_lock);
+static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base);
+/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
+int gro_normal_batch __read_mostly = 8;
+
+/**
+ * dev_add_offload - register offload handlers
+ * @po: protocol offload declaration
+ *
+ * Add protocol offload handlers to the networking stack. The passed
+ * &proto_offload is linked into kernel lists and may not be freed until
+ * it has been removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+ struct packet_offload *elem;
+
+ spin_lock(&offload_lock);
+ list_for_each_entry(elem, &offload_base, list) {
+ if (po->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&po->list, elem->list.prev);
+ spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ * __dev_remove_offload - remove offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a protocol offload handler that was previously added to the
+ * kernel offload handlers by dev_add_offload(). The passed &offload_type
+ * is removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+static void __dev_remove_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+ struct packet_offload *po1;
+
+ spin_lock(&offload_lock);
+
+ list_for_each_entry(po1, head, list) {
+ if (po == po1) {
+ list_del_rcu(&po->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+ spin_unlock(&offload_lock);
+}
+
+/**
+ * dev_remove_offload - remove packet offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a packet offload handler that was previously added to the kernel
+ * offload handlers by dev_add_offload(). The passed &offload_type is
+ * removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+ __dev_remove_offload(po);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
+{
+ struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
+ unsigned int offset = skb_gro_offset(skb);
+ unsigned int headlen = skb_headlen(skb);
+ unsigned int len = skb_gro_len(skb);
+ unsigned int delta_truesize;
+ unsigned int new_truesize;
+ struct sk_buff *lp;
+
+ if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
+ return -E2BIG;
+
+ lp = NAPI_GRO_CB(p)->last;
+ pinfo = skb_shinfo(lp);
+
+ if (headlen <= offset) {
+ skb_frag_t *frag;
+ skb_frag_t *frag2;
+ int i = skbinfo->nr_frags;
+ int nr_frags = pinfo->nr_frags + i;
+
+ if (nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ offset -= headlen;
+ pinfo->nr_frags = nr_frags;
+ skbinfo->nr_frags = 0;
+
+ frag = pinfo->frags + nr_frags;
+ frag2 = skbinfo->frags + i;
+ do {
+ *--frag = *--frag2;
+ } while (--i);
+
+ skb_frag_off_add(frag, offset);
+ skb_frag_size_sub(frag, offset);
+
+ /* all fragments truesize : remove (head size + sk_buff) */
+ new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ delta_truesize = skb->truesize - new_truesize;
+
+ skb->truesize = new_truesize;
+ skb->len -= skb->data_len;
+ skb->data_len = 0;
+
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ __skb_frag_set_page(frag, page);
+ skb_frag_off_set(frag, first_offset);
+ skb_frag_size_set(frag, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ delta_truesize = skb->truesize - new_truesize;
+ skb->truesize = new_truesize;
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
+ goto done;
+ }
+
+merge:
+ /* sk owenrship - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ delta_truesize = skb->truesize;
+ if (offset > headlen) {
+ unsigned int eat = offset - headlen;
+
+ skb_frag_off_add(&skbinfo->frags[0], eat);
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
+ skb->data_len -= eat;
+ skb->len -= eat;
+ offset = headlen;
+ }
+
+ __skb_pull(skb, offset);
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+ NAPI_GRO_CB(p)->last = skb;
+ __skb_header_release(skb);
+ lp = p;
+
+done:
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += len;
+ p->truesize += delta_truesize;
+ p->len += len;
+ if (lp != p) {
+ lp->data_len += len;
+ lp->truesize += delta_truesize;
+ lp->len += len;
+ }
+ NAPI_GRO_CB(skb)->same_flow = 1;
+ return 0;
+}
+
+
+static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct list_head *head = &offload_base;
+ int err = -ENOENT;
+
+ BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
+ if (NAPI_GRO_CB(skb)->count == 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, 0);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err) {
+ WARN_ON(&ptype->list == head);
+ kfree_skb(skb);
+ return;
+ }
+
+out:
+ gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
+}
+
+static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
+ bool flush_old)
+{
+ struct list_head *head = &napi->gro_hash[index].list;
+ struct sk_buff *skb, *p;
+
+ list_for_each_entry_safe_reverse(skb, p, head, list) {
+ if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+ return;
+ skb_list_del_init(skb);
+ napi_gro_complete(napi, skb);
+ napi->gro_hash[index].count--;
+ }
+
+ if (!napi->gro_hash[index].count)
+ __clear_bit(index, &napi->gro_bitmask);
+}
+
+/* napi->gro_hash[].list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ unsigned long bitmask = napi->gro_bitmask;
+ unsigned int i, base = ~0U;
+
+ while ((i = ffs(bitmask)) != 0) {
+ bitmask >>= i;
+ base += i;
+ __napi_gro_flush_chain(napi, base, flush_old);
+ }
+}
+EXPORT_SYMBOL(napi_gro_flush);
+
+static void gro_list_prepare(const struct list_head *head,
+ const struct sk_buff *skb)
+{
+ unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
+ struct sk_buff *p;
+
+ list_for_each_entry(p, head, list) {
+ unsigned long diffs;
+
+ NAPI_GRO_CB(p)->flush = 0;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
+ if (skb_vlan_tag_present(p))
+ diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
+ diffs |= skb_metadata_differs(p, skb);
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+
+ /* in most common scenarions 'slow_gro' is 0
+ * otherwise we are already on some slower paths
+ * either skip all the infrequent tests altogether or
+ * avoid trying too hard to skip each of them individually
+ */
+ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ struct tc_skb_ext *skb_ext;
+ struct tc_skb_ext *p_ext;
+#endif
+
+ diffs |= p->sk != skb->sk;
+ diffs |= skb_metadata_dst_cmp(p, skb);
+ diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
+
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+ p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+ diffs |= (!!p_ext) ^ (!!skb_ext);
+ if (!diffs && unlikely(skb_ext))
+ diffs |= p_ext->chain ^ skb_ext->chain;
+#endif
+ }
+
+ NAPI_GRO_CB(p)->same_flow = !diffs;
+ }
+}
+
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
+{
+ const struct skb_shared_info *pinfo = skb_shinfo(skb);
+ const skb_frag_t *frag0 = &pinfo->frags[0];
+
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ NAPI_GRO_CB(skb)->frag0 = NULL;
+ NAPI_GRO_CB(skb)->frag0_len = 0;
+
+ if (!skb_headlen(skb) && pinfo->nr_frags &&
+ !PageHighMem(skb_frag_page(frag0)) &&
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
+ skb_frag_size(frag0),
+ skb->end - skb->tail);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ skb_frag_off_add(&pinfo->frags[0], grow);
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
+static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
+{
+ struct sk_buff *oldest;
+
+ oldest = list_last_entry(head, struct sk_buff, list);
+
+ /* We are called with head length >= MAX_GRO_SKBS, so this is
+ * impossible.
+ */
+ if (WARN_ON_ONCE(!oldest))
+ return;
+
+ /* Do not adjust napi->gro_hash[].count, caller is adding a new
+ * SKB to the chain.
+ */
+ skb_list_del_init(oldest);
+ napi_gro_complete(napi, oldest);
+}
+
+static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct gro_list *gro_list = &napi->gro_hash[bucket];
+ struct list_head *head = &offload_base;
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct sk_buff *pp = NULL;
+ enum gro_result ret;
+ int same_flow;
+ int grow;
+
+ if (netif_elide_gro(skb->dev))
+ goto normal;
+
+ gro_list_prepare(&gro_list->list, skb);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ skb_reset_mac_len(skb);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
+ NAPI_GRO_CB(skb)->free = 0;
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+ NAPI_GRO_CB(skb)->recursion_counter = 0;
+ NAPI_GRO_CB(skb)->is_fou = 0;
+ NAPI_GRO_CB(skb)->is_atomic = 1;
+ NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
+
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ break;
+ default:
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ }
+
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ &gro_list->list, skb);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (&ptype->list == head)
+ goto normal;
+
+ if (PTR_ERR(pp) == -EINPROGRESS) {
+ ret = GRO_CONSUMED;
+ goto ok;
+ }
+
+ same_flow = NAPI_GRO_CB(skb)->same_flow;
+ ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
+
+ if (pp) {
+ skb_list_del_init(pp);
+ napi_gro_complete(napi, pp);
+ gro_list->count--;
+ }
+
+ if (same_flow)
+ goto ok;
+
+ if (NAPI_GRO_CB(skb)->flush)
+ goto normal;
+
+ if (unlikely(gro_list->count >= MAX_GRO_SKBS))
+ gro_flush_oldest(napi, &gro_list->list);
+ else
+ gro_list->count++;
+
+ NAPI_GRO_CB(skb)->count = 1;
+ NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+ list_add(&skb->list, &gro_list->list);
+ ret = GRO_HELD;
+
+pull:
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
+ok:
+ if (gro_list->count) {
+ if (!test_bit(bucket, &napi->gro_bitmask))
+ __set_bit(bucket, &napi->gro_bitmask);
+ } else if (test_bit(bucket, &napi->gro_bitmask)) {
+ __clear_bit(bucket, &napi->gro_bitmask);
+ }
+
+ return ret;
+
+normal:
+ ret = GRO_NORMAL;
+ goto pull;
+}
+
+struct packet_offload *gro_find_receive_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
+
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_complete_by_type);
+
+static gro_result_t napi_skb_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ gro_normal_one(napi, skb, 1);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __kfree_skb_defer(skb);
+ break;
+
+ case GRO_HELD:
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ gro_result_t ret;
+
+ skb_mark_napi_id(skb, napi);
+ trace_napi_gro_receive_entry(skb);
+
+ skb_gro_reset_offset(skb, 0);
+
+ ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
+ trace_napi_gro_receive_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(napi_gro_receive);
+
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+ if (unlikely(skb->pfmemalloc)) {
+ consume_skb(skb);
+ return;
+ }
+ __skb_pull(skb, skb_headlen(skb));
+ /* restore the reserve we had after netdev_alloc_skb_ip_align() */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
+ __vlan_hwaccel_clear_tag(skb);
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
+
+ /* eth_type_trans() assumes pkt_type is PACKET_HOST */
+ skb->pkt_type = PACKET_HOST;
+
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ if (unlikely(skb->slow_gro)) {
+ skb_orphan(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
+ skb->slow_gro = 0;
+ }
+
+ napi->skb = skb;
+}
+
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+
+ if (!skb) {
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
+ if (skb) {
+ napi->skb = skb;
+ skb_mark_napi_id(skb, napi);
+ }
+ }
+ return skb;
+}
+EXPORT_SYMBOL(napi_get_frags);
+
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL)
+ gro_normal_one(napi, skb, 1);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ napi_reuse_skb(napi, skb);
+ break;
+
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
+static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb, hlen);
+
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ net_warn_ratelimited("%s: dropping impossible skb from %s\n",
+ __func__, napi->dev->name);
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ eth = (const struct ethhdr *)skb->data;
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
+ }
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
+
+ return skb;
+}
+
+gro_result_t napi_gro_frags(struct napi_struct *napi)
+{
+ gro_result_t ret;
+ struct sk_buff *skb = napi_frags_skb(napi);
+
+ trace_napi_gro_frags_entry(skb);
+
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+ trace_napi_gro_frags_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(napi_gro_frags);
+
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+ __wsum wsum;
+ __sum16 sum;
+
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+ /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ /* See comments in __skb_checksum_complete(). */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+
+ NAPI_GRO_CB(skb)->csum = wsum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 1a455847da54..d7d089963b1d 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -55,7 +55,7 @@ static void rfc2863_policy(struct net_device *dev)
if (operstate == dev->operstate)
return;
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
switch(dev->link_mode) {
case IF_LINK_MODE_TESTING:
@@ -74,7 +74,7 @@ static void rfc2863_policy(struct net_device *dev)
dev->operstate = operstate;
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
}
@@ -109,7 +109,7 @@ static void linkwatch_add_event(struct net_device *dev)
spin_lock_irqsave(&lweventlist_lock, flags);
if (list_empty(&dev->link_watch_list)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
- dev_hold(dev);
+ dev_hold_track(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
}
@@ -166,7 +166,7 @@ static void linkwatch_do_dev(struct net_device *dev)
netdev_state_change(dev);
}
- dev_put(dev);
+ dev_put_track(dev, &dev->linkwatch_dev_tracker);
}
static void __linkwatch_run_queue(int urgent_only)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index dda12fbd177b..506aa01776df 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -624,7 +624,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
- dev_hold(dev);
+ dev_hold_track(dev, &n->dev_tracker, GFP_ATOMIC);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
@@ -770,10 +770,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
- dev_hold(dev);
+ dev_hold_track(dev, &n->dev_tracker, GFP_KERNEL);
if (tbl->pconstructor && tbl->pconstructor(n)) {
- dev_put(dev);
+ dev_put_track(dev, &n->dev_tracker);
kfree(n);
n = NULL;
goto out;
@@ -805,7 +805,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
write_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
- dev_put(n->dev);
+ dev_put_track(n->dev, &n->dev_tracker);
kfree(n);
return 0;
}
@@ -838,7 +838,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
n->next = NULL;
if (tbl->pdestructor)
tbl->pdestructor(n);
- dev_put(n->dev);
+ dev_put_track(n->dev, &n->dev_tracker);
kfree(n);
}
return -ENOENT;
@@ -879,7 +879,7 @@ void neigh_destroy(struct neighbour *neigh)
if (dev->netdev_ops->ndo_neigh_destroy)
dev->netdev_ops->ndo_neigh_destroy(dev, neigh);
- dev_put(dev);
+ dev_put_track(dev, &neigh->dev_tracker);
neigh_parms_put(neigh->parms);
neigh_dbg(2, "neigh %p is destroyed\n", neigh);
@@ -1665,13 +1665,13 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
refcount_set(&p->refcnt, 1);
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
- dev_hold(dev);
+ dev_hold_track(dev, &p->dev_tracker, GFP_KERNEL);
p->dev = dev;
write_pnet(&p->net, net);
p->sysctl_table = NULL;
if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
- dev_put(dev);
+ dev_put_track(dev, &p->dev_tracker);
kfree(p);
return NULL;
}
@@ -1702,7 +1702,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
list_del(&parms->list);
parms->dead = 1;
write_unlock_bh(&tbl->lock);
- dev_put(parms->dev);
+ dev_put_track(parms->dev, &parms->dev_tracker);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
EXPORT_SYMBOL(neigh_parms_release);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 9c01c642cf9e..53ea262ecafd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -488,14 +488,6 @@ static ssize_t proto_down_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- struct net_device *netdev = to_net_dev(dev);
-
- /* The check is also done in change_proto_down; this helps returning
- * early without hitting the trylock/restart in netdev_store.
- */
- if (!netdev->netdev_ops->ndo_change_proto_down)
- return -EOPNOTSUPP;
-
return netdev_store(dev, attr, buf, len, change_proto_down);
}
NETDEVICE_SHOW_RW(proto_down, fmt_dec);
@@ -1012,7 +1004,7 @@ static void rx_queue_release(struct kobject *kobj)
#endif
memset(kobj, 0, sizeof(*kobj));
- dev_put(queue->dev);
+ dev_put_track(queue->dev, &queue->dev_tracker);
}
static const void *rx_queue_namespace(struct kobject *kobj)
@@ -1052,7 +1044,7 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
/* Kobject_put later will trigger rx_queue_release call which
* decreases dev refcount: Take that reference here
*/
- dev_hold(queue->dev);
+ dev_hold_track(queue->dev, &queue->dev_tracker, GFP_KERNEL);
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
@@ -1201,11 +1193,7 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
{
- unsigned long trans_timeout;
-
- spin_lock_irq(&queue->_xmit_lock);
- trans_timeout = queue->trans_timeout;
- spin_unlock_irq(&queue->_xmit_lock);
+ unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout);
return sprintf(buf, fmt_ulong, trans_timeout);
}
@@ -1452,7 +1440,7 @@ static ssize_t xps_queue_show(struct net_device *dev, unsigned int index,
for (i = map->len; i--;) {
if (map->queues[i] == index) {
- set_bit(j, mask);
+ __set_bit(j, mask);
break;
}
}
@@ -1619,7 +1607,7 @@ static void netdev_queue_release(struct kobject *kobj)
struct netdev_queue *queue = to_netdev_queue(kobj);
memset(kobj, 0, sizeof(*kobj));
- dev_put(queue->dev);
+ dev_put_track(queue->dev, &queue->dev_tracker);
}
static const void *netdev_queue_namespace(struct kobject *kobj)
@@ -1659,7 +1647,7 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
/* Kobject_put later will trigger netdev_queue_release call
* which decreases dev refcount: Take that reference here
*/
- dev_hold(queue->dev);
+ dev_hold_track(queue->dev, &queue->dev_tracker, GFP_KERNEL);
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
@@ -1706,6 +1694,13 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
int i;
int error = 0;
+ /* Tx queue kobjects are allowed to be updated when a device is being
+ * unregistered, but solely to remove queues from qdiscs. Any path
+ * adding queues should be fixed.
+ */
+ WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num,
+ "New queues can't be registered after device unregistration.");
+
for (i = old_num; i < new_num; i++) {
error = netdev_queue_add_kobject(dev, i);
if (error) {
@@ -1820,6 +1815,9 @@ static void remove_queue_kobjects(struct net_device *dev)
net_rx_queue_update_kobjects(dev, real_rx, 0);
netdev_queue_update_kobjects(dev, real_tx, 0);
+
+ dev->real_num_rx_queues = 0;
+ dev->real_num_tx_queues = 0;
#ifdef CONFIG_SYSFS
kset_unregister(dev->queues_kset);
#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index edfc0f8011f8..db724463e7cd 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -776,7 +776,7 @@ put_noaddr:
err = __netpoll_setup(np, ndev);
if (err)
goto put;
-
+ netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL);
rtnl_unlock();
return 0;
@@ -853,7 +853,7 @@ void netpoll_cleanup(struct netpoll *np)
if (!np->dev)
goto out;
__netpoll_cleanup(np);
- dev_put(np->dev);
+ dev_put_track(np->dev, &np->dev_tracker);
np->dev = NULL;
out:
rtnl_unlock();
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index a3d74e2704c4..560a5e712dc3 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -410,6 +410,7 @@ struct pktgen_dev {
* device name (not when the inject is
* started as it used to do.)
*/
+ netdevice_tracker dev_tracker;
char odevname[32];
struct flow_state *flows;
unsigned int cflows; /* Concurrent flows (config) */
@@ -2099,7 +2100,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
/* Clean old setups */
if (pkt_dev->odev) {
- dev_put(pkt_dev->odev);
+ dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker);
pkt_dev->odev = NULL;
}
@@ -2117,6 +2118,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
err = -ENETDOWN;
} else {
pkt_dev->odev = odev;
+ netdev_tracker_alloc(odev, &pkt_dev->dev_tracker, GFP_KERNEL);
return 0;
}
@@ -3805,7 +3807,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
return add_dev_to_thread(t, pkt_dev);
out2:
- dev_put(pkt_dev->odev);
+ dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker);
out1:
#ifdef CONFIG_XFRM
free_SAs(pkt_dev);
@@ -3899,7 +3901,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
/* Dis-associate from the interface */
if (pkt_dev->odev) {
- dev_put(pkt_dev->odev);
+ dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker);
pkt_dev->odev = NULL;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2af8aeeadadf..d6eba554b137 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -842,9 +842,9 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
}
if (dev->operstate != operstate) {
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
dev->operstate = operstate;
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
netdev_state_change(dev);
}
}
@@ -2539,13 +2539,12 @@ static int do_set_proto_down(struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1];
- const struct net_device_ops *ops = dev->netdev_ops;
unsigned long mask = 0;
u32 value;
bool proto_down;
int err;
- if (!ops->ndo_change_proto_down) {
+ if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) {
NL_SET_ERR_MSG(extack, "Protodown not supported by device");
return -EOPNOTSUPP;
}
@@ -2768,7 +2767,7 @@ static int do_setlink(const struct sk_buff *skb,
}
if (dev->gso_max_segs ^ max_segs) {
- dev->gso_max_segs = max_segs;
+ netif_set_gso_max_segs(dev, max_segs);
status |= DO_SETLINK_MODIFIED;
}
}
@@ -2779,11 +2778,11 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_LINKMODE]) {
unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
if (dev->link_mode ^ value)
status |= DO_SETLINK_NOTIFY;
dev->link_mode = value;
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
}
if (tb[IFLA_VFINFO_LIST]) {
@@ -3222,7 +3221,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
if (tb[IFLA_GSO_MAX_SIZE])
netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
if (tb[IFLA_GSO_MAX_SEGS])
- dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+ netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
return dev;
}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index b5bc680d4755..9b8443774449 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -19,8 +19,8 @@
#include <linux/in6.h>
#include <net/tcp.h>
-static siphash_key_t net_secret __read_mostly;
-static siphash_key_t ts_secret __read_mostly;
+static siphash_aligned_key_t net_secret;
+static siphash_aligned_key_t ts_secret;
static __always_inline void net_secret_init(void)
{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ba2f38246f07..a33247fdb8f5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -992,12 +992,10 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
}
EXPORT_SYMBOL(napi_consume_skb);
-/* Make sure a field is enclosed inside headers_start/headers_end section */
+/* Make sure a field is contained by headers group */
#define CHECK_SKB_FIELD(field) \
- BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
- offsetof(struct sk_buff, headers_start)); \
- BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
- offsetof(struct sk_buff, headers_end)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \
+ offsetof(struct sk_buff, headers.field)); \
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
@@ -1009,14 +1007,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
__skb_ext_copy(new, old);
__nf_copy(new, old, false);
- /* Note : this field could be in headers_start/headers_end section
+ /* Note : this field could be in the headers group.
* It is not yet because we do not want to have a 16 bit hole
*/
new->queue_mapping = old->queue_mapping;
- memcpy(&new->headers_start, &old->headers_start,
- offsetof(struct sk_buff, headers_end) -
- offsetof(struct sk_buff, headers_start));
+ memcpy(&new->headers, &old->headers, sizeof(new->headers));
CHECK_SKB_FIELD(protocol);
CHECK_SKB_FIELD(csum);
CHECK_SKB_FIELD(hash);
@@ -3919,32 +3915,6 @@ err_linearize:
}
EXPORT_SYMBOL_GPL(skb_segment_list);
-int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
-{
- if (unlikely(p->len + skb->len >= 65536))
- return -E2BIG;
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
-
- skb_pull(skb, skb_gro_offset(skb));
-
- NAPI_GRO_CB(p)->last = skb;
- NAPI_GRO_CB(p)->count++;
- p->data_len += skb->len;
-
- /* sk owenrship - if any - completely transferred to the aggregated packet */
- skb->destructor = NULL;
- p->truesize += skb->truesize;
- p->len += skb->len;
-
- NAPI_GRO_CB(skb)->same_flow = 1;
-
- return 0;
-}
-
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
@@ -4297,122 +4267,6 @@ err:
}
EXPORT_SYMBOL_GPL(skb_segment);
-int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
-{
- struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
- unsigned int offset = skb_gro_offset(skb);
- unsigned int headlen = skb_headlen(skb);
- unsigned int len = skb_gro_len(skb);
- unsigned int delta_truesize;
- unsigned int new_truesize;
- struct sk_buff *lp;
-
- if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
- return -E2BIG;
-
- lp = NAPI_GRO_CB(p)->last;
- pinfo = skb_shinfo(lp);
-
- if (headlen <= offset) {
- skb_frag_t *frag;
- skb_frag_t *frag2;
- int i = skbinfo->nr_frags;
- int nr_frags = pinfo->nr_frags + i;
-
- if (nr_frags > MAX_SKB_FRAGS)
- goto merge;
-
- offset -= headlen;
- pinfo->nr_frags = nr_frags;
- skbinfo->nr_frags = 0;
-
- frag = pinfo->frags + nr_frags;
- frag2 = skbinfo->frags + i;
- do {
- *--frag = *--frag2;
- } while (--i);
-
- skb_frag_off_add(frag, offset);
- skb_frag_size_sub(frag, offset);
-
- /* all fragments truesize : remove (head size + sk_buff) */
- new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
- delta_truesize = skb->truesize - new_truesize;
-
- skb->truesize = new_truesize;
- skb->len -= skb->data_len;
- skb->data_len = 0;
-
- NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
- goto done;
- } else if (skb->head_frag) {
- int nr_frags = pinfo->nr_frags;
- skb_frag_t *frag = pinfo->frags + nr_frags;
- struct page *page = virt_to_head_page(skb->head);
- unsigned int first_size = headlen - offset;
- unsigned int first_offset;
-
- if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
- goto merge;
-
- first_offset = skb->data -
- (unsigned char *)page_address(page) +
- offset;
-
- pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
-
- __skb_frag_set_page(frag, page);
- skb_frag_off_set(frag, first_offset);
- skb_frag_size_set(frag, first_size);
-
- memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
- /* We dont need to clear skbinfo->nr_frags here */
-
- new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
- delta_truesize = skb->truesize - new_truesize;
- skb->truesize = new_truesize;
- NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
- goto done;
- }
-
-merge:
- /* sk owenrship - if any - completely transferred to the aggregated packet */
- skb->destructor = NULL;
- delta_truesize = skb->truesize;
- if (offset > headlen) {
- unsigned int eat = offset - headlen;
-
- skb_frag_off_add(&skbinfo->frags[0], eat);
- skb_frag_size_sub(&skbinfo->frags[0], eat);
- skb->data_len -= eat;
- skb->len -= eat;
- offset = headlen;
- }
-
- __skb_pull(skb, offset);
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
- NAPI_GRO_CB(p)->last = skb;
- __skb_header_release(skb);
- lp = p;
-
-done:
- NAPI_GRO_CB(p)->count++;
- p->data_len += len;
- p->truesize += delta_truesize;
- p->len += len;
- if (lp != p) {
- lp->data_len += len;
- lp->truesize += delta_truesize;
- lp->len += len;
- }
- NAPI_GRO_CB(skb)->same_flow = 1;
- return 0;
-}
-
#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE 8
#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
@@ -4849,8 +4703,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM)
+ if (sk_is_tcp(sk))
serr->ee.ee_data -= sk->sk_tskey;
}
@@ -4919,8 +4772,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (tsonly) {
#ifdef CONFIG_INET
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
- sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
+ sk_is_tcp(sk)) {
skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
ack_skb);
opt_stats = true;
diff --git a/net/core/sock.c b/net/core/sock.c
index 41e91d0f7061..4a499d255f40 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -144,8 +144,6 @@
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
-static void sock_inuse_add(struct net *net, int val);
-
/**
* sk_ns_capable - General socket capability test
* @sk: Socket to use a capability on or through
@@ -327,7 +325,10 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
noreclaim_flag = memalloc_noreclaim_save();
- ret = sk->sk_backlog_rcv(sk, skb);
+ ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
+ tcp_v6_do_rcv,
+ tcp_v4_do_rcv,
+ sk, skb);
memalloc_noreclaim_restore(noreclaim_flag);
return ret;
@@ -872,8 +873,7 @@ int sock_set_timestamping(struct sock *sk, int optname,
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
+ if (sk_is_tcp(sk)) {
if ((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN))
return -EINVAL;
@@ -1135,6 +1135,7 @@ set_sndbuf:
case SO_PRIORITY:
if ((val >= 0 && val <= 6) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
sk->sk_priority = val;
else
@@ -1280,7 +1281,8 @@ set_sndbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1370,8 +1372,7 @@ set_sndbuf:
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (!((sk->sk_type == SOCK_STREAM &&
- sk->sk_protocol == IPPROTO_TCP) ||
+ if (!(sk_is_tcp(sk) ||
(sk->sk_type == SOCK_DGRAM &&
sk->sk_protocol == IPPROTO_UDP)))
ret = -ENOTSUPP;
@@ -2246,17 +2247,22 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
u32 max_segs = 1;
sk_dst_set(sk, dst);
- sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
+ sk->sk_route_caps = dst->dev->features;
+ if (sk_is_tcp(sk))
+ sk->sk_route_caps |= NETIF_F_GSO;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
- sk->sk_route_caps &= ~sk->sk_route_nocaps;
+ if (unlikely(sk->sk_gso_disabled))
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
if (sk_can_gso(sk)) {
if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- sk->sk_gso_max_size = dst->dev->gso_max_size;
- max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
+ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
+ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
@@ -3532,19 +3538,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
}
#ifdef CONFIG_PROC_FS
-#define PROTO_INUSE_NR 64 /* should be enough for the first time */
-struct prot_inuse {
- int val[PROTO_INUSE_NR];
-};
-
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
-void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
-{
- __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
-}
-EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
-
int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
int cpu, idx = prot->inuse_idx;
@@ -3557,17 +3552,12 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
-static void sock_inuse_add(struct net *net, int val)
-{
- this_cpu_add(*net->core.sock_inuse, val);
-}
-
int sock_inuse_get(struct net *net)
{
int cpu, res = 0;
for_each_possible_cpu(cpu)
- res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+ res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
return res;
}
@@ -3579,22 +3569,12 @@ static int __net_init sock_inuse_init_net(struct net *net)
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
if (net->core.prot_inuse == NULL)
return -ENOMEM;
-
- net->core.sock_inuse = alloc_percpu(int);
- if (net->core.sock_inuse == NULL)
- goto out;
-
return 0;
-
-out:
- free_percpu(net->core.prot_inuse);
- return -ENOMEM;
}
static void __net_exit sock_inuse_exit_net(struct net *net)
{
free_percpu(net->core.prot_inuse);
- free_percpu(net->core.sock_inuse);
}
static struct pernet_operations net_inuse_ops = {
@@ -3640,9 +3620,6 @@ static inline void release_proto_idx(struct proto *prot)
{
}
-static void sock_inuse_add(struct net *net, int val)
-{
-}
#endif
static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)