summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c582
-rw-r--r--net/core/drop_monitor.c120
-rw-r--r--net/core/filter.c451
-rw-r--r--net/core/flow_dissector.c17
-rw-r--r--net/core/gro.c16
-rw-r--r--net/core/gro_cells.c36
-rw-r--r--net/core/link_watch.c6
-rw-r--r--net/core/neighbour.c6
-rw-r--r--net/core/net_namespace.c20
-rw-r--r--net/core/page_pool.c102
-rw-r--r--net/core/rtnetlink.c526
-rw-r--r--net/core/skbuff.c55
-rw-r--r--net/core/sock.c22
-rw-r--r--net/core/sock_map.c77
-rw-r--r--net/core/sysctl_net_core.c20
-rw-r--r--net/core/utils.c4
-rw-r--r--net/core/xdp.c78
17 files changed, 1718 insertions, 420 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 1baab07820f6..5db2443c2371 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -216,18 +216,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
-static inline void rps_lock(struct softnet_data *sd)
+static inline void rps_lock_irqsave(struct softnet_data *sd,
+ unsigned long *flags)
{
-#ifdef CONFIG_RPS
- spin_lock(&sd->input_pkt_queue.lock);
-#endif
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_save(*flags);
}
-static inline void rps_unlock(struct softnet_data *sd)
+static inline void rps_lock_irq_disable(struct softnet_data *sd)
{
-#ifdef CONFIG_RPS
- spin_unlock(&sd->input_pkt_queue.lock);
-#endif
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_lock_irq(&sd->input_pkt_queue.lock);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
+}
+
+static inline void rps_unlock_irq_restore(struct softnet_data *sd,
+ unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(*flags);
+}
+
+static inline void rps_unlock_irq_enable(struct softnet_data *sd)
+{
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_unlock_irq(&sd->input_pkt_queue.lock);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
}
static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
@@ -1037,7 +1057,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, name_node->name, IFNAMSIZ))
- set_bit(i, inuse);
+ __set_bit(i, inuse);
}
if (!sscanf(d->name, name, &i))
continue;
@@ -1047,7 +1067,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, d->name, IFNAMSIZ))
- set_bit(i, inuse);
+ __set_bit(i, inuse);
}
i = find_first_zero_bit(inuse, max_netdevices);
@@ -1602,7 +1622,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
- N(PRE_CHANGEADDR)
+ N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
+ N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
}
#undef N
return "UNKNOWN_NETDEV_EVENT";
@@ -1919,6 +1940,32 @@ static int call_netdevice_notifiers_info(unsigned long val,
return raw_notifier_call_chain(&netdev_chain, val, info);
}
+/**
+ * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
+ * for and rollback on error
+ * @val_up: value passed unmodified to notifier function
+ * @val_down: value passed unmodified to the notifier function when
+ * recovering from an error on @val_up
+ * @info: notifier information data
+ *
+ * Call all per-netns network notifier blocks, but not notifier blocks on
+ * the global notifier chain. Parameters and return value are as for
+ * raw_notifier_call_chain_robust().
+ */
+
+static int
+call_netdevice_notifiers_info_robust(unsigned long val_up,
+ unsigned long val_down,
+ struct netdev_notifier_info *info)
+{
+ struct net *net = dev_net(info->dev);
+
+ ASSERT_RTNL();
+
+ return raw_notifier_call_chain_robust(&net->netdev_chain,
+ val_up, val_down, info);
+}
+
static int call_netdevice_notifiers_extack(unsigned long val,
struct net_device *dev,
struct netlink_ext_ack *extack)
@@ -2000,7 +2047,8 @@ void net_dec_egress_queue(void)
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
-static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
@@ -2061,14 +2109,15 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
+ skb->mono_delivery_time = 0;
if (static_branch_unlikely(&netstamp_needed_key))
- __net_timestamp(skb);
+ skb->tstamp = ktime_get_real();
}
#define net_timestamp_check(COND, SKB) \
if (static_branch_unlikely(&netstamp_needed_key)) { \
if ((COND) && !(SKB)->tstamp) \
- __net_timestamp(SKB); \
+ (SKB)->tstamp = ktime_get_real(); \
} \
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
@@ -4456,11 +4505,11 @@ static void rps_trigger_softirq(void *data)
* If yes, queue it to our IPI list and return 1
* If no, return 0
*/
-static int rps_ipi_queued(struct softnet_data *sd)
+static int napi_schedule_rps(struct softnet_data *sd)
{
-#ifdef CONFIG_RPS
struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+#ifdef CONFIG_RPS
if (sd != mysd) {
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
@@ -4469,6 +4518,7 @@ static int rps_ipi_queued(struct softnet_data *sd)
return 1;
}
#endif /* CONFIG_RPS */
+ __napi_schedule_irqoff(&mysd->backlog);
return 0;
}
@@ -4525,9 +4575,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
sd = &per_cpu(softnet_data, cpu);
- local_irq_save(flags);
-
- rps_lock(sd);
+ rps_lock_irqsave(sd, &flags);
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
@@ -4536,26 +4584,21 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
- rps_unlock(sd);
- local_irq_restore(flags);
+ rps_unlock_irq_restore(sd, &flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
- if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
- if (!rps_ipi_queued(sd))
- ____napi_schedule(sd, &sd->backlog);
- }
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+ napi_schedule_rps(sd);
goto enqueue;
}
drop:
sd->dropped++;
- rps_unlock(sd);
-
- local_irq_restore(flags);
+ rps_unlock_irq_restore(sd, &flags);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
@@ -4796,7 +4839,6 @@ static int netif_rx_internal(struct sk_buff *skb)
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
- preempt_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -4806,78 +4848,72 @@ static int netif_rx_internal(struct sk_buff *skb)
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- preempt_enable();
} else
#endif
{
unsigned int qtail;
- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
- put_cpu();
+ ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
}
return ret;
}
/**
+ * __netif_rx - Slightly optimized version of netif_rx
+ * @skb: buffer to post
+ *
+ * This behaves as netif_rx except that it does not disable bottom halves.
+ * As a result this function may only be invoked from the interrupt context
+ * (either hard or soft interrupt).
+ */
+int __netif_rx(struct sk_buff *skb)
+{
+ int ret;
+
+ lockdep_assert_once(hardirq_count() | softirq_count());
+
+ trace_netif_rx_entry(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+ return ret;
+}
+EXPORT_SYMBOL(__netif_rx);
+
+/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
+ * the upper (protocol) levels to process via the backlog NAPI device. It
+ * always succeeds. The buffer may be dropped during processing for
+ * congestion control or by the protocol layers.
+ * The network buffer is passed via the backlog NAPI device. Modern NIC
+ * driver should use NAPI and GRO.
+ * This function can used from interrupt and from process context. The
+ * caller from process context must not disable interrupts before invoking
+ * this function.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
-
int netif_rx(struct sk_buff *skb)
{
+ bool need_bh_off = !(hardirq_count() | softirq_count());
int ret;
+ if (need_bh_off)
+ local_bh_disable();
trace_netif_rx_entry(skb);
-
ret = netif_rx_internal(skb);
trace_netif_rx_exit(ret);
-
+ if (need_bh_off)
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL(netif_rx);
-int netif_rx_ni(struct sk_buff *skb)
-{
- int err;
-
- trace_netif_rx_ni_entry(skb);
-
- preempt_disable();
- err = netif_rx_internal(skb);
- if (local_softirq_pending())
- do_softirq();
- preempt_enable();
- trace_netif_rx_ni_exit(err);
-
- return err;
-}
-EXPORT_SYMBOL(netif_rx_ni);
-
-int netif_rx_any_context(struct sk_buff *skb)
-{
- /*
- * If invoked from contexts which do not invoke bottom half
- * processing either at return from interrupt or when softrqs are
- * reenabled, use netif_rx_ni() which invokes bottomhalf processing
- * directly.
- */
- if (in_interrupt())
- return netif_rx(skb);
- else
- return netif_rx_ni(skb);
-}
-EXPORT_SYMBOL(netif_rx_any_context);
-
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -5650,8 +5686,7 @@ static void flush_backlog(struct work_struct *work)
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
- local_irq_disable();
- rps_lock(sd);
+ rps_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
@@ -5659,8 +5694,7 @@ static void flush_backlog(struct work_struct *work)
input_queue_head_incr(sd);
}
}
- rps_unlock(sd);
- local_irq_enable();
+ rps_unlock_irq_enable(sd);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
@@ -5678,16 +5712,14 @@ static bool flush_required(int cpu)
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
bool do_flush;
- local_irq_disable();
- rps_lock(sd);
+ rps_lock_irq_disable(sd);
/* as insertion into process_queue happens with the rps lock held,
* process_queue access may race only with dequeue
*/
do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
!skb_queue_empty_lockless(&sd->process_queue);
- rps_unlock(sd);
- local_irq_enable();
+ rps_unlock_irq_enable(sd);
return do_flush;
#endif
@@ -5802,8 +5834,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
- local_irq_disable();
- rps_lock(sd);
+ rps_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
@@ -5819,8 +5850,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
}
- rps_unlock(sd);
- local_irq_enable();
+ rps_unlock_irq_enable(sd);
}
return work;
@@ -7727,6 +7757,242 @@ void netdev_bonding_info_change(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_bonding_info_change);
+static int netdev_offload_xstats_enable_l3(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+ };
+ int err;
+ int rc;
+
+ dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
+ GFP_KERNEL);
+ if (!dev->offload_xstats_l3)
+ return -ENOMEM;
+
+ rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
+ NETDEV_OFFLOAD_XSTATS_DISABLE,
+ &info.info);
+ err = notifier_to_errno(rc);
+ if (err)
+ goto free_stats;
+
+ return 0;
+
+free_stats:
+ kfree(dev->offload_xstats_l3);
+ dev->offload_xstats_l3 = NULL;
+ return err;
+}
+
+int netdev_offload_xstats_enable(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (netdev_offload_xstats_enabled(dev, type))
+ return -EALREADY;
+
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ return netdev_offload_xstats_enable_l3(dev, extack);
+ }
+
+ WARN_ON(1);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enable);
+
+static void netdev_offload_xstats_disable_l3(struct net_device *dev)
+{
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+ };
+
+ call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
+ &info.info);
+ kfree(dev->offload_xstats_l3);
+ dev->offload_xstats_l3 = NULL;
+}
+
+int netdev_offload_xstats_disable(struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ ASSERT_RTNL();
+
+ if (!netdev_offload_xstats_enabled(dev, type))
+ return -EALREADY;
+
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ netdev_offload_xstats_disable_l3(dev);
+ return 0;
+ }
+
+ WARN_ON(1);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_disable);
+
+static void netdev_offload_xstats_disable_all(struct net_device *dev)
+{
+ netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
+}
+
+static struct rtnl_hw_stats64 *
+netdev_offload_xstats_get_ptr(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ return dev->offload_xstats_l3;
+ }
+
+ WARN_ON(1);
+ return NULL;
+}
+
+bool netdev_offload_xstats_enabled(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ ASSERT_RTNL();
+
+ return netdev_offload_xstats_get_ptr(dev, type);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enabled);
+
+struct netdev_notifier_offload_xstats_ru {
+ bool used;
+};
+
+struct netdev_notifier_offload_xstats_rd {
+ struct rtnl_hw_stats64 stats;
+ bool used;
+};
+
+static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
+ const struct rtnl_hw_stats64 *src)
+{
+ dest->rx_packets += src->rx_packets;
+ dest->tx_packets += src->tx_packets;
+ dest->rx_bytes += src->rx_bytes;
+ dest->tx_bytes += src->tx_bytes;
+ dest->rx_errors += src->rx_errors;
+ dest->tx_errors += src->tx_errors;
+ dest->rx_dropped += src->rx_dropped;
+ dest->tx_dropped += src->tx_dropped;
+ dest->multicast += src->multicast;
+}
+
+static int netdev_offload_xstats_get_used(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_ru report_used = {};
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = type,
+ .report_used = &report_used,
+ };
+ int rc;
+
+ WARN_ON(!netdev_offload_xstats_enabled(dev, type));
+ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
+ &info.info);
+ *p_used = report_used.used;
+ return notifier_to_errno(rc);
+}
+
+static int netdev_offload_xstats_get_stats(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_hw_stats64 *p_stats,
+ bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_rd report_delta = {};
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = type,
+ .report_delta = &report_delta,
+ };
+ struct rtnl_hw_stats64 *stats;
+ int rc;
+
+ stats = netdev_offload_xstats_get_ptr(dev, type);
+ if (WARN_ON(!stats))
+ return -EINVAL;
+
+ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
+ &info.info);
+
+ /* Cache whatever we got, even if there was an error, otherwise the
+ * successful stats retrievals would get lost.
+ */
+ netdev_hw_stats64_add(stats, &report_delta.stats);
+
+ if (p_stats)
+ *p_stats = *stats;
+ *p_used = report_delta.used;
+
+ return notifier_to_errno(rc);
+}
+
+int netdev_offload_xstats_get(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_hw_stats64 *p_stats, bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (p_stats)
+ return netdev_offload_xstats_get_stats(dev, type, p_stats,
+ p_used, extack);
+ else
+ return netdev_offload_xstats_get_used(dev, type, p_used,
+ extack);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_get);
+
+void
+netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
+ const struct rtnl_hw_stats64 *stats)
+{
+ report_delta->used = true;
+ netdev_hw_stats64_add(&report_delta->stats, stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
+
+void
+netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
+{
+ report_used->used = true;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_used);
+
+void netdev_offload_xstats_push_delta(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ const struct rtnl_hw_stats64 *p_stats)
+{
+ struct rtnl_hw_stats64 *stats;
+
+ ASSERT_RTNL();
+
+ stats = netdev_offload_xstats_get_ptr(dev, type);
+ if (WARN_ON(!stats))
+ return;
+
+ netdev_hw_stats64_add(stats, p_stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
+
/**
* netdev_get_xmit_slave - Get the xmit slave of master device
* @dev: device
@@ -9143,7 +9409,7 @@ DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
- dev_net(dev)->dev_unreg_count++;
+ atomic_inc(&dev_net(dev)->dev_unreg_count);
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -9683,8 +9949,10 @@ int register_netdevice(struct net_device *dev)
linkwatch_init_dev(dev);
dev_init_scheduler(dev);
- dev_hold(dev);
+
+ dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL);
list_netdevice(dev);
+
add_device_randomness(dev->dev_addr, dev->addr_len);
/* If the device has permanent device address, driver should
@@ -9813,8 +10081,8 @@ int netdev_unregister_timeout_secs __read_mostly = 10;
#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
- * netdev_wait_allrefs - wait until all references are gone.
- * @dev: target net_device
+ * netdev_wait_allrefs_any - wait until all references are gone.
+ * @list: list of net_devices to wait on
*
* This is called when unregistering network devices.
*
@@ -9824,37 +10092,42 @@ int netdev_unregister_timeout_secs __read_mostly = 10;
* We can get stuck here if buggy protocols don't correctly
* call dev_put.
*/
-static void netdev_wait_allrefs(struct net_device *dev)
+static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
unsigned long rebroadcast_time, warning_time;
- int wait = 0, refcnt;
-
- linkwatch_forget_dev(dev);
+ struct net_device *dev;
+ int wait = 0;
rebroadcast_time = warning_time = jiffies;
- refcnt = netdev_refcnt_read(dev);
- while (refcnt != 1) {
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
+
+ while (true) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
/* Rebroadcast unregister notification */
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ list_for_each_entry(dev, list, todo_list)
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
__rtnl_unlock();
rcu_barrier();
rtnl_lock();
- if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
- &dev->state)) {
- /* We must not have linkwatch events
- * pending on unregister. If this
- * happens, we simply run the queue
- * unscheduled, resulting in a noop
- * for this device.
- */
- linkwatch_run_queue();
- }
+ list_for_each_entry(dev, list, todo_list)
+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+ &dev->state)) {
+ /* We must not have linkwatch events
+ * pending on unregister. If this
+ * happens, we simply run the queue
+ * unscheduled, resulting in a noop
+ * for this device.
+ */
+ linkwatch_run_queue();
+ break;
+ }
__rtnl_unlock();
@@ -9869,14 +10142,18 @@ static void netdev_wait_allrefs(struct net_device *dev)
wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
}
- refcnt = netdev_refcnt_read(dev);
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
- if (refcnt != 1 &&
- time_after(jiffies, warning_time +
+ if (time_after(jiffies, warning_time +
netdev_unregister_timeout_secs * HZ)) {
- pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
- dev->name, refcnt);
- ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+ list_for_each_entry(dev, list, todo_list) {
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, netdev_refcnt_read(dev));
+ ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+ }
+
warning_time = jiffies;
}
}
@@ -9908,6 +10185,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
*/
void netdev_run_todo(void)
{
+ struct net_device *dev, *tmp;
struct list_head list;
#ifdef CONFIG_LOCKDEP
struct list_head unlink_list;
@@ -9928,26 +10206,24 @@ void netdev_run_todo(void)
__rtnl_unlock();
-
/* Wait for rcu callbacks to finish before next phase */
if (!list_empty(&list))
rcu_barrier();
- while (!list_empty(&list)) {
- struct net_device *dev
- = list_first_entry(&list, struct net_device, todo_list);
- list_del(&dev->todo_list);
-
+ list_for_each_entry_safe(dev, tmp, &list, todo_list) {
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
- pr_err("network todo '%s' but state %d\n",
- dev->name, dev->reg_state);
- dump_stack();
+ netdev_WARN(dev, "run_todo but not unregistering\n");
+ list_del(&dev->todo_list);
continue;
}
dev->reg_state = NETREG_UNREGISTERED;
+ linkwatch_forget_dev(dev);
+ }
- netdev_wait_allrefs(dev);
+ while (!list_empty(&list)) {
+ dev = netdev_wait_allrefs_any(&list);
+ list_del(&dev->todo_list);
/* paranoia */
BUG_ON(netdev_refcnt_read(dev) != 1);
@@ -9963,11 +10239,8 @@ void netdev_run_todo(void)
if (dev->needs_free_netdev)
free_netdev(dev);
- /* Report a network device has been unregistered */
- rtnl_lock();
- dev_net(dev)->dev_unreg_count--;
- __rtnl_unlock();
- wake_up(&netdev_unregistering_wq);
+ if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
+ wake_up(&netdev_unregistering_wq);
/* Free network device */
kobject_put(&dev->dev.kobj);
@@ -10172,7 +10445,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
- dev_hold(dev);
+ __dev_hold(dev);
#else
refcount_set(&dev->dev_refcnt, 1);
#endif
@@ -10409,6 +10682,8 @@ void unregister_netdevice_many(struct list_head *head)
dev_xdp_uninstall(dev);
+ netdev_offload_xstats_disable_all(dev);
+
/* Notify protocols, that we are about to destroy
* this device. They should clean all the things.
*/
@@ -10449,7 +10724,7 @@ void unregister_netdevice_many(struct list_head *head)
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
- dev_put(dev);
+ dev_put_track(dev, &dev->dev_registered_tracker);
net_set_todo(dev);
}
@@ -10732,8 +11007,7 @@ static int __net_init netdev_init(struct net *net)
BUILD_BUG_ON(GRO_HASH_BUCKETS >
8 * sizeof_field(struct napi_struct, gro_bitmask));
- if (net != &init_net)
- INIT_LIST_HEAD(&net->dev_base_head);
+ INIT_LIST_HEAD(&net->dev_base_head);
net->dev_name_head = netdev_create_hash();
if (net->dev_name_head == NULL)
@@ -10849,14 +11123,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
.exit = netdev_exit,
};
-static void __net_exit default_device_exit(struct net *net)
+static void __net_exit default_device_exit_net(struct net *net)
{
struct net_device *dev, *aux;
/*
* Push all migratable network devices back to the
* initial network namespace
*/
- rtnl_lock();
+ ASSERT_RTNL();
for_each_netdev_safe(net, dev, aux) {
int err;
char fb_name[IFNAMSIZ];
@@ -10880,35 +11154,6 @@ static void __net_exit default_device_exit(struct net *net)
BUG();
}
}
- rtnl_unlock();
-}
-
-static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
-{
- /* Return with the rtnl_lock held when there are no network
- * devices unregistering in any network namespace in net_list.
- */
- struct net *net;
- bool unregistering;
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- add_wait_queue(&netdev_unregistering_wq, &wait);
- for (;;) {
- unregistering = false;
- rtnl_lock();
- list_for_each_entry(net, net_list, exit_list) {
- if (net->dev_unreg_count > 0) {
- unregistering = true;
- break;
- }
- }
- if (!unregistering)
- break;
- __rtnl_unlock();
-
- wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
- }
- remove_wait_queue(&netdev_unregistering_wq, &wait);
}
static void __net_exit default_device_exit_batch(struct list_head *net_list)
@@ -10922,18 +11167,12 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
struct net *net;
LIST_HEAD(dev_kill_list);
- /* To prevent network device cleanup code from dereferencing
- * loopback devices or network devices that have been freed
- * wait here for all pending unregistrations to complete,
- * before unregistring the loopback device and allowing the
- * network namespace be freed.
- *
- * The netdev todo list containing all network devices
- * unregistrations that happen in default_device_exit_batch
- * will run in the rtnl_unlock() at the end of
- * default_device_exit_batch.
- */
- rtnl_lock_unregistering(net_list);
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ default_device_exit_net(net);
+ cond_resched();
+ }
+
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
@@ -10947,7 +11186,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
static struct pernet_operations __net_initdata default_device_ops = {
- .exit = default_device_exit,
.exit_batch = default_device_exit_batch,
};
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index d5dc6be2522c..b89e3e95bffc 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -48,10 +48,22 @@
static int trace_state = TRACE_OFF;
static bool monitor_hw;
+#undef EM
+#undef EMe
+
+#define EM(a, b) [a] = #b,
+#define EMe(a, b) [a] = #b
+
+/* drop_reasons is used to translate 'enum skb_drop_reason' to string,
+ * which is reported to user space.
+ */
+static const char * const drop_reasons[] = {
+ TRACE_SKB_DROP_REASON
+};
+
/* net_dm_mutex
*
* An overall lock guarding every operation coming from userspace.
- * It also guards the global 'hw_stats_list' list.
*/
static DEFINE_MUTEX(net_dm_mutex);
@@ -87,11 +99,9 @@ struct per_cpu_dm_data {
};
struct dm_hw_stat_delta {
- struct net_device *dev;
unsigned long last_rx;
- struct list_head list;
- struct rcu_head rcu;
unsigned long last_drop_val;
+ struct rcu_head rcu;
};
static struct genl_family net_drop_monitor_family;
@@ -102,7 +112,6 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data);
static int dm_hit_limit = 64;
static int dm_delay = 1;
static unsigned long dm_hw_check_delta = 2*HZ;
-static LIST_HEAD(hw_stats_list);
static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
static u32 net_dm_trunc_len;
@@ -126,6 +135,7 @@ struct net_dm_skb_cb {
struct devlink_trap_metadata *hw_metadata;
void *pc;
};
+ enum skb_drop_reason reason;
};
#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
@@ -273,33 +283,27 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
int work, int budget)
{
- struct dm_hw_stat_delta *new_stat;
-
+ struct net_device *dev = napi->dev;
+ struct dm_hw_stat_delta *stat;
/*
* Don't check napi structures with no associated device
*/
- if (!napi->dev)
+ if (!dev)
return;
rcu_read_lock();
- list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
- struct net_device *dev;
-
+ stat = rcu_dereference(dev->dm_private);
+ if (stat) {
/*
* only add a note to our monitor buffer if:
- * 1) this is the dev we received on
- * 2) its after the last_rx delta
- * 3) our rx_dropped count has gone up
+ * 1) its after the last_rx delta
+ * 2) our rx_dropped count has gone up
*/
- /* Paired with WRITE_ONCE() in dropmon_net_event() */
- dev = READ_ONCE(new_stat->dev);
- if ((dev == napi->dev) &&
- (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
- (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+ if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) &&
+ (dev->stats.rx_dropped != stat->last_drop_val)) {
trace_drop_common(NULL, NULL);
- new_stat->last_drop_val = napi->dev->stats.rx_dropped;
- new_stat->last_rx = jiffies;
- break;
+ stat->last_drop_val = dev->stats.rx_dropped;
+ stat->last_rx = jiffies;
}
}
rcu_read_unlock();
@@ -502,6 +506,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
{
ktime_t tstamp = ktime_get_real();
struct per_cpu_dm_data *data;
+ struct net_dm_skb_cb *cb;
struct sk_buff *nskb;
unsigned long flags;
@@ -512,7 +517,11 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
if (!nskb)
return;
- NET_DM_SKB_CB(nskb)->pc = location;
+ if ((unsigned int)reason >= SKB_DROP_REASON_MAX)
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ cb = NET_DM_SKB_CB(nskb);
+ cb->reason = reason;
+ cb->pc = location;
/* Override the timestamp because we care about the time when the
* packet was dropped.
*/
@@ -557,7 +566,8 @@ static size_t net_dm_in_port_size(void)
#define NET_DM_MAX_SYMBOL_LEN 40
-static size_t net_dm_packet_report_size(size_t payload_len)
+static size_t net_dm_packet_report_size(size_t payload_len,
+ enum skb_drop_reason reason)
{
size_t size;
@@ -578,6 +588,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
nla_total_size(sizeof(u32)) +
/* NET_DM_ATTR_PROTO */
nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_REASON */
+ nla_total_size(strlen(drop_reasons[reason]) + 1) +
/* NET_DM_ATTR_PAYLOAD */
nla_total_size(payload_len);
}
@@ -610,7 +622,7 @@ nla_put_failure:
static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
size_t payload_len)
{
- u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
+ struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
char buf[NET_DM_MAX_SYMBOL_LEN];
struct nlattr *attr;
void *hdr;
@@ -624,10 +636,15 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
goto nla_put_failure;
- if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc,
+ NET_DM_ATTR_PAD))
goto nla_put_failure;
- snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc);
+ if (nla_put_string(msg, NET_DM_ATTR_REASON,
+ drop_reasons[cb->reason]))
+ goto nla_put_failure;
+
+ snprintf(buf, sizeof(buf), "%pS", cb->pc);
if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
goto nla_put_failure;
@@ -683,7 +700,9 @@ static void net_dm_packet_report(struct sk_buff *skb)
if (net_dm_trunc_len)
payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
- msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+ msg = nlmsg_new(net_dm_packet_report_size(payload_len,
+ NET_DM_SKB_CB(skb)->reason),
+ GFP_KERNEL);
if (!msg)
goto out;
@@ -1169,7 +1188,6 @@ err_module_put:
static void net_dm_trace_off_set(void)
{
- struct dm_hw_stat_delta *new_stat, *temp;
const struct net_dm_alert_ops *ops;
int cpu;
@@ -1193,13 +1211,6 @@ static void net_dm_trace_off_set(void)
consume_skb(skb);
}
- list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
- if (new_stat->dev == NULL) {
- list_del_rcu(&new_stat->list);
- kfree_rcu(new_stat, rcu);
- }
- }
-
module_put(THIS_MODULE);
}
@@ -1560,41 +1571,28 @@ static int dropmon_net_event(struct notifier_block *ev_block,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct dm_hw_stat_delta *new_stat = NULL;
- struct dm_hw_stat_delta *tmp;
+ struct dm_hw_stat_delta *stat;
switch (event) {
case NETDEV_REGISTER:
- new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+ if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private)))
+ break;
+ stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+ if (!stat)
+ break;
- if (!new_stat)
- goto out;
+ stat->last_rx = jiffies;
+ rcu_assign_pointer(dev->dm_private, stat);
- new_stat->dev = dev;
- new_stat->last_rx = jiffies;
- mutex_lock(&net_dm_mutex);
- list_add_rcu(&new_stat->list, &hw_stats_list);
- mutex_unlock(&net_dm_mutex);
break;
case NETDEV_UNREGISTER:
- mutex_lock(&net_dm_mutex);
- list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
- if (new_stat->dev == dev) {
-
- /* Paired with READ_ONCE() in trace_napi_poll_hit() */
- WRITE_ONCE(new_stat->dev, NULL);
-
- if (trace_state == TRACE_OFF) {
- list_del_rcu(&new_stat->list);
- kfree_rcu(new_stat, rcu);
- break;
- }
- }
+ stat = rtnl_dereference(dev->dm_private);
+ if (stat) {
+ rcu_assign_pointer(dev->dm_private, NULL);
+ kfree_rcu(stat, rcu);
}
- mutex_unlock(&net_dm_mutex);
break;
}
-out:
return NOTIFY_DONE;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 9eb785842258..88767f7da150 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2107,7 +2107,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
}
skb->dev = dev;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
dev_xmit_recursion_inc();
ret = dev_queue_xmit(skb);
@@ -2176,7 +2176,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
}
skb->dev = dev;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len);
@@ -2274,7 +2274,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
}
skb->dev = dev;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len);
@@ -2603,7 +2603,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
+ if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2812,7 +2812,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- __clear_bit(new, &msg->sg.copy);
+ __clear_bit(new, msg->sg.copy);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
@@ -3786,6 +3786,28 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
+
+BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
+{
+ return xdp_get_buff_len(xdp);
+}
+
+static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0],
+};
+
static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -3820,11 +3842,208 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
+static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+ void *buf, unsigned long len, bool flush)
+{
+ unsigned long ptr_len, ptr_off = 0;
+ skb_frag_t *next_frag, *end_frag;
+ struct skb_shared_info *sinfo;
+ void *src, *dst;
+ u8 *ptr_buf;
+
+ if (likely(xdp->data_end - xdp->data >= off + len)) {
+ src = flush ? buf : xdp->data + off;
+ dst = flush ? xdp->data + off : buf;
+ memcpy(dst, src, len);
+ return;
+ }
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ end_frag = &sinfo->frags[sinfo->nr_frags];
+ next_frag = &sinfo->frags[0];
+
+ ptr_len = xdp->data_end - xdp->data;
+ ptr_buf = xdp->data;
+
+ while (true) {
+ if (off < ptr_off + ptr_len) {
+ unsigned long copy_off = off - ptr_off;
+ unsigned long copy_len = min(len, ptr_len - copy_off);
+
+ src = flush ? buf : ptr_buf + copy_off;
+ dst = flush ? ptr_buf + copy_off : buf;
+ memcpy(dst, src, copy_len);
+
+ off += copy_len;
+ len -= copy_len;
+ buf += copy_len;
+ }
+
+ if (!len || next_frag == end_frag)
+ break;
+
+ ptr_off += ptr_len;
+ ptr_buf = skb_frag_address(next_frag);
+ ptr_len = skb_frag_size(next_frag);
+ next_frag++;
+ }
+}
+
+static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ u32 size = xdp->data_end - xdp->data;
+ void *addr = xdp->data;
+ int i;
+
+ if (unlikely(offset > 0xffff || len > 0xffff))
+ return ERR_PTR(-EFAULT);
+
+ if (offset + len > xdp_get_buff_len(xdp))
+ return ERR_PTR(-EINVAL);
+
+ if (offset < size) /* linear area */
+ goto out;
+
+ offset -= size;
+ for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
+ u32 frag_size = skb_frag_size(&sinfo->frags[i]);
+
+ if (offset < frag_size) {
+ addr = skb_frag_address(&sinfo->frags[i]);
+ size = frag_size;
+ break;
+ }
+ offset -= frag_size;
+ }
+out:
+ return offset + len < size ? addr + offset : NULL;
+}
+
+BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, false);
+ else
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
+ .func = bpf_xdp_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, true);
+ else
+ memcpy(ptr, buf, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
+ .func = bpf_xdp_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
+ struct xdp_rxq_info *rxq = xdp->rxq;
+ unsigned int tailroom;
+
+ if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
+ return -EOPNOTSUPP;
+
+ tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
+ if (unlikely(offset > tailroom))
+ return -EINVAL;
+
+ memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+ skb_frag_size_add(frag, offset);
+ sinfo->xdp_frags_size += offset;
+
+ return 0;
+}
+
+static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, n_frags_free = 0, len_free = 0;
+
+ if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
+ return -EINVAL;
+
+ for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ int shrink = min_t(int, offset, skb_frag_size(frag));
+
+ len_free += shrink;
+ offset -= shrink;
+
+ if (skb_frag_size(frag) == shrink) {
+ struct page *page = skb_frag_page(frag);
+
+ __xdp_return(page_address(page), &xdp->rxq->mem,
+ false, NULL);
+ n_frags_free++;
+ } else {
+ skb_frag_size_sub(frag, shrink);
+ break;
+ }
+ }
+ sinfo->nr_frags -= n_frags_free;
+ sinfo->xdp_frags_size -= len_free;
+
+ if (unlikely(!sinfo->nr_frags)) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp->data_end -= offset;
+ }
+
+ return 0;
+}
+
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
void *data_end = xdp->data_end + offset;
+ if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
+ if (offset < 0)
+ return bpf_xdp_frags_shrink_tail(xdp, -offset);
+
+ return bpf_xdp_frags_increase_tail(xdp, offset);
+ }
+
/* Notice that xdp_data_hard_end have reserved some tailroom */
if (unlikely(data_end > data_hard_end))
return -EINVAL;
@@ -4050,6 +4269,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
enum bpf_map_type map_type = ri->map_type;
+ /* XDP_REDIRECT is not fully supported yet for xdp frags since
+ * not all XDP capable drivers can map non-linear xdp_frame in
+ * ndo_xdp_xmit.
+ */
+ if (unlikely(xdp_buff_has_frags(xdp) &&
+ map_type != BPF_MAP_TYPE_CPUMAP))
+ return -EOPNOTSUPP;
+
if (map_type == BPF_MAP_TYPE_XSKMAP)
return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
@@ -4593,10 +4820,12 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
};
#endif
-static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
unsigned long off, unsigned long len)
{
- memcpy(dst_buff, src_buff + off, len);
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+
+ bpf_xdp_copy_buf(xdp, off, dst, len, false);
return 0;
}
@@ -4607,11 +4836,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
- if (unlikely(!xdp ||
- xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+
+ if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
return -EFAULT;
- return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+ return bpf_event_output(map, flags, meta, meta_size, xdp,
xdp_size, bpf_xdp_copy);
}
@@ -4865,6 +5094,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
sk->sk_reuseport = valbool;
break;
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_txrehash = (u8)val;
+ break;
default:
ret = -EINVAL;
}
@@ -5043,6 +5279,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
*((int *)optval) = sk->sk_reuseport;
break;
+ case SO_TXREHASH:
+ *((int *)optval) = sk->sk_txrehash;
+ break;
default:
goto err_clear;
}
@@ -7149,6 +7388,43 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb,
+ u64, dtime, u32, dtime_type)
+{
+ /* skb_clear_delivery_time() is done for inet protocol */
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ return -EOPNOTSUPP;
+
+ switch (dtime_type) {
+ case BPF_SKB_DELIVERY_TIME_MONO:
+ if (!dtime)
+ return -EINVAL;
+ skb->tstamp = dtime;
+ skb->mono_delivery_time = 1;
+ break;
+ case BPF_SKB_DELIVERY_TIME_NONE:
+ if (dtime)
+ return -EINVAL;
+ skb->tstamp = 0;
+ skb->mono_delivery_time = 0;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = {
+ .func = bpf_skb_set_delivery_time,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -7510,6 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_tcp_gen_syncookie_proto;
case BPF_FUNC_sk_assign:
return &bpf_sk_assign_proto;
+ case BPF_FUNC_skb_set_delivery_time:
+ return &bpf_skb_set_delivery_time_proto;
#endif
default:
return bpf_sk_base_func_proto(func_id);
@@ -7536,6 +7814,12 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_map_proto;
case BPF_FUNC_xdp_adjust_tail:
return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_proto;
+ case BPF_FUNC_xdp_load_bytes:
+ return &bpf_xdp_load_bytes_proto;
+ case BPF_FUNC_xdp_store_bytes:
+ return &bpf_xdp_store_bytes_proto;
case BPF_FUNC_fib_lookup:
return &bpf_xdp_fib_lookup_proto;
case BPF_FUNC_check_mtu:
@@ -7843,7 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
return false;
info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
break;
- case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1:
+ case offsetof(struct __sk_buff, delivery_time_type):
+ return false;
+ case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
/* Explicitly prohibit access to padding in __sk_buff. */
return false;
default:
@@ -8033,6 +8319,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
+ int field_size;
if (off < 0 || off >= sizeof(struct bpf_sock))
return false;
@@ -8044,7 +8331,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
case offsetof(struct bpf_sock, family):
case offsetof(struct bpf_sock, type):
case offsetof(struct bpf_sock, protocol):
- case offsetof(struct bpf_sock, dst_port):
case offsetof(struct bpf_sock, src_port):
case offsetof(struct bpf_sock, rx_queue_mapping):
case bpf_ctx_range(struct bpf_sock, src_ip4):
@@ -8053,6 +8339,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case bpf_ctx_range(struct bpf_sock, dst_port):
+ field_size = size == size_default ?
+ size_default : sizeof_field(struct bpf_sock, dst_port);
+ bpf_ctx_record_field_size(info, field_size);
+ return bpf_ctx_narrow_access_ok(off, size, field_size);
+ case offsetofend(struct bpf_sock, dst_port) ...
+ offsetof(struct bpf_sock, dst_ip4) - 1:
+ return false;
}
return size == size_default;
@@ -8190,6 +8484,15 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
+ case offsetof(struct __sk_buff, delivery_time_type):
+ /* The convert_ctx_access() on reading and writing
+ * __sk_buff->tstamp depends on whether the bpf prog
+ * has used __sk_buff->delivery_time_type or not.
+ * Thus, we need to set prog->delivery_time_access
+ * earlier during is_valid_access() here.
+ */
+ ((struct bpf_prog *)prog)->delivery_time_access = 1;
+ return size == sizeof(__u8);
}
return bpf_skb_is_valid_access(off, size, type, prog, info);
@@ -8585,6 +8888,45 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+ SKB_MONO_DELIVERY_TIME_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+ SKB_MONO_DELIVERY_TIME_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+ /* value_reg = BPF_SKB_DELIVERY_TIME_MONO */
+ *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO);
+ *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5);
+
+ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg,
+ offsetof(struct sk_buff, tstamp));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2);
+ /* value_reg = BPF_SKB_DELIVERY_TIME_NONE */
+ *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE);
+ *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1);
+
+#ifdef CONFIG_NET_CLS_ACT
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+ /* At ingress, value_reg = 0 */
+ *insn++ = BPF_MOV32_IMM(value_reg, 0);
+ *insn++ = BPF_JMP_A(1);
+#endif
+
+ /* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */
+ *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC);
+
+ /* 15 insns with CONFIG_NET_CLS_ACT */
+ return insn;
+}
+
static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
struct bpf_insn *insn)
{
@@ -8606,6 +8948,71 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
return insn;
}
+static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (!prog->delivery_time_access) {
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5);
+ /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp,
+ * so check the skb->mono_delivery_time.
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+ SKB_MONO_DELIVERY_TIME_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+ SKB_MONO_DELIVERY_TIME_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+ /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */
+ *insn++ = BPF_MOV64_IMM(value_reg, 0);
+ *insn++ = BPF_JMP_A(1);
+ }
+#endif
+
+ *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
+ offsetof(struct sk_buff, tstamp));
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->src_reg;
+ __u8 skb_reg = si->dst_reg;
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (!prog->delivery_time_access) {
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3);
+ /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp.
+ * Clear the skb->mono_delivery_time.
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+ SKB_MONO_DELIVERY_TIME_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+ ~SKB_MONO_DELIVERY_TIME_MASK);
+ *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg,
+ SKB_MONO_DELIVERY_TIME_OFFSET);
+ }
+#endif
+
+ /* skb->tstamp = tstamp */
+ *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg,
+ offsetof(struct sk_buff, tstamp));
+ return insn;
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -8914,17 +9321,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_DW,
- si->dst_reg, si->src_reg,
- bpf_target_off(struct sk_buff,
- tstamp, 8,
- target_size));
+ insn = bpf_convert_tstamp_write(prog, si, insn);
else
- *insn++ = BPF_LDX_MEM(BPF_DW,
- si->dst_reg, si->src_reg,
- bpf_target_off(struct sk_buff,
- tstamp, 8,
- target_size));
+ insn = bpf_convert_tstamp_read(prog, si, insn);
+ break;
+
+ case offsetof(struct __sk_buff, delivery_time_type):
+ insn = bpf_convert_dtime_type_read(si, insn);
break;
case offsetof(struct __sk_buff, gso_segs):
@@ -10065,7 +10468,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
.gen_ld_abs = bpf_gen_ld_abs,
- .check_kfunc_call = bpf_prog_test_check_kfunc_call,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -10604,7 +11006,8 @@ static bool sk_lookup_is_valid_access(int off, int size,
case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
- case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+ case offsetof(struct bpf_sk_lookup, remote_port) ...
+ offsetof(struct bpf_sk_lookup, local_ip4) - 1:
case bpf_ctx_range(struct bpf_sk_lookup, local_port):
case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
bpf_ctx_record_field_size(info, sizeof(__u32));
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 15833e1d6ea1..34441a32e3be 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -22,6 +22,7 @@
#include <linux/ppp_defs.h>
#include <linux/stddef.h>
#include <linux/if_ether.h>
+#include <linux/if_hsr.h>
#include <linux/mpls.h>
#include <linux/tcp.h>
#include <linux/ptp_classify.h>
@@ -1282,6 +1283,22 @@ proto_again:
break;
}
+ case htons(ETH_P_HSR): {
+ struct hsr_tag *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen,
+ &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ proto = hdr->encap_proto;
+ nhoff += HSR_HLEN;
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+
default:
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
diff --git a/net/core/gro.c b/net/core/gro.c
index a11b286d1495..ee5e7e889d8b 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -459,29 +459,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
skb_set_network_header(skb, skb_gro_offset(skb));
skb_reset_mac_len(skb);
- NAPI_GRO_CB(skb)->same_flow = 0;
+ BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
+ sizeof(u32))); /* Avoid slow unaligned acc */
+ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->free = 0;
- NAPI_GRO_CB(skb)->encap_mark = 0;
- NAPI_GRO_CB(skb)->recursion_counter = 0;
- NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->is_atomic = 1;
- NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
NAPI_GRO_CB(skb)->csum = skb->csum;
NAPI_GRO_CB(skb)->csum_valid = 1;
- NAPI_GRO_CB(skb)->csum_cnt = 0;
break;
case CHECKSUM_UNNECESSARY:
NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
- NAPI_GRO_CB(skb)->csum_valid = 0;
break;
- default:
- NAPI_GRO_CB(skb)->csum_cnt = 0;
- NAPI_GRO_CB(skb)->csum_valid = 0;
}
pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
@@ -634,7 +627,6 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
if (unlikely(skb->slow_gro)) {
skb_orphan(skb);
skb_ext_reset(skb);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index 6eb2e5ec2c50..8462f926ab45 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -89,8 +89,23 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
}
EXPORT_SYMBOL(gro_cells_init);
+struct percpu_free_defer {
+ struct rcu_head rcu;
+ void __percpu *ptr;
+};
+
+static void percpu_free_defer_callback(struct rcu_head *head)
+{
+ struct percpu_free_defer *defer;
+
+ defer = container_of(head, struct percpu_free_defer, rcu);
+ free_percpu(defer->ptr);
+ kfree(defer);
+}
+
void gro_cells_destroy(struct gro_cells *gcells)
{
+ struct percpu_free_defer *defer;
int i;
if (!gcells->cells)
@@ -102,12 +117,23 @@ void gro_cells_destroy(struct gro_cells *gcells)
__netif_napi_del(&cell->napi);
__skb_queue_purge(&cell->napi_skbs);
}
- /* This barrier is needed because netpoll could access dev->napi_list
- * under rcu protection.
+ /* We need to observe an rcu grace period before freeing ->cells,
+ * because netpoll could access dev->napi_list under rcu protection.
+ * Try hard using call_rcu() instead of synchronize_rcu(),
+ * because we might be called from cleanup_net(), and we
+ * definitely do not want to block this critical task.
*/
- synchronize_net();
-
- free_percpu(gcells->cells);
+ defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN);
+ if (likely(defer)) {
+ defer->ptr = gcells->cells;
+ call_rcu(&defer->rcu, percpu_free_defer_callback);
+ } else {
+ /* We do not hold RTNL at this point, synchronize_net()
+ * would not be able to expedite this sync.
+ */
+ synchronize_rcu_expedited();
+ free_percpu(gcells->cells);
+ }
gcells->cells = NULL;
}
EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index b0f5344d1185..95098d1a49bd 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -166,10 +166,10 @@ static void linkwatch_do_dev(struct net_device *dev)
netdev_state_change(dev);
}
- /* Note: our callers are responsible for
- * calling netdev_tracker_free().
+ /* Note: our callers are responsible for calling netdev_tracker_free().
+ * This is the reason we use __dev_put() instead of dev_put().
*/
- dev_put(dev);
+ __dev_put(dev);
}
static void __linkwatch_run_queue(int urgent_only)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ec0bf737b076..f64ebd050f6c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1171,7 +1171,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
@@ -1193,7 +1193,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
- kfree_skb(buff);
+ kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
@@ -1215,7 +1215,7 @@ out_dead:
if (neigh->nud_state & NUD_STALE)
goto out_unlock_bh;
write_unlock_bh(&neigh->lock);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD);
trace_neigh_event_send_dead(neigh, 1);
return 1;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a5b5bb99c644..0ec2f5906a27 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -44,13 +44,7 @@ EXPORT_SYMBOL_GPL(net_rwsem);
static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
#endif
-struct net init_net = {
- .ns.count = REFCOUNT_INIT(1),
- .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
-#ifdef CONFIG_KEYS
- .key_domain = &init_net_key_domain,
-#endif
-};
+struct net init_net;
EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
@@ -301,6 +295,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id)
return peer;
}
+EXPORT_SYMBOL_GPL(get_net_ns_by_id);
/*
* setup_net runs the initializers for the network namespace object.
@@ -363,6 +358,8 @@ out_undo:
static int __net_init net_defaults_init_net(struct net *net)
{
net->core.sysctl_somaxconn = SOMAXCONN;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+
return 0;
}
@@ -1084,7 +1081,7 @@ out:
rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}
-static int __init net_ns_init(void)
+void __init net_ns_init(void)
{
struct net_generic *ng;
@@ -1105,6 +1102,9 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
+#ifdef CONFIG_KEYS
+ init_net.key_domain = &init_net_key_domain;
+#endif
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
@@ -1119,12 +1119,8 @@ static int __init net_ns_init(void)
RTNL_FLAG_DOIT_UNLOCKED);
rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
RTNL_FLAG_DOIT_UNLOCKED);
-
- return 0;
}
-pure_initcall(net_ns_init);
-
static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
{
ops_pre_exit_list(ops, net_exit_list);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index bd62c01a2ec3..1943c0f0307d 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -26,6 +26,45 @@
#define BIAS_MAX LONG_MAX
+#ifdef CONFIG_PAGE_POOL_STATS
+/* alloc_stat_inc is intended to be used in softirq context */
+#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
+/* recycle_stat_inc is safe to use when preemption is possible. */
+#define recycle_stat_inc(pool, __stat) \
+ do { \
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
+ this_cpu_inc(s->__stat); \
+ } while (0)
+
+bool page_pool_get_stats(struct page_pool *pool,
+ struct page_pool_stats *stats)
+{
+ int cpu = 0;
+
+ if (!stats)
+ return false;
+
+ memcpy(&stats->alloc_stats, &pool->alloc_stats, sizeof(pool->alloc_stats));
+
+ for_each_possible_cpu(cpu) {
+ const struct page_pool_recycle_stats *pcpu =
+ per_cpu_ptr(pool->recycle_stats, cpu);
+
+ stats->recycle_stats.cached += pcpu->cached;
+ stats->recycle_stats.cache_full += pcpu->cache_full;
+ stats->recycle_stats.ring += pcpu->ring;
+ stats->recycle_stats.ring_full += pcpu->ring_full;
+ stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(page_pool_get_stats);
+#else
+#define alloc_stat_inc(pool, __stat)
+#define recycle_stat_inc(pool, __stat)
+#endif
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -73,6 +112,12 @@ static int page_pool_init(struct page_pool *pool,
pool->p.flags & PP_FLAG_PAGE_FRAG)
return -EINVAL;
+#ifdef CONFIG_PAGE_POOL_STATS
+ pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
+ if (!pool->recycle_stats)
+ return -ENOMEM;
+#endif
+
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
@@ -117,8 +162,10 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
int pref_nid; /* preferred NUMA node */
/* Quicker fallback, avoid locks when ring is empty */
- if (__ptr_ring_empty(r))
+ if (__ptr_ring_empty(r)) {
+ alloc_stat_inc(pool, empty);
return NULL;
+ }
/* Softirq guarantee CPU and thus NUMA node is stable. This,
* assumes CPU refilling driver RX-ring will also run RX-NAPI.
@@ -145,14 +192,17 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
* This limit stress on page buddy alloactor.
*/
page_pool_return_page(pool, page);
+ alloc_stat_inc(pool, waive);
page = NULL;
break;
}
} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
/* Return last page */
- if (likely(pool->alloc.count > 0))
+ if (likely(pool->alloc.count > 0)) {
page = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, refill);
+ }
return page;
}
@@ -166,6 +216,7 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
if (likely(pool->alloc.count)) {
/* Fast-path */
page = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, fast);
} else {
page = page_pool_refill_alloc_cache(pool);
}
@@ -239,6 +290,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
return NULL;
}
+ alloc_stat_inc(pool, slow_high_order);
page_pool_set_pp_info(pool, page);
/* Track how many pages are held 'in-flight' */
@@ -293,10 +345,12 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
}
/* Return last page */
- if (likely(pool->alloc.count > 0))
+ if (likely(pool->alloc.count > 0)) {
page = pool->alloc.cache[--pool->alloc.count];
- else
+ alloc_stat_inc(pool, slow);
+ } else {
page = NULL;
+ }
/* When page just alloc'ed is should/must have refcnt 1. */
return page;
@@ -394,7 +448,12 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
else
ret = ptr_ring_produce_bh(&pool->ring, page);
- return (ret == 0) ? true : false;
+ if (!ret) {
+ recycle_stat_inc(pool, ring);
+ return true;
+ }
+
+ return false;
}
/* Only allow direct recycling in special circumstances, into the
@@ -405,11 +464,14 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
static bool page_pool_recycle_in_cache(struct page *page,
struct page_pool *pool)
{
- if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
+ recycle_stat_inc(pool, cache_full);
return false;
+ }
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
pool->alloc.cache[pool->alloc.count++] = page;
+ recycle_stat_inc(pool, cached);
return true;
}
@@ -423,11 +485,6 @@ static __always_inline struct page *
__page_pool_put_page(struct page_pool *pool, struct page *page,
unsigned int dma_sync_size, bool allow_direct)
{
- /* It is not the last user for the page frag case */
- if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
- page_pool_atomic_sub_frag_count_return(page, 1))
- return NULL;
-
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
@@ -464,6 +521,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* doing refcnt based recycle tricks, meaning another process
* will be invoking put_page.
*/
+ recycle_stat_inc(pool, released_refcnt);
/* Do not replace this with page_pool_return_page() */
page_pool_release_page(pool, page);
put_page(page);
@@ -471,16 +529,17 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
return NULL;
}
-void page_pool_put_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
if (page && !page_pool_recycle_in_ring(pool, page)) {
/* Cache full, fallback to free pages */
+ recycle_stat_inc(pool, ring_full);
page_pool_return_page(pool, page);
}
}
-EXPORT_SYMBOL(page_pool_put_page);
+EXPORT_SYMBOL(page_pool_put_defragged_page);
/* Caller must not use data area after call, as this function overwrites it */
void page_pool_put_page_bulk(struct page_pool *pool, void **data,
@@ -491,6 +550,10 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
for (i = 0; i < count; i++) {
struct page *page = virt_to_head_page(data[i]);
+ /* It is not the last user for the page frag case */
+ if (!page_pool_is_last_frag(pool, page))
+ continue;
+
page = __page_pool_put_page(pool, page, -1, false);
/* Approved for bulk recycling in ptr_ring cache */
if (page)
@@ -526,8 +589,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
long drain_count = BIAS_MAX - pool->frag_users;
/* Some user is still using the page frag */
- if (likely(page_pool_atomic_sub_frag_count_return(page,
- drain_count)))
+ if (likely(page_pool_defrag_page(page, drain_count)))
return NULL;
if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
@@ -548,8 +610,7 @@ static void page_pool_free_frag(struct page_pool *pool)
pool->frag_page = NULL;
- if (!page ||
- page_pool_atomic_sub_frag_count_return(page, drain_count))
+ if (!page || page_pool_defrag_page(page, drain_count))
return;
page_pool_return_page(pool, page);
@@ -588,7 +649,7 @@ frag_reset:
pool->frag_users = 1;
*offset = 0;
pool->frag_offset = size;
- page_pool_set_frag_count(page, BIAS_MAX);
+ page_pool_fragment_page(page, BIAS_MAX);
return page;
}
@@ -623,6 +684,9 @@ static void page_pool_free(struct page_pool *pool)
if (pool->p.flags & PP_FLAG_DMA_MAP)
put_device(pool->p.dev);
+#ifdef CONFIG_PAGE_POOL_STATS
+ free_percpu(pool->recycle_stats);
+#endif
kfree(pool);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2fb8eb6791e8..a66b6761b88b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -459,7 +459,7 @@ static void rtnl_lock_unregistering_all(void)
* setup_net() and cleanup_net() are not possible.
*/
for_each_net(net) {
- if (net->dev_unreg_count > 0) {
+ if (atomic_read(&net->dev_unreg_count) > 0) {
unregistering = true;
break;
}
@@ -5048,82 +5048,256 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
(!idxattr || idxattr == attrid);
}
-#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
-static int rtnl_get_offload_stats_attr_size(int attr_id)
+static bool
+rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id)
{
- switch (attr_id) {
- case IFLA_OFFLOAD_XSTATS_CPU_HIT:
- return sizeof(struct rtnl_link_stats64);
- }
+ return dev->netdev_ops &&
+ dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats &&
+ dev->netdev_ops->ndo_has_offload_stats(dev, attr_id);
+}
- return 0;
+static unsigned int
+rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id)
+{
+ return rtnl_offload_xstats_have_ndo(dev, attr_id) ?
+ sizeof(struct rtnl_link_stats64) : 0;
}
-static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
- int *prividx)
+static int
+rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id,
+ struct sk_buff *skb)
{
+ unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id);
struct nlattr *attr = NULL;
- int attr_id, size;
void *attr_data;
int err;
- if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
- dev->netdev_ops->ndo_get_offload_stats))
+ if (!size)
return -ENODATA;
- for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
- attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (attr_id < *prividx)
- continue;
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ return -EMSGSIZE;
- size = rtnl_get_offload_stats_attr_size(attr_id);
- if (!size)
- continue;
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
- if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
- continue;
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_stats(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ bool enabled = netdev_offload_xstats_enabled(dev, type);
+
+ return enabled ? sizeof(struct rtnl_hw_stats64) : 0;
+}
+
+struct rtnl_offload_xstats_request_used {
+ bool request;
+ bool used;
+};
+
+static int
+rtnl_offload_xstats_get_stats(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_offload_xstats_request_used *ru,
+ struct rtnl_hw_stats64 *stats,
+ struct netlink_ext_ack *extack)
+{
+ bool request;
+ bool used;
+ int err;
+
+ request = netdev_offload_xstats_enabled(dev, type);
+ if (!request) {
+ used = false;
+ goto out;
+ }
+
+ err = netdev_offload_xstats_get(dev, type, stats, &used, extack);
+ if (err)
+ return err;
+
+out:
+ if (ru) {
+ ru->request = request;
+ ru->used = used;
+ }
+ return 0;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id,
+ struct rtnl_offload_xstats_request_used *ru)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, attr_id);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ struct rtnl_offload_xstats_request_used ru_l3;
+ struct nlattr *nest;
+ int err;
+
+ err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack);
+ if (err)
+ return err;
+
+ nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (rtnl_offload_xstats_fill_hw_s_info_one(skb,
+ IFLA_OFFLOAD_XSTATS_L3_STATS,
+ &ru_l3))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
- attr = nla_reserve_64bit(skb, attr_id, size,
+static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
+ int *prividx, u32 off_filter_mask,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO;
+ int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS;
+ int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
+ bool have_data = false;
+ int err;
+
+ if (*prividx <= attr_id_cpu_hit &&
+ (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) {
+ err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb);
+ if (!err) {
+ have_data = true;
+ } else if (err != -ENODATA) {
+ *prividx = attr_id_cpu_hit;
+ return err;
+ }
+ }
+
+ if (*prividx <= attr_id_hw_s_info &&
+ (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) {
+ *prividx = attr_id_hw_s_info;
+
+ err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack);
+ if (err)
+ return err;
+
+ have_data = true;
+ *prividx = 0;
+ }
+
+ if (*prividx <= attr_id_l3_stats &&
+ (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) {
+ unsigned int size_l3;
+ struct nlattr *attr;
+
+ *prividx = attr_id_l3_stats;
+
+ size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3);
+ attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3,
IFLA_OFFLOAD_XSTATS_UNSPEC);
if (!attr)
- goto nla_put_failure;
+ return -EMSGSIZE;
- attr_data = nla_data(attr);
- memset(attr_data, 0, size);
- err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
- attr_data);
+ err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL,
+ nla_data(attr), extack);
if (err)
- goto get_offload_stats_failure;
+ return err;
+
+ have_data = true;
+ *prividx = 0;
}
- if (!attr)
+ if (!have_data)
return -ENODATA;
*prividx = 0;
return 0;
+}
-nla_put_failure:
- err = -EMSGSIZE;
-get_offload_stats_failure:
- *prividx = attr_id;
- return err;
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ bool enabled = netdev_offload_xstats_enabled(dev, type);
+
+ return nla_total_size(0) +
+ /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */
+ nla_total_size(sizeof(u8)) +
+ /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */
+ (enabled ? nla_total_size(sizeof(u8)) : 0) +
+ 0;
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+
+ return nla_total_size(0) +
+ /* IFLA_OFFLOAD_XSTATS_L3_STATS */
+ rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) +
+ 0;
}
-static int rtnl_get_offload_stats_size(const struct net_device *dev)
+static int rtnl_offload_xstats_get_size(const struct net_device *dev,
+ u32 off_filter_mask)
{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
int nla_size = 0;
- int attr_id;
int size;
- if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
- dev->netdev_ops->ndo_get_offload_stats))
- return 0;
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) {
+ size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit);
+ nla_size += nla_total_size_64bit(size);
+ }
- for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
- attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
- continue;
- size = rtnl_get_offload_stats_attr_size(attr_id);
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO))
+ nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev);
+
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) {
+ size = rtnl_offload_xstats_get_size_stats(dev, t_l3);
nla_size += nla_total_size_64bit(size);
}
@@ -5133,11 +5307,21 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
return nla_size;
}
+struct rtnl_stats_dump_filters {
+ /* mask[0] filters outer attributes. Then individual nests have their
+ * filtering mask at the index of the nested attribute.
+ */
+ u32 mask[IFLA_STATS_MAX + 1];
+};
+
static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
- unsigned int flags, unsigned int filter_mask,
- int *idxattr, int *prividx)
+ unsigned int flags,
+ const struct rtnl_stats_dump_filters *filters,
+ int *idxattr, int *prividx,
+ struct netlink_ext_ack *extack)
{
+ unsigned int filter_mask = filters->mask[0];
struct if_stats_msg *ifsm;
struct nlmsghdr *nlh;
struct nlattr *attr;
@@ -5163,8 +5347,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
sizeof(struct rtnl_link_stats64),
IFLA_STATS_UNSPEC);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
sp = nla_data(attr);
dev_get_stats(dev, sp);
@@ -5177,8 +5363,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
*idxattr = IFLA_STATS_LINK_XSTATS;
attr = nla_nest_start_noflag(skb,
IFLA_STATS_LINK_XSTATS);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
nla_nest_end(skb, attr);
@@ -5200,8 +5388,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
attr = nla_nest_start_noflag(skb,
IFLA_STATS_LINK_XSTATS_SLAVE);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
nla_nest_end(skb, attr);
@@ -5213,13 +5403,19 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
*idxattr)) {
+ u32 off_filter_mask;
+
+ off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
attr = nla_nest_start_noflag(skb,
IFLA_STATS_LINK_OFFLOAD_XSTATS);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
- err = rtnl_get_offload_stats(skb, dev, prividx);
+ err = rtnl_offload_xstats_fill(skb, dev, prividx,
+ off_filter_mask, extack);
if (err == -ENODATA)
nla_nest_cancel(skb, attr);
else
@@ -5235,8 +5431,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
*idxattr = IFLA_STATS_AF_SPEC;
attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
rcu_read_lock();
list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
@@ -5280,13 +5478,14 @@ nla_put_failure:
else
nlmsg_end(skb, nlh);
- return -EMSGSIZE;
+ return err;
}
static size_t if_nlmsg_stats_size(const struct net_device *dev,
- u32 filter_mask)
+ const struct rtnl_stats_dump_filters *filters)
{
size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
+ unsigned int filter_mask = filters->mask[0];
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
@@ -5322,8 +5521,12 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
}
}
- if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
- size += rtnl_get_offload_stats_size(dev);
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) {
+ u32 off_filter_mask;
+
+ off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
+ size += rtnl_offload_xstats_get_size(dev, off_filter_mask);
+ }
if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
struct rtnl_af_ops *af_ops;
@@ -5347,6 +5550,79 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
return size;
}
+#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1)
+
+static const struct nla_policy
+rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = {
+ [IFLA_STATS_LINK_OFFLOAD_XSTATS] =
+ NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID),
+};
+
+static const struct nla_policy
+rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
+ [IFLA_STATS_GET_FILTERS] =
+ NLA_POLICY_NESTED(rtnl_stats_get_policy_filters),
+};
+
+static const struct nla_policy
+ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = {
+ [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
+ struct rtnl_stats_dump_filters *filters,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_STATS_MAX + 1];
+ int err;
+ int at;
+
+ err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters,
+ rtnl_stats_get_policy_filters, extack);
+ if (err < 0)
+ return err;
+
+ for (at = 1; at <= IFLA_STATS_MAX; at++) {
+ if (tb[at]) {
+ if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) {
+ NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask");
+ return -EINVAL;
+ }
+ filters->mask[at] = nla_get_u32(tb[at]);
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_stats_get_parse(const struct nlmsghdr *nlh,
+ u32 filter_mask,
+ struct rtnl_stats_dump_filters *filters,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+ int err;
+ int i;
+
+ filters->mask[0] = filter_mask;
+ for (i = 1; i < ARRAY_SIZE(filters->mask); i++)
+ filters->mask[i] = -1U;
+
+ err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb,
+ IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_STATS_GET_FILTERS]) {
+ err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS],
+ filters, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
bool is_dump, struct netlink_ext_ack *extack)
{
@@ -5369,10 +5645,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
return -EINVAL;
}
- if (nlmsg_attrlen(nlh, sizeof(*ifsm))) {
- NL_SET_ERR_MSG(extack, "Invalid attributes after stats header");
- return -EINVAL;
- }
if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) {
NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask");
return -EINVAL;
@@ -5384,12 +5656,12 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct rtnl_stats_dump_filters filters;
struct net *net = sock_net(skb->sk);
struct net_device *dev = NULL;
int idxattr = 0, prividx = 0;
struct if_stats_msg *ifsm;
struct sk_buff *nskb;
- u32 filter_mask;
int err;
err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
@@ -5406,17 +5678,22 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!dev)
return -ENODEV;
- filter_mask = ifsm->filter_mask;
- if (!filter_mask)
+ if (!ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get");
return -EINVAL;
+ }
+
+ err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack);
+ if (err)
+ return err;
- nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL);
+ nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL);
if (!nskb)
return -ENOBUFS;
err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
- 0, filter_mask, &idxattr, &prividx);
+ 0, &filters, &idxattr, &prividx, extack);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
WARN_ON(err == -EMSGSIZE);
@@ -5432,12 +5709,12 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netlink_ext_ack *extack = cb->extack;
int h, s_h, err, s_idx, s_idxattr, s_prividx;
+ struct rtnl_stats_dump_filters filters;
struct net *net = sock_net(skb->sk);
unsigned int flags = NLM_F_MULTI;
struct if_stats_msg *ifsm;
struct hlist_head *head;
struct net_device *dev;
- u32 filter_mask = 0;
int idx = 0;
s_h = cb->args[0];
@@ -5452,12 +5729,16 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
return err;
ifsm = nlmsg_data(cb->nlh);
- filter_mask = ifsm->filter_mask;
- if (!filter_mask) {
+ if (!ifsm->filter_mask) {
NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
return -EINVAL;
}
+ err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters,
+ extack);
+ if (err)
+ return err;
+
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
head = &net->dev_index_head[h];
@@ -5467,8 +5748,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0,
- flags, filter_mask,
- &s_idxattr, &s_prividx);
+ flags, &filters,
+ &s_idxattr, &s_prividx,
+ extack);
/* If we ran out of room on the first message,
* we're in trouble
*/
@@ -5492,6 +5774,107 @@ out:
return skb->len;
}
+void rtnl_offload_xstats_notify(struct net_device *dev)
+{
+ struct rtnl_stats_dump_filters response_filters = {};
+ struct net *net = dev_net(dev);
+ int idxattr = 0, prividx = 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ ASSERT_RTNL();
+
+ response_filters.mask[0] |=
+ IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+
+ skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
+ GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0,
+ &response_filters, &idxattr, &prividx, NULL);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_STATS, err);
+}
+EXPORT_SYMBOL(rtnl_offload_xstats_notify);
+
+static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ struct rtnl_stats_dump_filters response_filters = {};
+ struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev = NULL;
+ struct if_stats_msg *ifsm;
+ bool notify = false;
+ int err;
+
+ err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
+ false, extack);
+ if (err)
+ return err;
+
+ ifsm = nlmsg_data(nlh);
+ if (ifsm->family != AF_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC");
+ return -EINVAL;
+ }
+
+ if (ifsm->ifindex > 0)
+ dev = __dev_get_by_index(net, ifsm->ifindex);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX,
+ ifla_stats_set_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) {
+ u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]);
+
+ if (req)
+ err = netdev_offload_xstats_enable(dev, t_l3, extack);
+ else
+ err = netdev_offload_xstats_disable(dev, t_l3);
+
+ if (!err)
+ notify = true;
+ else if (err != -EALREADY)
+ return err;
+
+ response_filters.mask[0] |=
+ IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+ }
+
+ if (notify)
+ rtnl_offload_xstats_notify(dev);
+
+ return 0;
+}
+
/* Process one rtnetlink message. */
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -5717,4 +6100,5 @@ void __init rtnetlink_init(void)
rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
0);
+ rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ea51e23e9247..23f3ba343661 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -201,7 +201,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
skb->head = data;
skb->data = data;
skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
+ skb_set_end_offset(skb, size);
skb->mac_header = (typeof(skb->mac_header))~0U;
skb->transport_header = (typeof(skb->transport_header))~0U;
@@ -1736,11 +1736,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->head = data;
skb->head_frag = 0;
skb->data += off;
+
+ skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
off = nhead;
-#else
- skb->end = skb->head + size;
#endif
skb->tail += off;
skb_headers_offset_update(skb, nhead);
@@ -1788,6 +1787,38 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
}
EXPORT_SYMBOL(skb_realloc_headroom);
+int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
+{
+ unsigned int saved_end_offset, saved_truesize;
+ struct skb_shared_info *shinfo;
+ int res;
+
+ saved_end_offset = skb_end_offset(skb);
+ saved_truesize = skb->truesize;
+
+ res = pskb_expand_head(skb, 0, 0, pri);
+ if (res)
+ return res;
+
+ skb->truesize = saved_truesize;
+
+ if (likely(skb_end_offset(skb) == saved_end_offset))
+ return 0;
+
+ shinfo = skb_shinfo(skb);
+
+ /* We are about to change back skb->end,
+ * we need to move skb_shinfo() to its new location.
+ */
+ memmove(skb->head + saved_end_offset,
+ shinfo,
+ offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
+
+ skb_set_end_offset(skb, saved_end_offset);
+
+ return 0;
+}
+
/**
* skb_expand_head - reallocate header of &sk_buff
* @skb: buffer to reallocate
@@ -4820,7 +4851,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (hwtstamps)
*skb_hwtstamps(skb) = *hwtstamps;
else
- skb->tstamp = ktime_get_real();
+ __net_timestamp(skb);
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
@@ -5350,7 +5381,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
ipvs_reset(skb);
skb->mark = 0;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
@@ -6044,11 +6075,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb->head = data;
skb->data = data;
skb->head_frag = 0;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
-#else
- skb->end = skb->head + size;
-#endif
+ skb_set_end_offset(skb, size);
skb_set_tail_pointer(skb, skb_headlen(skb));
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
@@ -6186,11 +6213,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb->head = data;
skb->head_frag = 0;
skb->data = data;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
-#else
- skb->end = skb->head + size;
-#endif
+ skb_set_end_offset(skb, size);
skb_reset_tail_pointer(skb);
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6eb174805bf0..784c92eaded8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1447,6 +1447,15 @@ set_sndbuf:
break;
}
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Paired with READ_ONCE() in tcp_rtx_synack() */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1834,6 +1843,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_reserved_mem;
break;
+ case SO_TXREHASH:
+ v.val = sk->sk_txrehash;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -2266,6 +2279,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+ sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
}
@@ -2611,7 +2625,8 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
switch (cmsg->cmsg_type) {
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
return -EINVAL;
@@ -3278,6 +3293,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
+ sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk);
/*
@@ -3702,6 +3718,10 @@ int proto_register(struct proto *prot, int alloc_slab)
{
int ret = -ENOBUFS;
+ if (prot->memory_allocated && !prot->sysctl_mem) {
+ pr_err("%s: missing sysctl_mem\n", prot->name);
+ return -EINVAL;
+ }
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1827669eedd6..2d213c4011db 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1416,38 +1416,50 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
return NULL;
}
-static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
+static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+ u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
- struct bpf_prog **pprog;
if (!progs)
return -EOPNOTSUPP;
switch (which) {
case BPF_SK_MSG_VERDICT:
- pprog = &progs->msg_parser;
+ *pprog = &progs->msg_parser;
break;
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
- pprog = &progs->stream_parser;
+ *pprog = &progs->stream_parser;
break;
#endif
case BPF_SK_SKB_STREAM_VERDICT:
if (progs->skb_verdict)
return -EBUSY;
- pprog = &progs->stream_verdict;
+ *pprog = &progs->stream_verdict;
break;
case BPF_SK_SKB_VERDICT:
if (progs->stream_verdict)
return -EBUSY;
- pprog = &progs->skb_verdict;
+ *pprog = &progs->skb_verdict;
break;
default:
return -EOPNOTSUPP;
}
+ return 0;
+}
+
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which)
+{
+ struct bpf_prog **pprog;
+ int ret;
+
+ ret = sock_map_prog_lookup(map, &pprog, which);
+ if (ret)
+ return ret;
+
if (old)
return psock_replace_prog(pprog, prog, old);
@@ -1455,6 +1467,57 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
return 0;
}
+int sock_map_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd;
+ struct bpf_prog **pprog;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct fd f;
+ u32 id = 0;
+ int ret;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ rcu_read_lock();
+
+ ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type);
+ if (ret)
+ goto end;
+
+ prog = *pprog;
+ prog_cnt = !prog ? 0 : 1;
+
+ if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+ goto end;
+
+ /* we do not hold the refcnt, the bpf prog may be released
+ * asynchronously and the id would be set to 0.
+ */
+ id = data_race(prog->aux->id);
+ if (id == 0)
+ prog_cnt = 0;
+
+end:
+ rcu_read_unlock();
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) ||
+ (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) ||
+ copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+ ret = -EFAULT;
+
+ fdput(f);
+ return ret;
+}
+
static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
{
switch (link->map->map_type) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7b4d485aac7a..7123fe7feeac 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -103,8 +103,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (orig_sock_table) {
static_branch_dec(&rps_needed);
static_branch_dec(&rfs_needed);
- synchronize_rcu();
- vfree(orig_sock_table);
+ kvfree_rcu(orig_sock_table);
}
}
}
@@ -142,8 +141,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
lockdep_is_held(&flow_limit_update_mutex));
if (cur && !cpumask_test_cpu(i, mask)) {
RCU_INIT_POINTER(sd->flow_limit, NULL);
- synchronize_rcu();
- kfree(cur);
+ kfree_rcu(cur);
} else if (!cur && cpumask_test_cpu(i, mask)) {
cur = kzalloc_node(len, GFP_KERNEL,
cpu_to_node(i));
@@ -593,6 +591,15 @@ static struct ctl_table netns_core_table[] = {
.extra1 = SYSCTL_ZERO,
.proc_handler = proc_dointvec_minmax
},
+ {
+ .procname = "txrehash",
+ .data = &init_net.core.sysctl_txrehash,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ .proc_handler = proc_dou8vec_minmax,
+ },
{ }
};
@@ -611,7 +618,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
static __net_init int sysctl_core_net_init(struct net *net)
{
- struct ctl_table *tbl;
+ struct ctl_table *tbl, *tmp;
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
@@ -619,7 +626,8 @@ static __net_init int sysctl_core_net_init(struct net *net)
if (tbl == NULL)
goto err_dup;
- tbl[0].data = &net->core.sysctl_somaxconn;
+ for (tmp = tbl; tmp->procname; tmp++)
+ tmp->data += (char *)net - (char *)&init_net;
/* Don't export any sysctls to unprivileged users */
if (net->user_ns != &init_user_ns) {
diff --git a/net/core/utils.c b/net/core/utils.c
index 1f31a39236d5..938495bc1d34 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -476,9 +476,9 @@ void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
__wsum diff, bool pseudohdr)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+ csum_replace_by_diff(sum, diff);
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_add(diff, ~skb->csum);
+ skb->csum = ~csum_sub(diff, skb->csum);
} else if (pseudohdr) {
*sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 7aba35504986..361df312ee7f 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -162,8 +162,9 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
}
/* Returns 0 on success, negative on failure */
-int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
- struct net_device *dev, u32 queue_index, unsigned int napi_id)
+int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+ struct net_device *dev, u32 queue_index,
+ unsigned int napi_id, u32 frag_size)
{
if (!dev) {
WARN(1, "Missing net_device from driver");
@@ -185,11 +186,12 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
xdp_rxq->napi_id = napi_id;
+ xdp_rxq->frag_size = frag_size;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
return 0;
}
-EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
{
@@ -369,8 +371,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- struct xdp_buff *xdp)
+void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+ struct xdp_buff *xdp)
{
struct xdp_mem_allocator *xa;
struct page *page;
@@ -406,12 +408,38 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
void xdp_return_frame(struct xdp_frame *xdpf)
{
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdpf->mem, false, NULL);
+ }
+out:
__xdp_return(xdpf->data, &xdpf->mem, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdpf->mem, true, NULL);
+ }
+out:
__xdp_return(xdpf->data, &xdpf->mem, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
@@ -447,7 +475,7 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
struct xdp_mem_allocator *xa;
if (mem->type != MEM_TYPE_PAGE_POOL) {
- __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+ xdp_return_frame(xdpf);
return;
}
@@ -466,12 +494,38 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
}
+ if (unlikely(xdp_frame_has_frags(xdpf))) {
+ struct skb_shared_info *sinfo;
+ int i;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+
+ bq->q[bq->count++] = skb_frag_address(frag);
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+ }
+ }
bq->q[bq->count++] = xdpf->data;
}
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
void xdp_return_buff(struct xdp_buff *xdp)
{
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
+ }
+out:
__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
}
@@ -561,8 +615,14 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *skb,
struct net_device *dev)
{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
unsigned int headroom, frame_size;
void *hard_start;
+ u8 nr_frags;
+
+ /* xdp frags frame */
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ nr_frags = sinfo->nr_frags;
/* Part of headroom was reserved to xdpf */
headroom = sizeof(*xdpf) + xdpf->headroom;
@@ -582,6 +642,12 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
if (xdpf->metasize)
skb_metadata_set(skb, xdpf->metasize);
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ xdp_update_skb_shared_info(skb, nr_frags,
+ sinfo->xdp_frags_size,
+ nr_frags * xdpf->frame_sz,
+ xdp_frame_is_frag_pfmemalloc(xdpf));
+
/* Essential SKB info: protocol and skb->dev */
skb->protocol = eth_type_trans(skb, dev);