summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c58
-rw-r--r--net/core/dev.c424
-rw-r--r--net/core/dst.c4
-rw-r--r--net/core/ethtool.c15
-rw-r--r--net/core/fib_rules.c25
-rw-r--r--net/core/filter.c678
-rw-r--r--net/core/flow_dissector.c658
-rw-r--r--net/core/gen_estimator.c13
-rw-r--r--net/core/link_watch.c4
-rw-r--r--net/core/neighbour.c128
-rw-r--r--net/core/net-sysfs.c135
-rw-r--r--net/core/net_namespace.c188
-rw-r--r--net/core/netclassid_cgroup.c3
-rw-r--r--net/core/netevent.c5
-rw-r--r--net/core/pktgen.c121
-rw-r--r--net/core/request_sock.c45
-rw-r--r--net/core/rtnetlink.c391
-rw-r--r--net/core/secure_seq.c2
-rw-r--r--net/core/skbuff.c474
-rw-r--r--net/core/sock.c186
-rw-r--r--net/core/sock_diag.c122
-rw-r--r--net/core/stream.c6
-rw-r--r--net/core/sysctl_net_core.c2
-rw-r--r--net/core/utils.c12
24 files changed, 2635 insertions, 1064 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index df493d68330c..4967262b2707 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -131,6 +131,35 @@ out_noerr:
goto out;
}
+static int skb_set_peeked(struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+
+ if (skb->peeked)
+ return 0;
+
+ /* We have to unshare an skb before modifying it. */
+ if (!skb_shared(skb))
+ goto done;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ skb->prev->next = nskb;
+ skb->next->prev = nskb;
+ nskb->prev = skb->prev;
+ nskb->next = skb->next;
+
+ consume_skb(skb);
+ skb = nskb;
+
+done:
+ skb->peeked = 1;
+
+ return 0;
+}
+
/**
* __skb_recv_datagram - Receive a datagram skbuff
* @sk: socket
@@ -165,7 +194,9 @@ out_noerr:
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
int *peeked, int *off, int *err)
{
+ struct sk_buff_head *queue = &sk->sk_receive_queue;
struct sk_buff *skb, *last;
+ unsigned long cpu_flags;
long timeo;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
@@ -184,8 +215,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
* Look at current nfs client by the way...
* However, this function was correct in any case. 8)
*/
- unsigned long cpu_flags;
- struct sk_buff_head *queue = &sk->sk_receive_queue;
int _off = *off;
last = (struct sk_buff *)queue;
@@ -199,7 +228,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
_off -= skb->len;
continue;
}
- skb->peeked = 1;
+
+ error = skb_set_peeked(skb);
+ if (error)
+ goto unlock_err;
+
atomic_inc(&skb->users);
} else
__skb_unlink(skb, queue);
@@ -223,6 +256,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
return NULL;
+unlock_err:
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
no_packet:
*err = error;
return NULL;
@@ -622,7 +657,8 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
}
- skb->csum_valid = !sum;
+ if (!skb_shared(skb))
+ skb->csum_valid = !sum;
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);
@@ -642,11 +678,13 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
netdev_rx_csum_fault(skb->dev);
}
- /* Save full packet checksum */
- skb->csum = csum;
- skb->ip_summed = CHECKSUM_COMPLETE;
- skb->csum_complete_sw = 1;
- skb->csum_valid = !sum;
+ if (!skb_shared(skb)) {
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+ }
return sum;
}
@@ -673,7 +711,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
if (!chunk)
return 0;
- if (iov_iter_count(&msg->msg_iter) < chunk) {
+ if (msg_data_left(msg) < chunk) {
if (__skb_checksum_complete(skb))
goto csum_error;
if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
diff --git a/net/core/dev.c b/net/core/dev.c
index 45109b70664e..a8e4dd430285 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,7 @@
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
#include "net-sysfs.h"
@@ -468,10 +469,14 @@ EXPORT_SYMBOL(dev_remove_pack);
*/
void dev_add_offload(struct packet_offload *po)
{
- struct list_head *head = &offload_base;
+ struct packet_offload *elem;
spin_lock(&offload_lock);
- list_add_rcu(&po->list, head);
+ list_for_each_entry(elem, &offload_base, list) {
+ if (po->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&po->list, elem->list.prev);
spin_unlock(&offload_lock);
}
EXPORT_SYMBOL(dev_add_offload);
@@ -660,6 +665,23 @@ __setup("netdev=", netdev_boot_setup);
*******************************************************************************/
/**
+ * dev_get_iflink - get 'iflink' value of a interface
+ * @dev: targeted interface
+ *
+ * Indicates the ifindex the interface is linked to.
+ * Physical interfaces have the same 'ifindex' and 'iflink' values.
+ */
+
+int dev_get_iflink(const struct net_device *dev)
+{
+ if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
+ return dev->netdev_ops->ndo_get_iflink(dev);
+
+ return dev->ifindex;
+}
+EXPORT_SYMBOL(dev_get_iflink);
+
+/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
@@ -1385,7 +1407,7 @@ static int __dev_close(struct net_device *dev)
return retval;
}
-static int dev_close_many(struct list_head *head)
+int dev_close_many(struct list_head *head, bool unlink)
{
struct net_device *dev, *tmp;
@@ -1399,11 +1421,13 @@ static int dev_close_many(struct list_head *head)
list_for_each_entry_safe(dev, tmp, head, close_list) {
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
call_netdevice_notifiers(NETDEV_DOWN, dev);
- list_del_init(&dev->close_list);
+ if (unlink)
+ list_del_init(&dev->close_list);
}
return 0;
}
+EXPORT_SYMBOL(dev_close_many);
/**
* dev_close - shutdown an interface.
@@ -1420,7 +1444,7 @@ int dev_close(struct net_device *dev)
LIST_HEAD(single);
list_add(&dev->close_list, &single);
- dev_close_many(&single);
+ dev_close_many(&single, true);
list_del(&single);
}
return 0;
@@ -1607,6 +1631,22 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
}
EXPORT_SYMBOL(call_netdevice_notifiers);
+#ifdef CONFIG_NET_INGRESS
+static struct static_key ingress_needed __read_mostly;
+
+void net_inc_ingress_queue(void)
+{
+ static_key_slow_inc(&ingress_needed);
+}
+EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
+
+void net_dec_ingress_queue(void)
+{
+ static_key_slow_dec(&ingress_needed);
+}
+EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
+#endif
+
static struct static_key netstamp_needed __read_mostly;
#ifdef HAVE_JUMP_LABEL
/* We are not allowed to call static_key_slow_dec() from irq context
@@ -1679,21 +1719,15 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
- }
-
- if (unlikely(!is_skb_forwardable(dev, skb))) {
+ if (skb_orphan_frags(skb, GFP_ATOMIC) ||
+ unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
skb_scrub_packet(skb, true);
+ skb->priority = 0;
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
@@ -1737,7 +1771,8 @@ static inline int deliver_skb(struct sk_buff *skb,
static inline void deliver_ptype_list_skb(struct sk_buff *skb,
struct packet_type **pt,
- struct net_device *dev, __be16 type,
+ struct net_device *orig_dev,
+ __be16 type,
struct list_head *ptype_list)
{
struct packet_type *ptype, *pt_prev = *pt;
@@ -1746,7 +1781,7 @@ static inline void deliver_ptype_list_skb(struct sk_buff *skb,
if (ptype->type != type)
continue;
if (pt_prev)
- deliver_skb(skb, pt_prev, dev);
+ deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
*pt = pt_prev;
@@ -2309,6 +2344,34 @@ void netif_device_attach(struct net_device *dev)
}
EXPORT_SYMBOL(netif_device_attach);
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+ unsigned int num_tx_queues)
+{
+ u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
+
+ if (skb_rx_queue_recorded(skb)) {
+ hash = skb_get_rx_queue(skb);
+ while (unlikely(hash >= num_tx_queues))
+ hash -= num_tx_queues;
+ return hash;
+ }
+
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
+ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
static const netdev_features_t null_features = 0;
@@ -2559,12 +2622,26 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
return features;
}
+netdev_features_t passthru_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ return features;
+}
+EXPORT_SYMBOL(passthru_features_check);
+
+static netdev_features_t dflt_features_check(const struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ return vlan_features_check(skb, features);
+}
+
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
netdev_features_t features = dev->features;
u16 gso_segs = skb_shinfo(skb)->gso_segs;
- __be16 protocol = skb->protocol;
if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
features &= ~NETIF_F_GSO_MASK;
@@ -2576,34 +2653,17 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
if (skb->encapsulation)
features &= dev->hw_enc_features;
- if (!skb_vlan_tag_present(skb)) {
- if (unlikely(protocol == htons(ETH_P_8021Q) ||
- protocol == htons(ETH_P_8021AD))) {
- struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
- protocol = veh->h_vlan_encapsulated_proto;
- } else {
- goto finalize;
- }
- }
-
- features = netdev_intersect_features(features,
- dev->vlan_features |
- NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_STAG_TX);
-
- if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
+ if (skb_vlan_tagged(skb))
features = netdev_intersect_features(features,
- NETIF_F_SG |
- NETIF_F_HIGHDMA |
- NETIF_F_FRAGLIST |
- NETIF_F_GEN_CSUM |
+ dev->vlan_features |
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
-finalize:
if (dev->netdev_ops->ndo_features_check)
features &= dev->netdev_ops->ndo_features_check(skb, dev,
features);
+ else
+ features &= dflt_features_check(skb, dev, features);
return harmonize_features(skb, features);
}
@@ -2675,7 +2735,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (unlikely(!skb))
goto out_null;
- if (netif_needs_gso(dev, skb, features)) {
+ if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
segs = skb_gso_segment(skb, features);
@@ -2857,7 +2917,7 @@ EXPORT_SYMBOL(xmit_recursion);
* dev_loopback_xmit - loop back @skb
* @skb: buffer to transmit
*/
-int dev_loopback_xmit(struct sk_buff *skb)
+int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
{
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
@@ -2870,6 +2930,84 @@ int dev_loopback_xmit(struct sk_buff *skb)
}
EXPORT_SYMBOL(dev_loopback_xmit);
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ int queue_index = -1;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ map = rcu_dereference(
+ dev_maps->cpu_map[skb->sender_cpu - 1]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else
+ queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+ map->len)];
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ int queue_index = sk_tx_queue_get(sk);
+
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int new_index = get_xps_queue(dev, skb);
+ if (new_index < 0)
+ new_index = skb_tx_hash(dev, skb);
+
+ if (queue_index != new_index && sk &&
+ rcu_access_pointer(sk->sk_dst_cache))
+ sk_tx_queue_set(sk, new_index);
+
+ queue_index = new_index;
+ }
+
+ return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ void *accel_priv)
+{
+ int queue_index = 0;
+
+#ifdef CONFIG_XPS
+ if (skb->sender_cpu == 0)
+ skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+ if (dev->real_num_tx_queues != 1) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ __netdev_pick_tx);
+ else
+ queue_index = __netdev_pick_tx(dev, skb);
+
+ if (!accel_priv)
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
+}
+
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
@@ -2995,11 +3133,11 @@ out:
return rc;
}
-int dev_queue_xmit(struct sk_buff *skb)
+int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
{
return __dev_queue_xmit(skb, NULL);
}
-EXPORT_SYMBOL(dev_queue_xmit);
+EXPORT_SYMBOL(dev_queue_xmit_sk);
int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
{
@@ -3041,7 +3179,7 @@ static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
- if (next_cpu != RPS_NO_CPU) {
+ if (next_cpu < nr_cpu_ids) {
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
@@ -3146,7 +3284,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
* table entry), switch if one of the following holds:
- * - Current CPU is unset (equal to RPS_NO_CPU).
+ * - Current CPU is unset (>= nr_cpu_ids).
* - Current CPU is offline.
* - The current CPU's queue tail has advanced beyond the
* last packet that was enqueued using this table entry.
@@ -3154,14 +3292,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
* have been dequeued, thus preserving in order delivery.
*/
if (unlikely(tcpu != next_cpu) &&
- (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+ (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
}
- if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+ if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
goto done;
@@ -3202,14 +3340,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- int cpu;
+ unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
rflow = &flow_table->flows[flow_id];
cpu = ACCESS_ONCE(rflow->cpu);
- if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
((int)(per_cpu(softnet_data, cpu).input_queue_head -
rflow->last_qtail) <
(int)(10 * flow_table->mask)))
@@ -3310,6 +3448,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
local_irq_save(flags);
rps_lock(sd);
+ if (!netif_running(skb->dev))
+ goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
if (qlen) {
@@ -3331,6 +3471,7 @@ enqueue:
goto enqueue;
}
+drop:
sd->dropped++;
rps_unlock(sd);
@@ -3482,68 +3623,47 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
-#ifdef CONFIG_NET_CLS_ACT
-/* TODO: Maybe we should just force sch_ingress to be compiled in
- * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
- * a compare and 2 stores extra right now if we dont have it on
- * but have CONFIG_NET_CLS_ACT
- * NOTE: This doesn't stop any functionality; if you dont have
- * the ingress scheduler, you just can't add policies on ingress.
- *
- */
-static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
-{
- struct net_device *dev = skb->dev;
- u32 ttl = G_TC_RTTL(skb->tc_verd);
- int result = TC_ACT_OK;
- struct Qdisc *q;
-
- if (unlikely(MAX_RED_LOOP < ttl++)) {
- net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
- skb->skb_iif, dev->ifindex);
- return TC_ACT_SHOT;
- }
-
- skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
- q = rcu_dereference(rxq->qdisc);
- if (q != &noop_qdisc) {
- spin_lock(qdisc_lock(q));
- if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
- result = qdisc_enqueue_root(skb, q);
- spin_unlock(qdisc_lock(q));
- }
-
- return result;
-}
-
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
- struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
-
- if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
- goto out;
+#ifdef CONFIG_NET_CLS_ACT
+ struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+ struct tcf_result cl_res;
+ /* If there's at least one ingress present somewhere (so
+ * we get here via enabled static key), remaining devices
+ * that are not configured with an ingress qdisc will bail
+ * out here.
+ */
+ if (!cl)
+ return skb;
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- switch (ing_filter(skb, rxq)) {
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+ qdisc_bstats_update_cpu(cl->q, skb);
+
+ switch (tc_classify(skb, cl, &cl_res)) {
+ case TC_ACT_OK:
+ case TC_ACT_RECLASSIFY:
+ skb->tc_index = TC_H_MIN(cl_res.classid);
+ break;
case TC_ACT_SHOT:
+ qdisc_qstats_drop_cpu(cl->q);
case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
kfree_skb(skb);
return NULL;
+ default:
+ break;
}
-
-out:
- skb->tc_verd = 0;
+#endif /* CONFIG_NET_CLS_ACT */
return skb;
}
-#endif
/**
* netdev_rx_handler_register - register receive handler
@@ -3616,6 +3736,22 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
}
}
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (nf_hook_ingress_active(skb)) {
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ return nf_hook_ingress(skb);
+ }
+#endif /* CONFIG_NETFILTER_INGRESS */
+ return 0;
+}
+
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
@@ -3638,8 +3774,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
pt_prev = NULL;
- rcu_read_lock();
-
another_round:
skb->skb_iif = skb->dev->ifindex;
@@ -3649,7 +3783,7 @@ another_round:
skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
- goto unlock;
+ goto out;
}
#ifdef CONFIG_NET_CLS_ACT
@@ -3675,13 +3809,20 @@ another_round:
}
skip_taps:
+#ifdef CONFIG_NET_INGRESS
+ if (static_key_false(&ingress_needed)) {
+ skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto out;
+
+ if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+ goto out;
+ }
+#endif
#ifdef CONFIG_NET_CLS_ACT
- skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
- if (!skb)
- goto unlock;
+ skb->tc_verd = 0;
ncls:
#endif
-
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
@@ -3693,7 +3834,7 @@ ncls:
if (vlan_do_receive(&skb))
goto another_round;
else if (unlikely(!skb))
- goto unlock;
+ goto out;
}
rx_handler = rcu_dereference(skb->dev->rx_handler);
@@ -3705,7 +3846,7 @@ ncls:
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
- goto unlock;
+ goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
@@ -3759,8 +3900,7 @@ drop:
ret = NET_RX_DROP;
}
-unlock:
- rcu_read_unlock();
+out:
return ret;
}
@@ -3791,29 +3931,30 @@ static int __netif_receive_skb(struct sk_buff *skb)
static int netif_receive_skb_internal(struct sk_buff *skb)
{
+ int ret;
+
net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
+ rcu_read_lock();
+
#ifdef CONFIG_RPS
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
- int cpu, ret;
-
- rcu_read_lock();
-
- cpu = get_rps_cpu(skb->dev, skb, &rflow);
+ int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
return ret;
}
- rcu_read_unlock();
}
#endif
- return __netif_receive_skb(skb);
+ ret = __netif_receive_skb(skb);
+ rcu_read_unlock();
+ return ret;
}
/**
@@ -3831,13 +3972,13 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
-int netif_receive_skb(struct sk_buff *skb)
+int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
{
trace_netif_receive_skb_entry(skb);
return netif_receive_skb_internal(skb);
}
-EXPORT_SYMBOL(netif_receive_skb);
+EXPORT_SYMBOL(netif_receive_skb_sk);
/* Network device is going away, flush any packets still pending
* Called with irqs disabled.
@@ -4358,8 +4499,10 @@ static int process_backlog(struct napi_struct *napi, int quota)
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) {
+ rcu_read_lock();
local_irq_enable();
__netif_receive_skb(skb);
+ rcu_read_unlock();
local_irq_disable();
input_queue_head_incr(sd);
if (++work >= quota) {
@@ -5170,7 +5313,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
return -EBUSY;
- if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
+ if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
@@ -5914,6 +6057,24 @@ int dev_get_phys_port_id(struct net_device *dev,
EXPORT_SYMBOL(dev_get_phys_port_id);
/**
+ * dev_get_phys_port_name - Get device physical port name
+ * @dev: device
+ * @name: port name
+ *
+ * Get device physical port name
+ */
+int dev_get_phys_port_name(struct net_device *dev,
+ char *name, size_t len)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_phys_port_name)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_name(dev, name, len);
+}
+EXPORT_SYMBOL(dev_get_phys_port_name);
+
+/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
@@ -5970,13 +6131,14 @@ static void rollback_registered_many(struct list_head *head)
/* If device is running, close it first. */
list_for_each_entry(dev, head, unreg_list)
list_add_tail(&dev->close_list, &close_head);
- dev_close_many(&close_head);
+ dev_close_many(&close_head, true);
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
dev->reg_state = NETREG_UNREGISTERING;
+ on_each_cpu(flush_backlog, dev, 1);
}
synchronize_net();
@@ -6247,7 +6409,8 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
struct netdev_queue *tx;
size_t sz = count * sizeof(*tx);
- BUG_ON(count < 1 || count > 0xffff);
+ if (count < 1 || count > 0xffff)
+ return -EINVAL;
tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!tx) {
@@ -6263,6 +6426,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
return 0;
}
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+ netif_tx_stop_queue(txq);
+ }
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
/**
* register_netdevice - register a network device
* @dev: device to register
@@ -6297,8 +6471,6 @@ int register_netdevice(struct net_device *dev)
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
- dev->iflink = -1;
-
ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
goto out;
@@ -6328,9 +6500,6 @@ int register_netdevice(struct net_device *dev)
else if (__dev_get_by_index(net, dev->ifindex))
goto err_uninit;
- if (dev->iflink == -1)
- dev->iflink = dev->ifindex;
-
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
@@ -6605,8 +6774,6 @@ void netdev_run_todo(void)
dev->reg_state = NETREG_UNREGISTERED;
- on_each_cpu(flush_backlog, dev, 1);
-
netdev_wait_allrefs(dev);
/* paranoia */
@@ -6817,6 +6984,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
+
+ nf_hook_ingress_init(dev);
+
return dev;
free_all:
@@ -6843,8 +7013,6 @@ void free_netdev(struct net_device *dev)
{
struct napi_struct *p, *n;
- release_net(dev_net(dev));
-
netif_free_tx_queues(dev);
#ifdef CONFIG_SYSFS
kvfree(dev->_rx);
@@ -7045,12 +7213,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
dev_net_set(dev, net);
/* If there is an ifindex conflict assign a new one */
- if (__dev_get_by_index(net, dev->ifindex)) {
- int iflink = (dev->iflink == dev->ifindex);
+ if (__dev_get_by_index(net, dev->ifindex))
dev->ifindex = dev_new_index(net);
- if (iflink)
- dev->iflink = dev->ifindex;
- }
/* Send a netdev-add uevent to the new namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
diff --git a/net/core/dst.c b/net/core/dst.c
index e956ce6d1378..002144bea935 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -284,7 +284,9 @@ void dst_release(struct dst_entry *dst)
int newrefcnt;
newrefcnt = atomic_dec_return(&dst->__refcnt);
- WARN_ON(newrefcnt < 0);
+ if (unlikely(newrefcnt < 0))
+ net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+ __func__, dst, newrefcnt);
if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index aa378ecef186..b495ab1797fa 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -98,7 +98,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
[NETIF_F_BUSY_POLL_BIT] = "busy-poll",
- [NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload",
};
static const char
@@ -107,6 +106,13 @@ rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
[ETH_RSS_HASH_XOR_BIT] = "xor",
};
+static const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
+ [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
+};
+
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -195,6 +201,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_RSS_HASH_FUNCS)
return ARRAY_SIZE(rss_hash_func_strings);
+ if (sset == ETH_SS_TUNABLES)
+ return ARRAY_SIZE(tunable_strings);
+
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
else
@@ -212,6 +221,8 @@ static void __ethtool_get_strings(struct net_device *dev,
else if (stringset == ETH_SS_RSS_HASH_FUNCS)
memcpy(data, rss_hash_func_strings,
sizeof(rss_hash_func_strings));
+ else if (stringset == ETH_SS_TUNABLES)
+ memcpy(data, tunable_strings, sizeof(tunable_strings));
else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
@@ -790,7 +801,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
if (ops->get_rxfh_indir_size)
dev_indir_size = ops->get_rxfh_indir_size(dev);
if (ops->get_rxfh_key_size)
- dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
+ dev_key_size = ops->get_rxfh_key_size(dev);
if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
return -EFAULT;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index e4fdc9dfb2c7..9a12668f7d62 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -31,7 +31,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->pref = pref;
r->table = table;
r->flags = flags;
- r->fr_net = hold_net(ops->fro_net);
+ r->fr_net = ops->fro_net;
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
@@ -116,7 +116,6 @@ static int __fib_rules_register(struct fib_rules_ops *ops)
if (ops->family == o->family)
goto errout;
- hold_net(net);
list_add_tail_rcu(&ops->list, &net->rules_ops);
err = 0;
errout:
@@ -160,15 +159,6 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
}
}
-static void fib_rules_put_rcu(struct rcu_head *head)
-{
- struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
- struct net *net = ops->fro_net;
-
- release_net(net);
- kfree(ops);
-}
-
void fib_rules_unregister(struct fib_rules_ops *ops)
{
struct net *net = ops->fro_net;
@@ -178,7 +168,7 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
spin_unlock(&net->rules_mod_lock);
fib_rules_cleanup_ops(ops);
- call_rcu(&ops->rcu, fib_rules_put_rcu);
+ kfree_rcu(ops, rcu);
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
@@ -303,7 +293,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
err = -ENOMEM;
goto errout;
}
- rule->fr_net = hold_net(net);
+ rule->fr_net = net;
if (tb[FRA_PRIORITY])
rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
@@ -423,7 +413,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
return 0;
errout_free:
- release_net(rule->fr_net);
kfree(rule);
errout:
rules_ops_put(ops);
@@ -492,6 +481,12 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
goto errout;
}
+ if (ops->delete) {
+ err = ops->delete(rule);
+ if (err)
+ goto errout;
+ }
+
list_del_rcu(&rule->list);
if (rule->action == FR_ACT_GOTO) {
@@ -517,8 +512,6 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
notify_rule_change(RTM_DELRULE, rule, ops, nlh,
NETLINK_CB(skb).portid);
- if (ops->delete)
- ops->delete(rule);
fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
diff --git a/net/core/filter.c b/net/core/filter.c
index f6bdc2b1ba01..be3098fb65e4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,6 +36,7 @@
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <net/sock.h>
+#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -45,6 +46,7 @@
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
+#include <net/sch_generic.h>
/**
* sk_filter - run a packet through a socket filter
@@ -150,10 +152,62 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return prandom_u32();
}
+static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (skb_field) {
+ case SKF_AD_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case SKF_AD_PKTTYPE:
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
+#endif
+ break;
+
+ case SKF_AD_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+
+ case SKF_AD_VLAN_TAG:
+ case SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_tci));
+ if (skb_field == SKF_AD_VLAN_TAG) {
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ /* dst_reg >>= 12 */
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
+ /* dst_reg &= 1 */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
+ }
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static bool convert_bpf_extensions(struct sock_filter *fp,
struct bpf_insn **insnp)
{
struct bpf_insn *insn = *insnp;
+ u32 cnt;
switch (fp->k) {
case SKF_AD_OFF + SKF_AD_PROTOCOL:
@@ -167,13 +221,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_PKTTYPE:
- *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
- PKT_TYPE_OFFSET());
- *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
-#ifdef __BIG_ENDIAN_BITFIELD
- insn++;
- *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5);
-#endif
+ cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
break;
case SKF_AD_OFF + SKF_AD_IFINDEX:
@@ -197,10 +246,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_MARK:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
-
- *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
- offsetof(struct sk_buff, mark));
+ cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
break;
case SKF_AD_OFF + SKF_AD_RXHASH:
@@ -211,29 +258,30 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_QUEUE:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
-
- *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
- offsetof(struct sk_buff, queue_mapping));
+ cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
break;
case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
- /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */
+ case SKF_AD_OFF + SKF_AD_VLAN_TPID:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
- offsetof(struct sk_buff, vlan_tci));
- if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
- *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
- ~VLAN_TAG_PRESENT);
- } else {
- /* A >>= 12 */
- *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
- /* A &= 1 */
- *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
- }
+ offsetof(struct sk_buff, vlan_proto));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
break;
case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
@@ -309,8 +357,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
* for socket filters: ctx == 'struct sk_buff *', for seccomp:
* ctx == 'struct seccomp_data *'.
*/
-int bpf_convert_filter(struct sock_filter *prog, int len,
- struct bpf_insn *new_prog, int *new_len)
+static int bpf_convert_filter(struct sock_filter *prog, int len,
+ struct bpf_insn *new_prog, int *new_len)
{
int new_flen = 0, pass = 0, target, i;
struct bpf_insn *new_insn;
@@ -325,7 +373,8 @@ int bpf_convert_filter(struct sock_filter *prog, int len,
return -EINVAL;
if (new_prog) {
- addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
+ addrs = kcalloc(len, sizeof(*addrs),
+ GFP_KERNEL | __GFP_NOWARN);
if (!addrs)
return -ENOMEM;
}
@@ -705,7 +754,8 @@ static bool chk_code_allowed(u16 code_to_probe)
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
-int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
+static int bpf_check_classic(const struct sock_filter *filter,
+ unsigned int flen)
{
bool anc_found;
int pc;
@@ -779,7 +829,6 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
return -EINVAL;
}
-EXPORT_SYMBOL(bpf_check_classic);
static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
const struct sock_fprog *fprog)
@@ -793,7 +842,9 @@ static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
fkprog = fp->orig_prog;
fkprog->len = fprog->len;
- fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
+
+ fkprog->filter = kmemdup(fp->insns, fsize,
+ GFP_KERNEL | __GFP_NOWARN);
if (!fkprog->filter) {
kfree(fp->orig_prog);
return -ENOMEM;
@@ -814,7 +865,7 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
static void __bpf_prog_release(struct bpf_prog *prog)
{
- if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+ if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
bpf_prog_put(prog);
} else {
bpf_release_orig_filter(prog);
@@ -895,7 +946,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
* pass. At this time, the user BPF is stored in fp->insns.
*/
old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
- GFP_KERNEL);
+ GFP_KERNEL | __GFP_NOWARN);
if (!old_prog) {
err = -ENOMEM;
goto out_err;
@@ -942,7 +993,8 @@ out_err:
return ERR_PTR(err);
}
-static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+ bpf_aux_classic_check_t trans)
{
int err;
@@ -955,6 +1007,17 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
return ERR_PTR(err);
}
+ /* There might be additional checks and transformations
+ * needed on classic filters, f.e. in case of seccomp.
+ */
+ if (trans) {
+ err = trans(fp->insns, fp->len);
+ if (err) {
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+ }
+ }
+
/* Probe if we can JIT compile the filter and if so, do
* the compilation of the filter.
*/
@@ -1004,7 +1067,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- fp = bpf_prepare_filter(fp);
+ fp = bpf_prepare_filter(fp, NULL);
if (IS_ERR(fp))
return PTR_ERR(fp);
@@ -1013,12 +1076,85 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
}
EXPORT_SYMBOL_GPL(bpf_prog_create);
+/**
+ * bpf_prog_create_from_user - create an unattached filter from user buffer
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ * @trans: post-classic verifier transformation handler
+ *
+ * This function effectively does the same as bpf_prog_create(), only
+ * that it builds up its insns buffer from user space provided buffer.
+ * It also allows for passing a bpf_aux_classic_check_t handler.
+ */
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+ bpf_aux_classic_check_t trans)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (fprog->filter == NULL)
+ return -EINVAL;
+
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!fp)
+ return -ENOMEM;
+
+ if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+ __bpf_prog_free(fp);
+ return -EFAULT;
+ }
+
+ fp->len = fprog->len;
+ /* Since unattached filters are not copied back to user
+ * space through sk_get_filter(), we do not need to hold
+ * a copy here, and can spare us the work.
+ */
+ fp->orig_prog = NULL;
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ fp = bpf_prepare_filter(fp, trans);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+
void bpf_prog_destroy(struct bpf_prog *fp)
{
__bpf_prog_release(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_destroy);
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
+{
+ struct sk_filter *fp, *old_fp;
+
+ fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ fp->prog = prog;
+ atomic_set(&fp->refcnt, 0);
+
+ if (!sk_filter_charge(sk, fp)) {
+ kfree(fp);
+ return -ENOMEM;
+ }
+
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ rcu_assign_pointer(sk->sk_filter, fp);
+
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+
+ return 0;
+}
+
/**
* sk_attach_filter - attach a socket filter
* @fprog: the filter program
@@ -1031,7 +1167,6 @@ EXPORT_SYMBOL_GPL(bpf_prog_destroy);
*/
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
- struct sk_filter *fp, *old_fp;
unsigned int fsize = bpf_classic_proglen(fprog);
unsigned int bpf_fsize = bpf_prog_size(fprog->len);
struct bpf_prog *prog;
@@ -1064,40 +1199,24 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- prog = bpf_prepare_filter(prog);
+ prog = bpf_prepare_filter(prog, NULL);
if (IS_ERR(prog))
return PTR_ERR(prog);
- fp = kmalloc(sizeof(*fp), GFP_KERNEL);
- if (!fp) {
+ err = __sk_attach_prog(prog, sk);
+ if (err < 0) {
__bpf_prog_release(prog);
- return -ENOMEM;
- }
- fp->prog = prog;
-
- atomic_set(&fp->refcnt, 0);
-
- if (!sk_filter_charge(sk, fp)) {
- __sk_filter_release(fp);
- return -ENOMEM;
+ return err;
}
- old_fp = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
- rcu_assign_pointer(sk->sk_filter, fp);
-
- if (old_fp)
- sk_filter_uncharge(sk, old_fp);
-
return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
-#ifdef CONFIG_BPF_SYSCALL
int sk_attach_bpf(u32 ufd, struct sock *sk)
{
- struct sk_filter *fp, *old_fp;
struct bpf_prog *prog;
+ int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
@@ -1106,40 +1225,207 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
- /* valid fd, but invalid program type */
+ if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
bpf_prog_put(prog);
return -EINVAL;
}
- fp = kmalloc(sizeof(*fp), GFP_KERNEL);
- if (!fp) {
+ err = __sk_attach_prog(prog, sk);
+ if (err < 0) {
bpf_prog_put(prog);
- return -ENOMEM;
+ return err;
}
- fp->prog = prog;
- atomic_set(&fp->refcnt, 0);
+ return 0;
+}
- if (!sk_filter_charge(sk, fp)) {
- __sk_filter_release(fp);
- return -ENOMEM;
+#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
+
+static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ int offset = (int) r2;
+ void *from = (void *) (long) r3;
+ unsigned int len = (unsigned int) r4;
+ char buf[16];
+ void *ptr;
+
+ /* bpf verifier guarantees that:
+ * 'from' pointer points to bpf program stack
+ * 'len' bytes of it were initialized
+ * 'len' > 0
+ * 'skb' is a valid pointer to 'struct sk_buff'
+ *
+ * so check for invalid 'offset' and too large 'len'
+ */
+ if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
+ return -EFAULT;
+
+ if (unlikely(skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + len)))
+ return -EFAULT;
+
+ ptr = skb_header_pointer(skb, offset, len, buf);
+ if (unlikely(!ptr))
+ return -EFAULT;
+
+ if (BPF_RECOMPUTE_CSUM(flags))
+ skb_postpull_rcsum(skb, ptr, len);
+
+ memcpy(ptr, from, len);
+
+ if (ptr == buf)
+ /* skb_store_bits cannot return -EFAULT here */
+ skb_store_bits(skb, offset, ptr, len);
+
+ if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0));
+ return 0;
+}
+
+const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+ .func = bpf_skb_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_STACK,
+ .arg4_type = ARG_CONST_STACK_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f)
+#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10)
+
+static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ int offset = (int) r2;
+ __sum16 sum, *ptr;
+
+ if (unlikely((u32) offset > 0xffff))
+ return -EFAULT;
+
+ if (unlikely(skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + sizeof(sum))))
+ return -EFAULT;
+
+ ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
+ if (unlikely(!ptr))
+ return -EFAULT;
+
+ switch (BPF_HEADER_FIELD_SIZE(flags)) {
+ case 2:
+ csum_replace2(ptr, from, to);
+ break;
+ case 4:
+ csum_replace4(ptr, from, to);
+ break;
+ default:
+ return -EINVAL;
}
- old_fp = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
- rcu_assign_pointer(sk->sk_filter, fp);
+ if (ptr == &sum)
+ /* skb_store_bits guaranteed to not return -EFAULT here */
+ skb_store_bits(skb, offset, ptr, sizeof(sum));
- if (old_fp)
- sk_filter_uncharge(sk, old_fp);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+ .func = bpf_l3_csum_replace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags);
+ int offset = (int) r2;
+ __sum16 sum, *ptr;
+
+ if (unlikely((u32) offset > 0xffff))
+ return -EFAULT;
+
+ if (unlikely(skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + sizeof(sum))))
+ return -EFAULT;
+
+ ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
+ if (unlikely(!ptr))
+ return -EFAULT;
+
+ switch (BPF_HEADER_FIELD_SIZE(flags)) {
+ case 2:
+ inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
+ break;
+ case 4:
+ inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (ptr == &sum)
+ /* skb_store_bits guaranteed to not return -EFAULT here */
+ skb_store_bits(skb, offset, ptr, sizeof(sum));
return 0;
}
-/* allow socket filters to call
- * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
- */
-static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id)
+const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+ .func = bpf_l4_csum_replace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1)
+
+static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
+ if (unlikely(!dev))
+ return -EINVAL;
+
+ if (unlikely(!(dev->flags & IFF_UP)))
+ return -EINVAL;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!skb2))
+ return -ENOMEM;
+
+ if (BPF_IS_REDIRECT_INGRESS(flags))
+ return dev_forward_skb(dev, skb2);
+
+ skb2->dev = dev;
+ return dev_queue_xmit(skb2);
+}
+
+const struct bpf_func_proto bpf_clone_redirect_proto = {
+ .func = bpf_clone_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -1148,39 +1434,239 @@ static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func
return &bpf_map_update_elem_proto;
case BPF_FUNC_map_delete_elem:
return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_trace_printk:
+ return bpf_get_trace_printk_proto();
default:
return NULL;
}
}
-static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+static const struct bpf_func_proto *
+tc_cls_act_func_proto(enum bpf_func_id func_id)
{
- /* skb fields cannot be accessed yet */
- return false;
+ switch (func_id) {
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
}
-static struct bpf_verifier_ops sock_filter_ops = {
- .get_func_proto = sock_filter_func_proto,
- .is_valid_access = sock_filter_is_valid_access,
+static bool __is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct __sk_buff))
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ /* all __sk_buff fields are __u32 */
+ if (size != 4)
+ return false;
+
+ return true;
+}
+
+static bool sk_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return __is_valid_access(off, size, type);
+}
+
+static bool tc_cls_act_is_valid_access(int off, int size,
+ enum bpf_access_type type)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, mark):
+ case offsetof(struct __sk_buff, tc_index):
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+ return __is_valid_access(off, size, type);
+}
+
+static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct __sk_buff, len):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, len));
+ break;
+
+ case offsetof(struct __sk_buff, protocol):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, protocol));
+ break;
+
+ case offsetof(struct __sk_buff, vlan_proto):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_proto));
+ break;
+
+ case offsetof(struct __sk_buff, priority):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, priority));
+ break;
+
+ case offsetof(struct __sk_buff, ingress_ifindex):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, skb_iif));
+ break;
+
+ case offsetof(struct __sk_buff, ifindex):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ dst_reg, src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
+
+ case offsetof(struct __sk_buff, mark):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case offsetof(struct __sk_buff, pkt_type):
+ return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, queue_mapping):
+ return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, vlan_present):
+ return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
+ dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, vlan_tci):
+ return convert_skb_access(SKF_AD_VLAN_TAG,
+ dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+
+ ctx_off -= offsetof(struct __sk_buff, cb[0]);
+ ctx_off += offsetof(struct sk_buff, cb);
+ ctx_off += offsetof(struct qdisc_skb_cb, data);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+ break;
+
+ case offsetof(struct __sk_buff, tc_index):
+#ifdef CONFIG_NET_SCHED
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, tc_index));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, tc_index));
+ break;
+#else
+ if (type == BPF_WRITE)
+ *insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+ else
+ *insn++ = BPF_MOV64_IMM(dst_reg, 0);
+ break;
+#endif
+ }
+
+ return insn - insn_buf;
+}
+
+static const struct bpf_verifier_ops sk_filter_ops = {
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = bpf_net_convert_ctx_access,
};
-static struct bpf_prog_type_list tl = {
- .ops = &sock_filter_ops,
+static const struct bpf_verifier_ops tc_cls_act_ops = {
+ .get_func_proto = tc_cls_act_func_proto,
+ .is_valid_access = tc_cls_act_is_valid_access,
+ .convert_ctx_access = bpf_net_convert_ctx_access,
+};
+
+static struct bpf_prog_type_list sk_filter_type __read_mostly = {
+ .ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
};
-static int __init register_sock_filter_ops(void)
+static struct bpf_prog_type_list sched_cls_type __read_mostly = {
+ .ops = &tc_cls_act_ops,
+ .type = BPF_PROG_TYPE_SCHED_CLS,
+};
+
+static struct bpf_prog_type_list sched_act_type __read_mostly = {
+ .ops = &tc_cls_act_ops,
+ .type = BPF_PROG_TYPE_SCHED_ACT,
+};
+
+static int __init register_sk_filter_ops(void)
{
- bpf_register_prog_type(&tl);
+ bpf_register_prog_type(&sk_filter_type);
+ bpf_register_prog_type(&sched_cls_type);
+ bpf_register_prog_type(&sched_act_type);
+
return 0;
}
-late_initcall(register_sock_filter_ops);
-#else
-int sk_attach_bpf(u32 ufd, struct sock *sk)
-{
- return -EOPNOTSUPP;
-}
-#endif
+late_initcall(register_sk_filter_ops);
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2c35c02a931e..2a834c6179b9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1,3 +1,4 @@
+#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <linux/ip.h>
@@ -12,19 +13,60 @@
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
-#include <net/flow_keys.h>
+#include <linux/stddef.h>
+#include <linux/if_ether.h>
+#include <linux/mpls.h>
+#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
-/* copy saddr & daddr, possibly using 64bit load/store
- * Equivalent to : flow->src = iph->saddr;
- * flow->dst = iph->daddr;
- */
-static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
+static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector,
+ enum flow_dissector_key_id key_id)
+{
+ return flow_dissector->used_keys & (1 << key_id);
+}
+
+static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector,
+ enum flow_dissector_key_id key_id)
+{
+ flow_dissector->used_keys |= (1 << key_id);
+}
+
+static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
+ enum flow_dissector_key_id key_id,
+ void *target_container)
+{
+ return ((char *) target_container) + flow_dissector->offset[key_id];
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+ const struct flow_dissector_key *key,
+ unsigned int key_count)
{
- BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
- offsetof(typeof(*flow), src) + sizeof(flow->src));
- memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
+ unsigned int i;
+
+ memset(flow_dissector, 0, sizeof(*flow_dissector));
+
+ for (i = 0; i < key_count; i++, key++) {
+ /* User should make sure that every key target offset is withing
+ * boundaries of unsigned short.
+ */
+ BUG_ON(key->offset > USHRT_MAX);
+ BUG_ON(skb_flow_dissector_uses_key(flow_dissector,
+ key->key_id));
+
+ skb_flow_dissector_set_key(flow_dissector, key->key_id);
+ flow_dissector->offset[key->key_id] = key->offset;
+ }
+
+ /* Ensure that the dissector always includes control and basic key.
+ * That way we are able to avoid handling lack of these in fast path.
+ */
+ BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL));
+ BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC));
}
+EXPORT_SYMBOL(skb_flow_dissector_init);
/**
* __skb_flow_get_ports - extract the upper layer ports and return them
@@ -63,18 +105,31 @@ EXPORT_SYMBOL(__skb_flow_get_ports);
/**
* __skb_flow_dissect - extract the flow_keys struct and return it
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @flow_dissector: list of keys to dissect
+ * @target_container: target structure to put dissected values into
* @data: raw buffer pointer to the packet, if NULL use skb->data
* @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
* @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
*
- * The function will try to retrieve the struct flow_keys from either the skbuff
- * or a raw buffer specified by the rest parameters
+ * The function will try to retrieve individual keys into target specified
+ * by flow_dissector from either the skbuff or a raw buffer specified by the
+ * rest parameters.
+ *
+ * Caller must take care of zeroing target container memory.
*/
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+bool __skb_flow_dissect(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container,
void *data, __be16 proto, int nhoff, int hlen)
{
- u8 ip_proto;
+ struct flow_dissector_key_control *key_control;
+ struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_tags *key_tags;
+ struct flow_dissector_key_keyid *key_keyid;
+ u8 ip_proto = 0;
if (!data) {
data = skb->data;
@@ -83,7 +138,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
hlen = skb_headlen(skb);
}
- memset(flow, 0, sizeof(*flow));
+ /* It is ensured by skb_flow_dissector_init() that control key will
+ * be always present.
+ */
+ key_control = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL,
+ target_container);
+
+ /* It is ensured by skb_flow_dissector_init() that basic key will
+ * be always present.
+ */
+ key_basic = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ target_container);
+
+ if (skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+ struct ethhdr *eth = eth_hdr(skb);
+ struct flow_dissector_key_eth_addrs *key_eth_addrs;
+
+ key_eth_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS,
+ target_container);
+ memcpy(key_eth_addrs, &eth->h_dest, sizeof(*key_eth_addrs));
+ }
again:
switch (proto) {
@@ -100,14 +178,15 @@ ip:
if (ip_is_fragment(iph))
ip_proto = 0;
- /* skip the address processing if skb is NULL. The assumption
- * here is that if there is no skb we are not looking for flow
- * info but lengths and protocols.
- */
- if (!skb)
+ if (!skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS))
break;
- iph_to_flow_copy_addrs(flow, iph);
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container);
+ memcpy(&key_addrs->v4addrs, &iph->saddr,
+ sizeof(key_addrs->v4addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
break;
}
case htons(ETH_P_IPV6): {
@@ -123,25 +202,27 @@ ipv6:
ip_proto = iph->nexthdr;
nhoff += sizeof(struct ipv6hdr);
- /* see comment above in IPv4 section */
- if (!skb)
- break;
+ if (skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+ struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
+
+ key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
- flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
- flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+ memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
flow_label = ip6_flowlabel(iph);
if (flow_label) {
- /* Awesome, IPv6 packet has a flow label so we can
- * use that to represent the ports without any
- * further dissection.
- */
- flow->n_proto = proto;
- flow->ip_proto = ip_proto;
- flow->ports = flow_label;
- flow->thoff = (u16)nhoff;
-
- return true;
+ if (skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ target_container);
+ key_tags->flow_label = ntohl(flow_label);
+ }
}
break;
@@ -155,6 +236,15 @@ ipv6:
if (!vlan)
return false;
+ if (skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_VLANID)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_VLANID,
+ target_container);
+
+ key_tags->vlan_id = skb_vlan_tag_get_id(skb);
+ }
+
proto = vlan->h_vlan_encapsulated_proto;
nhoff += sizeof(*vlan);
goto again;
@@ -186,19 +276,58 @@ ipv6:
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
return false;
- flow->src = hdr->srcnode;
- flow->dst = 0;
- flow->n_proto = proto;
- flow->thoff = (u16)nhoff;
+ key_basic->n_proto = proto;
+ key_control->thoff = (u16)nhoff;
+
+ if (skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_TIPC_ADDRS,
+ target_container);
+ key_addrs->tipcaddrs.srcnode = hdr->srcnode;
+ key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS;
+ }
+ return true;
+ }
+
+ case htons(ETH_P_MPLS_UC):
+ case htons(ETH_P_MPLS_MC): {
+ struct mpls_label *hdr, _hdr[2];
+mpls:
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+ hlen, &_hdr);
+ if (!hdr)
+ return false;
+
+ if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
+ MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
+ if (skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+ key_keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
+ target_container);
+ key_keyid->keyid = hdr[1].entry &
+ htonl(MPLS_LS_LABEL_MASK);
+ }
+
+ key_basic->n_proto = proto;
+ key_basic->ip_proto = ip_proto;
+ key_control->thoff = (u16)nhoff;
+
+ return true;
+ }
+
return true;
}
+
case htons(ETH_P_FCOE):
- flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+ key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
/* fall through */
default:
return false;
}
+ip_proto_again:
switch (ip_proto) {
case IPPROTO_GRE: {
struct gre_hdr {
@@ -213,30 +342,65 @@ ipv6:
* Only look inside GRE if version zero and no
* routing
*/
- if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
- proto = hdr->proto;
+ if (hdr->flags & (GRE_VERSION | GRE_ROUTING))
+ break;
+
+ proto = hdr->proto;
+ nhoff += 4;
+ if (hdr->flags & GRE_CSUM)
nhoff += 4;
- if (hdr->flags & GRE_CSUM)
- nhoff += 4;
- if (hdr->flags & GRE_KEY)
- nhoff += 4;
- if (hdr->flags & GRE_SEQ)
- nhoff += 4;
- if (proto == htons(ETH_P_TEB)) {
- const struct ethhdr *eth;
- struct ethhdr _eth;
-
- eth = __skb_header_pointer(skb, nhoff,
- sizeof(_eth),
- data, hlen, &_eth);
- if (!eth)
- return false;
- proto = eth->h_proto;
- nhoff += sizeof(*eth);
+ if (hdr->flags & GRE_KEY) {
+ const __be32 *keyid;
+ __be32 _keyid;
+
+ keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid),
+ data, hlen, &_keyid);
+
+ if (!keyid)
+ return false;
+
+ if (skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+ key_keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_GRE_KEYID,
+ target_container);
+ key_keyid->keyid = *keyid;
}
- goto again;
+ nhoff += 4;
}
- break;
+ if (hdr->flags & GRE_SEQ)
+ nhoff += 4;
+ if (proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = __skb_header_pointer(skb, nhoff,
+ sizeof(_eth),
+ data, hlen, &_eth);
+ if (!eth)
+ return false;
+ proto = eth->h_proto;
+ nhoff += sizeof(*eth);
+ }
+ goto again;
+ }
+ case NEXTHDR_HOP:
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_DEST: {
+ u8 _opthdr[2], *opthdr;
+
+ if (proto != htons(ETH_P_IPV6))
+ break;
+
+ opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
+ data, hlen, &_opthdr);
+ if (!opthdr)
+ return false;
+
+ ip_proto = opthdr[0];
+ nhoff += (opthdr[1] + 1) << 3;
+
+ goto ip_proto_again;
}
case IPPROTO_IPIP:
proto = htons(ETH_P_IP);
@@ -244,18 +408,25 @@ ipv6:
case IPPROTO_IPV6:
proto = htons(ETH_P_IPV6);
goto ipv6;
+ case IPPROTO_MPLS:
+ proto = htons(ETH_P_MPLS_UC);
+ goto mpls;
default:
break;
}
- flow->n_proto = proto;
- flow->ip_proto = ip_proto;
- flow->thoff = (u16) nhoff;
-
- /* unless skb is set we don't need to record port info */
- if (skb)
- flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
- data, hlen);
+ key_basic->n_proto = proto;
+ key_basic->ip_proto = ip_proto;
+ key_control->thoff = (u16)nhoff;
+
+ if (skb_flow_dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS)) {
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+ key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+ data, hlen);
+ }
return true;
}
@@ -267,27 +438,109 @@ static __always_inline void __flow_hash_secret_init(void)
net_get_random_once(&hashrnd, sizeof(hashrnd));
}
-static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
+static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval)
{
- __flow_hash_secret_init();
- return jhash_3words(a, b, c, hashrnd);
+ return jhash2(words, length, keyval);
}
-static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
+static inline void *flow_keys_hash_start(struct flow_keys *flow)
{
- u32 hash;
+ BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
+ return (void *)flow + FLOW_KEYS_HASH_OFFSET;
+}
+
+static inline size_t flow_keys_hash_length(struct flow_keys *flow)
+{
+ size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
+ BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
+ sizeof(*flow) - sizeof(flow->addrs));
+
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ diff -= sizeof(flow->addrs.v4addrs);
+ break;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ diff -= sizeof(flow->addrs.v6addrs);
+ break;
+ case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
+ diff -= sizeof(flow->addrs.tipcaddrs);
+ break;
+ }
+ return (sizeof(*flow) - diff) / sizeof(u32);
+}
+
+__be32 flow_get_u32_src(const struct flow_keys *flow)
+{
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ return flow->addrs.v4addrs.src;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ return (__force __be32)ipv6_addr_hash(
+ &flow->addrs.v6addrs.src);
+ case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
+ return flow->addrs.tipcaddrs.srcnode;
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(flow_get_u32_src);
- /* get a consistent hash (same value on both flow directions) */
- if (((__force u32)keys->dst < (__force u32)keys->src) ||
- (((__force u32)keys->dst == (__force u32)keys->src) &&
- ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
- swap(keys->dst, keys->src);
- swap(keys->port16[0], keys->port16[1]);
+__be32 flow_get_u32_dst(const struct flow_keys *flow)
+{
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ return flow->addrs.v4addrs.dst;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ return (__force __be32)ipv6_addr_hash(
+ &flow->addrs.v6addrs.dst);
+ default:
+ return 0;
}
+}
+EXPORT_SYMBOL(flow_get_u32_dst);
- hash = __flow_hash_3words((__force u32)keys->dst,
- (__force u32)keys->src,
- (__force u32)keys->ports);
+static inline void __flow_hash_consistentify(struct flow_keys *keys)
+{
+ int addr_diff, i;
+
+ switch (keys->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ addr_diff = (__force u32)keys->addrs.v4addrs.dst -
+ (__force u32)keys->addrs.v4addrs.src;
+ if ((addr_diff < 0) ||
+ (addr_diff == 0 &&
+ ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src))) {
+ swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+ swap(keys->ports.src, keys->ports.dst);
+ }
+ break;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ addr_diff = memcmp(&keys->addrs.v6addrs.dst,
+ &keys->addrs.v6addrs.src,
+ sizeof(keys->addrs.v6addrs.dst));
+ if ((addr_diff < 0) ||
+ (addr_diff == 0 &&
+ ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src))) {
+ for (i = 0; i < 4; i++)
+ swap(keys->addrs.v6addrs.src.s6_addr32[i],
+ keys->addrs.v6addrs.dst.s6_addr32[i]);
+ swap(keys->ports.src, keys->ports.dst);
+ }
+ break;
+ }
+}
+
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
+{
+ u32 hash;
+
+ __flow_hash_consistentify(keys);
+
+ hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
+ flow_keys_hash_length(keys), keyval);
if (!hash)
hash = 1;
@@ -296,12 +549,52 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
u32 flow_hash_from_keys(struct flow_keys *keys)
{
- return __flow_hash_from_keys(keys);
+ __flow_hash_secret_init();
+ return __flow_hash_from_keys(keys, hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);
-/*
- * __skb_get_hash: calculate a flow hash based on src/dst addresses
+static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+ struct flow_keys *keys, u32 keyval)
+{
+ if (!skb_flow_dissect_flow_keys(skb, keys))
+ return 0;
+
+ return __flow_hash_from_keys(keys, keyval);
+}
+
+struct _flow_keys_digest_data {
+ __be16 n_proto;
+ u8 ip_proto;
+ u8 padding;
+ __be32 ports;
+ __be32 src;
+ __be32 dst;
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+ const struct flow_keys *flow)
+{
+ struct _flow_keys_digest_data *data =
+ (struct _flow_keys_digest_data *)digest;
+
+ BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+ memset(digest, 0, sizeof(*digest));
+
+ data->n_proto = flow->basic.n_proto;
+ data->ip_proto = flow->basic.ip_proto;
+ data->ports = flow->ports.ports;
+ data->src = flow->addrs.v4addrs.src;
+ data->dst = flow->addrs.v4addrs.dst;
+}
+EXPORT_SYMBOL(make_flow_keys_digest);
+
+/**
+ * __skb_get_hash: calculate a flow hash
+ * @skb: sk_buff to calculate flow hash from
+ *
+ * This function calculates a flow hash based on src/dst addresses
* and src/dst port numbers. Sets hash in skb to non-zero hash value
* on success, zero indicates no valid hash. Also, sets l4_hash in skb
* if hash is a canonical 4-tuple hash over transport ports.
@@ -309,53 +602,34 @@ EXPORT_SYMBOL(flow_hash_from_keys);
void __skb_get_hash(struct sk_buff *skb)
{
struct flow_keys keys;
+ u32 hash;
- if (!skb_flow_dissect(skb, &keys))
- return;
+ __flow_hash_secret_init();
- if (keys.ports)
+ hash = ___skb_get_hash(skb, &keys, hashrnd);
+ if (!hash)
+ return;
+ if (keys.ports.ports)
skb->l4_hash = 1;
-
skb->sw_hash = 1;
-
- skb->hash = __flow_hash_from_keys(&keys);
+ skb->hash = hash;
}
EXPORT_SYMBOL(__skb_get_hash);
-/*
- * Returns a Tx hash based on the given packet descriptor a Tx queues' number
- * to be used as a distribution range.
- */
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
- unsigned int num_tx_queues)
+__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
{
- u32 hash;
- u16 qoffset = 0;
- u16 qcount = num_tx_queues;
-
- if (skb_rx_queue_recorded(skb)) {
- hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= num_tx_queues))
- hash -= num_tx_queues;
- return hash;
- }
-
- if (dev->num_tc) {
- u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
- qoffset = dev->tc_to_txq[tc].offset;
- qcount = dev->tc_to_txq[tc].count;
- }
+ struct flow_keys keys;
- return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+ return ___skb_get_hash(skb, &keys, perturb);
}
-EXPORT_SYMBOL(__skb_tx_hash);
+EXPORT_SYMBOL(skb_get_hash_perturb);
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
const struct flow_keys *keys, int hlen)
{
- u32 poff = keys->thoff;
+ u32 poff = keys->control.thoff;
- switch (keys->ip_proto) {
+ switch (keys->basic.ip_proto) {
case IPPROTO_TCP: {
/* access doff as u8 to avoid unaligned access */
const u8 *doff;
@@ -396,8 +670,12 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
return poff;
}
-/* skb_get_poff() returns the offset to the payload as far as it could
- * be dissected. The main user is currently BPF, so that we can dynamically
+/**
+ * skb_get_poff - get the offset to the payload
+ * @skb: sk_buff to get the payload offset from
+ *
+ * The function will get the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
* truncate packets without needing to push actual payload to the user
* space and can analyze headers only, instead.
*/
@@ -405,86 +683,76 @@ u32 skb_get_poff(const struct sk_buff *skb)
{
struct flow_keys keys;
- if (!skb_flow_dissect(skb, &keys))
+ if (!skb_flow_dissect_flow_keys(skb, &keys))
return 0;
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
}
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v4addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v6addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.tipcaddrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_PORTS,
+ .offset = offsetof(struct flow_keys, ports),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_VLANID,
+ .offset = offsetof(struct flow_keys, tags),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ .offset = offsetof(struct flow_keys, tags),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
+ .offset = offsetof(struct flow_keys, keyid),
+ },
+};
+
+static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+};
+
+struct flow_dissector flow_keys_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_dissector);
+
+struct flow_dissector flow_keys_buf_dissector __read_mostly;
+
+static int __init init_default_flow_dissectors(void)
{
-#ifdef CONFIG_XPS
- struct xps_dev_maps *dev_maps;
- struct xps_map *map;
- int queue_index = -1;
-
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
- if (dev_maps) {
- map = rcu_dereference(
- dev_maps->cpu_map[skb->sender_cpu - 1]);
- if (map) {
- if (map->len == 1)
- queue_index = map->queues[0];
- else
- queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
- map->len)];
- if (unlikely(queue_index >= dev->real_num_tx_queues))
- queue_index = -1;
- }
- }
- rcu_read_unlock();
-
- return queue_index;
-#else
- return -1;
-#endif
-}
-
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
- int queue_index = sk_tx_queue_get(sk);
-
- if (queue_index < 0 || skb->ooo_okay ||
- queue_index >= dev->real_num_tx_queues) {
- int new_index = get_xps_queue(dev, skb);
- if (new_index < 0)
- new_index = skb_tx_hash(dev, skb);
-
- if (queue_index != new_index && sk &&
- rcu_access_pointer(sk->sk_dst_cache))
- sk_tx_queue_set(sk, new_index);
-
- queue_index = new_index;
- }
-
- return queue_index;
+ skb_flow_dissector_init(&flow_keys_dissector,
+ flow_keys_dissector_keys,
+ ARRAY_SIZE(flow_keys_dissector_keys));
+ skb_flow_dissector_init(&flow_keys_buf_dissector,
+ flow_keys_buf_dissector_keys,
+ ARRAY_SIZE(flow_keys_buf_dissector_keys));
+ return 0;
}
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
- struct sk_buff *skb,
- void *accel_priv)
-{
- int queue_index = 0;
-
-#ifdef CONFIG_XPS
- if (skb->sender_cpu == 0)
- skb->sender_cpu = raw_smp_processor_id() + 1;
-#endif
-
- if (dev->real_num_tx_queues != 1) {
- const struct net_device_ops *ops = dev->netdev_ops;
- if (ops->ndo_select_queue)
- queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
- __netdev_pick_tx);
- else
- queue_index = __netdev_pick_tx(dev, skb);
-
- if (!accel_priv)
- queue_index = netdev_cap_txqueue(dev, queue_index);
- }
-
- skb_set_queue_mapping(skb, queue_index);
- return netdev_get_tx_queue(dev, queue_index);
-}
+late_initcall_sync(init_default_flow_dissectors);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9dfb88a933e7..92d886f4adcb 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -66,7 +66,7 @@
NOTES.
- * avbps is scaled by 2^5, avpps is scaled by 2^10.
+ * avbps and avpps are scaled by 2^5.
* both values are reported as 32 bit unsigned values. bps can
overflow for fast links : max speed being 34360Mbit/sec
* Minimal interval is HZ/4=250msec (it is the greatest common divisor
@@ -85,10 +85,10 @@ struct gen_estimator
struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
int ewma_log;
+ u32 last_packets;
+ unsigned long avpps;
u64 last_bytes;
u64 avbps;
- u32 last_packets;
- u32 avpps;
struct rcu_head e_rcu;
struct rb_node node;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
@@ -118,8 +118,8 @@ static void est_timer(unsigned long arg)
rcu_read_lock();
list_for_each_entry_rcu(e, &elist[idx].list, list) {
struct gnet_stats_basic_packed b = {0};
+ unsigned long rate;
u64 brate;
- u32 rate;
spin_lock(e->stats_lock);
read_lock(&est_lock);
@@ -133,10 +133,11 @@ static void est_timer(unsigned long arg)
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
e->rate_est->bps = (e->avbps+0xF)>>5;
- rate = (b.packets - e->last_packets)<<(12 - idx);
+ rate = b.packets - e->last_packets;
+ rate <<= (7 - idx);
e->last_packets = b.packets;
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- e->rate_est->pps = (e->avpps+0x1FF)>>10;
+ e->rate_est->pps = (e->avpps + 0xF) >> 5;
skip:
read_unlock(&est_lock);
spin_unlock(e->stats_lock);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 49a9e3e06c08..982861607f88 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -40,7 +40,7 @@ static DEFINE_SPINLOCK(lweventlist_lock);
static unsigned char default_operstate(const struct net_device *dev)
{
if (!netif_carrier_ok(dev))
- return (dev->ifindex != dev->iflink ?
+ return (dev->ifindex != dev_get_iflink(dev) ?
IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
if (netif_dormant(dev))
@@ -89,7 +89,7 @@ static bool linkwatch_urgent_event(struct net_device *dev)
if (!netif_running(dev))
return false;
- if (dev->ifindex != dev->iflink)
+ if (dev->ifindex != dev_get_iflink(dev))
return true;
if (dev->priv_flags & IFF_TEAM_PORT)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 70fe9e10ac86..84195dacb8b6 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -397,25 +397,15 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n;
- int key_len = tbl->key_len;
- u32 hash_val;
- struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
rcu_read_lock_bh();
- nht = rcu_dereference_bh(tbl->nht);
- hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
-
- for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
- n != NULL;
- n = rcu_dereference_bh(n->next)) {
- if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
- if (!atomic_inc_not_zero(&n->refcnt))
- n = NULL;
- NEIGH_CACHE_STAT_INC(tbl, hits);
- break;
- }
+ n = __neigh_lookup_noref(tbl, pkey, dev);
+ if (n) {
+ if (!atomic_inc_not_zero(&n->refcnt))
+ n = NULL;
+ NEIGH_CACHE_STAT_INC(tbl, hits);
}
rcu_read_unlock_bh();
@@ -601,7 +591,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
if (!n)
goto out;
- write_pnet(&n->net, hold_net(net));
+ write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
if (dev)
@@ -610,7 +600,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
if (tbl->pconstructor && tbl->pconstructor(n)) {
if (dev)
dev_put(dev);
- release_net(net);
kfree(n);
n = NULL;
goto out;
@@ -644,7 +633,6 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
- release_net(pneigh_net(n));
kfree(n);
return 0;
}
@@ -667,7 +655,6 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
- release_net(pneigh_net(n));
kfree(n);
continue;
}
@@ -830,10 +817,9 @@ out:
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
- int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES);
- if (!(n->nud_state & NUD_PROBE))
- max_probes += NEIGH_VAR(p, MCAST_PROBES);
- return max_probes;
+ return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) +
+ (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) :
+ NEIGH_VAR(p, MCAST_PROBES));
}
static void neigh_invalidate(struct neighbour *neigh)
@@ -927,6 +913,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh->nud_state = NUD_PROBE;
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
+ notify = 1;
next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
} else {
@@ -971,6 +958,8 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
rc = 0;
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
+ if (neigh->dead)
+ goto out_dead;
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
@@ -1027,6 +1016,13 @@ out_unlock_bh:
write_unlock(&neigh->lock);
local_bh_enable();
return rc;
+
+out_dead:
+ if (neigh->nud_state & NUD_STALE)
+ goto out_unlock_bh;
+ write_unlock_bh(&neigh->lock);
+ kfree_skb(skb);
+ return 1;
}
EXPORT_SYMBOL(__neigh_event_send);
@@ -1090,6 +1086,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
+ if (neigh->dead)
+ goto out;
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
@@ -1158,6 +1156,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (new != old) {
neigh_del_timer(neigh);
+ if (new & NUD_PROBE)
+ atomic_set(&neigh->probes, 0);
if (new & NUD_IN_TIMER)
neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
@@ -1239,6 +1239,8 @@ EXPORT_SYMBOL(neigh_update);
*/
void __neigh_set_probe_once(struct neighbour *neigh)
{
+ if (neigh->dead)
+ return;
neigh->updated = jiffies;
if (!(neigh->nud_state & NUD_FAILED))
return;
@@ -1263,10 +1265,10 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
EXPORT_SYMBOL(neigh_event_ns);
/* called with read_lock_bh(&n->lock); */
-static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
+static void neigh_hh_init(struct neighbour *n)
{
- struct net_device *dev = dst->dev;
- __be16 prot = dst->ops->protocol;
+ struct net_device *dev = n->dev;
+ __be16 prot = n->tbl->protocol;
struct hh_cache *hh = &n->hh;
write_lock_bh(&n->lock);
@@ -1280,43 +1282,19 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
write_unlock_bh(&n->lock);
}
-/* This function can be used in contexts, where only old dev_queue_xmit
- * worked, f.e. if you want to override normal output path (eql, shaper),
- * but resolution is not made yet.
- */
-
-int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
-
- __skb_pull(skb, skb_network_offset(skb));
-
- if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
- skb->len) < 0 &&
- dev_rebuild_header(skb))
- return 0;
-
- return dev_queue_xmit(skb);
-}
-EXPORT_SYMBOL(neigh_compat_output);
-
/* Slow and careful. */
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
int rc = 0;
- if (!dst)
- goto discard;
-
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
unsigned int seq;
if (dev->header_ops->cache && !neigh->hh.hh_len)
- neigh_hh_init(neigh, dst);
+ neigh_hh_init(neigh);
do {
__skb_pull(skb, skb_network_offset(skb));
@@ -1332,8 +1310,6 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
}
out:
return rc;
-discard:
- neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh);
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
@@ -1464,11 +1440,10 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
dev_hold(dev);
p->dev = dev;
- write_pnet(&p->net, hold_net(net));
+ write_pnet(&p->net, net);
p->sysctl_table = NULL;
if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
- release_net(net);
dev_put(dev);
kfree(p);
return NULL;
@@ -1508,7 +1483,6 @@ EXPORT_SYMBOL(neigh_parms_release);
static void neigh_parms_destroy(struct neigh_parms *parms)
{
- release_net(neigh_parms_net(parms));
kfree(parms);
}
@@ -1783,6 +1757,8 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
NEIGH_VAR(parms, UCAST_PROBES)) ||
nla_put_u32(skb, NDTPA_MCAST_PROBES,
NEIGH_VAR(parms, MCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_REPROBES,
+ NEIGH_VAR(parms, MCAST_REPROBES)) ||
nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) ||
nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
NEIGH_VAR(parms, BASE_REACHABLE_TIME)) ||
@@ -1942,6 +1918,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
[NDTPA_MCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 },
[NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 },
[NDTPA_GC_STALETIME] = { .type = NLA_U64 },
[NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 },
@@ -2042,6 +2019,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
NEIGH_VAR_SET(p, MCAST_PROBES,
nla_get_u32(tbp[i]));
break;
+ case NDTPA_MCAST_REPROBES:
+ NEIGH_VAR_SET(p, MCAST_REPROBES,
+ nla_get_u32(tbp[i]));
+ break;
case NDTPA_BASE_REACHABLE_TIME:
NEIGH_VAR_SET(p, BASE_REACHABLE_TIME,
nla_get_msecs(tbp[i]));
@@ -2427,6 +2408,40 @@ void __neigh_for_each_release(struct neigh_table *tbl,
}
EXPORT_SYMBOL(__neigh_for_each_release);
+int neigh_xmit(int index, struct net_device *dev,
+ const void *addr, struct sk_buff *skb)
+{
+ int err = -EAFNOSUPPORT;
+ if (likely(index < NEIGH_NR_TABLES)) {
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+
+ tbl = neigh_tables[index];
+ if (!tbl)
+ goto out;
+ neigh = __neigh_lookup_noref(tbl, addr, dev);
+ if (!neigh)
+ neigh = __neigh_create(tbl, addr, dev, false);
+ err = PTR_ERR(neigh);
+ if (IS_ERR(neigh))
+ goto out_kfree_skb;
+ err = neigh->output(neigh, skb);
+ }
+ else if (index == NEIGH_LINK_TABLE) {
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ addr, NULL, skb->len);
+ if (err < 0)
+ goto out_kfree_skb;
+ err = dev_queue_xmit(skb);
+ }
+out:
+ return err;
+out_kfree_skb:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_xmit);
+
#ifdef CONFIG_PROC_FS
static struct neighbour *neigh_get_first(struct seq_file *seq)
@@ -2994,6 +3009,7 @@ static struct neigh_sysctl_table {
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"),
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f2aa73bfb0e4..18b34d771ed4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -23,6 +23,7 @@
#include <linux/export.h>
#include <linux/jiffies.h>
#include <linux/pm_runtime.h>
+#include <linux/of.h>
#include "net-sysfs.h"
@@ -108,11 +109,19 @@ NETDEVICE_SHOW_RO(dev_id, fmt_hex);
NETDEVICE_SHOW_RO(dev_port, fmt_dec);
NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
NETDEVICE_SHOW_RO(addr_len, fmt_dec);
-NETDEVICE_SHOW_RO(iflink, fmt_dec);
NETDEVICE_SHOW_RO(ifindex, fmt_dec);
NETDEVICE_SHOW_RO(type, fmt_dec);
NETDEVICE_SHOW_RO(link_mode, fmt_dec);
+static ssize_t iflink_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+
+ return sprintf(buf, fmt_dec, dev_get_iflink(ndev));
+}
+static DEVICE_ATTR_RO(iflink);
+
static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
{
return sprintf(buf, fmt_dec, dev->name_assign_type);
@@ -417,6 +426,28 @@ static ssize_t phys_port_id_show(struct device *dev,
}
static DEVICE_ATTR_RO(phys_port_id);
+static ssize_t phys_port_name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ char name[IFNAMSIZ];
+
+ ret = dev_get_phys_port_name(netdev, name, sizeof(name));
+ if (!ret)
+ ret = sprintf(buf, "%s\n", name);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_name);
+
static ssize_t phys_switch_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -427,11 +458,15 @@ static ssize_t phys_switch_id_show(struct device *dev,
return restart_syscall();
if (dev_isalive(netdev)) {
- struct netdev_phys_item_id ppid;
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
- ret = netdev_switch_parent_id_get(netdev, &ppid);
+ ret = switchdev_port_attr_get(netdev, &attr);
if (!ret)
- ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len,
+ attr.u.ppid.id);
}
rtnl_unlock();
@@ -464,6 +499,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
&dev_attr_phys_port_id.attr,
+ &dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
NULL,
};
@@ -950,6 +986,60 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
return sprintf(buf, "%lu", trans_timeout);
}
+#ifdef CONFIG_XPS
+static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++)
+ if (queue == &dev->_tx[i])
+ break;
+
+ BUG_ON(i >= dev->num_tx_queues);
+
+ return i;
+}
+
+static ssize_t show_tx_maxrate(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", queue->tx_maxrate);
+}
+
+static ssize_t set_tx_maxrate(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct net_device *dev = queue->dev;
+ int err, index = get_netdev_queue_index(queue);
+ u32 rate = 0;
+
+ err = kstrtou32(buf, 10, &rate);
+ if (err < 0)
+ return err;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ err = -EOPNOTSUPP;
+ if (dev->netdev_ops->ndo_set_tx_maxrate)
+ err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate);
+
+ rtnl_unlock();
+ if (!err) {
+ queue->tx_maxrate = rate;
+ return len;
+ }
+ return err;
+}
+
+static struct netdev_queue_attribute queue_tx_maxrate =
+ __ATTR(tx_maxrate, S_IRUGO | S_IWUSR,
+ show_tx_maxrate, set_tx_maxrate);
+#endif
+
static struct netdev_queue_attribute queue_trans_timeout =
__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
@@ -1064,18 +1154,6 @@ static struct attribute_group dql_group = {
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
-{
- struct net_device *dev = queue->dev;
- unsigned int i;
-
- i = queue - dev->_tx;
- BUG_ON(i >= dev->num_tx_queues);
-
- return i;
-}
-
-
static ssize_t show_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute, char *buf)
{
@@ -1152,6 +1230,7 @@ static struct attribute *netdev_queue_default_attrs[] = {
&queue_trans_timeout.attr,
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
+ &queue_tx_maxrate.attr,
#endif
NULL
};
@@ -1374,6 +1453,30 @@ static struct class net_class = {
.namespace = net_namespace,
};
+#ifdef CONFIG_OF_NET
+static int of_dev_node_match(struct device *dev, const void *data)
+{
+ int ret = 0;
+
+ if (dev->parent)
+ ret = dev->parent->of_node == data;
+
+ return ret == 0 ? dev->of_node == data : ret;
+}
+
+struct net_device *of_find_net_device_by_node(struct device_node *np)
+{
+ struct device *dev;
+
+ dev = class_find_device(&net_class, NULL, np, of_dev_node_match);
+ if (!dev)
+ return NULL;
+
+ return to_net_dev(dev);
+}
+EXPORT_SYMBOL(of_find_net_device_by_node);
+#endif
+
/* Delete sysfs entries but hold kobject reference until after all
* netdev references are gone.
*/
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 70d3450588b2..2c2eb1b629b1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,7 +16,6 @@
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
-#include <linux/rtnetlink.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
@@ -148,18 +147,17 @@ static void ops_free_list(const struct pernet_operations *ops,
}
}
+/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
int min = 0, max = 0;
- ASSERT_RTNL();
-
if (reqid >= 0) {
min = reqid;
max = reqid + 1;
}
- return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
+ return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
}
/* This function is used by idr_for_each(). If net is equal to peer, the
@@ -175,11 +173,16 @@ static int net_eq_idr(int id, void *net, void *peer)
return 0;
}
-static int __peernet2id(struct net *net, struct net *peer, bool alloc)
+/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
+ * is set to true, thus the caller knows that the new id must be notified via
+ * rtnl.
+ */
+static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
{
int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
+ bool alloc_it = *alloc;
- ASSERT_RTNL();
+ *alloc = false;
/* Magic value for id 0. */
if (id == NET_ID_ZERO)
@@ -187,36 +190,77 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
if (id > 0)
return id;
- if (alloc)
- return alloc_netid(net, peer, -1);
+ if (alloc_it) {
+ id = alloc_netid(net, peer, -1);
+ *alloc = true;
+ return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+ }
+
+ return NETNSA_NSID_NOT_ASSIGNED;
+}
+
+/* should be called with nsid_lock held */
+static int __peernet2id(struct net *net, struct net *peer)
+{
+ bool no = false;
- return -ENOENT;
+ return __peernet2id_alloc(net, peer, &no);
}
+static void rtnl_net_notifyid(struct net *net, int cmd, int id);
/* This function returns the id of a peer netns. If no id is assigned, one will
* be allocated and returned.
*/
+int peernet2id_alloc(struct net *net, struct net *peer)
+{
+ unsigned long flags;
+ bool alloc;
+ int id;
+
+ spin_lock_irqsave(&net->nsid_lock, flags);
+ alloc = atomic_read(&peer->count) == 0 ? false : true;
+ id = __peernet2id_alloc(net, peer, &alloc);
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
+ if (alloc && id >= 0)
+ rtnl_net_notifyid(net, RTM_NEWNSID, id);
+ return id;
+}
+EXPORT_SYMBOL(peernet2id_alloc);
+
+/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer)
{
- bool alloc = atomic_read(&peer->count) == 0 ? false : true;
+ unsigned long flags;
int id;
- id = __peernet2id(net, peer, alloc);
- return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+ spin_lock_irqsave(&net->nsid_lock, flags);
+ id = __peernet2id(net, peer);
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
+ return id;
+}
+
+/* This function returns true is the peer netns has an id assigned into the
+ * current netns.
+ */
+bool peernet_has_id(struct net *net, struct net *peer)
+{
+ return peernet2id(net, peer) >= 0;
}
-EXPORT_SYMBOL(peernet2id);
struct net *get_net_ns_by_id(struct net *net, int id)
{
+ unsigned long flags;
struct net *peer;
if (id < 0)
return NULL;
rcu_read_lock();
+ spin_lock_irqsave(&net->nsid_lock, flags);
peer = idr_find(&net->netns_ids, id);
if (peer)
get_net(peer);
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
rcu_read_unlock();
return peer;
@@ -237,10 +281,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
net->dev_base_seq = 1;
net->user_ns = user_ns;
idr_init(&net->netns_ids);
-
-#ifdef NETNS_REFCNT_DEBUG
- atomic_set(&net->use_count, 0);
-#endif
+ spin_lock_init(&net->nsid_lock);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
@@ -296,13 +337,6 @@ out_free:
static void net_free(struct net *net)
{
-#ifdef NETNS_REFCNT_DEBUG
- if (unlikely(atomic_read(&net->use_count) != 0)) {
- pr_emerg("network namespace not free! Usage: %d\n",
- atomic_read(&net->use_count));
- return;
- }
-#endif
kfree(rcu_access_pointer(net->gen));
kmem_cache_free(net_cachep, net);
}
@@ -368,12 +402,19 @@ static void cleanup_net(struct work_struct *work)
list_del_rcu(&net->list);
list_add_tail(&net->exit_list, &net_exit_list);
for_each_net(tmp) {
- int id = __peernet2id(tmp, net, false);
+ int id;
+ spin_lock_irq(&tmp->nsid_lock);
+ id = __peernet2id(tmp, net);
if (id >= 0)
idr_remove(&tmp->netns_ids, id);
+ spin_unlock_irq(&tmp->nsid_lock);
+ if (id >= 0)
+ rtnl_net_notifyid(tmp, RTM_DELNSID, id);
}
+ spin_lock_irq(&net->nsid_lock);
idr_destroy(&net->netns_ids);
+ spin_unlock_irq(&net->nsid_lock);
}
rtnl_unlock();
@@ -501,6 +542,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
+ unsigned long flags;
struct net *peer;
int nsid, err;
@@ -521,14 +563,19 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
if (IS_ERR(peer))
return PTR_ERR(peer);
- if (__peernet2id(net, peer, false) >= 0) {
+ spin_lock_irqsave(&net->nsid_lock, flags);
+ if (__peernet2id(net, peer) >= 0) {
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
err = -EEXIST;
goto out;
}
err = alloc_netid(net, peer, nsid);
- if (err > 0)
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
+ if (err >= 0) {
+ rtnl_net_notifyid(net, RTM_NEWNSID, err);
err = 0;
+ }
out:
put_net(peer);
return err;
@@ -542,13 +589,10 @@ static int rtnl_net_get_size(void)
}
static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
- int cmd, struct net *net, struct net *peer)
+ int cmd, struct net *net, int nsid)
{
struct nlmsghdr *nlh;
struct rtgenmsg *rth;
- int id;
-
- ASSERT_RTNL();
nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
if (!nlh)
@@ -557,10 +601,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
rth = nlmsg_data(nlh);
rth->rtgen_family = AF_UNSPEC;
- id = __peernet2id(net, peer, false);
- if (id < 0)
- id = NETNSA_NSID_NOT_ASSIGNED;
- if (nla_put_s32(skb, NETNSA_NSID, id))
+ if (nla_put_s32(skb, NETNSA_NSID, nsid))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -576,8 +617,8 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
struct sk_buff *msg;
- int err = -ENOBUFS;
struct net *peer;
+ int err, id;
err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
rtnl_net_policy);
@@ -599,8 +640,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
goto out;
}
+ id = peernet2id(net, peer);
err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
- RTM_GETNSID, net, peer);
+ RTM_NEWNSID, net, id);
if (err < 0)
goto err_out;
@@ -614,6 +656,75 @@ out:
return err;
}
+struct rtnl_net_dump_cb {
+ struct net *net;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ int idx;
+ int s_idx;
+};
+
+static int rtnl_net_dumpid_one(int id, void *peer, void *data)
+{
+ struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
+ int ret;
+
+ if (net_cb->idx < net_cb->s_idx)
+ goto cont;
+
+ ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
+ net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWNSID, net_cb->net, id);
+ if (ret < 0)
+ return ret;
+
+cont:
+ net_cb->idx++;
+ return 0;
+}
+
+static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct rtnl_net_dump_cb net_cb = {
+ .net = net,
+ .skb = skb,
+ .cb = cb,
+ .idx = 0,
+ .s_idx = cb->args[0],
+ };
+ unsigned long flags;
+
+ spin_lock_irqsave(&net->nsid_lock, flags);
+ idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+ spin_unlock_irqrestore(&net->nsid_lock, flags);
+
+ cb->args[0] = net_cb.idx;
+ return skb->len;
+}
+
+static void rtnl_net_notifyid(struct net *net, int cmd, int id)
+{
+ struct sk_buff *msg;
+ int err = -ENOMEM;
+
+ msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
+ if (err < 0)
+ goto err_out;
+
+ rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0);
+ return;
+
+err_out:
+ nlmsg_free(msg);
+out:
+ rtnl_set_sk_err(net, RTNLGRP_NSID, err);
+}
+
static int __init net_ns_init(void)
{
struct net_generic *ng;
@@ -648,7 +759,8 @@ static int __init net_ns_init(void)
register_pernet_subsys(&net_ns_ops);
rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
+ NULL);
return 0;
}
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 1f2a126f4ffa..6441f47b1a8f 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,8 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
- return css_cls_state(task_css(p, net_cls_cgrp_id));
+ return css_cls_state(task_css_check(p, net_cls_cgrp_id,
+ rcu_read_lock_bh_held()));
}
EXPORT_SYMBOL_GPL(task_cls_state);
diff --git a/net/core/netevent.c b/net/core/netevent.c
index f17ccd291d39..8b3bc4fac613 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -31,10 +31,7 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
*/
int register_netevent_notifier(struct notifier_block *nb)
{
- int err;
-
- err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
- return err;
+ return atomic_notifier_chain_register(&netevent_notif_chain, nb);
}
EXPORT_SYMBOL_GPL(register_netevent_notifier);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 508155b283dd..1ebdf1c0d118 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -177,7 +177,7 @@
#include <asm/dma.h>
#include <asm/div64.h> /* do_div */
-#define VERSION "2.74"
+#define VERSION "2.75"
#define IP_NAME_SZ 32
#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
#define MPLS_STACK_BOTTOM htonl(0x00000100)
@@ -210,6 +210,10 @@
#define T_REMDEVALL (1<<2) /* Remove all devs */
#define T_REMDEV (1<<3) /* Remove one dev */
+/* Xmit modes */
+#define M_START_XMIT 0 /* Default normal TX */
+#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+
/* If lock -- protects updating of if_list */
#define if_lock(t) spin_lock(&(t->if_lock));
#define if_unlock(t) spin_unlock(&(t->if_lock));
@@ -251,13 +255,14 @@ struct pktgen_dev {
* we will do a random selection from within the range.
*/
__u32 flags;
- int removal_mark; /* non-zero => the device is marked for
- * removal by worker thread */
-
+ int xmit_mode;
int min_pkt_size;
int max_pkt_size;
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
+ int removal_mark; /* non-zero => the device is marked for
+ * removal by worker thread */
+
struct page *page;
u64 delay; /* nano-seconds */
@@ -507,7 +512,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
pktgen_reset_all_threads(pn);
else
- pr_warn("Unknown command: %s\n", data);
+ return -EINVAL;
return count;
}
@@ -567,7 +572,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
" dst_min: %s dst_max: %s\n",
pkt_dev->dst_min, pkt_dev->dst_max);
seq_printf(seq,
- " src_min: %s src_max: %s\n",
+ " src_min: %s src_max: %s\n",
pkt_dev->src_min, pkt_dev->src_max);
}
@@ -620,6 +625,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->node >= 0)
seq_printf(seq, " node: %d\n", pkt_dev->node);
+ if (pkt_dev->xmit_mode == M_NETIF_RECEIVE)
+ seq_puts(seq, " xmit_mode: netif_receive\n");
+
seq_puts(seq, " Flags: ");
if (pkt_dev->flags & F_IPV6)
@@ -1081,7 +1089,8 @@ static ssize_t pktgen_if_write(struct file *file,
if (len < 0)
return len;
if ((value > 0) &&
- (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
+ !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
i += len;
pkt_dev->clone_skb = value;
@@ -1134,7 +1143,7 @@ static ssize_t pktgen_if_write(struct file *file,
return len;
i += len;
- if ((value > 1) &&
+ if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
pkt_dev->burst = value < 1 ? 1 : value;
@@ -1160,6 +1169,45 @@ static ssize_t pktgen_if_write(struct file *file,
sprintf(pg_result, "ERROR: node not possible");
return count;
}
+ if (!strcmp(name, "xmit_mode")) {
+ char f[32];
+
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ if (strcmp(f, "start_xmit") == 0) {
+ pkt_dev->xmit_mode = M_START_XMIT;
+ } else if (strcmp(f, "netif_receive") == 0) {
+ /* clone_skb set earlier, not supported in this mode */
+ if (pkt_dev->clone_skb > 0)
+ return -ENOTSUPP;
+
+ pkt_dev->xmit_mode = M_NETIF_RECEIVE;
+
+ /* make sure new packet is allocated every time
+ * pktgen_xmit() is called
+ */
+ pkt_dev->last_ok = 1;
+
+ /* override clone_skb if user passed default value
+ * at module loading time
+ */
+ pkt_dev->clone_skb = 0;
+ } else {
+ sprintf(pg_result,
+ "xmit_mode -:%s:- unknown\nAvailable modes: %s",
+ f, "start_xmit, netif_receive\n");
+ return count;
+ }
+ sprintf(pg_result, "OK: xmit_mode=%s", f);
+ return count;
+ }
if (!strcmp(name, "flag")) {
char f[32];
memset(f, 0, 32);
@@ -1267,6 +1315,9 @@ static ssize_t pktgen_if_write(struct file *file,
else if (strcmp(f, "NO_TIMESTAMP") == 0)
pkt_dev->flags |= F_NO_TIMESTAMP;
+ else if (strcmp(f, "!NO_TIMESTAMP") == 0)
+ pkt_dev->flags &= ~F_NO_TIMESTAMP;
+
else {
sprintf(pg_result,
"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
@@ -2212,8 +2263,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&t.timer))
- t.task = NULL;
if (likely(t.task))
schedule();
@@ -2594,9 +2643,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
int nhead = 0;
if (x) {
- int ret;
- __u8 *eth;
+ struct ethhdr *eth;
struct iphdr *iph;
+ int ret;
nhead = x->props.header_len - skb_headroom(skb);
if (nhead > 0) {
@@ -2616,9 +2665,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
goto err;
}
/* restore ll */
- eth = (__u8 *) skb_push(skb, ETH_HLEN);
- memcpy(eth, pkt_dev->hh, 12);
- *(u16 *) &eth[12] = protocol;
+ eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
+ memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN);
+ eth->h_proto = protocol;
/* Update IPv4 header len as well as checksum value */
iph = ip_hdr(skb);
@@ -3317,6 +3366,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
struct net_device *odev = pkt_dev->odev;
struct netdev_queue *txq;
+ struct sk_buff *skb;
int ret;
/* If device is offline, then don't send */
@@ -3354,6 +3404,37 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (pkt_dev->delay && pkt_dev->last_ok)
spin(pkt_dev, pkt_dev->next_tx);
+ if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
+ skb = pkt_dev->skb;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ atomic_add(burst, &skb->users);
+ local_bh_disable();
+ do {
+ ret = netif_receive_skb(skb);
+ if (ret == NET_RX_DROP)
+ pkt_dev->errors++;
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ if (atomic_read(&skb->users) != burst) {
+ /* skb was queued by rps/rfs or taps,
+ * so cannot reuse this skb
+ */
+ atomic_sub(burst - 1, &skb->users);
+ /* get out of the loop and wait
+ * until skb is consumed
+ */
+ break;
+ }
+ /* skb was 'freed' by stack, so clean few
+ * bits and reuse it
+ */
+#ifdef CONFIG_NET_CLS_ACT
+ skb->tc_verd = 0; /* reset reclass/redir ttl */
+#endif
+ } while (--burst > 0);
+ goto out; /* Skips xmit_mode M_START_XMIT */
+ }
+
txq = skb_get_tx_queue(odev, pkt_dev->skb);
local_bh_disable();
@@ -3401,6 +3482,7 @@ xmit_more:
unlock:
HARD_TX_UNLOCK(odev, txq);
+out:
local_bh_enable();
/* If pkt_dev->count is zero, then run forever */
@@ -3489,13 +3571,6 @@ static int pktgen_thread_worker(void *arg)
pr_debug("%s removing thread\n", t->tsk->comm);
pktgen_rem_thread(t);
- /* Wait for kthread_stop */
- while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- }
- __set_current_state(TASK_RUNNING);
-
return 0;
}
@@ -3687,6 +3762,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
}
t->net = pn;
+ get_task_struct(p);
wake_up_process(p);
wait_for_completion(&t->start_done);
@@ -3809,6 +3885,7 @@ static void __net_exit pg_net_exit(struct net *net)
t = list_entry(q, struct pktgen_thread, th_list);
list_del(&t->th_list);
kthread_stop(t->tsk);
+ put_task_struct(t->tsk);
kfree(t);
}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 04db318e6218..87b22c0bc08c 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -58,14 +58,14 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
return -ENOMEM;
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
- rwlock_init(&queue->syn_wait_lock);
+ spin_lock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
lopt->max_qlen_log = ilog2(nr_table_entries);
- write_lock_bh(&queue->syn_wait_lock);
+ spin_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
- write_unlock_bh(&queue->syn_wait_lock);
+ spin_unlock_bh(&queue->syn_wait_lock);
return 0;
}
@@ -81,10 +81,10 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk(
{
struct listen_sock *lopt;
- write_lock_bh(&queue->syn_wait_lock);
+ spin_lock_bh(&queue->syn_wait_lock);
lopt = queue->listen_opt;
queue->listen_opt = NULL;
- write_unlock_bh(&queue->syn_wait_lock);
+ spin_unlock_bh(&queue->syn_wait_lock);
return lopt;
}
@@ -94,21 +94,26 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
- if (lopt->qlen != 0) {
+ if (listen_sock_qlen(lopt) != 0) {
unsigned int i;
for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req;
+ spin_lock_bh(&queue->syn_wait_lock);
while ((req = lopt->syn_table[i]) != NULL) {
lopt->syn_table[i] = req->dl_next;
- lopt->qlen--;
- reqsk_free(req);
+ atomic_inc(&lopt->qlen_dec);
+ if (del_timer(&req->rsk_timer))
+ reqsk_put(req);
+ reqsk_put(req);
}
+ spin_unlock_bh(&queue->syn_wait_lock);
}
}
- WARN_ON(lopt->qlen != 0);
+ if (WARN_ON(listen_sock_qlen(lopt) != 0))
+ pr_err("qlen %u\n", listen_sock_qlen(lopt));
kvfree(lopt);
}
@@ -153,24 +158,22 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
* case might also exist in tcp_v4_hnd_req() that will trigger this locking
* order.
*
- * When a TFO req is created, it needs to sock_hold its listener to prevent
- * the latter data structure from going away.
- *
- * This function also sets "treq->listener" to NULL and unreference listener
- * socket. treq->listener is used by the listener so it is protected by the
+ * This function also sets "treq->tfo_listener" to false.
+ * treq->tfo_listener is used by the listener so it is protected by the
* fastopenq->lock in this function.
*/
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
bool reset)
{
- struct sock *lsk = tcp_rsk(req)->listener;
- struct fastopen_queue *fastopenq =
- inet_csk(lsk)->icsk_accept_queue.fastopenq;
+ struct sock *lsk = req->rsk_listener;
+ struct fastopen_queue *fastopenq;
+
+ fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq;
tcp_sk(sk)->fastopen_rsk = NULL;
spin_lock_bh(&fastopenq->lock);
fastopenq->qlen--;
- tcp_rsk(req)->listener = NULL;
+ tcp_rsk(req)->tfo_listener = false;
if (req->sk) /* the child socket hasn't been accepted yet */
goto out;
@@ -179,8 +182,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
* special RST handling below.
*/
spin_unlock_bh(&fastopenq->lock);
- sock_put(lsk);
- reqsk_free(req);
+ reqsk_put(req);
return;
}
/* Wait for 60secs before removing a req that has triggered RST.
@@ -190,7 +192,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
*
* For more details see CoNext'11 "TCP Fast Open" paper.
*/
- req->expires = jiffies + 60*HZ;
+ req->rsk_timer.expires = jiffies + 60*HZ;
if (fastopenq->rskq_rst_head == NULL)
fastopenq->rskq_rst_head = req;
else
@@ -201,5 +203,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
fastopenq->qlen++;
out:
spin_unlock_bh(&fastopenq->lock);
- sock_put(lsk);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 7ebed55b5f7d..dc004b1e1f85 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -818,7 +818,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
nla_total_size(sizeof(struct ifla_vf_vlan)) +
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
nla_total_size(sizeof(struct ifla_vf_rate)) +
- nla_total_size(sizeof(struct ifla_vf_link_state)));
+ nla_total_size(sizeof(struct ifla_vf_link_state)) +
+ nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+ /* IFLA_VF_STATS_RX_PACKETS */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_PACKETS */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_BYTES */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_BYTES */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_BROADCAST */
+ nla_total_size(sizeof(__u64)) +
+ /* IFLA_VF_STATS_MULTICAST */
+ nla_total_size(sizeof(__u64)));
return size;
} else
return 0;
@@ -982,19 +995,41 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ char name[IFNAMSIZ];
+ int err;
+
+ err = dev_get_phys_port_name(dev, name, sizeof(name));
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
{
int err;
- struct netdev_phys_item_id psid;
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
- err = netdev_switch_parent_id_get(dev, &psid);
+ err = switchdev_port_attr_get(dev, &attr);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
return err;
}
- if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id))
+ if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len,
+ attr.u.ppid.id))
return -EMSGSIZE;
return 0;
@@ -1037,8 +1072,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
#ifdef CONFIG_RPS
nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
#endif
- (dev->ifindex != dev->iflink &&
- nla_put_u32(skb, IFLA_LINK, dev->iflink)) ||
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) ||
(upper_dev &&
nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) ||
nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
@@ -1072,6 +1107,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
if (rtnl_phys_port_id_fill(skb, dev))
goto nla_put_failure;
+ if (rtnl_phys_port_name_fill(skb, dev))
+ goto nla_put_failure;
+
if (rtnl_phys_switch_id_fill(skb, dev))
goto nla_put_failure;
@@ -1097,7 +1135,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
&& (ext_filter_mask & RTEXT_FILTER_VF)) {
int i;
- struct nlattr *vfinfo, *vf;
+ struct nlattr *vfinfo, *vf, *vfstats;
int num_vfs = dev_num_vf(dev->dev.parent);
vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
@@ -1111,14 +1149,17 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct ifla_vf_tx_rate vf_tx_rate;
struct ifla_vf_spoofchk vf_spoofchk;
struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct ifla_vf_stats vf_stats;
/*
* Not all SR-IOV capable drivers support the
- * spoofcheck query. Preset to -1 so the user
- * space tool can detect that the driver didn't
- * report anything.
+ * spoofcheck and "RSS query enable" query. Preset to
+ * -1 so the user space tool can detect that the driver
+ * didn't report anything.
*/
ivi.spoofchk = -1;
+ ivi.rss_query_en = -1;
memset(ivi.mac, 0, sizeof(ivi.mac));
/* The default value for VF link state is "auto"
* IFLA_VF_LINK_STATE_AUTO which equals zero
@@ -1131,7 +1172,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
vf_rate.vf =
vf_tx_rate.vf =
vf_spoofchk.vf =
- vf_linkstate.vf = ivi.vf;
+ vf_linkstate.vf =
+ vf_rss_query_en.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
@@ -1141,6 +1183,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
vf_rate.max_tx_rate = ivi.max_tx_rate;
vf_spoofchk.setting = ivi.spoofchk;
vf_linkstate.link_state = ivi.linkstate;
+ vf_rss_query_en.setting = ivi.rss_query_en;
vf = nla_nest_start(skb, IFLA_VF_INFO);
if (!vf) {
nla_nest_cancel(skb, vfinfo);
@@ -1155,8 +1198,35 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
&vf_spoofchk) ||
nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
- &vf_linkstate))
+ &vf_linkstate) ||
+ nla_put(skb, IFLA_VF_RSS_QUERY_EN,
+ sizeof(vf_rss_query_en),
+ &vf_rss_query_en))
goto nla_put_failure;
+ memset(&vf_stats, 0, sizeof(vf_stats));
+ if (dev->netdev_ops->ndo_get_vf_stats)
+ dev->netdev_ops->ndo_get_vf_stats(dev, i,
+ &vf_stats);
+ vfstats = nla_nest_start(skb, IFLA_VF_STATS);
+ if (!vfstats) {
+ nla_nest_cancel(skb, vf);
+ nla_nest_cancel(skb, vfinfo);
+ goto nla_put_failure;
+ }
+ if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets) ||
+ nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets) ||
+ nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes) ||
+ nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes) ||
+ nla_put_u64(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast) ||
+ nla_put_u64(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast))
+ goto nla_put_failure;
+ nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
}
nla_nest_end(skb, vfinfo);
@@ -1175,7 +1245,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
if (!net_eq(dev_net(dev), link_net)) {
- int id = peernet2id(dev_net(dev), link_net);
+ int id = peernet2id_alloc(dev_net(dev), link_net);
if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
goto nla_put_failure;
@@ -1258,10 +1328,6 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
[IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED },
};
-static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
- [IFLA_VF_INFO] = { .type = NLA_NESTED },
-};
-
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
@@ -1269,6 +1335,17 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
[IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
[IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) },
+ [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) },
+ [IFLA_VF_STATS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
+ [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 },
+ [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 },
};
static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1407,85 +1484,98 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
return 0;
}
-static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
+static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
{
- int rem, err = -EINVAL;
- struct nlattr *vf;
const struct net_device_ops *ops = dev->netdev_ops;
+ int err = -EINVAL;
- nla_for_each_nested(vf, attr, rem) {
- switch (nla_type(vf)) {
- case IFLA_VF_MAC: {
- struct ifla_vf_mac *ivm;
- ivm = nla_data(vf);
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_mac)
- err = ops->ndo_set_vf_mac(dev, ivm->vf,
- ivm->mac);
- break;
- }
- case IFLA_VF_VLAN: {
- struct ifla_vf_vlan *ivv;
- ivv = nla_data(vf);
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_vlan)
- err = ops->ndo_set_vf_vlan(dev, ivv->vf,
- ivv->vlan,
- ivv->qos);
- break;
- }
- case IFLA_VF_TX_RATE: {
- struct ifla_vf_tx_rate *ivt;
- struct ifla_vf_info ivf;
- ivt = nla_data(vf);
- err = -EOPNOTSUPP;
- if (ops->ndo_get_vf_config)
- err = ops->ndo_get_vf_config(dev, ivt->vf,
- &ivf);
- if (err)
- break;
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_rate)
- err = ops->ndo_set_vf_rate(dev, ivt->vf,
- ivf.min_tx_rate,
- ivt->rate);
- break;
- }
- case IFLA_VF_RATE: {
- struct ifla_vf_rate *ivt;
- ivt = nla_data(vf);
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_rate)
- err = ops->ndo_set_vf_rate(dev, ivt->vf,
- ivt->min_tx_rate,
- ivt->max_tx_rate);
- break;
- }
- case IFLA_VF_SPOOFCHK: {
- struct ifla_vf_spoofchk *ivs;
- ivs = nla_data(vf);
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_spoofchk)
- err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
- ivs->setting);
- break;
- }
- case IFLA_VF_LINK_STATE: {
- struct ifla_vf_link_state *ivl;
- ivl = nla_data(vf);
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_link_state)
- err = ops->ndo_set_vf_link_state(dev, ivl->vf,
- ivl->link_state);
- break;
- }
- default:
- err = -EINVAL;
- break;
- }
- if (err)
- break;
+ if (tb[IFLA_VF_MAC]) {
+ struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_mac)
+ err = ops->ndo_set_vf_mac(dev, ivm->vf,
+ ivm->mac);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN]) {
+ struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_vlan)
+ err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
+ ivv->qos);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_TX_RATE]) {
+ struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
+ struct ifla_vf_info ivf;
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_get_vf_config)
+ err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
+ if (err < 0)
+ return err;
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate,
+ ivt->rate);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_RATE]) {
+ struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate,
+ ivt->max_tx_rate);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_SPOOFCHK]) {
+ struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_spoofchk)
+ err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
+ ivs->setting);
+ if (err < 0)
+ return err;
}
+
+ if (tb[IFLA_VF_LINK_STATE]) {
+ struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_link_state)
+ err = ops->ndo_set_vf_link_state(dev, ivl->vf,
+ ivl->link_state);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_RSS_QUERY_EN]) {
+ struct ifla_vf_rss_query_en *ivrssq_en;
+
+ err = -EOPNOTSUPP;
+ ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
+ if (ops->ndo_set_vf_rss_query_en)
+ err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
+ ivrssq_en->setting);
+ if (err < 0)
+ return err;
+ }
+
return err;
}
@@ -1681,14 +1771,21 @@ static int do_setlink(const struct sk_buff *skb,
}
if (tb[IFLA_VFINFO_LIST]) {
+ struct nlattr *vfinfo[IFLA_VF_MAX + 1];
struct nlattr *attr;
int rem;
+
nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
- if (nla_type(attr) != IFLA_VF_INFO) {
+ if (nla_type(attr) != IFLA_VF_INFO ||
+ nla_len(attr) < NLA_HDRLEN) {
err = -EINVAL;
goto errout;
}
- err = do_setvfinfo(dev, attr);
+ err = nla_parse_nested(vfinfo, IFLA_VF_MAX, attr,
+ ifla_vf_policy);
+ if (err < 0)
+ goto errout;
+ err = do_setvfinfo(dev, vfinfo);
if (err < 0)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -1707,10 +1804,13 @@ static int do_setlink(const struct sk_buff *skb,
goto errout;
nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
- if (nla_type(attr) != IFLA_VF_PORT)
- continue;
- err = nla_parse_nested(port, IFLA_PORT_MAX,
- attr, ifla_port_policy);
+ if (nla_type(attr) != IFLA_VF_PORT ||
+ nla_len(attr) < NLA_HDRLEN) {
+ err = -EINVAL;
+ goto errout;
+ }
+ err = nla_parse_nested(port, IFLA_PORT_MAX, attr,
+ ifla_port_policy);
if (err < 0)
goto errout;
if (!port[IFLA_PORT_VF]) {
@@ -1815,6 +1915,42 @@ errout:
return err;
}
+static int rtnl_group_dellink(const struct net *net, int group)
+{
+ struct net_device *dev, *aux;
+ LIST_HEAD(list_kill);
+ bool found = false;
+
+ if (!group)
+ return -EPERM;
+
+ for_each_netdev(net, dev) {
+ if (dev->group == group) {
+ const struct rtnl_link_ops *ops;
+
+ found = true;
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (!found)
+ return -ENODEV;
+
+ for_each_netdev_safe(net, dev, aux) {
+ if (dev->group == group) {
+ const struct rtnl_link_ops *ops;
+
+ ops = dev->rtnl_link_ops;
+ ops->dellink(dev, &list_kill);
+ }
+ }
+ unregister_netdevice_many(&list_kill);
+
+ return 0;
+}
+
static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
@@ -1838,6 +1974,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
dev = __dev_get_by_index(net, ifm->ifi_index);
else if (tb[IFLA_IFNAME])
dev = __dev_get_by_name(net, ifname);
+ else if (tb[IFLA_GROUP])
+ return rtnl_group_dellink(net, nla_get_u32(tb[IFLA_GROUP]));
else
return -EINVAL;
@@ -1873,7 +2011,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
EXPORT_SYMBOL(rtnl_configure_link);
struct net_device *rtnl_create_link(struct net *net,
- char *ifname, unsigned char name_assign_type,
+ const char *ifname, unsigned char name_assign_type,
const struct rtnl_link_ops *ops, struct nlattr *tb[])
{
int err;
@@ -2337,6 +2475,9 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
{
struct sk_buff *skb;
+ if (dev->reg_state != NETREG_REGISTERED)
+ return;
+
skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
if (skb)
rtmsg_ifinfo_send(skb, dev, flags);
@@ -2345,7 +2486,7 @@ EXPORT_SYMBOL(rtmsg_ifinfo);
static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
struct net_device *dev,
- u8 *addr, u32 pid, u32 seq,
+ u8 *addr, u16 vid, u32 pid, u32 seq,
int type, unsigned int flags,
int nlflags)
{
@@ -2367,6 +2508,9 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
goto nla_put_failure;
+ if (vid)
+ if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid))
+ goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
@@ -2381,7 +2525,7 @@ static inline size_t rtnl_fdb_nlmsg_size(void)
return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
}
-static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type)
+static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
@@ -2391,7 +2535,8 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type)
if (!skb)
goto errout;
- err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0);
+ err = nlmsg_populate_fdb_fill(skb, dev, addr, vid,
+ 0, 0, type, NTF_SELF, 0);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -2526,7 +2671,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
nlh->nlmsg_flags);
if (!err) {
- rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH);
+ rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH);
ndm->ndm_flags &= ~NTF_SELF;
}
}
@@ -2627,7 +2772,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
if (!err) {
- rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
+ rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH);
ndm->ndm_flags &= ~NTF_SELF;
}
}
@@ -2652,7 +2797,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
if (*idx < cb->args[0])
goto skip;
- err = nlmsg_populate_fdb_fill(skb, dev, ha->addr,
+ err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
portid, seq,
RTM_NEWNEIGH, NTF_SELF,
NLM_F_MULTI);
@@ -2695,7 +2840,6 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net_device *dev;
struct nlattr *tb[IFLA_MAX+1];
- struct net_device *bdev = NULL;
struct net_device *br_dev = NULL;
const struct net_device_ops *ops = NULL;
const struct net_device_ops *cops = NULL;
@@ -2719,7 +2863,6 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
return -ENODEV;
ops = br_dev->netdev_ops;
- bdev = br_dev;
}
for_each_netdev(net, dev) {
@@ -2732,7 +2875,6 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
cops = br_dev->netdev_ops;
}
- bdev = dev;
} else {
if (dev != br_dev &&
!(dev->priv_flags & IFF_BRIDGE_PORT))
@@ -2742,7 +2884,6 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
!(dev->priv_flags & IFF_EBRIDGE))
continue;
- bdev = br_dev;
cops = ops;
}
@@ -2775,7 +2916,11 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
struct net_device *dev, u16 mode,
- u32 flags, u32 mask)
+ u32 flags, u32 mask, int nlflags,
+ u32 filter_mask,
+ int (*vlan_fill)(struct sk_buff *skb,
+ struct net_device *dev,
+ u32 filter_mask))
{
struct nlmsghdr *nlh;
struct ifinfomsg *ifm;
@@ -2783,8 +2928,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
struct nlattr *protinfo;
u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ int err = 0;
- nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -2804,8 +2950,8 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) ||
(dev->addr_len &&
nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
- (dev->ifindex != dev->iflink &&
- nla_put_u32(skb, IFLA_LINK, dev->iflink)))
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
goto nla_put_failure;
br_afspec = nla_nest_start(skb, IFLA_AF_SPEC);
@@ -2823,6 +2969,13 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
goto nla_put_failure;
}
}
+ if (vlan_fill) {
+ err = vlan_fill(skb, dev, filter_mask);
+ if (err) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
nla_nest_end(skb, br_afspec);
protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
@@ -2856,9 +3009,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
return 0;
nla_put_failure:
nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
+ return err ? err : -EMSGSIZE;
}
-EXPORT_SYMBOL(ndo_dflt_bridge_getlink);
+EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -2890,7 +3043,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
if (idx >= cb->args[0] &&
br_dev->netdev_ops->ndo_bridge_getlink(
- skb, portid, seq, dev, filter_mask) < 0)
+ skb, portid, seq, dev, filter_mask,
+ NLM_F_MULTI) < 0)
break;
idx++;
}
@@ -2898,7 +3052,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
if (ops->ndo_bridge_getlink) {
if (idx >= cb->args[0] &&
ops->ndo_bridge_getlink(skb, portid, seq, dev,
- filter_mask) < 0)
+ filter_mask,
+ NLM_F_MULTI) < 0)
break;
idx++;
}
@@ -2939,7 +3094,7 @@ static int rtnl_bridge_notify(struct net_device *dev)
goto errout;
}
- err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0);
+ err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
if (err < 0)
goto errout;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 51dd3193a33e..fd3ce461fbe6 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -154,7 +154,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + daddr[i];
+ secret[i] = net_secret[i] + (__force u32)daddr[i];
secret[4] = net_secret[4] +
(((__force u16)sport << 16) + (__force u16)dport);
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8e4ac97c8477..b6a19ca0f99e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -280,13 +280,14 @@ nodata:
EXPORT_SYMBOL(__alloc_skb);
/**
- * build_skb - build a network buffer
+ * __build_skb - build a network buffer
* @data: data buffer provided by caller
- * @frag_size: size of fragment, or 0 if head was kmalloced
+ * @frag_size: size of data, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator.
+ * @frag_size is 0, otherwise data should come from the page allocator
+ * or vmalloc()
* The return is the new skb buffer.
* On a failure the return is %NULL, and @data is not freed.
* Notes :
@@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb);
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *build_skb(void *data, unsigned int frag_size)
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
@@ -311,7 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
- skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -328,95 +328,37 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
return skb;
}
-EXPORT_SYMBOL(build_skb);
-
-struct netdev_alloc_cache {
- struct page_frag frag;
- /* we maintain a pagecount bias, so that we dont dirty cache line
- * containing page->_count every time we allocate a fragment.
- */
- unsigned int pagecnt_bias;
-};
-static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
-
-static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
- gfp_t gfp_mask)
-{
- const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
- struct page *page = NULL;
- gfp_t gfp = gfp_mask;
-
- if (order) {
- gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
- page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
- nc->frag.size = PAGE_SIZE << (page ? order : 0);
- }
-
- if (unlikely(!page))
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
- nc->frag.page = page;
-
- return page;
-}
-static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
- unsigned int fragsz, gfp_t gfp_mask)
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ * This means that if @frag_size is not zero, then @data must be backed
+ * by a page fragment, not kmalloc() or vmalloc()
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
- struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
- struct page *page = nc->frag.page;
- unsigned int size;
- int offset;
-
- if (unlikely(!page)) {
-refill:
- page = __page_frag_refill(nc, gfp_mask);
- if (!page)
- return NULL;
-
- /* if size can vary use frag.size else just use PAGE_SIZE */
- size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
- /* Even if we own the page, we do not use atomic_set().
- * This would break get_page_unless_zero() users.
- */
- atomic_add(size - 1, &page->_count);
-
- /* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
- nc->frag.offset = size;
- }
+ struct sk_buff *skb = __build_skb(data, frag_size);
- offset = nc->frag.offset - fragsz;
- if (unlikely(offset < 0)) {
- if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
- goto refill;
-
- /* if size can vary use frag.size else just use PAGE_SIZE */
- size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
- /* OK, page count is 0, we can safely set it */
- atomic_set(&page->_count, size);
-
- /* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
- offset = size - fragsz;
+ if (skb && frag_size) {
+ skb->head_frag = 1;
+ if (virt_to_head_page(data)->pfmemalloc)
+ skb->pfmemalloc = 1;
}
-
- nc->pagecnt_bias--;
- nc->frag.offset = offset;
-
- return page_address(page) + offset;
+ return skb;
}
+EXPORT_SYMBOL(build_skb);
+
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
+ struct page_frag_cache *nc;
unsigned long flags;
void *data;
local_irq_save(flags);
- data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = __alloc_page_frag(nc, fragsz, gfp_mask);
local_irq_restore(flags);
return data;
}
@@ -436,7 +378,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);
static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+ struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ return __alloc_page_frag(nc, fragsz, gfp_mask);
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -446,76 +390,70 @@ void *napi_alloc_frag(unsigned int fragsz)
EXPORT_SYMBOL(napi_alloc_frag);
/**
- * __alloc_rx_skb - allocate an skbuff for rx
+ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ * @dev: network device to receive on
* @length: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb
- * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
- * allocations in case we have to fallback to __alloc_skb()
- * If SKB_ALLOC_NAPI is set, page fragment will be allocated
- * from napi_cache instead of netdev_cache.
*
* Allocate a new &sk_buff and assign it a usage count of one. The
- * buffer has unspecified headroom built in. Users should allocate
+ * buffer has NET_SKB_PAD headroom built in. Users should allocate
* the headroom they think they need without accounting for the
* built in space. The built in space is used for optimisations.
*
* %NULL is returned if there is no free memory.
*/
-static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
- int flags)
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+ gfp_t gfp_mask)
{
- struct sk_buff *skb = NULL;
- unsigned int fragsz = SKB_DATA_ALIGN(length) +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct page_frag_cache *nc;
+ unsigned long flags;
+ struct sk_buff *skb;
+ bool pfmemalloc;
+ void *data;
- if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
- void *data;
+ len += NET_SKB_PAD;
- if (sk_memalloc_socks())
- gfp_mask |= __GFP_MEMALLOC;
+ if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+ goto skb_success;
+ }
- data = (flags & SKB_ALLOC_NAPI) ?
- __napi_alloc_frag(fragsz, gfp_mask) :
- __netdev_alloc_frag(fragsz, gfp_mask);
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
- if (likely(data)) {
- skb = build_skb(data, fragsz);
- if (unlikely(!skb))
- put_page(virt_to_head_page(data));
- }
- } else {
- skb = __alloc_skb(length, gfp_mask,
- SKB_ALLOC_RX, NUMA_NO_NODE);
- }
- return skb;
-}
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
-/**
- * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
- * @dev: network device to receive on
- * @length: length to allocate
- * @gfp_mask: get_free_pages mask, passed to alloc_skb
- *
- * Allocate a new &sk_buff and assign it a usage count of one. The
- * buffer has NET_SKB_PAD headroom built in. Users should allocate
- * the headroom they think they need without accounting for the
- * built in space. The built in space is used for optimisations.
- *
- * %NULL is returned if there is no free memory.
- */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
- unsigned int length, gfp_t gfp_mask)
-{
- struct sk_buff *skb;
+ local_irq_save(flags);
- length += NET_SKB_PAD;
- skb = __alloc_rx_skb(length, gfp_mask, 0);
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = __alloc_page_frag(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
- if (likely(skb)) {
- skb_reserve(skb, NET_SKB_PAD);
- skb->dev = dev;
+ local_irq_restore(flags);
+
+ if (unlikely(!data))
+ return NULL;
+
+ skb = __build_skb(data, len);
+ if (unlikely(!skb)) {
+ skb_free_frag(data);
+ return NULL;
}
+ /* use OR instead of assignment to avoid clearing of bits in mask */
+ if (pfmemalloc)
+ skb->pfmemalloc = 1;
+ skb->head_frag = 1;
+
+skb_success:
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+
+skb_fail:
return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);
@@ -533,19 +471,49 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
- unsigned int length, gfp_t gfp_mask)
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
+ gfp_t gfp_mask)
{
+ struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
+ void *data;
- length += NET_SKB_PAD + NET_IP_ALIGN;
- skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+ len += NET_SKB_PAD + NET_IP_ALIGN;
- if (likely(skb)) {
- skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
- skb->dev = napi->dev;
+ if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+ goto skb_success;
}
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = __alloc_page_frag(nc, len, gfp_mask);
+ if (unlikely(!data))
+ return NULL;
+
+ skb = __build_skb(data, len);
+ if (unlikely(!skb)) {
+ skb_free_frag(data);
+ return NULL;
+ }
+
+ /* use OR instead of assignment to avoid clearing of bits in mask */
+ if (nc->pfmemalloc)
+ skb->pfmemalloc = 1;
+ skb->head_frag = 1;
+
+skb_success:
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ skb->dev = napi->dev;
+
+skb_fail:
return skb;
}
EXPORT_SYMBOL(__napi_alloc_skb);
@@ -593,10 +561,12 @@ static void skb_clone_fraglist(struct sk_buff *skb)
static void skb_free_head(struct sk_buff *skb)
{
+ unsigned char *head = skb->head;
+
if (skb->head_frag)
- put_page(virt_to_head_page(skb->head));
+ skb_free_frag(head);
else
- kfree(skb->head);
+ kfree(head);
}
static void skb_release_data(struct sk_buff *skb)
@@ -1900,15 +1870,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
return false;
}
+ssize_t skb_socket_splice(struct sock *sk,
+ struct pipe_inode_info *pipe,
+ struct splice_pipe_desc *spd)
+{
+ int ret;
+
+ /* Drop the socket lock, otherwise we have reverse
+ * locking dependencies between sk_lock and i_mutex
+ * here as compared to sendfile(). We enter here
+ * with the socket lock held, and splice_to_pipe() will
+ * grab the pipe inode lock. For sendfile() emulation,
+ * we call into ->sendpage() with the i_mutex lock held
+ * and networking will grab the socket lock.
+ */
+ release_sock(sk);
+ ret = splice_to_pipe(pipe, spd);
+ lock_sock(sk);
+
+ return ret;
+}
+
/*
* Map data from the skb to a pipe. Should handle both the linear part,
* the fragments, and the frag list. It does NOT handle frag lists within
* the frag list, if such a thing exists. We'd probably need to recurse to
* handle that cleanly.
*/
-int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
- unsigned int flags)
+ unsigned int flags,
+ ssize_t (*splice_cb)(struct sock *,
+ struct pipe_inode_info *,
+ struct splice_pipe_desc *))
{
struct partial_page partial[MAX_SKB_FRAGS];
struct page *pages[MAX_SKB_FRAGS];
@@ -1921,7 +1915,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
.spd_release = sock_spd_release,
};
struct sk_buff *frag_iter;
- struct sock *sk = skb->sk;
int ret = 0;
/*
@@ -1944,23 +1937,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
}
done:
- if (spd.nr_pages) {
- /*
- * Drop the socket lock, otherwise we have reverse
- * locking dependencies between sk_lock and i_mutex
- * here as compared to sendfile(). We enter here
- * with the socket lock held, and splice_to_pipe() will
- * grab the pipe inode lock. For sendfile() emulation,
- * we call into ->sendpage() with the i_mutex lock held
- * and networking will grab the socket lock.
- */
- release_sock(sk);
- ret = splice_to_pipe(pipe, &spd);
- lock_sock(sk);
- }
+ if (spd.nr_pages)
+ ret = splice_cb(sk, pipe, &spd);
return ret;
}
+EXPORT_SYMBOL_GPL(skb_splice_bits);
/**
* skb_store_bits - store bits from kernel buffer to skb
@@ -2865,7 +2847,6 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
* @from: search offset
* @to: search limit
* @config: textsearch configuration
- * @state: uninitialized textsearch state variable
*
* Finds a pattern in the skb data according to the specified
* textsearch configuration. Use textsearch_next() to retrieve
@@ -2873,17 +2854,17 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
* to the first occurrence or UINT_MAX if no match was found.
*/
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
- unsigned int to, struct ts_config *config,
- struct ts_state *state)
+ unsigned int to, struct ts_config *config)
{
+ struct ts_state state;
unsigned int ret;
config->get_next_block = skb_ts_get_next_block;
config->finish = skb_ts_finish;
- skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
- ret = textsearch_find(config, state);
+ ret = textsearch_find(config, &state);
return (ret <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);
@@ -2946,6 +2927,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
}
EXPORT_SYMBOL(skb_append_datato_frags);
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+ int offset, size_t size)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+ } else if (i < MAX_SKB_FRAGS) {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, size);
+ } else {
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_append_pagefrags);
+
/**
* skb_pull_rcsum - pull skb and update receive checksum
* @skb: buffer to update
@@ -3207,10 +3206,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
- struct sk_buff *nskb, *lp, *p = *head;
unsigned int len = skb_gro_len(skb);
+ struct sk_buff *lp, *p = *head;
unsigned int delta_truesize;
- unsigned int headroom;
if (unlikely(p->len + len >= 65536))
return -E2BIG;
@@ -3277,48 +3275,6 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
}
- /* switch back to head shinfo */
- pinfo = skb_shinfo(p);
-
- if (pinfo->frag_list)
- goto merge;
- if (skb_gro_len(p) != pinfo->gso_size)
- return -E2BIG;
-
- headroom = skb_headroom(p);
- nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
- if (unlikely(!nskb))
- return -ENOMEM;
-
- __copy_skb_header(nskb, p);
- nskb->mac_len = p->mac_len;
-
- skb_reserve(nskb, headroom);
- __skb_put(nskb, skb_gro_offset(p));
-
- skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
- skb_set_network_header(nskb, skb_network_offset(p));
- skb_set_transport_header(nskb, skb_transport_offset(p));
-
- __skb_pull(p, skb_gro_offset(p));
- memcpy(skb_mac_header(nskb), skb_mac_header(p),
- p->data - skb_mac_header(p));
-
- skb_shinfo(nskb)->frag_list = p;
- skb_shinfo(nskb)->gso_size = pinfo->gso_size;
- pinfo->gso_size = 0;
- __skb_header_release(p);
- NAPI_GRO_CB(nskb)->last = p;
-
- nskb->data_len += p->len;
- nskb->truesize += p->truesize;
- nskb->len += p->len;
-
- *head = nskb;
- nskb->next = p->next;
- p->next = NULL;
-
- p = nskb;
merge:
delta_truesize = skb->truesize;
@@ -3796,7 +3752,6 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
-
/**
* skb_partial_csum_set - set up and verify partial csum values for packet
* @skb: the skb to set
@@ -4057,6 +4012,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
}
EXPORT_SYMBOL(skb_checksum_setup);
+/**
+ * skb_checksum_maybe_trim - maybe trims the given skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ *
+ * Checks whether the given skb has data beyond the given transport length.
+ * If so, returns a cloned skb trimmed to this transport length.
+ * Otherwise returns the provided skb. Returns NULL in error cases
+ * (e.g. transport_len exceeds skb length or out-of-memory).
+ *
+ * Caller needs to set the skb transport header and release the returned skb.
+ * Provided skb is consumed.
+ */
+static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
+ unsigned int transport_len)
+{
+ struct sk_buff *skb_chk;
+ unsigned int len = skb_transport_offset(skb) + transport_len;
+ int ret;
+
+ if (skb->len < len) {
+ kfree_skb(skb);
+ return NULL;
+ } else if (skb->len == len) {
+ return skb;
+ }
+
+ skb_chk = skb_clone(skb, GFP_ATOMIC);
+ kfree_skb(skb);
+
+ if (!skb_chk)
+ return NULL;
+
+ ret = pskb_trim_rcsum(skb_chk, len);
+ if (ret) {
+ kfree_skb(skb_chk);
+ return NULL;
+ }
+
+ return skb_chk;
+}
+
+/**
+ * skb_checksum_trimmed - validate checksum of an skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ * @skb_chkf: checksum function to use
+ *
+ * Applies the given checksum function skb_chkf to the provided skb.
+ * Returns a checked and maybe trimmed skb. Returns NULL on error.
+ *
+ * If the skb has data beyond the given transport length, then a
+ * trimmed & cloned skb is checked and returned.
+ *
+ * Caller needs to set the skb transport header and release the returned skb.
+ * Provided skb is consumed.
+ */
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+ unsigned int transport_len,
+ __sum16(*skb_chkf)(struct sk_buff *skb))
+{
+ struct sk_buff *skb_chk;
+ unsigned int offset = skb_transport_offset(skb);
+ __sum16 ret;
+
+ skb_chk = skb_checksum_maybe_trim(skb, transport_len);
+ if (!skb_chk)
+ return NULL;
+
+ if (!pskb_may_pull(skb_chk, offset)) {
+ kfree_skb(skb_chk);
+ return NULL;
+ }
+
+ __skb_pull(skb_chk, offset);
+ ret = skb_chkf(skb_chk);
+ __skb_push(skb_chk, offset);
+
+ if (ret) {
+ kfree_skb(skb_chk);
+ return NULL;
+ }
+
+ return skb_chk;
+}
+EXPORT_SYMBOL(skb_checksum_trimmed);
+
void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
@@ -4169,19 +4211,21 @@ EXPORT_SYMBOL(skb_try_coalesce);
*/
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
- if (xnet)
- skb_orphan(skb);
skb->tstamp.tv64 = 0;
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
- skb->mark = 0;
skb_sender_cpu_clear(skb);
- skb_init_secmark(skb);
secpath_reset(skb);
nf_reset(skb);
nf_reset_trace(skb);
+
+ if (!xnet)
+ return;
+
+ skb_orphan(skb);
+ skb->mark = 0;
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
@@ -4423,7 +4467,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
while (order) {
if (npages >= 1 << order) {
- page = alloc_pages(gfp_mask |
+ page = alloc_pages((gfp_mask & ~__GFP_WAIT) |
__GFP_COMP |
__GFP_NOWARN |
__GFP_NORETRY,
diff --git a/net/core/sock.c b/net/core/sock.c
index 71e3e5f1eaa0..193901d09757 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -131,6 +131,7 @@
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
#include <linux/filter.h>
@@ -354,15 +355,12 @@ void sk_clear_memalloc(struct sock *sk)
/*
* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
- * progress of swapping. However, if SOCK_MEMALLOC is cleared while
- * it has rmem allocations there is a risk that the user of the
- * socket cannot make forward progress due to exceeding the rmem
- * limits. By rights, sk_clear_memalloc() should only be called
- * on sockets being torn down but warn and reset the accounting if
- * that assumption breaks.
+ * progress of swapping. SOCK_MEMALLOC may be cleared while
+ * it has rmem allocations due to the last swapfile being deactivated
+ * but there is a risk that the socket is unusable due to exceeding
+ * the rmem limits. Reclaim the reserves and obey rmem limits again.
*/
- if (WARN_ON(sk->sk_forward_alloc))
- sk_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);
@@ -466,7 +464,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
skb_dst_force(skb);
spin_lock_irqsave(&list->lock, flags);
- skb->dropcount = atomic_read(&sk->sk_drops);
+ sock_skb_set_dropcount(sk, skb);
__skb_queue_tail(list, skb);
spin_unlock_irqrestore(&list->lock, flags);
@@ -947,8 +945,6 @@ set_rcvbuf:
sk->sk_mark = val;
break;
- /* We implement the SO_SNDLOWAT etc to
- not be settable (1003.1g 5.3) */
case SO_RXQ_OVFL:
sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
break;
@@ -1253,6 +1249,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
default:
+ /* We implement the SO_SNDLOWAT etc to not be settable
+ * (1003.1g 7).
+ */
return -ENOPROTOOPT;
}
@@ -1395,9 +1394,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx);
* @family: protocol family
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
* @prot: struct proto associated with this new sock instance
+ * @kern: is this to be a kernel socket?
*/
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
- struct proto *prot)
+ struct proto *prot, int kern)
{
struct sock *sk;
@@ -1410,7 +1410,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
- sock_net_set(sk, get_net(net));
+ sk->sk_net_refcnt = kern ? 0 : 1;
+ if (likely(sk->sk_net_refcnt))
+ get_net(net);
+ sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
@@ -1421,7 +1424,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
}
EXPORT_SYMBOL(sk_alloc);
-static void __sk_free(struct sock *sk)
+void sk_destruct(struct sock *sk)
{
struct sk_filter *filter;
@@ -1444,10 +1447,19 @@ static void __sk_free(struct sock *sk)
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
- put_net(sock_net(sk));
+ if (likely(sk->sk_net_refcnt))
+ put_net(sock_net(sk));
sk_prot_free(sk->sk_prot_creator, sk);
}
+static void __sk_free(struct sock *sk)
+{
+ if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+ sock_diag_broadcast_destroy(sk);
+ else
+ sk_destruct(sk);
+}
+
void sk_free(struct sock *sk)
{
/*
@@ -1460,26 +1472,6 @@ void sk_free(struct sock *sk)
}
EXPORT_SYMBOL(sk_free);
-/*
- * Last sock_put should drop reference to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking reference to stopping namespace
- * is not an option.
- * Take reference to a socket to remove it from hash _alive_ and after that
- * destroy it in the context of init_net.
- */
-void sk_release_kernel(struct sock *sk)
-{
- if (sk == NULL || sk->sk_socket == NULL)
- return;
-
- sock_hold(sk);
- sock_release(sk->sk_socket);
- release_net(sock_net(sk));
- sock_net_set(sk, get_net(&init_net));
- sock_put(sk);
-}
-EXPORT_SYMBOL(sk_release_kernel);
-
static void sk_update_clone(const struct sock *sk, struct sock *newsk)
{
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
@@ -1505,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
sock_copy(newsk, sk);
/* SANITY */
- get_net(sock_net(newsk));
+ if (likely(newsk->sk_net_refcnt))
+ get_net(sock_net(newsk));
sk_node_init(&newsk->sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
@@ -1557,6 +1550,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_err = 0;
newsk->sk_priority = 0;
newsk->sk_incoming_cpu = raw_smp_processor_id();
+ atomic64_set(&newsk->sk_cookie, 0);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
@@ -1594,6 +1588,8 @@ EXPORT_SYMBOL_GPL(sk_clone_lock);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
+ u32 max_segs = 1;
+
__sk_dst_set(sk, dst);
sk->sk_route_caps = dst->dev->features;
if (sk->sk_route_caps & NETIF_F_GSO)
@@ -1605,9 +1601,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = dst->dev->gso_max_size;
- sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+ max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
}
}
+ sk->sk_gso_max_segs = max_segs;
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -1684,19 +1681,6 @@ void sock_efree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_efree);
-#ifdef CONFIG_INET
-void sock_edemux(struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
-
- if (sk->sk_state == TCP_TIME_WAIT)
- inet_twsk_put(inet_twsk(sk));
- else
- sock_put(sk);
-}
-EXPORT_SYMBOL(sock_edemux);
-#endif
-
kuid_t sock_i_uid(struct sock *sk)
{
kuid_t uid;
@@ -1895,7 +1879,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
pfrag->offset = 0;
if (SKB_FRAG_PAGE_ORDER) {
- pfrag->page = alloc_pages(gfp | __GFP_COMP |
+ pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
__GFP_NOWARN | __GFP_NORETRY,
SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
@@ -1984,20 +1968,21 @@ static void __release_sock(struct sock *sk)
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
* @timeo: for how long
+ * @skb: last skb seen on sk_receive_queue
*
* Now socket state including sk->sk_err is changed only under lock,
* hence we may omit checks after joining wait queue.
* We check receive queue before schedule() only as optimization;
* it is very likely that release_sock() added new data.
*/
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
int rc;
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
finish_wait(sk_sleep(sk), &wait);
return rc;
@@ -2095,12 +2080,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
/**
* __sk_reclaim - reclaim memory_allocated
* @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
*/
-void __sk_mem_reclaim(struct sock *sk)
+void __sk_mem_reclaim(struct sock *sk, int amount)
{
- sk_memory_allocated_sub(sk,
- sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
- sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk_memory_allocated_sub(sk, amount);
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (sk_under_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
@@ -2186,15 +2172,14 @@ int sock_no_getsockopt(struct socket *sock, int level, int optname,
}
EXPORT_SYMBOL(sock_no_getsockopt);
-int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
- size_t len)
+int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg);
-int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
- size_t len, int flags)
+int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
+ int flags)
{
return -EOPNOTSUPP;
}
@@ -2286,7 +2271,6 @@ static void sock_def_write_space(struct sock *sk)
static void sock_def_destruct(struct sock *sk)
{
- kfree(sk->sk_protinfo);
}
void sk_send_sigurg(struct sock *sk)
@@ -2566,14 +2550,14 @@ int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
EXPORT_SYMBOL(compat_sock_common_getsockopt);
#endif
-int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size, int flags)
+int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
{
struct sock *sk = sock->sk;
int addr_len = 0;
int err;
- err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
@@ -2750,6 +2734,42 @@ static inline void release_proto_idx(struct proto *prot)
}
#endif
+static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
+{
+ if (!rsk_prot)
+ return;
+ kfree(rsk_prot->slab_name);
+ rsk_prot->slab_name = NULL;
+ if (rsk_prot->slab) {
+ kmem_cache_destroy(rsk_prot->slab);
+ rsk_prot->slab = NULL;
+ }
+}
+
+static int req_prot_init(const struct proto *prot)
+{
+ struct request_sock_ops *rsk_prot = prot->rsk_prot;
+
+ if (!rsk_prot)
+ return 0;
+
+ rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
+ prot->name);
+ if (!rsk_prot->slab_name)
+ return -ENOMEM;
+
+ rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
+ rsk_prot->obj_size, 0,
+ 0, NULL);
+
+ if (!rsk_prot->slab) {
+ pr_crit("%s: Can't create request sock SLAB cache!\n",
+ prot->name);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
int proto_register(struct proto *prot, int alloc_slab)
{
if (alloc_slab) {
@@ -2763,21 +2783,8 @@ int proto_register(struct proto *prot, int alloc_slab)
goto out;
}
- if (prot->rsk_prot != NULL) {
- prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
- if (prot->rsk_prot->slab_name == NULL)
- goto out_free_sock_slab;
-
- prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
- prot->rsk_prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL);
-
- if (prot->rsk_prot->slab == NULL) {
- pr_crit("%s: Can't create request sock SLAB cache!\n",
- prot->name);
- goto out_free_request_sock_slab_name;
- }
- }
+ if (req_prot_init(prot))
+ goto out_free_request_sock_slab;
if (prot->twsk_prot != NULL) {
prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
@@ -2789,8 +2796,7 @@ int proto_register(struct proto *prot, int alloc_slab)
kmem_cache_create(prot->twsk_prot->twsk_slab_name,
prot->twsk_prot->twsk_obj_size,
0,
- SLAB_HWCACHE_ALIGN |
- prot->slab_flags,
+ prot->slab_flags,
NULL);
if (prot->twsk_prot->twsk_slab == NULL)
goto out_free_timewait_sock_slab_name;
@@ -2806,14 +2812,8 @@ int proto_register(struct proto *prot, int alloc_slab)
out_free_timewait_sock_slab_name:
kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
- if (prot->rsk_prot && prot->rsk_prot->slab) {
- kmem_cache_destroy(prot->rsk_prot->slab);
- prot->rsk_prot->slab = NULL;
- }
-out_free_request_sock_slab_name:
- if (prot->rsk_prot)
- kfree(prot->rsk_prot->slab_name);
-out_free_sock_slab:
+ req_prot_cleanup(prot->rsk_prot);
+
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
out:
@@ -2833,11 +2833,7 @@ void proto_unregister(struct proto *prot)
prot->slab = NULL;
}
- if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
- kmem_cache_destroy(prot->rsk_prot->slab);
- kfree(prot->rsk_prot->slab_name);
- prot->rsk_prot->slab = NULL;
- }
+ req_prot_cleanup(prot->rsk_prot);
if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
kmem_cache_destroy(prot->twsk_prot->twsk_slab);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index ad704c757bb4..d79866c5f8bc 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -5,6 +5,9 @@
#include <net/net_namespace.h>
#include <linux/module.h>
#include <net/sock.h>
+#include <linux/kernel.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
@@ -12,23 +15,41 @@
static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
+static struct workqueue_struct *broadcast_wq;
-int sock_diag_check_cookie(void *sk, __u32 *cookie)
+static u64 sock_gen_cookie(struct sock *sk)
{
- if ((cookie[0] != INET_DIAG_NOCOOKIE ||
- cookie[1] != INET_DIAG_NOCOOKIE) &&
- ((u32)(unsigned long)sk != cookie[0] ||
- (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
- return -ESTALE;
- else
+ while (1) {
+ u64 res = atomic64_read(&sk->sk_cookie);
+
+ if (res)
+ return res;
+ res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
+ atomic64_cmpxchg(&sk->sk_cookie, 0, res);
+ }
+}
+
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
+{
+ u64 res;
+
+ if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE)
return 0;
+
+ res = sock_gen_cookie(sk);
+ if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1])
+ return -ESTALE;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
-void sock_diag_save_cookie(void *sk, __u32 *cookie)
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie)
{
- cookie[0] = (u32)(unsigned long)sk;
- cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+ u64 res = sock_gen_cookie(sk);
+
+ cookie[0] = (u32)res;
+ cookie[1] = (u32)(res >> 32);
}
EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
@@ -84,6 +105,62 @@ out:
}
EXPORT_SYMBOL(sock_diag_put_filterinfo);
+struct broadcast_sk {
+ struct sock *sk;
+ struct work_struct work;
+};
+
+static size_t sock_diag_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct inet_diag_msg)
+ + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */
+ + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
+}
+
+static void sock_diag_broadcast_destroy_work(struct work_struct *work)
+{
+ struct broadcast_sk *bsk =
+ container_of(work, struct broadcast_sk, work);
+ struct sock *sk = bsk->sk;
+ const struct sock_diag_handler *hndl;
+ struct sk_buff *skb;
+ const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+ int err = -1;
+
+ WARN_ON(group == SKNLGRP_NONE);
+
+ skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ mutex_lock(&sock_diag_table_mutex);
+ hndl = sock_diag_handlers[sk->sk_family];
+ if (hndl && hndl->get_info)
+ err = hndl->get_info(skb, sk);
+ mutex_unlock(&sock_diag_table_mutex);
+
+ if (!err)
+ nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
+ GFP_KERNEL);
+ else
+ kfree_skb(skb);
+out:
+ sk_destruct(sk);
+ kfree(bsk);
+}
+
+void sock_diag_broadcast_destroy(struct sock *sk)
+{
+ /* Note, this function is often called from an interrupt context. */
+ struct broadcast_sk *bsk =
+ kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC);
+ if (!bsk)
+ return sk_destruct(sk);
+ bsk->sk = sk;
+ INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work);
+ queue_work(broadcast_wq, &bsk->work);
+}
+
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
{
mutex_lock(&sock_diag_table_mutex);
@@ -194,10 +271,32 @@ static void sock_diag_rcv(struct sk_buff *skb)
mutex_unlock(&sock_diag_mutex);
}
+static int sock_diag_bind(struct net *net, int group)
+{
+ switch (group) {
+ case SKNLGRP_INET_TCP_DESTROY:
+ case SKNLGRP_INET_UDP_DESTROY:
+ if (!sock_diag_handlers[AF_INET])
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+ break;
+ case SKNLGRP_INET6_TCP_DESTROY:
+ case SKNLGRP_INET6_UDP_DESTROY:
+ if (!sock_diag_handlers[AF_INET6])
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+ break;
+ }
+ return 0;
+}
+
static int __net_init diag_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
+ .groups = SKNLGRP_MAX,
.input = sock_diag_rcv,
+ .bind = sock_diag_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
};
net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
@@ -217,12 +316,15 @@ static struct pernet_operations diag_net_ops = {
static int __init sock_diag_init(void)
{
+ broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0);
+ BUG_ON(!broadcast_wq);
return register_pernet_subsys(&diag_net_ops);
}
static void __exit sock_diag_exit(void)
{
unregister_pernet_subsys(&diag_net_ops);
+ destroy_workqueue(broadcast_wq);
}
module_init(sock_diag_init);
diff --git a/net/core/stream.c b/net/core/stream.c
index 301c05f26060..d70f77a0c889 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -119,6 +119,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
int err = 0;
long vm_wait = 0;
long current_timeo = *timeo_p;
+ bool noblock = (*timeo_p ? false : true);
DEFINE_WAIT(wait);
if (sk_stream_memory_free(sk))
@@ -131,8 +132,11 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
- if (!*timeo_p)
+ if (!*timeo_p) {
+ if (noblock)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
goto do_nonblock;
+ }
if (signal_pending(current))
goto do_interrupted;
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8ce351ffceb1..95b6139d710c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -24,7 +24,6 @@
static int zero = 0;
static int one = 1;
-static int ushort_max = USHRT_MAX;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -403,7 +402,6 @@ static struct ctl_table netns_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
- .extra2 = &ushort_max,
.proc_handler = proc_dointvec_minmax
},
{ }
diff --git a/net/core/utils.c b/net/core/utils.c
index 7b803884c162..a7732a068043 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -304,13 +304,15 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, int pseudohdr)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
- to));
+ csum_replace4(sum, from, to);
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
+ skb->csum = ~csum_add(csum_sub(~(skb->csum),
+ (__force __wsum)from),
+ (__force __wsum)to);
} else if (pseudohdr)
- *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
- to));
+ *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum),
+ (__force __wsum)from),
+ (__force __wsum)to));
}
EXPORT_SYMBOL(inet_proto_csum_replace4);