summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig26
-rw-r--r--net/atm/clip.c6
-rw-r--r--net/bpf/test_run.c43
-rw-r--r--net/can/af_can.c3
-rw-r--r--net/can/j1939/main.c9
-rw-r--r--net/can/j1939/socket.c94
-rw-r--r--net/can/j1939/transport.c36
-rw-r--r--net/core/bpf_sk_storage.c2
-rw-r--r--net/core/dev.c5
-rw-r--r--net/core/devlink.c13
-rw-r--r--net/core/filter.c12
-rw-r--r--net/core/net-sysfs.c25
-rw-r--r--net/core/page_pool.c189
-rw-r--r--net/core/rtnetlink.c23
-rw-r--r--net/core/skmsg.c13
-rw-r--r--net/core/xdp.c126
-rw-r--r--net/dsa/Kconfig7
-rw-r--r--net/dsa/Makefile1
-rw-r--r--net/dsa/dsa_priv.h4
-rw-r--r--net/dsa/port.c19
-rw-r--r--net/dsa/tag_8021q.c2
-rw-r--r--net/dsa/tag_ocelot.c241
-rw-r--r--net/ipv4/Kconfig218
-rw-r--r--net/ipv4/fib_frontend.c10
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_input.c35
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_tunnel_core.c140
-rw-r--r--net/ipv4/ipconfig.c3
-rw-r--r--net/ipv4/ipmr.c3
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c2
-rw-r--r--net/ipv4/nexthop.c1
-rw-r--r--net/ipv4/route.c42
-rw-r--r--net/ipv4/sysctl_net_ipv4.c6
-rw-r--r--net/ipv4/tcp_cong.c6
-rw-r--r--net/ipv4/tcp_ulp.c3
-rw-r--r--net/ipv4/udp.c29
-rw-r--r--net/ipv4/xfrm4_output.c2
-rw-r--r--net/ipv6/ip6_fib.c4
-rw-r--r--net/ipv6/ip6_input.c26
-rw-r--r--net/ipv6/ip6_output.c4
-rw-r--r--net/ipv6/ipv6_sockglue.c4
-rw-r--r--net/ipv6/netfilter/Kconfig28
-rw-r--r--net/ipv6/netfilter/nf_flow_table_ipv6.c2
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/seg6_local.c33
-rw-r--r--net/ipv6/xfrm6_output.c2
-rw-r--r--net/mac80211/Makefile3
-rw-r--r--net/mac80211/airtime.c597
-rw-r--r--net/mac80211/debugfs.c88
-rw-r--r--net/mac80211/debugfs_sta.c43
-rw-r--r--net/mac80211/ieee80211_i.h8
-rw-r--r--net/mac80211/main.c10
-rw-r--r--net/mac80211/sta_info.c52
-rw-r--r--net/mac80211/sta_info.h12
-rw-r--r--net/mac80211/status.c39
-rw-r--r--net/mac80211/tx.c72
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c23
-rw-r--r--net/netfilter/nf_flow_table_core.c8
-rw-r--r--net/netfilter/nf_flow_table_inet.c25
-rw-r--r--net/netfilter/nf_flow_table_offload.c179
-rw-r--r--net/netfilter/nf_tables_api.c49
-rw-r--r--net/netfilter/nf_tables_offload.c95
-rw-r--r--net/netfilter/nft_cmp.c6
-rw-r--r--net/netfilter/nft_meta.c18
-rw-r--r--net/netfilter/nft_payload.c94
-rw-r--r--net/netfilter/xt_time.c19
-rw-r--r--net/nfc/hci/Kconfig14
-rw-r--r--net/openvswitch/datapath.c48
-rw-r--r--net/openvswitch/datapath.h12
-rw-r--r--net/rds/ib_cm.c23
-rw-r--r--net/sched/act_ct.c1
-rw-r--r--net/sched/act_mpls.c1
-rw-r--r--net/sched/act_pedit.c12
-rw-r--r--net/sched/act_tunnel_key.c207
-rw-r--r--net/sched/cls_flower.c254
-rw-r--r--net/sched/sch_pie.c120
-rw-r--r--net/sched/sch_taprio.c28
-rw-r--r--net/smc/af_smc.c23
-rw-r--r--net/smc/smc_cdc.c3
-rw-r--r--net/smc/smc_clc.c2
-rw-r--r--net/smc/smc_close.c27
-rw-r--r--net/smc/smc_core.c234
-rw-r--r--net/smc/smc_core.h9
-rw-r--r--net/smc/smc_ib.c9
-rw-r--r--net/smc/smc_ib.h3
-rw-r--r--net/smc/smc_ism.c22
-rw-r--r--net/smc/smc_llc.c9
-rw-r--r--net/smc/smc_tx.c2
-rw-r--r--net/smc/smc_wr.c37
-rw-r--r--net/smc/smc_wr.h10
-rw-r--r--net/tipc/bcast.c4
-rw-r--r--net/tipc/bcast.h2
-rw-r--r--net/tipc/core.c2
-rw-r--r--net/tipc/core.h6
-rw-r--r--net/tipc/link.c2
-rw-r--r--net/tipc/name_table.c51
-rw-r--r--net/tipc/name_table.h4
-rw-r--r--net/tipc/node.c8
-rw-r--r--net/tipc/socket.c2
-rw-r--r--net/tls/tls_main.c1
-rw-r--r--net/tls/tls_proc.c2
-rw-r--r--net/tls/tls_sw.c11
-rw-r--r--net/vmw_vsock/af_vsock.c389
-rw-r--r--net/vmw_vsock/hyperv_transport.c70
-rw-r--r--net/vmw_vsock/virtio_transport.c177
-rw-r--r--net/vmw_vsock/virtio_transport_common.c166
-rw-r--r--net/vmw_vsock/vmci_transport.c140
-rw-r--r--net/vmw_vsock/vmci_transport.h3
-rw-r--r--net/vmw_vsock/vmci_transport_notify.h1
-rw-r--r--net/xfrm/Kconfig10
-rw-r--r--net/xfrm/xfrm_input.c3
-rw-r--r--net/xfrm/xfrm_interface.c23
-rw-r--r--net/xfrm/xfrm_state.c2
114 files changed, 3967 insertions, 1179 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 3101bfcbdd7a..bd191f978a23 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -258,7 +258,7 @@ config XPS
default y
config HWBM
- bool
+ bool
config CGROUP_NET_PRIO
bool "Network priority cgroup"
@@ -309,12 +309,12 @@ config BPF_STREAM_PARSER
select STREAM_PARSER
select NET_SOCK_MSG
---help---
- Enabling this allows a stream parser to be used with
- BPF_MAP_TYPE_SOCKMAP.
+ Enabling this allows a stream parser to be used with
+ BPF_MAP_TYPE_SOCKMAP.
- BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
- It can be used to enforce socket policy, implement socket redirects,
- etc.
+ BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
+ It can be used to enforce socket policy, implement socket redirects,
+ etc.
config NET_FLOW_LIMIT
bool
@@ -349,12 +349,12 @@ config NET_DROP_MONITOR
tristate "Network packet drop alerting service"
depends on INET && TRACEPOINTS
---help---
- This feature provides an alerting service to userspace in the
- event that packets are discarded in the network stack. Alerts
- are broadcast via netlink socket to any listening user space
- process. If you don't need network drop alerts, or if you are ok
- just checking the various proc files and other utilities for
- drop statistics, say N here.
+ This feature provides an alerting service to userspace in the
+ event that packets are discarded in the network stack. Alerts
+ are broadcast via netlink socket to any listening user space
+ process. If you don't need network drop alerts, or if you are ok
+ just checking the various proc files and other utilities for
+ drop statistics, say N here.
endmenu
@@ -433,7 +433,7 @@ config NET_DEVLINK
imply NET_DROP_MONITOR
config PAGE_POOL
- bool
+ bool
config FAILOVER
tristate "Generic failover module"
diff --git a/net/atm/clip.c b/net/atm/clip.c
index a7972da7235d..294cb9efe3d3 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -89,7 +89,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
struct clip_vcc **walk;
if (!entry) {
- pr_crit("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
+ pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
return;
}
netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */
@@ -109,10 +109,10 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
error = neigh_update(entry->neigh, NULL, NUD_NONE,
NEIGH_UPDATE_F_ADMIN, 0);
if (error)
- pr_crit("neigh_update failed with %d\n", error);
+ pr_err("neigh_update failed with %d\n", error);
goto out;
}
- pr_crit("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
+ pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
out:
netif_tx_unlock_bh(entry->neigh->dev);
}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0be4497cb832..915c2d6f7fb9 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -105,6 +105,40 @@ out:
return err;
}
+/* Integer types of various sizes and pointer combinations cover variety of
+ * architecture dependent calling conventions. 7+ can be supported in the
+ * future.
+ */
+int noinline bpf_fentry_test1(int a)
+{
+ return a + 1;
+}
+
+int noinline bpf_fentry_test2(int a, u64 b)
+{
+ return a + b;
+}
+
+int noinline bpf_fentry_test3(char a, int b, u64 c)
+{
+ return a + b + c;
+}
+
+int noinline bpf_fentry_test4(void *a, char b, int c, u64 d)
+{
+ return (long)a + b + c + d;
+}
+
+int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e)
+{
+ return a + (long)b + c + d + e;
+}
+
+int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f)
+{
+ return a + (long)b + c + d + (long)e + f;
+}
+
static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
u32 headroom, u32 tailroom)
{
@@ -122,6 +156,15 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
kfree(data);
return ERR_PTR(-EFAULT);
}
+ if (bpf_fentry_test1(1) != 2 ||
+ bpf_fentry_test2(2, 3) != 5 ||
+ bpf_fentry_test3(4, 5, 6) != 15 ||
+ bpf_fentry_test4((void *)7, 8, 9, 10) != 34 ||
+ bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
+ bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) {
+ kfree(data);
+ return ERR_PTR(-EFAULT);
+ }
return data;
}
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 5518a7d9eed9..128d37a4c2e0 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -86,11 +86,12 @@ static atomic_t skbcounter = ATOMIC_INIT(0);
/* af_can socket functions */
-static void can_sock_destruct(struct sock *sk)
+void can_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_receive_queue);
skb_queue_purge(&sk->sk_error_queue);
}
+EXPORT_SYMBOL(can_sock_destruct);
static const struct can_proto *can_get_proto(int protocol)
{
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index def2f813ffce..137054bff9ec 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -51,6 +51,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data)
if (!skb)
return;
+ j1939_priv_get(priv);
can_skb_set_owner(skb, iskb->sk);
/* get a pointer to the header of the skb
@@ -104,6 +105,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data)
j1939_simple_recv(priv, skb);
j1939_sk_recv(priv, skb);
done:
+ j1939_priv_put(priv);
kfree_skb(skb);
}
@@ -150,6 +152,10 @@ static void __j1939_priv_release(struct kref *kref)
netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv);
+ WARN_ON_ONCE(!list_empty(&priv->active_session_list));
+ WARN_ON_ONCE(!list_empty(&priv->ecus));
+ WARN_ON_ONCE(!list_empty(&priv->j1939_socks));
+
dev_put(ndev);
kfree(priv);
}
@@ -207,6 +213,9 @@ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev)
{
struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+ if (!can_ml_priv)
+ return NULL;
+
return can_ml_priv->j1939_priv;
}
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 4d8ba701e15d..de09b0a65791 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -78,7 +78,6 @@ static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk)
{
jsk->state |= J1939_SOCK_BOUND;
j1939_priv_get(priv);
- jsk->priv = priv;
spin_lock_bh(&priv->j1939_socks_lock);
list_add_tail(&jsk->list, &priv->j1939_socks);
@@ -91,7 +90,6 @@ static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk)
list_del_init(&jsk->list);
spin_unlock_bh(&priv->j1939_socks_lock);
- jsk->priv = NULL;
j1939_priv_put(priv);
jsk->state &= ~J1939_SOCK_BOUND;
}
@@ -349,6 +347,34 @@ void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb)
spin_unlock_bh(&priv->j1939_socks_lock);
}
+static void j1939_sk_sock_destruct(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* This function will be call by the generic networking code, when then
+ * the socket is ultimately closed (sk->sk_destruct).
+ *
+ * The race between
+ * - processing a received CAN frame
+ * (can_receive -> j1939_can_recv)
+ * and accessing j1939_priv
+ * ... and ...
+ * - closing a socket
+ * (j1939_can_rx_unregister -> can_rx_unregister)
+ * and calling the final j1939_priv_put()
+ *
+ * is avoided by calling the final j1939_priv_put() from this
+ * RCU deferred cleanup call.
+ */
+ if (jsk->priv) {
+ j1939_priv_put(jsk->priv);
+ jsk->priv = NULL;
+ }
+
+ /* call generic CAN sock destruct */
+ can_sock_destruct(sk);
+}
+
static int j1939_sk_init(struct sock *sk)
{
struct j1939_sock *jsk = j1939_sk(sk);
@@ -371,6 +397,7 @@ static int j1939_sk_init(struct sock *sk)
atomic_set(&jsk->skb_pending, 0);
spin_lock_init(&jsk->sk_session_queue_lock);
INIT_LIST_HEAD(&jsk->sk_session_queue);
+ sk->sk_destruct = j1939_sk_sock_destruct;
return 0;
}
@@ -443,6 +470,12 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
}
jsk->ifindex = addr->can_ifindex;
+
+ /* the corresponding j1939_priv_put() is called via
+ * sk->sk_destruct, which points to j1939_sk_sock_destruct()
+ */
+ j1939_priv_get(priv);
+ jsk->priv = priv;
}
/* set default transmit pgn */
@@ -560,8 +593,8 @@ static int j1939_sk_release(struct socket *sock)
if (!sk)
return 0;
- jsk = j1939_sk(sk);
lock_sock(sk);
+ jsk = j1939_sk(sk);
if (jsk->state & J1939_SOCK_BOUND) {
struct j1939_priv *priv = jsk->priv;
@@ -1059,51 +1092,72 @@ static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
struct j1939_sock *jsk = j1939_sk(sk);
- struct j1939_priv *priv = jsk->priv;
+ struct j1939_priv *priv;
int ifindex;
int ret;
+ lock_sock(sock->sk);
/* various socket state tests */
- if (!(jsk->state & J1939_SOCK_BOUND))
- return -EBADFD;
+ if (!(jsk->state & J1939_SOCK_BOUND)) {
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+ priv = jsk->priv;
ifindex = jsk->ifindex;
- if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR)
+ if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) {
/* no source address assigned yet */
- return -EBADFD;
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
/* deal with provided destination address info */
if (msg->msg_name) {
struct sockaddr_can *addr = msg->msg_name;
- if (msg->msg_namelen < J1939_MIN_NAMELEN)
- return -EINVAL;
+ if (msg->msg_namelen < J1939_MIN_NAMELEN) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
- if (addr->can_family != AF_CAN)
- return -EINVAL;
+ if (addr->can_family != AF_CAN) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
- if (addr->can_ifindex && addr->can_ifindex != ifindex)
- return -EBADFD;
+ if (addr->can_ifindex && addr->can_ifindex != ifindex) {
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
- !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn))
- return -EINVAL;
+ !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
if (!addr->can_addr.j1939.name &&
addr->can_addr.j1939.addr == J1939_NO_ADDR &&
- !sock_flag(sk, SOCK_BROADCAST))
+ !sock_flag(sk, SOCK_BROADCAST)) {
/* broadcast, but SO_BROADCAST not set */
- return -EACCES;
+ ret = -EACCES;
+ goto sendmsg_done;
+ }
} else {
if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR &&
- !sock_flag(sk, SOCK_BROADCAST))
+ !sock_flag(sk, SOCK_BROADCAST)) {
/* broadcast, but SO_BROADCAST not set */
- return -EACCES;
+ ret = -EACCES;
+ goto sendmsg_done;
+ }
}
ret = j1939_sk_send_loop(priv, sk, msg, size);
+sendmsg_done:
+ release_sock(sock->sk);
+
return ret;
}
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index e5f1a56994c6..9f99af5b0b11 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -255,6 +255,7 @@ static void __j1939_session_drop(struct j1939_session *session)
return;
j1939_sock_pending_del(session->sk);
+ sock_put(session->sk);
}
static void j1939_session_destroy(struct j1939_session *session)
@@ -266,6 +267,9 @@ static void j1939_session_destroy(struct j1939_session *session)
netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+ WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry));
+ WARN_ON_ONCE(!list_empty(&session->active_session_list_entry));
+
skb_queue_purge(&session->skb_queue);
__j1939_session_drop(session);
j1939_priv_put(session->priv);
@@ -1042,12 +1046,13 @@ j1939_session_deactivate_activate_next(struct j1939_session *session)
j1939_sk_queue_activate_next(session);
}
-static void j1939_session_cancel(struct j1939_session *session,
+static void __j1939_session_cancel(struct j1939_session *session,
enum j1939_xtp_abort err)
{
struct j1939_priv *priv = session->priv;
WARN_ON_ONCE(!err);
+ lockdep_assert_held(&session->priv->active_session_list_lock);
session->err = j1939_xtp_abort_to_errno(priv, err);
/* do not send aborts on incoming broadcasts */
@@ -1062,6 +1067,20 @@ static void j1939_session_cancel(struct j1939_session *session,
j1939_sk_send_loop_abort(session->sk, session->err);
}
+static void j1939_session_cancel(struct j1939_session *session,
+ enum j1939_xtp_abort err)
+{
+ j1939_session_list_lock(session->priv);
+
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_WAITING_ABORT) {
+ j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
+ __j1939_session_cancel(session, err);
+ }
+
+ j1939_session_list_unlock(session->priv);
+}
+
static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer)
{
struct j1939_session *session =
@@ -1108,8 +1127,6 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer)
netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n",
__func__, session, ret);
if (session->skcb.addr.type != J1939_SIMPLE) {
- j1939_tp_set_rxtimeout(session,
- J1939_XTP_ABORT_TIMEOUT_MS);
j1939_session_cancel(session, J1939_XTP_ABORT_OTHER);
} else {
session->err = ret;
@@ -1169,7 +1186,7 @@ static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer)
hrtimer_start(&session->rxtimer,
ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS),
HRTIMER_MODE_REL_SOFT);
- j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT);
+ __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT);
}
j1939_session_list_unlock(session->priv);
}
@@ -1375,7 +1392,6 @@ j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb)
out_session_cancel:
j1939_session_timers_cancel(session);
- j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
j1939_session_cancel(session, err);
}
@@ -1572,7 +1588,6 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session,
/* RTS on active session */
j1939_session_timers_cancel(session);
- j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
}
@@ -1583,7 +1598,6 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session,
session->last_cmd);
j1939_session_timers_cancel(session);
- j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
return -EBUSY;
@@ -1785,7 +1799,6 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session,
out_session_cancel:
j1939_session_timers_cancel(session);
- j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
j1939_session_cancel(session, J1939_XTP_ABORT_FAULT);
j1939_session_put(session);
}
@@ -1866,6 +1879,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
return ERR_PTR(-ENOMEM);
/* skb is recounted in j1939_session_new() */
+ sock_hold(skb->sk);
session->sk = skb->sk;
session->transmission = true;
session->pkt.total = (size + 6) / 7;
@@ -2028,7 +2042,11 @@ int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk)
&priv->active_session_list,
active_session_list_entry) {
if (!sk || sk == session->sk) {
- j1939_session_timers_cancel(session);
+ if (hrtimer_try_to_cancel(&session->txtimer) == 1)
+ j1939_session_put(session);
+ if (hrtimer_try_to_cancel(&session->rxtimer) == 1)
+ j1939_session_put(session);
+
session->err = ESHUTDOWN;
j1939_session_deactivate_locked(session);
}
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..458be6b3eda9 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -798,7 +798,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
* Try to grab map refcnt to make sure that it's still
* alive and prevent concurrent removal.
*/
- map = bpf_map_inc_not_zero(&smap->map, false);
+ map = bpf_map_inc_not_zero(&smap->map);
if (IS_ERR(map))
continue;
diff --git a/net/core/dev.c b/net/core/dev.c
index 1c799d486623..c7fc902ccbdc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5586,7 +5586,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
if (skb_vlan_tag_present(p))
- diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
@@ -5611,8 +5611,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
- if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
- pinfo->nr_frags &&
+ if (!skb_headlen(skb) && pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 9bad78388a07..4c63c9a4c09e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2812,7 +2812,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
struct net *dest_net = NULL;
int err;
- if (!devlink_reload_supported(devlink))
+ if (!devlink_reload_supported(devlink) || !devlink->reload_enabled)
return -EOPNOTSUPP;
err = devlink_resources_validate(devlink, NULL, info);
@@ -3006,6 +3006,11 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME,
.type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
@@ -4742,6 +4747,7 @@ struct devlink_health_reporter {
bool auto_recover;
u8 health_state;
u64 dump_ts;
+ u64 dump_real_ts;
u64 error_count;
u64 recovery_count;
u64 last_recovery_ts;
@@ -4918,6 +4924,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
goto dump_err;
reporter->dump_ts = jiffies;
+ reporter->dump_real_ts = ktime_get_real_ns();
return 0;
@@ -5067,6 +5074,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
jiffies_to_msecs(reporter->dump_ts),
DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
+ if (reporter->dump_fmsg &&
+ nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
+ reporter->dump_real_ts, DEVLINK_ATTR_PAD))
+ goto reporter_nest_cancel;
nla_nest_end(msg, reporter_attr);
genlmsg_end(msg, hdr);
diff --git a/net/core/filter.c b/net/core/filter.c
index fc303abec8fa..49ded4a7588a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3816,7 +3816,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
-static u32 bpf_skb_output_btf_ids[5];
+static int bpf_skb_output_btf_ids[5];
const struct bpf_func_proto bpf_skb_output_proto = {
.func = bpf_skb_event_output,
.gpl_only = true,
@@ -8684,16 +8684,6 @@ out:
}
#ifdef CONFIG_INET
-struct sk_reuseport_kern {
- struct sk_buff *skb;
- struct sock *sk;
- struct sock *selected_sk;
- void *data_end;
- u32 hash;
- u32 reuseport_id;
- bool bind_inany;
-};
-
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock_reuseport *reuse,
struct sock *sk, struct sk_buff *skb,
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 865ba6ca16eb..ae3bcb1540ec 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -923,21 +923,23 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
"rx-%u", index);
if (error)
- return error;
+ goto err;
dev_hold(queue->dev);
if (dev->sysfs_rx_queue_group) {
error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
- if (error) {
- kobject_put(kobj);
- return error;
- }
+ if (error)
+ goto err;
}
kobject_uevent(kobj, KOBJ_ADD);
return error;
+
+err:
+ kobject_put(kobj);
+ return error;
}
#endif /* CONFIG_SYSFS */
@@ -1461,21 +1463,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
"tx-%u", index);
if (error)
- return error;
+ goto err;
dev_hold(queue->dev);
#ifdef CONFIG_BQL
error = sysfs_create_group(kobj, &dql_group);
- if (error) {
- kobject_put(kobj);
- return error;
- }
+ if (error)
+ goto err;
#endif
kobject_uevent(kobj, KOBJ_ADD);
-
return 0;
+
+err:
+ kobject_put(kobj);
+ return error;
}
#endif /* CONFIG_SYSFS */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5bc65587f1c4..a6aefe989043 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -18,6 +18,9 @@
#include <trace/events/page_pool.h>
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (60 * HZ)
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -44,6 +47,21 @@ static int page_pool_init(struct page_pool *pool,
(pool->p.dma_dir != DMA_BIDIRECTIONAL))
return -EINVAL;
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
+ /* In order to request DMA-sync-for-device the page
+ * needs to be mapped
+ */
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ return -EINVAL;
+
+ if (!pool->p.max_len)
+ return -EINVAL;
+
+ /* pool->p.offset has to be set according to the address
+ * offset used by the DMA engine to start copying rx data
+ */
+ }
+
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
@@ -112,6 +130,16 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
return page;
}
+static void page_pool_dma_sync_for_device(struct page_pool *pool,
+ struct page *page,
+ unsigned int dma_sync_size)
+{
+ dma_sync_size = min(dma_sync_size, pool->p.max_len);
+ dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+ pool->p.offset, dma_sync_size,
+ pool->p.dma_dir);
+}
+
/* slow path */
noinline
static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
@@ -156,6 +184,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
}
page->dma_addr = dma;
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+ page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+
skip_dma_map:
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -193,22 +224,14 @@ static s32 page_pool_inflight(struct page_pool *pool)
{
u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
- s32 distance;
+ s32 inflight;
- distance = _distance(hold_cnt, release_cnt);
+ inflight = _distance(hold_cnt, release_cnt);
- trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
- return distance;
-}
-
-static bool __page_pool_safe_to_destroy(struct page_pool *pool)
-{
- s32 inflight = page_pool_inflight(pool);
-
- /* The distance should not be able to become negative */
+ trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
- return (inflight == 0);
+ return inflight;
}
/* Cleanup page_pool state from page */
@@ -216,6 +239,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
struct page *page)
{
dma_addr_t dma;
+ int count;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
goto skip_dma_unmap;
@@ -227,9 +251,11 @@ static void __page_pool_clean_page(struct page_pool *pool,
DMA_ATTR_SKIP_CPU_SYNC);
page->dma_addr = 0;
skip_dma_unmap:
- atomic_inc(&pool->pages_state_release_cnt);
- trace_page_pool_state_release(pool, page,
- atomic_read(&pool->pages_state_release_cnt));
+ /* This may be the last page returned, releasing the pool, so
+ * it is not safe to reference pool afterwards.
+ */
+ count = atomic_inc_return(&pool->pages_state_release_cnt);
+ trace_page_pool_state_release(pool, page, count);
}
/* unmap the page and clean our state */
@@ -283,8 +309,19 @@ static bool __page_pool_recycle_direct(struct page *page,
return true;
}
-void __page_pool_put_page(struct page_pool *pool,
- struct page *page, bool allow_direct)
+/* page is NOT reusable when:
+ * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
+ * 2) belongs to a different NUMA node than pool->p.nid.
+ *
+ * To update pool->p.nid users must call page_pool_update_nid.
+ */
+static bool pool_page_reusable(struct page_pool *pool, struct page *page)
+{
+ return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid;
+}
+
+void __page_pool_put_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
@@ -292,9 +329,14 @@ void __page_pool_put_page(struct page_pool *pool,
*
* refcnt == 1 means page_pool owns page, and can recycle it.
*/
- if (likely(page_ref_count(page) == 1)) {
+ if (likely(page_ref_count(page) == 1 &&
+ pool_page_reusable(pool, page))) {
/* Read barrier done in page_ref_count / READ_ONCE */
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+ page_pool_dma_sync_for_device(pool, page,
+ dma_sync_size);
+
if (allow_direct && in_serving_softirq())
if (__page_pool_recycle_direct(page, pool))
return;
@@ -338,31 +380,10 @@ static void __page_pool_empty_ring(struct page_pool *pool)
}
}
-static void __warn_in_flight(struct page_pool *pool)
+static void page_pool_free(struct page_pool *pool)
{
- u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
- u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
- s32 distance;
-
- distance = _distance(hold_cnt, release_cnt);
-
- /* Drivers should fix this, but only problematic when DMA is used */
- WARN(1, "Still in-flight pages:%d hold:%u released:%u",
- distance, hold_cnt, release_cnt);
-}
-
-void __page_pool_free(struct page_pool *pool)
-{
- /* Only last user actually free/release resources */
- if (!page_pool_put(pool))
- return;
-
- WARN(pool->alloc.count, "API usage violation");
- WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
-
- /* Can happen due to forced shutdown */
- if (!__page_pool_safe_to_destroy(pool))
- __warn_in_flight(pool);
+ if (pool->disconnect)
+ pool->disconnect(pool);
ptr_ring_cleanup(&pool->ring, NULL);
@@ -371,15 +392,14 @@ void __page_pool_free(struct page_pool *pool)
kfree(pool);
}
-EXPORT_SYMBOL(__page_pool_free);
-/* Request to shutdown: release pages cached by page_pool, and check
- * for in-flight pages
- */
-bool __page_pool_request_shutdown(struct page_pool *pool)
+static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
{
struct page *page;
+ if (pool->destroy_cnt)
+ return;
+
/* Empty alloc cache, assume caller made sure this is
* no-longer in use, and page_pool_alloc_pages() cannot be
* call concurrently.
@@ -388,12 +408,83 @@ bool __page_pool_request_shutdown(struct page_pool *pool)
page = pool->alloc.cache[--pool->alloc.count];
__page_pool_return_page(pool, page);
}
+}
+
+static void page_pool_scrub(struct page_pool *pool)
+{
+ page_pool_empty_alloc_cache_once(pool);
+ pool->destroy_cnt++;
/* No more consumers should exist, but producers could still
* be in-flight.
*/
__page_pool_empty_ring(pool);
+}
+
+static int page_pool_release(struct page_pool *pool)
+{
+ int inflight;
+
+ page_pool_scrub(pool);
+ inflight = page_pool_inflight(pool);
+ if (!inflight)
+ page_pool_free(pool);
+
+ return inflight;
+}
+
+static void page_pool_release_retry(struct work_struct *wq)
+{
+ struct delayed_work *dwq = to_delayed_work(wq);
+ struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
+ int inflight;
+
+ inflight = page_pool_release(pool);
+ if (!inflight)
+ return;
+
+ /* Periodic warning */
+ if (time_after_eq(jiffies, pool->defer_warn)) {
+ int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
+
+ pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
+ __func__, inflight, sec);
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ }
+
+ /* Still not ready to be disconnected, retry later */
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
- return __page_pool_safe_to_destroy(pool);
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
+{
+ refcount_inc(&pool->user_cnt);
+ pool->disconnect = disconnect;
+}
+
+void page_pool_destroy(struct page_pool *pool)
+{
+ if (!pool)
+ return;
+
+ if (!page_pool_put(pool))
+ return;
+
+ if (!page_pool_release(pool))
+ return;
+
+ pool->defer_start = jiffies;
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+
+ INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+EXPORT_SYMBOL(page_pool_destroy);
+
+/* Caller must provide appropriate safe context, e.g. NAPI. */
+void page_pool_update_nid(struct page_pool *pool, int new_nid)
+{
+ trace_page_pool_update_nid(pool, new_nid);
+ pool->p.nid = new_nid;
}
-EXPORT_SYMBOL(__page_pool_request_shutdown);
+EXPORT_SYMBOL(page_pool_update_nid);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 000eddb1207d..9f7aa448bd11 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2251,6 +2251,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_MAC]) {
struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);
+ if (ivm->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_mac)
err = ops->ndo_set_vf_mac(dev, ivm->vf,
@@ -2262,6 +2264,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_VLAN]) {
struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);
+ if (ivv->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_vlan)
err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
@@ -2294,6 +2298,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (len == 0)
return -EINVAL;
+ if (ivvl[0]->vf >= INT_MAX)
+ return -EINVAL;
err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
ivvl[0]->qos, ivvl[0]->vlan_proto);
if (err < 0)
@@ -2304,6 +2310,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
struct ifla_vf_info ivf;
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_get_vf_config)
err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
@@ -2322,6 +2330,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_RATE]) {
struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_rate)
err = ops->ndo_set_vf_rate(dev, ivt->vf,
@@ -2334,6 +2344,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_SPOOFCHK]) {
struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);
+ if (ivs->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_spoofchk)
err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
@@ -2345,6 +2357,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_LINK_STATE]) {
struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);
+ if (ivl->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_link_state)
err = ops->ndo_set_vf_link_state(dev, ivl->vf,
@@ -2358,6 +2372,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
err = -EOPNOTSUPP;
ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
+ if (ivrssq_en->vf >= INT_MAX)
+ return -EINVAL;
if (ops->ndo_set_vf_rss_query_en)
err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
ivrssq_en->setting);
@@ -2368,6 +2384,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_TRUST]) {
struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_trust)
err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
@@ -2378,15 +2396,18 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_IB_NODE_GUID]) {
struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
if (!ops->ndo_set_vf_guid)
return -EOPNOTSUPP;
-
return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
}
if (tb[IFLA_VF_IB_PORT_GUID]) {
struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
if (!ops->ndo_set_vf_guid)
return -EOPNOTSUPP;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ad31e4e53d0a..a469d2124f3f 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -793,15 +793,18 @@ static void sk_psock_strp_data_ready(struct sock *sk)
static void sk_psock_write_space(struct sock *sk)
{
struct sk_psock *psock;
- void (*write_space)(struct sock *sk);
+ void (*write_space)(struct sock *sk) = NULL;
rcu_read_lock();
psock = sk_psock(sk);
- if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
- schedule_work(&psock->work);
- write_space = psock->saved_write_space;
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ }
rcu_read_unlock();
- write_space(sk);
+ if (write_space)
+ write_space(sk);
}
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 20781ad5f9c3..e334fad0a6b8 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -70,77 +70,63 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
- /* Allocator have indicated safe to remove before this is called */
- if (xa->mem.type == MEM_TYPE_PAGE_POOL)
- page_pool_free(xa->page_pool);
-
/* Allow this ID to be reused */
ida_simple_remove(&mem_id_pool, xa->mem.id);
- /* Poison memory */
- xa->mem.id = 0xFFFF;
- xa->mem.type = 0xF0F0;
- xa->allocator = (void *)0xDEAD9001;
-
kfree(xa);
}
-static bool __mem_id_disconnect(int id, bool force)
+static void mem_xa_remove(struct xdp_mem_allocator *xa)
{
- struct xdp_mem_allocator *xa;
- bool safe_to_remove = true;
+ trace_mem_disconnect(xa);
mutex_lock(&mem_id_lock);
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
- WARN(1, "Request remove non-existing id(%d), driver bug?", id);
- return true;
- }
- xa->disconnect_cnt++;
-
- /* Detects in-flight packet-pages for page_pool */
- if (xa->mem.type == MEM_TYPE_PAGE_POOL)
- safe_to_remove = page_pool_request_shutdown(xa->page_pool);
-
- trace_mem_disconnect(xa, safe_to_remove, force);
-
- if ((safe_to_remove || force) &&
- !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
mutex_unlock(&mem_id_lock);
- return (safe_to_remove|force);
}
-#define DEFER_TIME (msecs_to_jiffies(1000))
-#define DEFER_WARN_INTERVAL (30 * HZ)
-#define DEFER_MAX_RETRIES 120
+static void mem_allocator_disconnect(void *allocator)
+{
+ struct xdp_mem_allocator *xa;
+ struct rhashtable_iter iter;
+
+ rhashtable_walk_enter(mem_id_ht, &iter);
+ do {
+ rhashtable_walk_start(&iter);
+
+ while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
+ if (xa->allocator == allocator)
+ mem_xa_remove(xa);
+ }
+
+ rhashtable_walk_stop(&iter);
-static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+ } while (xa == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+}
+
+static void mem_id_disconnect(int id)
{
- struct delayed_work *dwq = to_delayed_work(wq);
- struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
- bool force = false;
+ struct xdp_mem_allocator *xa;
- if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
- force = true;
+ mutex_lock(&mem_id_lock);
- if (__mem_id_disconnect(xa->mem.id, force))
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ WARN(1, "Request remove non-existing id(%d), driver bug?", id);
return;
+ }
- /* Periodic warning */
- if (time_after_eq(jiffies, xa->defer_warn)) {
- int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
+ trace_mem_disconnect(xa);
- pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
- __func__, xa->mem.id, xa->disconnect_cnt, sec);
- xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
- }
+ if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
- /* Still not ready to be disconnected, retry later */
- schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+ mutex_unlock(&mem_id_lock);
}
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
@@ -153,38 +139,21 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
return;
}
- if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL &&
- xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) {
- return;
- }
-
if (id == 0)
return;
- if (__mem_id_disconnect(id, false))
- return;
-
- /* Could not disconnect, defer new disconnect attempt to later */
- mutex_lock(&mem_id_lock);
+ if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
+ return mem_id_disconnect(id);
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
- return;
+ if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
+ rcu_read_lock();
+ xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+ page_pool_destroy(xa->page_pool);
+ rcu_read_unlock();
}
- xa->defer_start = jiffies;
- xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
-
- INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
- mutex_unlock(&mem_id_lock);
- schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
-/* This unregister operation will also cleanup and destroy the
- * allocator. The page_pool_free() operation is first called when it's
- * safe to remove, possibly deferred to a workqueue.
- */
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -371,7 +340,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
}
if (type == MEM_TYPE_PAGE_POOL)
- page_pool_get(xdp_alloc->page_pool);
+ page_pool_use_xdp_mem(allocator, mem_allocator_disconnect);
mutex_unlock(&mem_id_lock);
@@ -402,15 +371,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
- if (likely(xa)) {
- napi_direct &= !xdp_return_frame_no_direct();
- page_pool_put_page(xa->page_pool, page, napi_direct);
- } else {
- /* Hopefully stack show who to blame for late return */
- WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
- trace_mem_return_failed(mem, page);
- put_page(page);
- }
+ napi_direct &= !xdp_return_frame_no_direct();
+ page_pool_put_page(xa->page_pool, page, napi_direct);
rcu_read_unlock();
break;
case MEM_TYPE_PAGE_SHARED:
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 136612792c08..1e6c3cac11e6 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -79,6 +79,13 @@ config NET_DSA_TAG_KSZ
Say Y if you want to enable support for tagging frames for the
Microchip 8795/9477/9893 families of switches.
+config NET_DSA_TAG_OCELOT
+ tristate "Tag driver for Ocelot family of switches"
+ select PACKING
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Ocelot switches (VSC7511, VSC7512, VSC7513, VSC7514, VSC9959).
+
config NET_DSA_TAG_QCA
tristate "Tag driver for Qualcomm Atheros QCA8K switches"
help
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 2c6d286f0511..9a482c38bdb1 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
+obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 53e7577896b6..2dd86d9bcda9 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -153,8 +153,8 @@ void dsa_port_link_unregister_of(struct dsa_port *dp);
void dsa_port_phylink_validate(struct phylink_config *config,
unsigned long *supported,
struct phylink_link_state *state);
-int dsa_port_phylink_mac_link_state(struct phylink_config *config,
- struct phylink_link_state *state);
+void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
+ struct phylink_link_state *state);
void dsa_port_phylink_mac_config(struct phylink_config *config,
unsigned int mode,
const struct phylink_link_state *state);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 6e93c36bf0c0..46ac9ba21987 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -429,19 +429,22 @@ void dsa_port_phylink_validate(struct phylink_config *config,
}
EXPORT_SYMBOL_GPL(dsa_port_phylink_validate);
-int dsa_port_phylink_mac_link_state(struct phylink_config *config,
- struct phylink_link_state *state)
+void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
+ struct phylink_link_state *state)
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct dsa_switch *ds = dp->ds;
- /* Only called for SGMII and 802.3z */
- if (!ds->ops->phylink_mac_link_state)
- return -EOPNOTSUPP;
+ /* Only called for inband modes */
+ if (!ds->ops->phylink_mac_link_state) {
+ state->link = 0;
+ return;
+ }
- return ds->ops->phylink_mac_link_state(ds, dp->index, state);
+ if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0)
+ state->link = 0;
}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state);
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_pcs_get_state);
void dsa_port_phylink_mac_config(struct phylink_config *config,
unsigned int mode,
@@ -510,7 +513,7 @@ EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up);
const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
.validate = dsa_port_phylink_validate,
- .mac_link_state = dsa_port_phylink_mac_link_state,
+ .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state,
.mac_config = dsa_port_phylink_mac_config,
.mac_an_restart = dsa_port_phylink_mac_an_restart,
.mac_link_down = dsa_port_phylink_mac_link_down,
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 73632d21f1a6..2fb6c26294b5 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -105,7 +105,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port)
slave = dsa_to_port(ds, port)->slave;
err = br_vlan_get_pvid(slave, &pvid);
- if (err < 0)
+ if (!pvid || err < 0)
/* There is no pvid on the bridge for this port, which is
* perfectly valid. Nothing to restore, bye-bye!
*/
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
new file mode 100644
index 000000000000..8e3e7283d430
--- /dev/null
+++ b/net/dsa/tag_ocelot.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 NXP Semiconductors
+ */
+#include <soc/mscc/ocelot.h>
+#include <linux/packing.h>
+#include "dsa_priv.h"
+
+/* The CPU injection header and the CPU extraction header can have 3 types of
+ * prefixes: long, short and no prefix. The format of the header itself is the
+ * same in all 3 cases.
+ *
+ * Extraction with long prefix:
+ *
+ * +-------------------+-------------------+------+------+------------+-------+
+ * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame |
+ * | | | | | header | |
+ * +-------------------+-------------------+------+------+------------+-------+
+ * 48 bits 48 bits 16 bits 16 bits 128 bits
+ *
+ * Extraction with short prefix:
+ *
+ * +------+------+------------+-------+
+ * | 8880 | 000a | extraction | frame |
+ * | | | header | |
+ * +------+------+------------+-------+
+ * 16 bits 16 bits 128 bits
+ *
+ * Extraction with no prefix:
+ *
+ * +------------+-------+
+ * | extraction | frame |
+ * | header | |
+ * +------------+-------+
+ * 128 bits
+ *
+ *
+ * Injection with long prefix:
+ *
+ * +-------------------+-------------------+------+------+------------+-------+
+ * | any dmac | any smac | 8880 | 000a | injection | frame |
+ * | | | | | header | |
+ * +-------------------+-------------------+------+------+------------+-------+
+ * 48 bits 48 bits 16 bits 16 bits 128 bits
+ *
+ * Injection with short prefix:
+ *
+ * +------+------+------------+-------+
+ * | 8880 | 000a | injection | frame |
+ * | | | header | |
+ * +------+------+------------+-------+
+ * 16 bits 16 bits 128 bits
+ *
+ * Injection with no prefix:
+ *
+ * +------------+-------+
+ * | injection | frame |
+ * | header | |
+ * +------------+-------+
+ * 128 bits
+ *
+ * The injection header looks like this (network byte order, bit 127
+ * is part of lowest address byte in memory, bit 0 is part of highest
+ * address byte):
+ *
+ * +------+------+------+------+------+------+------+------+
+ * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP|
+ * +------+------+------+------+------+------+------+------+
+ * 119:112 | REW_OP |
+ * +------+------+------+------+------+------+------+------+
+ * 111:104 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 103: 96 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 95: 88 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 87: 80 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 79: 72 | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 71: 64 | RSV | DEST |
+ * +------+------+------+------+------+------+------+------+
+ * 63: 56 | DEST |
+ * +------+------+------+------+------+------+------+------+
+ * 55: 48 | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER|
+ * +------+------+------+------+------+------+------+------+
+ * 39: 32 | TFRM_TIMER | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 31: 24 | RSV | DP | POP_CNT | CPUQ |
+ * +------+------+------+------+------+------+------+------+
+ * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE|
+ * +------+------+------+------+------+------+------+------+
+ * 15: 8 | PCP | DEI | VID |
+ * +------+------+------+------+------+------+------+------+
+ * 7: 0 | VID |
+ * +------+------+------+------+------+------+------+------+
+ *
+ * And the extraction header looks like this:
+ *
+ * +------+------+------+------+------+------+------+------+
+ * 127:120 | RSV | REW_OP |
+ * +------+------+------+------+------+------+------+------+
+ * 119:112 | REW_OP | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 111:104 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 103: 96 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 95: 88 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 87: 80 | REW_VAL | LLEN |
+ * +------+------+------+------+------+------+------+------+
+ * 79: 72 | LLEN | WLEN |
+ * +------+------+------+------+------+------+------+------+
+ * 71: 64 | WLEN | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 63: 56 | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 55: 48 | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 47: 40 | RSV | SRC_PORT | ACL_ID |
+ * +------+------+------+------+------+------+------+------+
+ * 39: 32 | ACL_ID | RSV | SFLOW_ID |
+ * +------+------+------+------+------+------+------+------+
+ * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ |
+ * +------+------+------+------+------+------+------+------+
+ * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE|
+ * +------+------+------+------+------+------+------+------+
+ * 15: 8 | PCP | DEI | VID |
+ * +------+------+------+------+------+------+------+------+
+ * 7: 0 | VID |
+ * +------+------+------+------+------+------+------+------+
+ */
+
+static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(netdev);
+ u64 bypass, dest, src, qos_class, rew_op;
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ struct ocelot *ocelot = ds->priv;
+ struct ocelot_port *ocelot_port = ocelot->ports[port];
+ u8 *injection;
+
+ if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) {
+ netdev_err(netdev, "Cannot make room for tag.\n");
+ return NULL;
+ }
+
+ injection = skb_push(skb, OCELOT_TAG_LEN);
+
+ memset(injection, 0, OCELOT_TAG_LEN);
+
+ src = dsa_upstream_port(ds, port);
+ dest = BIT(port);
+ bypass = true;
+ qos_class = skb->priority;
+
+ packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0);
+ packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0);
+ packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0);
+ packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0);
+
+ if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) {
+ rew_op = ocelot_port->ptp_cmd;
+ if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) {
+ rew_op |= (ocelot_port->ts_id % 4) << 3;
+ ocelot_port->ts_id++;
+ }
+
+ packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
+ struct net_device *netdev,
+ struct packet_type *pt)
+{
+ u64 src_port, qos_class;
+ u8 *start = skb->data;
+ u8 *extraction;
+
+ /* Revert skb->data by the amount consumed by the DSA master,
+ * so it points to the beginning of the frame.
+ */
+ skb_push(skb, ETH_HLEN);
+ /* We don't care about the long prefix, it is just for easy entrance
+ * into the DSA master's RX filter. Discard it now by moving it into
+ * the headroom.
+ */
+ skb_pull(skb, OCELOT_LONG_PREFIX_LEN);
+ /* And skb->data now points to the extraction frame header.
+ * Keep a pointer to it.
+ */
+ extraction = skb->data;
+ /* Now the EFH is part of the headroom as well */
+ skb_pull(skb, OCELOT_TAG_LEN);
+ /* Reset the pointer to the real MAC header */
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ /* And move skb->data to the correct location again */
+ skb_pull(skb, ETH_HLEN);
+
+ /* Remove from inet csum the extraction header */
+ skb_postpull_rcsum(skb, start, OCELOT_LONG_PREFIX_LEN + OCELOT_TAG_LEN);
+
+ packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0);
+ packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0);
+
+ skb->dev = dsa_master_find_slave(netdev, 0, src_port);
+ if (!skb->dev)
+ /* The switch will reflect back some frames sent through
+ * sockets opened on the bare DSA master. These will come back
+ * with src_port equal to the index of the CPU port, for which
+ * there is no slave registered. So don't print any error
+ * message here (ignore and drop those frames).
+ */
+ return NULL;
+
+ skb->offload_fwd_mark = 1;
+ skb->priority = qos_class;
+
+ return skb;
+}
+
+static struct dsa_device_ops ocelot_netdev_ops = {
+ .name = "ocelot",
+ .proto = DSA_TAG_PROTO_OCELOT,
+ .xmit = ocelot_xmit,
+ .rcv = ocelot_rcv,
+ .overhead = OCELOT_TAG_LEN + OCELOT_LONG_PREFIX_LEN,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT);
+
+module_dsa_tag_driver(ocelot_netdev_ops);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 03381f3e12ba..fc816b187170 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -180,8 +180,8 @@ config NET_IPIP
config NET_IPGRE_DEMUX
tristate "IP: GRE demultiplexer"
help
- This is helper module to demultiplex GRE packets on GRE version field criteria.
- Required by ip_gre and pptp modules.
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
config NET_IP_TUNNEL
tristate
@@ -459,200 +459,200 @@ config TCP_CONG_BIC
tristate "Binary Increase Congestion (BIC) control"
default m
---help---
- BIC-TCP is a sender-side only change that ensures a linear RTT
- fairness under large windows while offering both scalability and
- bounded TCP-friendliness. The protocol combines two schemes
- called additive increase and binary search increase. When the
- congestion window is large, additive increase with a large
- increment ensures linear RTT fairness as well as good
- scalability. Under small congestion windows, binary search
- increase provides TCP friendliness.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
config TCP_CONG_CUBIC
tristate "CUBIC TCP"
default y
---help---
- This is version 2.0 of BIC-TCP which uses a cubic growth function
- among other techniques.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
config TCP_CONG_WESTWOOD
tristate "TCP Westwood+"
default m
---help---
- TCP Westwood+ is a sender-side only modification of the TCP Reno
- protocol stack that optimizes the performance of TCP congestion
- control. It is based on end-to-end bandwidth estimation to set
- congestion window and slow start threshold after a congestion
- episode. Using this estimation, TCP Westwood+ adaptively sets a
- slow start threshold and a congestion window which takes into
- account the bandwidth used at the time congestion is experienced.
- TCP Westwood+ significantly increases fairness wrt TCP Reno in
- wired networks and throughput over wireless links.
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
config TCP_CONG_HTCP
tristate "H-TCP"
default m
---help---
- H-TCP is a send-side only modifications of the TCP Reno
- protocol stack that optimizes the performance of TCP
- congestion control for high speed network links. It uses a
- modeswitch to change the alpha and beta parameters of TCP Reno
- based on network conditions and in a way so as to be fair with
- other Reno and H-TCP flows.
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
config TCP_CONG_HSTCP
tristate "High Speed TCP"
default n
---help---
- Sally Floyd's High Speed TCP (RFC 3649) congestion control.
- A modification to TCP's congestion control mechanism for use
- with large congestion windows. A table indicates how much to
- increase the congestion window by when an ACK is received.
- For more detail see http://www.icir.org/floyd/hstcp.html
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see http://www.icir.org/floyd/hstcp.html
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
default n
---help---
- TCP-Hybla is a sender-side only change that eliminates penalization of
- long-RTT, large-bandwidth connections, like when satellite legs are
- involved, especially when sharing a common bottleneck with normal
- terrestrial connections.
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, especially when sharing a common bottleneck with normal
+ terrestrial connections.
config TCP_CONG_VEGAS
tristate "TCP Vegas"
default n
---help---
- TCP Vegas is a sender-side only change to TCP that anticipates
- the onset of congestion by estimating the bandwidth. TCP Vegas
- adjusts the sending rate by modifying the congestion
- window. TCP Vegas should provide less packet loss, but it is
- not as aggressive as TCP Reno.
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
config TCP_CONG_NV
- tristate "TCP NV"
- default n
- ---help---
- TCP NV is a follow up to TCP Vegas. It has been modified to deal with
- 10G networks, measurement noise introduced by LRO, GRO and interrupt
- coalescence. In addition, it will decrease its cwnd multiplicatively
- instead of linearly.
+ tristate "TCP NV"
+ default n
+ ---help---
+ TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+ 10G networks, measurement noise introduced by LRO, GRO and interrupt
+ coalescence. In addition, it will decrease its cwnd multiplicatively
+ instead of linearly.
- Note that in general congestion avoidance (cwnd decreased when # packets
- queued grows) cannot coexist with congestion control (cwnd decreased only
- when there is packet loss) due to fairness issues. One scenario when they
- can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+ Note that in general congestion avoidance (cwnd decreased when # packets
+ queued grows) cannot coexist with congestion control (cwnd decreased only
+ when there is packet loss) due to fairness issues. One scenario when they
+ can coexist safely is when the CA flows have RTTs << CC flows RTTs.
- For further details see http://www.brakmo.org/networking/tcp-nv/
+ For further details see http://www.brakmo.org/networking/tcp-nv/
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
default n
---help---
- Scalable TCP is a sender-side only change to TCP which uses a
- MIMD congestion control algorithm which has some nice scaling
- properties, though is known to have fairness issues.
- See http://www.deneholme.net/tom/scalable/
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www.deneholme.net/tom/scalable/
config TCP_CONG_LP
tristate "TCP Low Priority"
default n
---help---
- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
- to utilize only the excess network bandwidth as compared to the
- ``fair share`` of bandwidth as targeted by TCP.
- See http://www-ece.rice.edu/networks/TCP-LP/
+ TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+ to utilize only the excess network bandwidth as compared to the
+ ``fair share`` of bandwidth as targeted by TCP.
+ See http://www-ece.rice.edu/networks/TCP-LP/
config TCP_CONG_VENO
tristate "TCP Veno"
default n
---help---
- TCP Veno is a sender-side only enhancement of TCP to obtain better
- throughput over wireless networks. TCP Veno makes use of state
- distinguishing to circumvent the difficult judgment of the packet loss
- type. TCP Veno cuts down less congestion window in response to random
- loss packets.
- See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+ TCP Veno is a sender-side only enhancement of TCP to obtain better
+ throughput over wireless networks. TCP Veno makes use of state
+ distinguishing to circumvent the difficult judgment of the packet loss
+ type. TCP Veno cuts down less congestion window in response to random
+ loss packets.
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
select TCP_CONG_VEGAS
default n
---help---
- YeAH-TCP is a sender-side high-speed enabled TCP congestion control
- algorithm, which uses a mixed loss/delay approach to compute the
- congestion window. It's design goals target high efficiency,
- internal, RTT and Reno fairness, resilience to link loss while
- keeping network elements load as low as possible.
+ YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+ algorithm, which uses a mixed loss/delay approach to compute the
+ congestion window. It's design goals target high efficiency,
+ internal, RTT and Reno fairness, resilience to link loss while
+ keeping network elements load as low as possible.
- For further details look here:
- http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ For further details look here:
+ http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
config TCP_CONG_ILLINOIS
tristate "TCP Illinois"
default n
---help---
- TCP-Illinois is a sender-side modification of TCP Reno for
- high speed long delay links. It uses round-trip-time to
- adjust the alpha and beta parameters to achieve a higher average
- throughput and maintain fairness.
+ TCP-Illinois is a sender-side modification of TCP Reno for
+ high speed long delay links. It uses round-trip-time to
+ adjust the alpha and beta parameters to achieve a higher average
+ throughput and maintain fairness.
- For further details see:
- http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
config TCP_CONG_DCTCP
tristate "DataCenter TCP (DCTCP)"
default n
---help---
- DCTCP leverages Explicit Congestion Notification (ECN) in the network to
- provide multi-bit feedback to the end hosts. It is designed to provide:
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
- - High burst tolerance (incast due to partition/aggregate),
- - Low latency (short flows, queries),
- - High throughput (continuous data updates, large file transfers) with
- commodity, shallow-buffered switches.
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
- All switches in the data center network running DCTCP must support
- ECN marking and be configured for marking when reaching defined switch
- buffer thresholds. The default ECN marking threshold heuristic for
- DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
- (~100KB) at 10Gbps, but might need further careful tweaking.
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
- For further details see:
- http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
config TCP_CONG_CDG
tristate "CAIA Delay-Gradient (CDG)"
default n
---help---
- CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
- the TCP sender in order to:
+ CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+ the TCP sender in order to:
o Use the delay gradient as a congestion signal.
o Back off with an average probability that is independent of the RTT.
o Coexist with flows that use loss-based congestion control.
o Tolerate packet loss unrelated to congestion.
- For further details see:
- D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
- delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+ For further details see:
+ D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
config TCP_CONG_BBR
tristate "BBR TCP"
default n
---help---
- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
- maximize network utilization and minimize queues. It builds an explicit
- model of the the bottleneck delivery rate and path round-trip
- propagation delay. It tolerates packet loss and delay unrelated to
- congestion. It can operate over LAN, WAN, cellular, wifi, or cable
- modem links. It can coexist with flows that use loss-based congestion
- control, and can operate with shallow buffers, deep buffers,
- bufferbloat, policers, or AQM schemes that do not provide a delay
- signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
choice
prompt "Default TCP congestion control"
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 71c78d223dfd..577db1d50a24 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -70,11 +70,6 @@ fail:
fib_free_table(main_table);
return -ENOMEM;
}
-
-static bool fib4_has_custom_rules(struct net *net)
-{
- return false;
-}
#else
struct fib_table *fib_new_table(struct net *net, u32 id)
@@ -131,11 +126,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
return NULL;
}
-
-static bool fib4_has_custom_rules(struct net *net)
-{
- return net->ipv4.fib_has_custom_rules;
-}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
static void fib_replace_table(struct net *net, struct fib_table *old,
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 10636fb6093e..572b6307a2df 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -340,6 +340,8 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ const struct iphdr *tnl_params;
+
if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
raw_proto, false) < 0)
goto drop;
@@ -348,7 +350,9 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
skb_pop_mac_header(skb);
else
skb_reset_mac_header(skb);
- if (tunnel->collect_md) {
+
+ tnl_params = &tunnel->parms.iph;
+ if (tunnel->collect_md || tnl_params->daddr == 0) {
__be16 flags;
__be64 tun_id;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 24a95126e698..aa438c6758a7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -302,16 +302,31 @@ drop:
return true;
}
+static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
+ const struct sk_buff *hint)
+{
+ return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
+ ip_hdr(hint)->tos == iph->tos;
+}
+
INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *));
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
- struct sk_buff *skb, struct net_device *dev)
+ struct sk_buff *skb, struct net_device *dev,
+ const struct sk_buff *hint)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
struct rtable *rt;
int err;
+ if (ip_can_use_hint(skb, iph, hint)) {
+ err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
+ dev, hint);
+ if (unlikely(err))
+ goto drop_error;
+ }
+
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -408,7 +423,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb, dev);
+ ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -535,11 +550,20 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
}
+static struct sk_buff *ip_extract_route_hint(const struct net *net,
+ struct sk_buff *skb, int rt_type)
+{
+ if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
+ return NULL;
+
+ return skb;
+}
+
static void ip_list_rcv_finish(struct net *net, struct sock *sk,
struct list_head *head)
{
+ struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
- struct sk_buff *skb, *next;
struct list_head sublist;
INIT_LIST_HEAD(&sublist);
@@ -554,11 +578,14 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
if (curr_dst != dst) {
+ hint = ip_extract_route_hint(net, skb,
+ ((struct rtable *)dst)->rt_type);
+
/* dispatch old sublist */
if (!list_empty(&sublist))
ip_sublist_rcv_finish(&sublist);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3d8baaaf7086..9d83cb320dcb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -422,7 +422,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb_dst(skb)->dev;
+ struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
@@ -430,7 +430,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, dev,
+ net, sk, skb, indev, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index c724fb30d048..47f8b947eef1 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -215,6 +215,7 @@ void ip_tunnel_get_stats64(struct net_device *dev,
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+ [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS },
[LWTUNNEL_IP_ID] = { .type = NLA_U64 },
[LWTUNNEL_IP_DST] = { .type = NLA_U32 },
[LWTUNNEL_IP_SRC] = { .type = NLA_U32 },
@@ -251,7 +252,7 @@ erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
};
static int ip_tun_parse_opts_geneve(struct nlattr *attr,
- struct ip_tunnel_info *info,
+ struct ip_tunnel_info *info, int opts_len,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
@@ -273,7 +274,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr,
return -EINVAL;
if (info) {
- struct geneve_opt *opt = ip_tunnel_info_opts(info);
+ struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
memcpy(opt->opt_data, nla_data(attr), data_len);
opt->length = data_len / 4;
@@ -288,7 +289,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr,
}
static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
- struct ip_tunnel_info *info,
+ struct ip_tunnel_info *info, int opts_len,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
@@ -303,7 +304,8 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
return -EINVAL;
if (info) {
- struct vxlan_metadata *md = ip_tunnel_info_opts(info);
+ struct vxlan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
md->gbp = nla_get_u32(attr);
@@ -314,11 +316,12 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
}
static int ip_tun_parse_opts_erspan(struct nlattr *attr,
- struct ip_tunnel_info *info,
+ struct ip_tunnel_info *info, int opts_len,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
int err;
+ u8 ver;
err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
erspan_opt_policy, extack);
@@ -328,23 +331,31 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr,
if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
return -EINVAL;
- if (info) {
- struct erspan_metadata *md = ip_tunnel_info_opts(info);
+ ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
+ return -EINVAL;
+ } else if (ver == 2) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
+ !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
- attr = tb[LWTUNNEL_IP_OPT_ERSPAN_VER];
- md->version = nla_get_u8(attr);
+ if (info) {
+ struct erspan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
- if (md->version == 1 && tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) {
+ md->version = ver;
+ if (ver == 1) {
attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
md->u.index = nla_get_be32(attr);
- } else if (md->version == 2 && tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] &&
- tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) {
+ } else {
attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
md->u.md2.dir = nla_get_u8(attr);
attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
set_hwid(&md->u.md2, nla_get_u8(attr));
- } else {
- return -EINVAL;
}
info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
@@ -356,30 +367,57 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr,
static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
struct netlink_ext_ack *extack)
{
- struct nlattr *tb[LWTUNNEL_IP_OPTS_MAX + 1];
- int err;
+ int err, rem, opt_len, opts_len = 0, type = 0;
+ struct nlattr *nla;
if (!attr)
return 0;
- err = nla_parse_nested(tb, LWTUNNEL_IP_OPTS_MAX, attr,
- ip_opts_policy, extack);
+ err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
+ ip_opts_policy, extack);
if (err)
return err;
- if (tb[LWTUNNEL_IP_OPTS_GENEVE])
- err = ip_tun_parse_opts_geneve(tb[LWTUNNEL_IP_OPTS_GENEVE],
- info, extack);
- else if (tb[LWTUNNEL_IP_OPTS_VXLAN])
- err = ip_tun_parse_opts_vxlan(tb[LWTUNNEL_IP_OPTS_VXLAN],
- info, extack);
- else if (tb[LWTUNNEL_IP_OPTS_ERSPAN])
- err = ip_tun_parse_opts_erspan(tb[LWTUNNEL_IP_OPTS_ERSPAN],
- info, extack);
- else
- err = -EINVAL;
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case LWTUNNEL_IP_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ case LWTUNNEL_IP_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case LWTUNNEL_IP_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_ERSPAN_OPT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
- return err;
+ return opts_len;
}
static int ip_tun_get_optlen(struct nlattr *attr,
@@ -477,18 +515,23 @@ static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
{
struct geneve_opt *opt;
struct nlattr *nest;
+ int offset = 0;
nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
if (!nest)
return -ENOMEM;
- opt = ip_tunnel_info_opts(tun_info);
- if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, opt->opt_class) ||
- nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
- nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
- opt->opt_data)) {
- nla_nest_cancel(skb, nest);
- return -ENOMEM;
+ while (tun_info->options_len > offset) {
+ opt = ip_tunnel_info_opts(tun_info) + offset;
+ if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
+ nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
+ opt->opt_data)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+ offset += sizeof(*opt) + opt->length * 4;
}
nla_nest_end(skb, nest);
@@ -526,7 +569,7 @@ static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
return -ENOMEM;
md = ip_tunnel_info_opts(tun_info);
- if (nla_put_u32(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
+ if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
goto err;
if (md->version == 1 &&
@@ -602,13 +645,18 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */
if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
- struct geneve_opt *opt = ip_tunnel_info_opts(info);
-
- opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_GENEVE */
- + nla_total_size(2) /* OPT_GENEVE_CLASS */
- + nla_total_size(1) /* OPT_GENEVE_TYPE */
- + nla_total_size(opt->length * 4);
- /* OPT_GENEVE_DATA */
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */
+ while (info->options_len > offset) {
+ opt = ip_tunnel_info_opts(info) + offset;
+ opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */
+ + nla_total_size(1) /* OPT_GENEVE_TYPE */
+ + nla_total_size(opt->length * 4);
+ /* OPT_GENEVE_DATA */
+ offset += sizeof(*opt) + opt->length * 4;
+ }
} else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */
+ nla_total_size(4); /* OPT_VXLAN_GBP */
@@ -661,12 +709,14 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
};
static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+ [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS },
[LWTUNNEL_IP6_ID] = { .type = NLA_U64 },
[LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) },
[LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) },
[LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 },
[LWTUNNEL_IP6_TC] = { .type = NLA_U8 },
[LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
};
static int ip6_tun_build_state(struct nlattr *attr,
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 32e20b758b68..f35308ff84c3 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1412,6 +1412,9 @@ static int __init wait_for_devices(void)
struct net_device *dev;
int found = 0;
+ /* make sure deferred device probes are finished */
+ wait_for_device_probe();
+
rtnl_lock();
for_each_netdev(&init_net, dev) {
if (ic_is_init_dev(dev)) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 440294bdb752..6e68def66822 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2291,7 +2291,8 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
rcu_read_unlock();
return -ENODEV;
}
- skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
if (!skb2) {
read_unlock(&mrt_lock);
rcu_read_unlock();
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 168b72e18be0..e32e41b99f0f 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -10,7 +10,7 @@ static struct nf_flowtable_type flowtable_ipv4 = {
.family = NFPROTO_IPV4,
.init = nf_flow_table_init,
.setup = nf_flow_table_offload_setup,
- .action = nf_flow_rule_route,
+ .action = nf_flow_rule_route_ipv4,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ip_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index fc34fd1668d6..511eaa94e2d1 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -23,7 +23,6 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
- [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 },
[NHA_ID] = { .type = NLA_U32 },
[NHA_GROUP] = { .type = NLA_BINARY },
[NHA_GROUP_TYPE] = { .type = NLA_U16 },
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dcc4fa10138d..f88c93c38f11 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2019,10 +2019,52 @@ static int ip_mkroute_input(struct sk_buff *skb,
return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
}
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and the destination is not a local broadcast one.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ const struct sk_buff *hint)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct rtable *rt = (struct rtable *)hint;
+ struct net *net = dev_net(dev);
+ int err = -EINVAL;
+ u32 tag = 0;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ goto martian_source;
+
+ if (ipv4_is_zeronet(saddr))
+ goto martian_source;
+
+ if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_source;
+
+ if (rt->rt_type != RTN_LOCAL)
+ goto skip_validate_source;
+
+ tos &= IPTOS_RT_MASK;
+ err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
+ if (err < 0)
+ goto martian_source;
+
+skip_validate_source:
+ skb_dst_copy(skb, hint);
+ return 0;
+
+martian_source:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ return err;
+}
+
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
+ * Changes in the enforced policies must be applied also to
+ * ip_route_use_hint().
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 59ded25acd04..fcb2cd167f64 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -340,6 +340,10 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
user_key[i * 4 + 1],
user_key[i * 4 + 2],
user_key[i * 4 + 3]);
+
+ if (WARN_ON_ONCE(off >= tbl.maxlen - 1))
+ break;
+
if (i + 1 < n_keys)
off += snprintf(tbl.data + off, tbl.maxlen - off, ",");
}
@@ -1037,7 +1041,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_fib_multipath_hash_policy,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = &two,
},
#endif
{
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c445a81d144e..3737ec096650 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -256,6 +256,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
@@ -285,6 +288,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 4849edb62d52..12ab5db2b71c 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -92,6 +92,9 @@ void tcp_get_available_ulp(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ulp_ops->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1d58ce829dca..4da5758cc718 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1297,6 +1297,27 @@ out:
#define UDP_SKB_IS_STATELESS 0x80000000
+/* all head states (dst, sk, nf conntrack) except skb extensions are
+ * cleared by udp_rcv().
+ *
+ * We need to preserve secpath, if present, to eventually process
+ * IP_CMSG_PASSSEC at recvmsg() time.
+ *
+ * Other extensions can be cleared.
+ */
+static bool udp_try_make_stateless(struct sk_buff *skb)
+{
+ if (!skb_has_extensions(skb))
+ return true;
+
+ if (!secpath_exists(skb)) {
+ skb_ext_reset(skb);
+ return true;
+ }
+
+ return false;
+}
+
static void udp_set_dev_scratch(struct sk_buff *skb)
{
struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
@@ -1308,11 +1329,7 @@ static void udp_set_dev_scratch(struct sk_buff *skb)
scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
scratch->is_linear = !skb_is_nonlinear(skb);
#endif
- /* all head states execept sp (dst, sk, nf) are always cleared by
- * udp_rcv() and we need to preserve secpath, if present, to eventually
- * process IP_CMSG_PASSSEC at recvmsg() time
- */
- if (likely(!skb_sec_path(skb)))
+ if (udp_try_make_stateless(skb))
scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}
@@ -2534,9 +2551,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
case UDP_ENCAP:
switch (val) {
case 0:
+#ifdef CONFIG_XFRM
case UDP_ENCAP_ESPINUDP:
case UDP_ENCAP_ESPINUDP_NON_IKE:
up->encap_rcv = xfrm4_udp_encap_rcv;
+#endif
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index ecff3fce9807..89ba7c87de5d 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -92,7 +92,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, skb->dev, skb_dst(skb)->dev,
__xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f66bc2af4e9d..7bae6a91b487 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1461,6 +1461,8 @@ out:
}
#endif
goto failure;
+ } else if (fib6_requires_src(rt)) {
+ fib6_routes_require_src_inc(info->nl_net);
}
return err;
@@ -1933,6 +1935,8 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info)
struct fib6_info *cur = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
if (rt == cur) {
+ if (fib6_requires_src(cur))
+ fib6_routes_require_src_dec(info->nl_net);
fib6_del_route(table, fn, rtp, info);
return 0;
}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index ef7f707d9ae3..7b089d0ac8cd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -86,11 +86,27 @@ static void ip6_sublist_rcv_finish(struct list_head *head)
}
}
+static bool ip6_can_use_hint(const struct sk_buff *skb,
+ const struct sk_buff *hint)
+{
+ return hint && !skb_dst(skb) &&
+ ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr);
+}
+
+static struct sk_buff *ip6_extract_route_hint(const struct net *net,
+ struct sk_buff *skb)
+{
+ if (fib6_routes_require_src(net) || fib6_has_custom_rules(net))
+ return NULL;
+
+ return skb;
+}
+
static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
struct list_head *head)
{
+ struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
- struct sk_buff *skb, *next;
struct list_head sublist;
INIT_LIST_HEAD(&sublist);
@@ -104,9 +120,15 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip6_rcv(skb);
if (!skb)
continue;
- ip6_rcv_finish_core(net, sk, skb);
+
+ if (ip6_can_use_hint(skb, hint))
+ skb_dst_copy(skb, hint);
+ else
+ ip6_rcv_finish_core(net, sk, skb);
dst = skb_dst(skb);
if (curr_dst != dst) {
+ hint = ip6_extract_route_hint(net, skb);
+
/* dispatch old sublist */
if (!list_empty(&sublist))
ip6_sublist_rcv_finish(&sublist);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 71827b56c006..945508a7cb0f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -160,7 +160,7 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb_dst(skb)->dev;
+ struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
skb->protocol = htons(ETH_P_IPV6);
@@ -173,7 +173,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
}
return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, dev,
+ net, sk, skb, indev, dev,
ip6_finish_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 264c292e7dcc..79fc012dd2ca 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -363,8 +363,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
case IPV6_TRANSPARENT:
- if (valbool && !ns_capable(net->user_ns, CAP_NET_ADMIN) &&
- !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) &&
+ !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
retv = -EPERM;
break;
}
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 69443e9a3aa5..0594131fa46d 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -128,9 +128,9 @@ config IP6_NF_MATCH_HL
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MATCH_HL
---help---
- This is a backwards-compat option for the user's convenience
- (e.g. when running oldconfig). It selects
- CONFIG_NETFILTER_XT_MATCH_HL.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_HL.
config IP6_NF_MATCH_IPV6HEADER
tristate '"ipv6header" IPv6 Extension Headers Match'
@@ -184,9 +184,9 @@ config IP6_NF_TARGET_HL
depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
select NETFILTER_XT_TARGET_HL
---help---
- This is a backwards-compatible option for the user's convenience
- (e.g. when running oldconfig). It selects
- CONFIG_NETFILTER_XT_TARGET_HL.
+ This is a backwards-compatible option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_HL.
config IP6_NF_FILTER
tristate "Packet filtering"
@@ -245,14 +245,14 @@ config IP6_NF_RAW
# security table for MAC policy
config IP6_NF_SECURITY
- tristate "Security table"
- depends on SECURITY
- depends on NETFILTER_ADVANCED
- help
- This option adds a `security' table to iptables, for use
- with Mandatory Access Control (MAC) policy.
-
- If unsure, say N.
+ tristate "Security table"
+ depends on SECURITY
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `security' table to iptables, for use
+ with Mandatory Access Control (MAC) policy.
+
+ If unsure, say N.
config IP6_NF_NAT
tristate "ip6tables NAT support"
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index f069bc0dc056..a8566ee12e83 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -11,7 +11,7 @@ static struct nf_flowtable_type flowtable_ipv6 = {
.family = NFPROTO_IPV6,
.init = nf_flow_table_init,
.setup = nf_flow_table_offload_setup,
- .action = nf_flow_rule_route,
+ .action = nf_flow_rule_route_ipv6,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ipv6_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index edcb52543518..b59940416cb5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -634,7 +634,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
- if (fib6_nh->fib_nh_gw_family)
+ if (!fib6_nh->fib_nh_gw_family)
return;
nh_gw = &fib6_nh->fib_nh_gw6;
@@ -6199,6 +6199,9 @@ static int __net_init ip6_route_net_init(struct net *net)
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
+#ifdef CONFIG_IPV6_SUBTREES
+ net->ipv6.fib6_routes_require_src = 0;
+#endif
#endif
net->ipv6.sysctl.flush_delay = 0;
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 9d4f75e0d33a..85a5447a3e8d 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -81,6 +81,11 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
if (!pskb_may_pull(skb, srhoff + len))
return NULL;
+ /* note that pskb_may_pull may change pointers in header;
+ * for this reason it is necessary to reload them when needed.
+ */
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
if (!seg6_validate_srh(srh, len))
return NULL;
@@ -144,8 +149,9 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
*daddr = *addr;
}
-int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
- u32 tbl_id)
+static int
+seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+ u32 tbl_id, bool local_delivery)
{
struct net *net = dev_net(skb->dev);
struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -153,6 +159,7 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
struct dst_entry *dst = NULL;
struct rt6_info *rt;
struct flowi6 fl6;
+ int dev_flags = 0;
fl6.flowi6_iif = skb->dev->ifindex;
fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
@@ -177,7 +184,13 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
dst = &rt->dst;
}
- if (dst && dst->dev->flags & IFF_LOOPBACK && !dst->error) {
+ /* we want to discard traffic destined for local packet processing,
+ * if @local_delivery is set to false.
+ */
+ if (!local_delivery)
+ dev_flags |= IFF_LOOPBACK;
+
+ if (dst && (dst->dev->flags & dev_flags) && !dst->error) {
dst_release(dst);
dst = NULL;
}
@@ -194,6 +207,12 @@ out:
return dst->error;
}
+int seg6_lookup_nexthop(struct sk_buff *skb,
+ struct in6_addr *nhaddr, u32 tbl_id)
+{
+ return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false);
+}
+
/* regular endpoint function */
static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
{
@@ -336,6 +355,8 @@ static int input_action_end_dx6(struct sk_buff *skb,
if (!ipv6_addr_any(&slwt->nh6))
nhaddr = &slwt->nh6;
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
seg6_lookup_nexthop(skb, nhaddr, 0);
return dst_input(skb);
@@ -365,6 +386,8 @@ static int input_action_end_dx4(struct sk_buff *skb,
skb_dst_drop(skb);
+ skb_set_transport_header(skb, sizeof(struct iphdr));
+
err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
if (err)
goto drop;
@@ -385,7 +408,9 @@ static int input_action_end_dt6(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- seg6_lookup_nexthop(skb, NULL, slwt->table);
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ seg6_lookup_any_nexthop(skb, NULL, slwt->table, true);
return dst_input(skb);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index eecac1b7148e..fbe51d40bd7e 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -187,7 +187,7 @@ skip_frag:
int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, skb->dev, skb_dst(skb)->dev,
__xfrm6_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 4f03ebe732fa..6cbb1286d6c0 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -32,7 +32,8 @@ mac80211-y := \
chan.o \
trace.o mlme.o \
tdls.o \
- ocb.o
+ ocb.o \
+ airtime.o
mac80211-$(CONFIG_MAC80211_LEDS) += led.o
mac80211-$(CONFIG_MAC80211_DEBUGFS) += \
diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
new file mode 100644
index 000000000000..63cb0028b02d
--- /dev/null
+++ b/net/mac80211/airtime.c
@@ -0,0 +1,597 @@
+// SPDX-License-Identifier: ISC
+/*
+ * Copyright (C) 2019 Felix Fietkau <nbd@nbd.name>
+ */
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+
+#define AVG_PKT_SIZE 1024
+
+/* Number of bits for an average sized packet */
+#define MCS_NBITS (AVG_PKT_SIZE << 3)
+
+/* Number of kilo-symbols (symbols * 1024) for a packet with (bps) bits per
+ * symbol. We use k-symbols to avoid rounding in the _TIME macros below.
+ */
+#define MCS_N_KSYMS(bps) DIV_ROUND_UP(MCS_NBITS << 10, (bps))
+
+/* Transmission time (in 1024 * usec) for a packet containing (ksyms) * 1024
+ * symbols.
+ */
+#define MCS_SYMBOL_TIME(sgi, ksyms) \
+ (sgi ? \
+ ((ksyms) * 4 * 18) / 20 : /* 3.6 us per sym */ \
+ ((ksyms) * 4) /* 4.0 us per sym */ \
+ )
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define MCS_DURATION(streams, sgi, bps) \
+ ((u32)MCS_SYMBOL_TIME(sgi, MCS_N_KSYMS((streams) * (bps))))
+
+#define MCS_DURATION_S(shift, streams, sgi, bps) \
+ ((u16)((MCS_DURATION(streams, sgi, bps) >> shift)))
+
+/* These should match the values in enum nl80211_he_gi */
+#define HE_GI_08 0
+#define HE_GI_16 1
+#define HE_GI_32 2
+
+/* Transmission time (1024 usec) for a packet containing (ksyms) * k-symbols */
+#define HE_SYMBOL_TIME(gi, ksyms) \
+ (gi == HE_GI_08 ? \
+ ((ksyms) * 16 * 17) / 20 : /* 13.6 us per sym */ \
+ (gi == HE_GI_16 ? \
+ ((ksyms) * 16 * 18) / 20 : /* 14.4 us per sym */ \
+ ((ksyms) * 16) /* 16.0 us per sym */ \
+ ))
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define HE_DURATION(streams, gi, bps) \
+ ((u32)HE_SYMBOL_TIME(gi, MCS_N_KSYMS((streams) * (bps))))
+
+#define HE_DURATION_S(shift, streams, gi, bps) \
+ (HE_DURATION(streams, gi, bps) >> shift)
+
+#define BW_20 0
+#define BW_40 1
+#define BW_80 2
+#define BW_160 3
+
+/*
+ * Define group sort order: HT40 -> SGI -> #streams
+ */
+#define IEEE80211_MAX_STREAMS 4
+#define IEEE80211_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */
+#define IEEE80211_VHT_STREAM_GROUPS 8 /* BW(=4) * SGI(=2) */
+
+#define IEEE80211_HE_MAX_STREAMS 8
+#define IEEE80211_HE_STREAM_GROUPS 12 /* BW(=4) * GI(=3) */
+
+#define IEEE80211_HT_GROUPS_NB (IEEE80211_MAX_STREAMS * \
+ IEEE80211_HT_STREAM_GROUPS)
+#define IEEE80211_VHT_GROUPS_NB (IEEE80211_MAX_STREAMS * \
+ IEEE80211_VHT_STREAM_GROUPS)
+#define IEEE80211_HE_GROUPS_NB (IEEE80211_HE_MAX_STREAMS * \
+ IEEE80211_HE_STREAM_GROUPS)
+#define IEEE80211_GROUPS_NB (IEEE80211_HT_GROUPS_NB + \
+ IEEE80211_VHT_GROUPS_NB + \
+ IEEE80211_HE_GROUPS_NB)
+
+#define IEEE80211_HT_GROUP_0 0
+#define IEEE80211_VHT_GROUP_0 (IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB)
+#define IEEE80211_HE_GROUP_0 (IEEE80211_VHT_GROUP_0 + IEEE80211_VHT_GROUPS_NB)
+
+#define MCS_GROUP_RATES 12
+
+#define HT_GROUP_IDX(_streams, _sgi, _ht40) \
+ IEEE80211_HT_GROUP_0 + \
+ IEEE80211_MAX_STREAMS * 2 * _ht40 + \
+ IEEE80211_MAX_STREAMS * _sgi + \
+ _streams - 1
+
+#define _MAX(a, b) (((a)>(b))?(a):(b))
+
+#define GROUP_SHIFT(duration) \
+ _MAX(0, 16 - __builtin_clz(duration))
+
+/* MCS rate information for an MCS group */
+#define __MCS_GROUP(_streams, _sgi, _ht40, _s) \
+ [HT_GROUP_IDX(_streams, _sgi, _ht40)] = { \
+ .shift = _s, \
+ .duration = { \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 54 : 26), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 108 : 52), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 162 : 78), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 216 : 104), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 324 : 156), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 432 : 208), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 486 : 234), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 540 : 260) \
+ } \
+}
+
+#define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \
+ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26))
+
+#define MCS_GROUP(_streams, _sgi, _ht40) \
+ __MCS_GROUP(_streams, _sgi, _ht40, \
+ MCS_GROUP_SHIFT(_streams, _sgi, _ht40))
+
+#define VHT_GROUP_IDX(_streams, _sgi, _bw) \
+ (IEEE80211_VHT_GROUP_0 + \
+ IEEE80211_MAX_STREAMS * 2 * (_bw) + \
+ IEEE80211_MAX_STREAMS * (_sgi) + \
+ (_streams) - 1)
+
+#define BW2VBPS(_bw, r4, r3, r2, r1) \
+ (_bw == BW_160 ? r4 : _bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1)
+
+#define __VHT_GROUP(_streams, _sgi, _bw, _s) \
+ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \
+ .shift = _s, \
+ .duration = { \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 234, 117, 54, 26)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 468, 234, 108, 52)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 702, 351, 162, 78)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 936, 468, 216, 104)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 1404, 702, 324, 156)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 1872, 936, 432, 208)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2106, 1053, 486, 234)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2340, 1170, 540, 260)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2808, 1404, 648, 312)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 3120, 1560, 720, 346)) \
+ } \
+}
+
+#define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \
+ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 243, 117, 54, 26)))
+
+#define VHT_GROUP(_streams, _sgi, _bw) \
+ __VHT_GROUP(_streams, _sgi, _bw, \
+ VHT_GROUP_SHIFT(_streams, _sgi, _bw))
+
+
+#define HE_GROUP_IDX(_streams, _gi, _bw) \
+ (IEEE80211_HE_GROUP_0 + \
+ IEEE80211_HE_MAX_STREAMS * 3 * (_bw) + \
+ IEEE80211_HE_MAX_STREAMS * (_gi) + \
+ (_streams) - 1)
+
+#define __HE_GROUP(_streams, _gi, _bw, _s) \
+ [HE_GROUP_IDX(_streams, _gi, _bw)] = { \
+ .shift = _s, \
+ .duration = { \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 979, 489, 230, 115)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 1958, 979, 475, 230)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 2937, 1468, 705, 345)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 3916, 1958, 936, 475)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 5875, 2937, 1411, 705)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 7833, 3916, 1872, 936)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 8827, 4406, 2102, 1051)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 9806, 4896, 2347, 1166)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 11764, 5875, 2808, 1411)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 13060, 6523, 3124, 1555)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 14702, 7344, 3513, 1756)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 16329, 8164, 3902, 1944)) \
+ } \
+}
+
+#define HE_GROUP_SHIFT(_streams, _gi, _bw) \
+ GROUP_SHIFT(HE_DURATION(_streams, _gi, \
+ BW2VBPS(_bw, 979, 489, 230, 115)))
+
+#define HE_GROUP(_streams, _gi, _bw) \
+ __HE_GROUP(_streams, _gi, _bw, \
+ HE_GROUP_SHIFT(_streams, _gi, _bw))
+struct mcs_group {
+ u8 shift;
+ u16 duration[MCS_GROUP_RATES];
+};
+
+static const struct mcs_group airtime_mcs_groups[] = {
+ MCS_GROUP(1, 0, BW_20),
+ MCS_GROUP(2, 0, BW_20),
+ MCS_GROUP(3, 0, BW_20),
+ MCS_GROUP(4, 0, BW_20),
+
+ MCS_GROUP(1, 1, BW_20),
+ MCS_GROUP(2, 1, BW_20),
+ MCS_GROUP(3, 1, BW_20),
+ MCS_GROUP(4, 1, BW_20),
+
+ MCS_GROUP(1, 0, BW_40),
+ MCS_GROUP(2, 0, BW_40),
+ MCS_GROUP(3, 0, BW_40),
+ MCS_GROUP(4, 0, BW_40),
+
+ MCS_GROUP(1, 1, BW_40),
+ MCS_GROUP(2, 1, BW_40),
+ MCS_GROUP(3, 1, BW_40),
+ MCS_GROUP(4, 1, BW_40),
+
+ VHT_GROUP(1, 0, BW_20),
+ VHT_GROUP(2, 0, BW_20),
+ VHT_GROUP(3, 0, BW_20),
+ VHT_GROUP(4, 0, BW_20),
+
+ VHT_GROUP(1, 1, BW_20),
+ VHT_GROUP(2, 1, BW_20),
+ VHT_GROUP(3, 1, BW_20),
+ VHT_GROUP(4, 1, BW_20),
+
+ VHT_GROUP(1, 0, BW_40),
+ VHT_GROUP(2, 0, BW_40),
+ VHT_GROUP(3, 0, BW_40),
+ VHT_GROUP(4, 0, BW_40),
+
+ VHT_GROUP(1, 1, BW_40),
+ VHT_GROUP(2, 1, BW_40),
+ VHT_GROUP(3, 1, BW_40),
+ VHT_GROUP(4, 1, BW_40),
+
+ VHT_GROUP(1, 0, BW_80),
+ VHT_GROUP(2, 0, BW_80),
+ VHT_GROUP(3, 0, BW_80),
+ VHT_GROUP(4, 0, BW_80),
+
+ VHT_GROUP(1, 1, BW_80),
+ VHT_GROUP(2, 1, BW_80),
+ VHT_GROUP(3, 1, BW_80),
+ VHT_GROUP(4, 1, BW_80),
+
+ VHT_GROUP(1, 0, BW_160),
+ VHT_GROUP(2, 0, BW_160),
+ VHT_GROUP(3, 0, BW_160),
+ VHT_GROUP(4, 0, BW_160),
+
+ VHT_GROUP(1, 1, BW_160),
+ VHT_GROUP(2, 1, BW_160),
+ VHT_GROUP(3, 1, BW_160),
+ VHT_GROUP(4, 1, BW_160),
+
+ HE_GROUP(1, HE_GI_08, BW_20),
+ HE_GROUP(2, HE_GI_08, BW_20),
+ HE_GROUP(3, HE_GI_08, BW_20),
+ HE_GROUP(4, HE_GI_08, BW_20),
+ HE_GROUP(5, HE_GI_08, BW_20),
+ HE_GROUP(6, HE_GI_08, BW_20),
+ HE_GROUP(7, HE_GI_08, BW_20),
+ HE_GROUP(8, HE_GI_08, BW_20),
+
+ HE_GROUP(1, HE_GI_16, BW_20),
+ HE_GROUP(2, HE_GI_16, BW_20),
+ HE_GROUP(3, HE_GI_16, BW_20),
+ HE_GROUP(4, HE_GI_16, BW_20),
+ HE_GROUP(5, HE_GI_16, BW_20),
+ HE_GROUP(6, HE_GI_16, BW_20),
+ HE_GROUP(7, HE_GI_16, BW_20),
+ HE_GROUP(8, HE_GI_16, BW_20),
+
+ HE_GROUP(1, HE_GI_32, BW_20),
+ HE_GROUP(2, HE_GI_32, BW_20),
+ HE_GROUP(3, HE_GI_32, BW_20),
+ HE_GROUP(4, HE_GI_32, BW_20),
+ HE_GROUP(5, HE_GI_32, BW_20),
+ HE_GROUP(6, HE_GI_32, BW_20),
+ HE_GROUP(7, HE_GI_32, BW_20),
+ HE_GROUP(8, HE_GI_32, BW_20),
+
+ HE_GROUP(1, HE_GI_08, BW_40),
+ HE_GROUP(2, HE_GI_08, BW_40),
+ HE_GROUP(3, HE_GI_08, BW_40),
+ HE_GROUP(4, HE_GI_08, BW_40),
+ HE_GROUP(5, HE_GI_08, BW_40),
+ HE_GROUP(6, HE_GI_08, BW_40),
+ HE_GROUP(7, HE_GI_08, BW_40),
+ HE_GROUP(8, HE_GI_08, BW_40),
+
+ HE_GROUP(1, HE_GI_16, BW_40),
+ HE_GROUP(2, HE_GI_16, BW_40),
+ HE_GROUP(3, HE_GI_16, BW_40),
+ HE_GROUP(4, HE_GI_16, BW_40),
+ HE_GROUP(5, HE_GI_16, BW_40),
+ HE_GROUP(6, HE_GI_16, BW_40),
+ HE_GROUP(7, HE_GI_16, BW_40),
+ HE_GROUP(8, HE_GI_16, BW_40),
+
+ HE_GROUP(1, HE_GI_32, BW_40),
+ HE_GROUP(2, HE_GI_32, BW_40),
+ HE_GROUP(3, HE_GI_32, BW_40),
+ HE_GROUP(4, HE_GI_32, BW_40),
+ HE_GROUP(5, HE_GI_32, BW_40),
+ HE_GROUP(6, HE_GI_32, BW_40),
+ HE_GROUP(7, HE_GI_32, BW_40),
+ HE_GROUP(8, HE_GI_32, BW_40),
+
+ HE_GROUP(1, HE_GI_08, BW_80),
+ HE_GROUP(2, HE_GI_08, BW_80),
+ HE_GROUP(3, HE_GI_08, BW_80),
+ HE_GROUP(4, HE_GI_08, BW_80),
+ HE_GROUP(5, HE_GI_08, BW_80),
+ HE_GROUP(6, HE_GI_08, BW_80),
+ HE_GROUP(7, HE_GI_08, BW_80),
+ HE_GROUP(8, HE_GI_08, BW_80),
+
+ HE_GROUP(1, HE_GI_16, BW_80),
+ HE_GROUP(2, HE_GI_16, BW_80),
+ HE_GROUP(3, HE_GI_16, BW_80),
+ HE_GROUP(4, HE_GI_16, BW_80),
+ HE_GROUP(5, HE_GI_16, BW_80),
+ HE_GROUP(6, HE_GI_16, BW_80),
+ HE_GROUP(7, HE_GI_16, BW_80),
+ HE_GROUP(8, HE_GI_16, BW_80),
+
+ HE_GROUP(1, HE_GI_32, BW_80),
+ HE_GROUP(2, HE_GI_32, BW_80),
+ HE_GROUP(3, HE_GI_32, BW_80),
+ HE_GROUP(4, HE_GI_32, BW_80),
+ HE_GROUP(5, HE_GI_32, BW_80),
+ HE_GROUP(6, HE_GI_32, BW_80),
+ HE_GROUP(7, HE_GI_32, BW_80),
+ HE_GROUP(8, HE_GI_32, BW_80),
+
+ HE_GROUP(1, HE_GI_08, BW_160),
+ HE_GROUP(2, HE_GI_08, BW_160),
+ HE_GROUP(3, HE_GI_08, BW_160),
+ HE_GROUP(4, HE_GI_08, BW_160),
+ HE_GROUP(5, HE_GI_08, BW_160),
+ HE_GROUP(6, HE_GI_08, BW_160),
+ HE_GROUP(7, HE_GI_08, BW_160),
+ HE_GROUP(8, HE_GI_08, BW_160),
+
+ HE_GROUP(1, HE_GI_16, BW_160),
+ HE_GROUP(2, HE_GI_16, BW_160),
+ HE_GROUP(3, HE_GI_16, BW_160),
+ HE_GROUP(4, HE_GI_16, BW_160),
+ HE_GROUP(5, HE_GI_16, BW_160),
+ HE_GROUP(6, HE_GI_16, BW_160),
+ HE_GROUP(7, HE_GI_16, BW_160),
+ HE_GROUP(8, HE_GI_16, BW_160),
+
+ HE_GROUP(1, HE_GI_32, BW_160),
+ HE_GROUP(2, HE_GI_32, BW_160),
+ HE_GROUP(3, HE_GI_32, BW_160),
+ HE_GROUP(4, HE_GI_32, BW_160),
+ HE_GROUP(5, HE_GI_32, BW_160),
+ HE_GROUP(6, HE_GI_32, BW_160),
+ HE_GROUP(7, HE_GI_32, BW_160),
+ HE_GROUP(8, HE_GI_32, BW_160),
+};
+
+static u32
+ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre,
+ bool cck, int len)
+{
+ u32 duration;
+
+ if (cck) {
+ duration = 144 + 48; /* preamble + PLCP */
+ if (short_pre)
+ duration >>= 1;
+
+ duration += 10; /* SIFS */
+ } else {
+ duration = 20 + 16; /* premable + SIFS */
+ }
+
+ len <<= 3;
+ duration += (len * 10) / bitrate;
+
+ return duration;
+}
+
+u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_rx_status *status,
+ int len)
+{
+ struct ieee80211_supported_band *sband;
+ const struct ieee80211_rate *rate;
+ bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI;
+ bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE;
+ int bw, streams;
+ int group, idx;
+ u32 duration;
+ bool cck;
+
+ switch (status->bw) {
+ case RATE_INFO_BW_20:
+ bw = BW_20;
+ break;
+ case RATE_INFO_BW_40:
+ bw = BW_40;
+ break;
+ case RATE_INFO_BW_80:
+ bw = BW_80;
+ break;
+ case RATE_INFO_BW_160:
+ bw = BW_160;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ switch (status->encoding) {
+ case RX_ENC_LEGACY:
+ if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ))
+ return 0;
+
+ sband = hw->wiphy->bands[status->band];
+ if (!sband || status->rate_idx > sband->n_bitrates)
+ return 0;
+
+ rate = &sband->bitrates[status->rate_idx];
+ cck = rate->flags & IEEE80211_RATE_MANDATORY_B;
+
+ return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp,
+ cck, len);
+
+ case RX_ENC_VHT:
+ streams = status->nss;
+ idx = status->rate_idx;
+ group = VHT_GROUP_IDX(streams, sgi, bw);
+ break;
+ case RX_ENC_HT:
+ streams = ((status->rate_idx >> 3) & 3) + 1;
+ idx = status->rate_idx & 7;
+ group = HT_GROUP_IDX(streams, sgi, bw);
+ break;
+ case RX_ENC_HE:
+ streams = status->nss;
+ idx = status->rate_idx;
+ group = HE_GROUP_IDX(streams, status->he_gi, bw);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ if (WARN_ON_ONCE((status->encoding != RX_ENC_HE && streams > 4) ||
+ (status->encoding == RX_ENC_HE && streams > 8)))
+ return 0;
+
+ duration = airtime_mcs_groups[group].duration[idx];
+ duration <<= airtime_mcs_groups[group].shift;
+ duration *= len;
+ duration /= AVG_PKT_SIZE;
+ duration /= 1024;
+
+ duration += 36 + (streams << 2);
+
+ return duration;
+}
+EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime);
+
+static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw,
+ struct ieee80211_tx_rate *rate,
+ u8 band, int len)
+{
+ struct ieee80211_rx_status stat = {
+ .band = band,
+ };
+
+ if (rate->idx < 0 || !rate->count)
+ return 0;
+
+ if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+ stat.bw = RATE_INFO_BW_80;
+ else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ stat.bw = RATE_INFO_BW_40;
+ else
+ stat.bw = RATE_INFO_BW_20;
+
+ stat.enc_flags = 0;
+ if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)
+ stat.enc_flags |= RX_ENC_FLAG_SHORTPRE;
+ if (rate->flags & IEEE80211_TX_RC_SHORT_GI)
+ stat.enc_flags |= RX_ENC_FLAG_SHORT_GI;
+
+ stat.rate_idx = rate->idx;
+ if (rate->flags & IEEE80211_TX_RC_VHT_MCS) {
+ stat.encoding = RX_ENC_VHT;
+ stat.rate_idx = ieee80211_rate_get_vht_mcs(rate);
+ stat.nss = ieee80211_rate_get_vht_nss(rate);
+ } else if (rate->flags & IEEE80211_TX_RC_MCS) {
+ stat.encoding = RX_ENC_HT;
+ } else {
+ stat.encoding = RX_ENC_LEGACY;
+ }
+
+ return ieee80211_calc_rx_airtime(hw, &stat, len);
+}
+
+u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_tx_info *info,
+ int len)
+{
+ u32 duration = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(info->status.rates); i++) {
+ struct ieee80211_tx_rate *rate = &info->status.rates[i];
+ u32 cur_duration;
+
+ cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate,
+ info->band, len);
+ if (!cur_duration)
+ break;
+
+ duration += cur_duration * rate->count;
+ }
+
+ return duration;
+}
+EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime);
+
+u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *pubsta,
+ int len)
+{
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_chanctx_conf *conf;
+ int rateidx, shift = 0;
+ bool cck, short_pream;
+ u32 basic_rates;
+ u8 band = 0;
+ u16 rate;
+
+ len += 38; /* Ethernet header length */
+
+ conf = rcu_dereference(vif->chanctx_conf);
+ if (conf) {
+ band = conf->def.chan->band;
+ shift = ieee80211_chandef_get_shift(&conf->def);
+ }
+
+ if (pubsta) {
+ struct sta_info *sta = container_of(pubsta, struct sta_info,
+ sta);
+
+ return ieee80211_calc_tx_airtime_rate(hw,
+ &sta->tx_stats.last_rate,
+ band, len);
+ }
+
+ if (!conf)
+ return 0;
+
+ /* No station to get latest rate from, so calculate the worst-case
+ * duration using the lowest configured basic rate.
+ */
+ sband = hw->wiphy->bands[band];
+
+ basic_rates = vif->bss_conf.basic_rates;
+ short_pream = vif->bss_conf.use_short_preamble;
+
+ rateidx = basic_rates ? ffs(basic_rates) - 1 : 0;
+ rate = sband->bitrates[rateidx].bitrate << shift;
+ cck = sband->bitrates[rateidx].flags & IEEE80211_RATE_MANDATORY_B;
+
+ return ieee80211_calc_legacy_rate_duration(rate, short_pream, cck, len);
+}
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 568b3b276931..ad41d74530c6 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -59,6 +59,8 @@ static const struct file_operations name## _ops = { \
debugfs_create_file(#name, mode, phyd, local, &name## _ops);
+DEBUGFS_READONLY_FILE(hw_conf, "%x",
+ local->hw.conf.flags);
DEBUGFS_READONLY_FILE(user_power, "%d",
local->user_power_level);
DEBUGFS_READONLY_FILE(power, "%d",
@@ -148,6 +150,87 @@ static const struct file_operations aqm_ops = {
.llseek = default_llseek,
};
+static ssize_t aql_txq_limit_read(struct file *file,
+ char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[400];
+ int len = 0;
+
+ len = scnprintf(buf, sizeof(buf),
+ "AC AQL limit low AQL limit high\n"
+ "VO %u %u\n"
+ "VI %u %u\n"
+ "BE %u %u\n"
+ "BK %u %u\n",
+ local->aql_txq_limit_low[IEEE80211_AC_VO],
+ local->aql_txq_limit_high[IEEE80211_AC_VO],
+ local->aql_txq_limit_low[IEEE80211_AC_VI],
+ local->aql_txq_limit_high[IEEE80211_AC_VI],
+ local->aql_txq_limit_low[IEEE80211_AC_BE],
+ local->aql_txq_limit_high[IEEE80211_AC_BE],
+ local->aql_txq_limit_low[IEEE80211_AC_BK],
+ local->aql_txq_limit_high[IEEE80211_AC_BK]);
+ return simple_read_from_buffer(user_buf, count, ppos,
+ buf, len);
+}
+
+static ssize_t aql_txq_limit_write(struct file *file,
+ const char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[100];
+ size_t len;
+ u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old;
+ struct sta_info *sta;
+
+ if (count > sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ buf[sizeof(buf) - 1] = 0;
+ len = strlen(buf);
+ if (len > 0 && buf[len - 1] == '\n')
+ buf[len - 1] = 0;
+
+ if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
+ return -EINVAL;
+
+ if (ac >= IEEE80211_NUM_ACS)
+ return -EINVAL;
+
+ q_limit_low_old = local->aql_txq_limit_low[ac];
+ q_limit_high_old = local->aql_txq_limit_high[ac];
+
+ local->aql_txq_limit_low[ac] = q_limit_low;
+ local->aql_txq_limit_high[ac] = q_limit_high;
+
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta, &local->sta_list, list) {
+ /* If a sta has customized queue limits, keep it */
+ if (sta->airtime[ac].aql_limit_low == q_limit_low_old &&
+ sta->airtime[ac].aql_limit_high == q_limit_high_old) {
+ sta->airtime[ac].aql_limit_low = q_limit_low;
+ sta->airtime[ac].aql_limit_high = q_limit_high;
+ }
+ }
+ mutex_unlock(&local->sta_mtx);
+ return count;
+}
+
+static const struct file_operations aql_txq_limit_ops = {
+ .write = aql_txq_limit_write,
+ .read = aql_txq_limit_read,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
static ssize_t force_tx_status_read(struct file *file,
char __user *user_buf,
size_t count,
@@ -433,6 +516,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_ADD(hwflags);
DEBUGFS_ADD(user_power);
DEBUGFS_ADD(power);
+ DEBUGFS_ADD(hw_conf);
DEBUGFS_ADD_MODE(force_tx_status, 0600);
if (local->ops->wake_tx_queue)
@@ -441,6 +525,10 @@ void debugfs_hw_add(struct ieee80211_local *local)
debugfs_create_u16("airtime_flags", 0600,
phyd, &local->airtime_flags);
+ DEBUGFS_ADD(aql_txq_limit);
+ debugfs_create_u32("aql_threshold", 0600,
+ phyd, &local->aql_threshold);
+
statsd = debugfs_create_dir("statistics", phyd);
/* if the dir failed, don't put all the other things into the root! */
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index c8ad20c28c43..0185e6e5e5d1 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -197,10 +197,12 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
{
struct sta_info *sta = file->private_data;
struct ieee80211_local *local = sta->sdata->local;
- size_t bufsz = 200;
+ size_t bufsz = 400;
char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
u64 rx_airtime = 0, tx_airtime = 0;
s64 deficit[IEEE80211_NUM_ACS];
+ u32 q_depth[IEEE80211_NUM_ACS];
+ u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS];
ssize_t rv;
int ac;
@@ -212,19 +214,22 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
rx_airtime += sta->airtime[ac].rx_airtime;
tx_airtime += sta->airtime[ac].tx_airtime;
deficit[ac] = sta->airtime[ac].deficit;
+ q_limit_l[ac] = sta->airtime[ac].aql_limit_low;
+ q_limit_h[ac] = sta->airtime[ac].aql_limit_high;
spin_unlock_bh(&local->active_txq_lock[ac]);
+ q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending);
}
p += scnprintf(p, bufsz + buf - p,
"RX: %llu us\nTX: %llu us\nWeight: %u\n"
- "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n",
- rx_airtime,
- tx_airtime,
- sta->airtime_weight,
- deficit[0],
- deficit[1],
- deficit[2],
- deficit[3]);
+ "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n"
+ "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n"
+ "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n",
+ rx_airtime, tx_airtime, sta->airtime_weight,
+ deficit[0], deficit[1], deficit[2], deficit[3],
+ q_depth[0], q_depth[1], q_depth[2], q_depth[3],
+ q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1],
+ q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]),
rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
kfree(buf);
@@ -236,7 +241,25 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf,
{
struct sta_info *sta = file->private_data;
struct ieee80211_local *local = sta->sdata->local;
- int ac;
+ u32 ac, q_limit_l, q_limit_h;
+ char _buf[100] = {}, *buf = _buf;
+
+ if (count > sizeof(_buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, userbuf, count))
+ return -EFAULT;
+
+ buf[sizeof(_buf) - 1] = '\0';
+ if (sscanf(buf, "queue limit %u %u %u", &ac, &q_limit_l, &q_limit_h)
+ != 3)
+ return -EINVAL;
+
+ if (ac >= IEEE80211_NUM_ACS)
+ return -EINVAL;
+
+ sta->airtime[ac].aql_limit_low = q_limit_l;
+ sta->airtime[ac].aql_limit_high = q_limit_h;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
spin_lock_bh(&local->active_txq_lock[ac]);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 05406e9c05b3..ad15b3be8bb3 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1142,6 +1142,10 @@ struct ieee80211_local {
u16 schedule_round[IEEE80211_NUM_ACS];
u16 airtime_flags;
+ u32 aql_txq_limit_low[IEEE80211_NUM_ACS];
+ u32 aql_txq_limit_high[IEEE80211_NUM_ACS];
+ u32 aql_threshold;
+ atomic_t aql_total_pending_airtime;
const struct ieee80211_ops *ops;
@@ -2249,6 +2253,10 @@ const char *ieee80211_get_reason_code_string(u16 reason_code);
extern const struct ethtool_ops ieee80211_ethtool_ops;
+u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *pubsta,
+ int len);
#ifdef CONFIG_MAC80211_NOINLINE
#define debug_noinline noinline
#else
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 2d05c4cfaf6d..6cca0853f183 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -667,8 +667,16 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
INIT_LIST_HEAD(&local->active_txqs[i]);
spin_lock_init(&local->active_txq_lock[i]);
+ local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L;
+ local->aql_txq_limit_high[i] =
+ IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H;
}
- local->airtime_flags = AIRTIME_USE_TX | AIRTIME_USE_RX;
+
+ local->airtime_flags = AIRTIME_USE_TX |
+ AIRTIME_USE_RX |
+ AIRTIME_USE_AQL;
+ local->aql_threshold = IEEE80211_AQL_THRESHOLD;
+ atomic_set(&local->aql_total_pending_airtime, 0);
INIT_LIST_HEAD(&local->chanctx_list);
mutex_init(&local->chanctx_mtx);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 8d3a2389b055..8eafd81e97b4 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -210,6 +210,20 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
return NULL;
}
+struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local,
+ const u8 *sta_addr, const u8 *vif_addr)
+{
+ struct rhlist_head *tmp;
+ struct sta_info *sta;
+
+ for_each_sta_info(local, sta_addr, sta, tmp) {
+ if (ether_addr_equal(vif_addr, sta->sdata->vif.addr))
+ return sta;
+ }
+
+ return NULL;
+}
+
struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
int idx)
{
@@ -396,6 +410,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
skb_queue_head_init(&sta->ps_tx_buf[i]);
skb_queue_head_init(&sta->tx_filtered[i]);
sta->airtime[i].deficit = sta->airtime_weight;
+ atomic_set(&sta->airtime[i].aql_tx_pending, 0);
+ sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i];
+ sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i];
}
for (i = 0; i < IEEE80211_NUM_TIDS; i++)
@@ -1893,6 +1910,41 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
}
EXPORT_SYMBOL(ieee80211_sta_register_airtime);
+void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
+ struct sta_info *sta, u8 ac,
+ u16 tx_airtime, bool tx_completed)
+{
+ int tx_pending;
+
+ if (!tx_completed) {
+ if (sta)
+ atomic_add(tx_airtime,
+ &sta->airtime[ac].aql_tx_pending);
+
+ atomic_add(tx_airtime, &local->aql_total_pending_airtime);
+ return;
+ }
+
+ if (sta) {
+ tx_pending = atomic_sub_return(tx_airtime,
+ &sta->airtime[ac].aql_tx_pending);
+ if (WARN_ONCE(tx_pending < 0,
+ "STA %pM AC %d txq pending airtime underflow: %u, %u",
+ sta->addr, ac, tx_pending, tx_airtime))
+ atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending,
+ tx_pending, 0);
+ }
+
+ tx_pending = atomic_sub_return(tx_airtime,
+ &local->aql_total_pending_airtime);
+ if (WARN_ONCE(tx_pending < 0,
+ "Device %s AC %d pending airtime underflow: %u, %u",
+ wiphy_name(local->hw.wiphy), ac, tx_pending,
+ tx_airtime))
+ atomic_cmpxchg(&local->aql_total_pending_airtime,
+ tx_pending, 0);
+}
+
int sta_info_move_state(struct sta_info *sta,
enum ieee80211_sta_state new_state)
{
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 369c2dddce52..ad5d8a4ae56d 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -127,13 +127,21 @@ enum ieee80211_agg_stop_reason {
/* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */
#define AIRTIME_USE_TX BIT(0)
#define AIRTIME_USE_RX BIT(1)
+#define AIRTIME_USE_AQL BIT(2)
struct airtime_info {
u64 rx_airtime;
u64 tx_airtime;
s64 deficit;
+ atomic_t aql_tx_pending; /* Estimated airtime for frames pending */
+ u32 aql_limit_low;
+ u32 aql_limit_high;
};
+void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
+ struct sta_info *sta, u8 ac,
+ u16 tx_airtime, bool tx_completed);
+
struct sta_info;
/**
@@ -725,6 +733,10 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
const u8 *addr);
+/* user must hold sta_mtx or be in RCU critical section */
+struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local,
+ const u8 *sta_addr, const u8 *vif_addr);
+
#define for_each_sta_info(local, _addr, _sta, _tmp) \
rhl_for_each_entry_rcu(_sta, _tmp, \
sta_info_hash_lookup(local, _addr), hash_node)
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ab8ba5835ca0..b720feaf9a74 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -670,12 +670,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
struct sk_buff *skb, bool dropped)
{
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ u16 tx_time_est = ieee80211_info_get_tx_time_est(info);
struct ieee80211_hdr *hdr = (void *)skb->data;
bool acked = info->flags & IEEE80211_TX_STAT_ACK;
if (dropped)
acked = false;
+ if (tx_time_est) {
+ struct sta_info *sta;
+
+ rcu_read_lock();
+
+ sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2);
+ ieee80211_sta_update_pending_airtime(local, sta,
+ skb_get_queue_mapping(skb),
+ tx_time_est,
+ true);
+ rcu_read_unlock();
+ }
+
if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) {
struct ieee80211_sub_if_data *sdata;
@@ -877,6 +891,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
struct ieee80211_bar *bar;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
+ u16 tx_time_est;
rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
@@ -986,6 +1001,17 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
ieee80211_sta_register_airtime(&sta->sta, tid,
info->status.tx_time, 0);
+ if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) {
+ /* Do this here to avoid the expensive lookup of the sta
+ * in ieee80211_report_used_skb().
+ */
+ ieee80211_sta_update_pending_airtime(local, sta,
+ skb_get_queue_mapping(skb),
+ tx_time_est,
+ true);
+ ieee80211_info_set_tx_time_est(info, 0);
+ }
+
if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
if (info->flags & IEEE80211_TX_STAT_ACK) {
if (sta->status_stats.lost_packets)
@@ -1030,7 +1056,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
I802_DEBUG_INC(local->dot11FailedCount);
}
- if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) &&
+ if ((ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc)) &&
+ ieee80211_has_pm(fc) &&
ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) &&
!(info->flags & IEEE80211_TX_CTL_INJECTED) &&
local->ps_sdata && !(local->scanning)) {
@@ -1073,19 +1100,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
.skb = skb,
.info = IEEE80211_SKB_CB(skb),
};
- struct rhlist_head *tmp;
struct sta_info *sta;
rcu_read_lock();
- for_each_sta_info(local, hdr->addr1, sta, tmp) {
- /* skip wrong virtual interface */
- if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
- continue;
-
+ sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2);
+ if (sta)
status.sta = &sta->sta;
- break;
- }
__ieee80211_tx_status(hw, &status);
rcu_read_unlock();
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index db38be1b75fa..b696b9136f4c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2270,6 +2270,9 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
* isn't always enough to find the interface to use; for proper
* VLAN/WDS support we will need a different mechanism (which
* likely isn't going to be monitor interfaces).
+ *
+ * This is necessary, for example, for old hostapd versions that
+ * don't use nl80211-based management TX/RX.
*/
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -3551,6 +3554,9 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
WARN_ON_ONCE(softirq_count() == 0);
+ if (!ieee80211_txq_airtime_check(hw, txq))
+ return NULL;
+
begin:
spin_lock_bh(&fq->lock);
@@ -3661,6 +3667,21 @@ begin:
}
IEEE80211_SKB_CB(skb)->control.vif = vif;
+
+ if (local->airtime_flags & AIRTIME_USE_AQL) {
+ u32 airtime;
+
+ airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta,
+ skb->len);
+ if (airtime) {
+ airtime = ieee80211_info_set_tx_time_est(info, airtime);
+ ieee80211_sta_update_pending_airtime(local, tx.sta,
+ txq->ac,
+ airtime,
+ false);
+ }
+ }
+
return skb;
out:
@@ -3674,7 +3695,8 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
{
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_txq *ret = NULL;
- struct txq_info *txqi = NULL;
+ struct txq_info *txqi = NULL, *head = NULL;
+ bool found_eligible_txq = false;
spin_lock_bh(&local->active_txq_lock[ac]);
@@ -3685,13 +3707,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
if (!txqi)
goto out;
+ if (txqi == head) {
+ if (!found_eligible_txq)
+ goto out;
+ else
+ found_eligible_txq = false;
+ }
+
+ if (!head)
+ head = txqi;
+
if (txqi->txq.sta) {
struct sta_info *sta = container_of(txqi->txq.sta,
- struct sta_info, sta);
+ struct sta_info, sta);
+ bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq);
+ s64 deficit = sta->airtime[txqi->txq.ac].deficit;
+
+ if (aql_check)
+ found_eligible_txq = true;
- if (sta->airtime[txqi->txq.ac].deficit < 0) {
+ if (deficit < 0)
sta->airtime[txqi->txq.ac].deficit +=
sta->airtime_weight;
+
+ if (deficit < 0 || !aql_check) {
list_move_tail(&txqi->schedule_order,
&local->active_txqs[txqi->txq.ac]);
goto begin;
@@ -3745,6 +3784,33 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL(__ieee80211_schedule_txq);
+bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw,
+ struct ieee80211_txq *txq)
+{
+ struct sta_info *sta;
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ if (!(local->airtime_flags & AIRTIME_USE_AQL))
+ return true;
+
+ if (!txq->sta)
+ return true;
+
+ sta = container_of(txq->sta, struct sta_info, sta);
+ if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
+ sta->airtime[txq->ac].aql_limit_low)
+ return true;
+
+ if (atomic_read(&local->aql_total_pending_airtime) <
+ local->aql_threshold &&
+ atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
+ sta->airtime[txq->ac].aql_limit_high)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(ieee80211_txq_airtime_check);
+
bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
struct ieee80211_txq *txq)
{
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 1a04e0929738..be5e95a0d876 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -25,7 +25,8 @@
/* 3 Counters support added */
/* 4 Comments support added */
/* 5 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */
+/* 6 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -57,6 +58,7 @@ struct hash_netiface4_elem {
u8 cidr;
u8 nomatch;
u8 elem;
+ u8 wildcard;
char iface[IFNAMSIZ];
};
@@ -71,7 +73,9 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- strcmp(ip1->iface, ip2->iface) == 0;
+ (ip1->wildcard ?
+ strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 :
+ strcmp(ip1->iface, ip2->iface) == 0);
}
static int
@@ -103,7 +107,8 @@ static bool
hash_netiface4_data_list(struct sk_buff *skb,
const struct hash_netiface4_elem *data)
{
- u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+ u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) |
+ (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0);
if (data->nomatch)
flags |= IPSET_FLAG_NOMATCH;
@@ -229,6 +234,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
+ if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD)
+ e.wildcard = 1;
}
if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
e.ip = htonl(ip & ip_set_hostmask(e.cidr));
@@ -280,6 +287,7 @@ struct hash_netiface6_elem {
u8 cidr;
u8 nomatch;
u8 elem;
+ u8 wildcard;
char iface[IFNAMSIZ];
};
@@ -294,7 +302,9 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- strcmp(ip1->iface, ip2->iface) == 0;
+ (ip1->wildcard ?
+ strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 :
+ strcmp(ip1->iface, ip2->iface) == 0);
}
static int
@@ -326,7 +336,8 @@ static bool
hash_netiface6_data_list(struct sk_buff *skb,
const struct hash_netiface6_elem *data)
{
- u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+ u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) |
+ (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0);
if (data->nomatch)
flags |= IPSET_FLAG_NOMATCH;
@@ -440,6 +451,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
+ if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD)
+ e.wildcard = 1;
}
ret = adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 8468d2d02284..9889d52eda82 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -18,11 +18,11 @@ static DEFINE_MUTEX(flowtable_lock);
static LIST_HEAD(flowtables);
static void
-flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
+flow_offload_fill_dir(struct flow_offload *flow,
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
- struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
ft->dir = dir;
@@ -57,8 +57,8 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
flow->ct = ct;
- flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_ORIGINAL);
- flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_REPLY);
+ flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
if (ct->status & IPS_SRC_NAT)
flow->flags |= FLOW_OFFLOAD_SNAT;
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index bfb910b874ce..88bedf1ff1ae 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -21,11 +21,34 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
return NF_ACCEPT;
}
+static int nf_flow_rule_route_inet(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+ int err;
+
+ switch (flow_tuple->l3proto) {
+ case NFPROTO_IPV4:
+ err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule);
+ break;
+ case NFPROTO_IPV6:
+ err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule);
+ break;
+ default:
+ err = -1;
+ break;
+ }
+
+ return err;
+}
+
static struct nf_flowtable_type flowtable_inet = {
.family = NFPROTO_INET,
.init = nf_flow_table_init,
.setup = nf_flow_table_offload_setup,
- .action = nf_flow_rule_route,
+ .action = nf_flow_rule_route_inet,
.free = nf_flow_table_free,
.hook = nf_flow_offload_inet_hook,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 9be61f47303a..c54c9a6cc981 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -112,13 +112,22 @@ static void flow_offload_mangle(struct flow_action_entry *entry,
memcpy(&entry->mangle.val, value, sizeof(u32));
}
+static inline struct flow_action_entry *
+flow_action_entry_next(struct nf_flow_rule *flow_rule)
+{
+ int i = flow_rule->rule->action.num_entries++;
+
+ return &flow_rule->rule->action.entries[i];
+}
+
static int flow_offload_eth_src(struct net *net,
const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
- struct flow_action_entry *entry0,
- struct flow_action_entry *entry1)
+ struct nf_flow_rule *flow_rule)
{
const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple;
+ struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
+ struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
struct net_device *dev;
u32 mask, val;
u16 val16;
@@ -145,10 +154,11 @@ static int flow_offload_eth_src(struct net *net,
static int flow_offload_eth_dst(struct net *net,
const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
- struct flow_action_entry *entry0,
- struct flow_action_entry *entry1)
+ struct nf_flow_rule *flow_rule)
{
const struct flow_offload_tuple *tuple = &flow->tuplehash[dir].tuple;
+ struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
+ struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
struct neighbour *n;
u32 mask, val;
u16 val16;
@@ -175,8 +185,9 @@ static int flow_offload_eth_dst(struct net *net,
static void flow_offload_ipv4_snat(struct net *net,
const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
- struct flow_action_entry *entry)
+ struct nf_flow_rule *flow_rule)
{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
u32 mask = ~htonl(0xffffffff);
__be32 addr;
u32 offset;
@@ -201,8 +212,9 @@ static void flow_offload_ipv4_snat(struct net *net,
static void flow_offload_ipv4_dnat(struct net *net,
const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
- struct flow_action_entry *entry)
+ struct nf_flow_rule *flow_rule)
{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
u32 mask = ~htonl(0xffffffff);
__be32 addr;
u32 offset;
@@ -224,6 +236,71 @@ static void flow_offload_ipv4_dnat(struct net *net,
(u8 *)&addr, (u8 *)&mask);
}
+static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
+ unsigned int offset,
+ u8 *addr, u8 *mask)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) {
+ entry = flow_action_entry_next(flow_rule);
+ flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
+ offset + i,
+ &addr[i], mask);
+ }
+}
+
+static void flow_offload_ipv6_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ u32 mask = ~htonl(0xffffffff);
+ const u8 *addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr;
+ offset = offsetof(struct ipv6hdr, saddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr;
+ offset = offsetof(struct ipv6hdr, daddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_ipv6_mangle(flow_rule, offset, (u8 *)addr, (u8 *)&mask);
+}
+
+static void flow_offload_ipv6_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ u32 mask = ~htonl(0xffffffff);
+ const u8 *addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr;
+ offset = offsetof(struct ipv6hdr, daddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr;
+ offset = offsetof(struct ipv6hdr, saddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_ipv6_mangle(flow_rule, offset, (u8 *)addr, (u8 *)&mask);
+}
+
static int flow_offload_l4proto(const struct flow_offload *flow)
{
u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
@@ -246,8 +323,9 @@ static int flow_offload_l4proto(const struct flow_offload *flow)
static void flow_offload_port_snat(struct net *net,
const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
- struct flow_action_entry *entry)
+ struct nf_flow_rule *flow_rule)
{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
u32 mask = ~htonl(0xffff0000);
__be16 port;
u32 offset;
@@ -272,8 +350,9 @@ static void flow_offload_port_snat(struct net *net,
static void flow_offload_port_dnat(struct net *net,
const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
- struct flow_action_entry *entry)
+ struct nf_flow_rule *flow_rule)
{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
u32 mask = ~htonl(0xffff);
__be16 port;
u32 offset;
@@ -297,9 +376,10 @@ static void flow_offload_port_dnat(struct net *net,
static void flow_offload_ipv4_checksum(struct net *net,
const struct flow_offload *flow,
- struct flow_action_entry *entry)
+ struct nf_flow_rule *flow_rule)
{
u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
entry->id = FLOW_ACTION_CSUM;
entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR;
@@ -316,8 +396,9 @@ static void flow_offload_ipv4_checksum(struct net *net,
static void flow_offload_redirect(const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
- struct flow_action_entry *entry)
+ struct nf_flow_rule *flow_rule)
{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
struct rtable *rt;
rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
@@ -326,45 +407,56 @@ static void flow_offload_redirect(const struct flow_offload *flow,
dev_hold(rt->dst.dev);
}
-int nf_flow_rule_route(struct net *net, const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
- int i;
-
- if (flow_offload_eth_src(net, flow, dir,
- &flow_rule->rule->action.entries[0],
- &flow_rule->rule->action.entries[1]) < 0)
+ if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
return -1;
- if (flow_offload_eth_dst(net, flow, dir,
- &flow_rule->rule->action.entries[2],
- &flow_rule->rule->action.entries[3]) < 0)
- return -1;
-
- i = 4;
if (flow->flags & FLOW_OFFLOAD_SNAT) {
- flow_offload_ipv4_snat(net, flow, dir,
- &flow_rule->rule->action.entries[i++]);
- flow_offload_port_snat(net, flow, dir,
- &flow_rule->rule->action.entries[i++]);
+ flow_offload_ipv4_snat(net, flow, dir, flow_rule);
+ flow_offload_port_snat(net, flow, dir, flow_rule);
}
if (flow->flags & FLOW_OFFLOAD_DNAT) {
- flow_offload_ipv4_dnat(net, flow, dir,
- &flow_rule->rule->action.entries[i++]);
- flow_offload_port_dnat(net, flow, dir,
- &flow_rule->rule->action.entries[i++]);
+ flow_offload_ipv4_dnat(net, flow, dir, flow_rule);
+ flow_offload_port_dnat(net, flow, dir, flow_rule);
}
if (flow->flags & FLOW_OFFLOAD_SNAT ||
flow->flags & FLOW_OFFLOAD_DNAT)
- flow_offload_ipv4_checksum(net, flow,
- &flow_rule->rule->action.entries[i++]);
+ flow_offload_ipv4_checksum(net, flow, flow_rule);
- flow_offload_redirect(flow, dir, &flow_rule->rule->action.entries[i++]);
+ flow_offload_redirect(flow, dir, flow_rule);
- return i;
+ return 0;
}
-EXPORT_SYMBOL_GPL(nf_flow_rule_route);
+EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4);
+
+int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ return -1;
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT) {
+ flow_offload_ipv6_snat(net, flow, dir, flow_rule);
+ flow_offload_port_snat(net, flow, dir, flow_rule);
+ }
+ if (flow->flags & FLOW_OFFLOAD_DNAT) {
+ flow_offload_ipv6_dnat(net, flow, dir, flow_rule);
+ flow_offload_port_dnat(net, flow, dir, flow_rule);
+ }
+
+ flow_offload_redirect(flow, dir, flow_rule);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6);
+
+#define NF_FLOW_RULE_ACTION_MAX 16
static struct nf_flow_rule *
nf_flow_offload_rule_alloc(struct net *net,
@@ -375,13 +467,13 @@ nf_flow_offload_rule_alloc(struct net *net,
const struct flow_offload *flow = offload->flow;
const struct flow_offload_tuple *tuple;
struct nf_flow_rule *flow_rule;
- int err = -ENOMEM, num_actions;
+ int err = -ENOMEM;
flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
if (!flow_rule)
goto err_flow;
- flow_rule->rule = flow_rule_alloc(10);
+ flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX);
if (!flow_rule->rule)
goto err_flow_rule;
@@ -394,12 +486,10 @@ nf_flow_offload_rule_alloc(struct net *net,
if (err < 0)
goto err_flow_match;
- num_actions = flowtable->type->action(net, flow, dir, flow_rule);
- if (num_actions < 0)
+ flow_rule->rule->action.num_entries = 0;
+ if (flowtable->type->action(net, flow, dir, flow_rule) < 0)
goto err_flow_match;
- flow_rule->rule->action.num_entries = num_actions;
-
return flow_rule;
err_flow_match:
@@ -722,6 +812,9 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD))
return 0;
+ if (!dev->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
bo.net = dev_net(dev);
bo.block = &flowtable->flow_block;
bo.command = cmd;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2dc636faa322..ff04cdc87f76 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -361,6 +361,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
{
+ struct nft_flow_rule *flow;
struct nft_trans *trans;
int err;
@@ -368,6 +369,16 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
if (trans == NULL)
return -ENOMEM;
+ if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+ flow = nft_flow_rule_create(ctx->net, rule);
+ if (IS_ERR(flow)) {
+ nft_trans_destroy(trans);
+ return PTR_ERR(flow);
+ }
+
+ nft_trans_flow_rule(trans) = flow;
+ }
+
err = nf_tables_delrule_deactivate(ctx, rule);
if (err < 0) {
nft_trans_destroy(trans);
@@ -5964,16 +5975,22 @@ nft_flowtable_type_get(struct net *net, u8 family)
return ERR_PTR(-ENOENT);
}
+static void nft_unregister_flowtable_hook(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nft_hook *hook)
+{
+ nf_unregister_net_hook(net, &hook->ops);
+ flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+ FLOW_BLOCK_UNBIND);
+}
+
static void nft_unregister_flowtable_net_hooks(struct net *net,
struct nft_flowtable *flowtable)
{
struct nft_hook *hook;
- list_for_each_entry(hook, &flowtable->hook_list, list) {
- nf_unregister_net_hook(net, &hook->ops);
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
- FLOW_BLOCK_UNBIND);
- }
+ list_for_each_entry(hook, &flowtable->hook_list, list)
+ nft_unregister_flowtable_hook(net, flowtable, hook);
}
static int nft_register_flowtable_net_hooks(struct net *net,
@@ -5995,12 +6012,20 @@ static int nft_register_flowtable_net_hooks(struct net *net,
}
}
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
- FLOW_BLOCK_BIND);
- err = nf_register_net_hook(net, &hook->ops);
+ err = flowtable->data.type->setup(&flowtable->data,
+ hook->ops.dev,
+ FLOW_BLOCK_BIND);
if (err < 0)
goto err_unregister_net_hooks;
+ err = nf_register_net_hook(net, &hook->ops);
+ if (err < 0) {
+ flowtable->data.type->setup(&flowtable->data,
+ hook->ops.dev,
+ FLOW_BLOCK_UNBIND);
+ goto err_unregister_net_hooks;
+ }
+
i++;
}
@@ -6011,9 +6036,7 @@ err_unregister_net_hooks:
if (i-- <= 0)
break;
- nf_unregister_net_hook(net, &hook->ops);
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
- FLOW_BLOCK_UNBIND);
+ nft_unregister_flowtable_hook(net, flowtable, hook);
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
}
@@ -6120,7 +6143,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
return 0;
err5:
list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
- nf_unregister_net_hook(net, &hook->ops);
+ nft_unregister_flowtable_hook(net, flowtable, hook);
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
}
@@ -6465,7 +6488,7 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev,
if (hook->ops.dev != dev)
continue;
- nf_unregister_net_hook(dev_net(dev), &hook->ops);
+ nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook);
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
break;
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index cdea3010c7a0..68f17a6921d8 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -159,9 +159,9 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
const struct nft_base_chain *basechain,
const struct nft_rule *rule,
const struct nft_flow_rule *flow,
+ struct netlink_ext_ack *extack,
enum flow_cls_command command)
{
- struct netlink_ext_ack extack;
__be16 proto = ETH_P_ALL;
memset(cls_flow, 0, sizeof(*cls_flow));
@@ -170,7 +170,7 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
proto = flow->proto;
nft_flow_offload_common_init(&cls_flow->common, proto,
- basechain->ops.priority, &extack);
+ basechain->ops.priority, extack);
cls_flow->command = command;
cls_flow->cookie = (unsigned long) rule;
if (flow)
@@ -182,6 +182,7 @@ static int nft_flow_offload_rule(struct nft_chain *chain,
struct nft_flow_rule *flow,
enum flow_cls_command command)
{
+ struct netlink_ext_ack extack = {};
struct flow_cls_offload cls_flow;
struct nft_base_chain *basechain;
@@ -189,7 +190,8 @@ static int nft_flow_offload_rule(struct nft_chain *chain,
return -EOPNOTSUPP;
basechain = nft_base_chain(chain);
- nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, command);
+ nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack,
+ command);
return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow,
&basechain->flow_block.cb_list);
@@ -207,13 +209,15 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo,
{
struct flow_block_cb *block_cb, *next;
struct flow_cls_offload cls_flow;
+ struct netlink_ext_ack extack;
struct nft_chain *chain;
struct nft_rule *rule;
chain = &basechain->chain;
list_for_each_entry(rule, &chain->rules, list) {
+ memset(&extack, 0, sizeof(extack));
nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL,
- FLOW_CLS_DESTROY);
+ &extack, FLOW_CLS_DESTROY);
nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list);
}
@@ -385,6 +389,55 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy,
return nft_flow_block_chain(basechain, NULL, cmd);
}
+static void nft_flow_rule_offload_abort(struct net *net,
+ struct nft_trans *trans)
+{
+ int err = 0;
+
+ list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) {
+ if (trans->ctx.family != NFPROTO_NETDEV)
+ continue;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ nft_trans_chain_update(trans))
+ continue;
+
+ err = nft_flow_offload_chain(trans->ctx.chain, NULL,
+ FLOW_BLOCK_UNBIND);
+ break;
+ case NFT_MSG_DELCHAIN:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_chain(trans->ctx.chain, NULL,
+ FLOW_BLOCK_BIND);
+ break;
+ case NFT_MSG_NEWRULE:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_rule(trans->ctx.chain,
+ nft_trans_rule(trans),
+ NULL, FLOW_CLS_DESTROY);
+ break;
+ case NFT_MSG_DELRULE:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_rule(trans->ctx.chain,
+ nft_trans_rule(trans),
+ nft_trans_flow_rule(trans),
+ FLOW_CLS_REPLACE);
+ break;
+ }
+
+ if (WARN_ON_ONCE(err))
+ break;
+ }
+}
+
int nft_flow_rule_offload_commit(struct net *net)
{
struct nft_trans *trans;
@@ -418,14 +471,14 @@ int nft_flow_rule_offload_commit(struct net *net)
continue;
if (trans->ctx.flags & NLM_F_REPLACE ||
- !(trans->ctx.flags & NLM_F_APPEND))
- return -EOPNOTSUPP;
-
+ !(trans->ctx.flags & NLM_F_APPEND)) {
+ err = -EOPNOTSUPP;
+ break;
+ }
err = nft_flow_offload_rule(trans->ctx.chain,
nft_trans_rule(trans),
nft_trans_flow_rule(trans),
FLOW_CLS_REPLACE);
- nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_DELRULE:
if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
@@ -433,13 +486,31 @@ int nft_flow_rule_offload_commit(struct net *net)
err = nft_flow_offload_rule(trans->ctx.chain,
nft_trans_rule(trans),
- nft_trans_flow_rule(trans),
- FLOW_CLS_DESTROY);
+ NULL, FLOW_CLS_DESTROY);
break;
}
- if (err)
- return err;
+ if (err) {
+ nft_flow_rule_offload_abort(net, trans);
+ break;
+ }
+ }
+
+ list_for_each_entry(trans, &net->nft.commit_list, list) {
+ if (trans->ctx.family != NFPROTO_NETDEV)
+ continue;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWRULE:
+ case NFT_MSG_DELRULE:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+ break;
+ default:
+ break;
+ }
}
return err;
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 0744b2bb46da..b8092069f868 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
+#include <linux/if_arp.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables_offload.h>
@@ -125,6 +126,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
flow->match.dissector.used_keys |= BIT(reg->key);
flow->match.dissector.offset[reg->key] = reg->base_offset;
+ if (reg->key == FLOW_DISSECTOR_KEY_META &&
+ reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) &&
+ nft_reg_load16(priv->data.data) != ARPHRD_ETHER)
+ return -EOPNOTSUPP;
+
nft_offload_update_dependency(ctx, &priv->data, priv->len);
return 0;
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 317e3a9e8c5b..9740b554fdb3 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -33,19 +33,19 @@
static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-static u8 nft_meta_weekday(unsigned long secs)
+static u8 nft_meta_weekday(time64_t secs)
{
unsigned int dse;
u8 wday;
secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest;
- dse = secs / NFT_META_SECS_PER_DAY;
+ dse = div_u64(secs, NFT_META_SECS_PER_DAY);
wday = (4 + dse) % NFT_META_DAYS_PER_WEEK;
return wday;
}
-static u32 nft_meta_hour(unsigned long secs)
+static u32 nft_meta_hour(time64_t secs)
{
struct tm tm;
@@ -250,10 +250,10 @@ void nft_meta_get_eval(const struct nft_expr *expr,
nft_reg_store64(dest, ktime_get_real_ns());
break;
case NFT_META_TIME_DAY:
- nft_reg_store8(dest, nft_meta_weekday(get_seconds()));
+ nft_reg_store8(dest, nft_meta_weekday(ktime_get_real_seconds()));
break;
case NFT_META_TIME_HOUR:
- *dest = nft_meta_hour(get_seconds());
+ *dest = nft_meta_hour(ktime_get_real_seconds());
break;
default:
WARN_ON(1);
@@ -547,6 +547,14 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
sizeof(__u8), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
break;
+ case NFT_META_IIF:
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_ifindex, sizeof(__u32), reg);
+ break;
+ case NFT_META_IIFTYPE:
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_iftype, sizeof(__u16), reg);
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 5cb2d8908d2a..1993af3a2979 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -23,50 +23,58 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
+static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
+ struct vlan_ethhdr *veth)
+{
+ if (skb_copy_bits(skb, mac_off, veth, ETH_HLEN))
+ return false;
+
+ veth->h_vlan_proto = skb->vlan_proto;
+ veth->h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ veth->h_vlan_encapsulated_proto = skb->protocol;
+
+ return true;
+}
+
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
{
int mac_off = skb_mac_header(skb) - skb->data;
- u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d;
+ u8 *vlanh, *dst_u8 = (u8 *) d;
struct vlan_ethhdr veth;
+ u8 vlan_hlen = 0;
+
+ if ((skb->protocol == htons(ETH_P_8021AD) ||
+ skb->protocol == htons(ETH_P_8021Q)) &&
+ offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN)
+ vlan_hlen += VLAN_HLEN;
vlanh = (u8 *) &veth;
- if (offset < ETH_HLEN) {
- u8 ethlen = min_t(u8, len, ETH_HLEN - offset);
+ if (offset < VLAN_ETH_HLEN + vlan_hlen) {
+ u8 ethlen = len;
- if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN))
+ if (vlan_hlen &&
+ skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0)
+ return false;
+ else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth))
return false;
- veth.h_vlan_proto = skb->vlan_proto;
+ if (offset + len > VLAN_ETH_HLEN + vlan_hlen)
+ ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen;
- memcpy(dst_u8, vlanh + offset, ethlen);
+ memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen);
len -= ethlen;
if (len == 0)
return true;
dst_u8 += ethlen;
- offset = ETH_HLEN;
- } else if (offset >= VLAN_ETH_HLEN) {
- offset -= VLAN_HLEN;
- goto skip;
+ offset = ETH_HLEN + vlan_hlen;
+ } else {
+ offset -= VLAN_HLEN + vlan_hlen;
}
- veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
- veth.h_vlan_encapsulated_proto = skb->protocol;
-
- vlanh += offset;
-
- vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset);
- memcpy(dst_u8, vlanh, vlan_len);
-
- len -= vlan_len;
- if (!len)
- return true;
-
- dst_u8 += vlan_len;
- skip:
return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
}
@@ -174,6 +182,44 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
dst, ETH_ALEN, reg);
break;
+ case offsetof(struct ethhdr, h_proto):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic,
+ n_proto, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
+ vlan_tci, sizeof(__be16), reg);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
+ vlan_tpid, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
+ vlan_tci, sizeof(__be16), reg);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) +
+ sizeof(struct vlan_hdr):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
+ vlan_tpid, sizeof(__be16), reg);
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 8dbb4d48f2ed..67cb98489415 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -77,12 +77,12 @@ static inline bool is_leap(unsigned int y)
* This is done in three separate functions so that the most expensive
* calculations are done last, in case a "simple match" can be found earlier.
*/
-static inline unsigned int localtime_1(struct xtm *r, time_t time)
+static inline unsigned int localtime_1(struct xtm *r, time64_t time)
{
unsigned int v, w;
/* Each day has 86400s, so finding the hour/minute is actually easy. */
- v = time % SECONDS_PER_DAY;
+ div_u64_rem(time, SECONDS_PER_DAY, &v);
r->second = v % 60;
w = v / 60;
r->minute = w % 60;
@@ -90,13 +90,13 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time)
return v;
}
-static inline void localtime_2(struct xtm *r, time_t time)
+static inline void localtime_2(struct xtm *r, time64_t time)
{
/*
* Here comes the rest (weekday, monthday). First, divide the SSTE
* by seconds-per-day to get the number of _days_ since the epoch.
*/
- r->dse = time / 86400;
+ r->dse = div_u64(time, SECONDS_PER_DAY);
/*
* 1970-01-01 (w=0) was a Thursday (4).
@@ -105,7 +105,7 @@ static inline void localtime_2(struct xtm *r, time_t time)
r->weekday = (4 + r->dse - 1) % 7 + 1;
}
-static void localtime_3(struct xtm *r, time_t time)
+static void localtime_3(struct xtm *r, time64_t time)
{
unsigned int year, i, w = r->dse;
@@ -160,7 +160,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_time_info *info = par->matchinfo;
unsigned int packet_time;
struct xtm current_time;
- s64 stamp;
+ time64_t stamp;
/*
* We need real time here, but we can neither use skb->tstamp
@@ -173,14 +173,14 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
* 1. match before 13:00
* 2. match after 13:00
*
- * If you match against processing time (get_seconds) it
+ * If you match against processing time (ktime_get_real_seconds) it
* may happen that the same packet matches both rules if
* it arrived at the right moment before 13:00, so it would be
* better to check skb->tstamp and set it via __net_timestamp()
* if needed. This however breaks outgoing packets tx timestamp,
* and causes them to get delayed forever by fq packet scheduler.
*/
- stamp = get_seconds();
+ stamp = ktime_get_real_seconds();
if (info->flags & XT_TIME_LOCAL_TZ)
/* Adjust for local timezone */
@@ -193,6 +193,9 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
* - 'now' is in the weekday mask
* - 'now' is in the daytime range time_start..time_end
* (and by default, libxt_time will set these so as to match)
+ *
+ * note: info->date_start/stop are unsigned 32-bit values that
+ * can hold values beyond y2038, but not after y2106.
*/
if (stamp < info->date_start || stamp > info->date_stop)
diff --git a/net/nfc/hci/Kconfig b/net/nfc/hci/Kconfig
index 97bd3a2c5c98..4822d6f46947 100644
--- a/net/nfc/hci/Kconfig
+++ b/net/nfc/hci/Kconfig
@@ -1,12 +1,12 @@
# SPDX-License-Identifier: GPL-2.0-only
config NFC_HCI
- depends on NFC
- tristate "NFC HCI implementation"
- default n
- help
- Say Y here if you want to build support for a kernel NFC HCI
- implementation. This is mostly needed for devices that only process
- HCI frames, like for example the NXP pn544.
+ depends on NFC
+ tristate "NFC HCI implementation"
+ default n
+ help
+ Say Y here if you want to build support for a kernel NFC HCI
+ implementation. This is mostly needed for devices that only process
+ HCI frames, like for example the NXP pn544.
config NFC_SHDLC
depends on NFC_HCI
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2088619c03f0..93d4991ddc1f 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -350,7 +350,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
+ nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
- + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
+ + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */
+ + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */
/* OVS_PACKET_ATTR_USERDATA */
if (upcall_info->userdata)
@@ -393,6 +394,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
size_t len;
unsigned int hlen;
int err, dp_ifindex;
+ u64 hash;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
@@ -485,23 +487,30 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
}
/* Add OVS_PACKET_ATTR_MRU */
- if (upcall_info->mru) {
- if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
- upcall_info->mru)) {
- err = -ENOBUFS;
- goto out;
- }
- pad_packet(dp, user_skb);
+ if (upcall_info->mru &&
+ nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) {
+ err = -ENOBUFS;
+ goto out;
}
/* Add OVS_PACKET_ATTR_LEN when packet is truncated */
- if (cutlen > 0) {
- if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN,
- skb->len)) {
- err = -ENOBUFS;
- goto out;
- }
- pad_packet(dp, user_skb);
+ if (cutlen > 0 &&
+ nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ /* Add OVS_PACKET_ATTR_HASH */
+ hash = skb_get_hash_raw(skb);
+ if (skb->sw_hash)
+ hash |= OVS_PACKET_HASH_SW_BIT;
+
+ if (skb->l4_hash)
+ hash |= OVS_PACKET_HASH_L4_BIT;
+
+ if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) {
+ err = -ENOBUFS;
+ goto out;
}
/* Only reserve room for attribute header, packet data is added
@@ -543,6 +552,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct datapath *dp;
struct vport *input_vport;
u16 mru = 0;
+ u64 hash;
int len;
int err;
bool log = !a[OVS_PACKET_ATTR_PROBE];
@@ -568,6 +578,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
}
OVS_CB(packet)->mru = mru;
+ if (a[OVS_PACKET_ATTR_HASH]) {
+ hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]);
+
+ __skb_set_hash(packet, hash & 0xFFFFFFFFULL,
+ !!(hash & OVS_PACKET_HASH_SW_BIT),
+ !!(hash & OVS_PACKET_HASH_L4_BIT));
+ }
+
/* Build an sw_flow for sending this packet. */
flow = ovs_flow_alloc();
err = PTR_ERR(flow);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 81e85dde8217..e239a46c2f94 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -139,6 +139,18 @@ struct ovs_net {
bool xt_label;
};
+/**
+ * enum ovs_pkt_hash_types - hash info to include with a packet
+ * to send to userspace.
+ * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack.
+ * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash
+ * over transport ports.
+ */
+enum ovs_pkt_hash_types {
+ OVS_PACKET_HASH_SW_BIT = (1ULL << 32),
+ OVS_PACKET_HASH_L4_BIT = (1ULL << 33),
+};
+
extern unsigned int ovs_net_id;
void ovs_lock(void);
void ovs_unlock(void);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 6b345c858dba..c71f4328d138 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -513,6 +513,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct ib_qp_init_attr attr;
struct ib_cq_init_attr cq_attr = {};
struct rds_ib_device *rds_ibdev;
+ unsigned long max_wrs;
int ret, fr_queue_space;
struct dma_pool *pool;
@@ -533,10 +534,15 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
- if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
- rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
- if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
- rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+ max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ?
+ rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr;
+ if (ic->i_send_ring.w_nr != max_wrs)
+ rds_ib_ring_resize(&ic->i_send_ring, max_wrs);
+
+ max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ?
+ rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr;
+ if (ic->i_recv_ring.w_nr != max_wrs)
+ rds_ib_ring_resize(&ic->i_recv_ring, max_wrs);
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
@@ -1176,8 +1182,9 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
ic->i_flowctl = 0;
atomic_set(&ic->i_credits, 0);
- rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
- rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+ /* Re-init rings, but retain sizes. */
+ rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr);
+ rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr);
if (ic->i_ibinc) {
rds_inc_put(&ic->i_ibinc->ii_inc);
@@ -1224,8 +1231,8 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
* rds_ib_conn_shutdown() waits for these to be emptied so they
* must be initialized before it can be called.
*/
- rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
- rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+ rds_ib_ring_init(&ic->i_send_ring, 0);
+ rds_ib_ring_init(&ic->i_recv_ring, 0);
ic->conn = conn;
conn->c_transport_data = ic;
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 68d6af56b243..c13638aeef46 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -474,7 +474,6 @@ drop:
}
static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
- [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 },
[TCA_CT_ACTION] = { .type = NLA_U16 },
[TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
[TCA_CT_ZONE] = { .type = NLA_U16 },
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index 4d8c822b6aca..c7d5e12ee919 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -119,7 +119,6 @@ static int valid_label(const struct nlattr *attr,
}
static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
- [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
[TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
[TCA_MPLS_PROTO] = { .type = NLA_U16 },
[TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index d5eff6ac17a9..3ad718576304 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -43,7 +43,7 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
int err = -EINVAL;
int rem;
- if (!nla || !n)
+ if (!nla)
return NULL;
keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
@@ -171,6 +171,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(pattr);
+ if (!parm->nkeys) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
+ return -EINVAL;
+ }
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
if (nla_len(pattr) < sizeof(*parm) + ksize) {
NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
@@ -184,12 +188,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- if (!parm->nkeys) {
- tcf_idr_cleanup(tn, index);
- NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
- ret = -EINVAL;
- goto out_free;
- }
ret = tcf_idr_create(tn, index, est, a,
&act_pedit_ops, bind, false, 0);
if (ret) {
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index cb34e5d57aaa..6379f9568ab8 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -10,6 +10,8 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -53,7 +55,11 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
static const struct nla_policy
enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN },
[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -64,6 +70,19 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
.len = 128 },
};
+static const struct nla_policy
+vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
static int
tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
struct netlink_ext_ack *extack)
@@ -116,10 +135,89 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
return opt_len;
}
+static int
+tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct vxlan_metadata *md = dst;
+
+ md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]);
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int
+tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ } else if (ver == 2) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct erspan_metadata *md = dst;
+
+ md->version = ver;
+ if (ver == 1) {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ } else {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
int dst_len, struct netlink_ext_ack *extack)
{
- int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+ int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0;
const struct nlattr *attr, *head = nla_data(nla);
err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
@@ -130,15 +228,48 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
nla_for_each_attr(attr, head, len, rem) {
switch (nla_type(attr)) {
case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
opt_len = tunnel_key_copy_geneve_opt(attr, dst,
dst_len, extack);
if (opt_len < 0)
return opt_len;
opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX) {
+ NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+ return -EINVAL;
+ }
if (dst) {
dst_len -= opt_len;
dst += opt_len;
}
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_vxlan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_erspan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_ERSPAN_OPT;
break;
}
}
@@ -175,6 +306,22 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
#else
return -EAFNOSUPPORT;
#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
default:
NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
return -EINVAL;
@@ -451,6 +598,56 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
return 0;
}
+static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_erspan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct erspan_metadata *md = (struct erspan_metadata *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, start);
+ return 0;
+err:
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+}
+
static int tunnel_key_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
@@ -468,6 +665,14 @@ static int tunnel_key_opts_dump(struct sk_buff *skb,
err = tunnel_key_geneve_opts_dump(skb, info);
if (err)
goto err_out;
+ } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+ err = tunnel_key_vxlan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
+ } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+ err = tunnel_key_erspan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
} else {
err_out:
nla_nest_cancel(skb, start);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 74221e3351c3..c307ee1d6ca6 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -22,6 +22,8 @@
#include <net/ip.h>
#include <net/flow_dissector.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
@@ -688,7 +690,11 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
static const struct nla_policy
enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN },
[TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -699,6 +705,19 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
.len = 128 },
};
+static const struct nla_policy
+vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
static void fl_set_key_val(struct nlattr **tb,
void *val, int val_type,
void *mask, int mask_type, int len)
@@ -928,6 +947,105 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
return sizeof(struct geneve_opt) + data_len;
}
+static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1];
+ struct vxlan_metadata *md;
+ int err;
+
+ md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) {
+ NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP])
+ md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]);
+
+ return sizeof(*md);
+}
+
+static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ struct erspan_metadata *md;
+ int err;
+
+ md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+ md->version = 1;
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) {
+ NL_SET_ERR_MSG(extack, "Non-erspan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER])
+ md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]);
+
+ if (md->version == 1) {
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ }
+ } else if (md->version == 2) {
+ if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ return sizeof(*md);
+}
+
static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
@@ -958,6 +1076,11 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
switch (nla_type(nla_opt_key)) {
case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+ if (key->enc_opts.dst_opt_type &&
+ key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
option_len = 0;
key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
option_len = fl_set_geneve_opt(nla_opt_key, key,
@@ -986,6 +1109,72 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
if (msk_depth)
nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
break;
+ case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+ option_len = fl_set_vxlan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+ option_len = fl_set_vxlan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+
+ if (msk_depth)
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+ option_len = fl_set_erspan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+ option_len = fl_set_erspan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+
+ if (msk_depth)
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ break;
default:
NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
return -EINVAL;
@@ -2135,6 +2324,61 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fl_dump_key_vxlan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct vxlan_metadata *)&enc_opts->data[0];
+ if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_erspan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct erspan_metadata *)&enc_opts->data[0];
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto nla_put_failure;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto nla_put_failure;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static int fl_dump_key_ct(struct sk_buff *skb,
struct flow_dissector_key_ct *key,
struct flow_dissector_key_ct *mask)
@@ -2188,6 +2432,16 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
if (err)
goto nla_put_failure;
break;
+ case TUNNEL_VXLAN_OPT:
+ err = fl_dump_key_vxlan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ case TUNNEL_ERSPAN_OPT:
+ err = fl_dump_key_erspan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
default:
goto nla_put_failure;
}
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index df98a887eb89..b0b0dc46af61 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -22,6 +22,7 @@
#define QUEUE_THRESHOLD 16384
#define DQCOUNT_INVALID -1
+#define DTIME_INVALID 0xffffffffffffffff
#define MAX_PROB 0xffffffffffffffff
#define PIE_SCALE 8
@@ -34,6 +35,7 @@ struct pie_params {
u32 beta; /* and are used for shift relative to 1 */
bool ecn; /* true if ecn is enabled */
bool bytemode; /* to scale drop early prob based on pkt size */
+ u8 dq_rate_estimator; /* to calculate delay using Little's law */
};
/* variables used */
@@ -77,11 +79,34 @@ static void pie_params_init(struct pie_params *params)
params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */
params->ecn = false;
params->bytemode = false;
+ params->dq_rate_estimator = false;
+}
+
+/* private skb vars */
+struct pie_skb_cb {
+ psched_time_t enqueue_time;
+};
+
+static struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb));
+ return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static psched_time_t pie_get_enqueue_time(const struct sk_buff *skb)
+{
+ return get_pie_cb(skb)->enqueue_time;
+}
+
+static void pie_set_enqueue_time(struct sk_buff *skb)
+{
+ get_pie_cb(skb)->enqueue_time = psched_get_time();
}
static void pie_vars_init(struct pie_vars *vars)
{
vars->dq_count = DQCOUNT_INVALID;
+ vars->dq_tstamp = DTIME_INVALID;
vars->accu_prob = 0;
vars->avg_dq_rate = 0;
/* default of 150 ms in pschedtime */
@@ -172,6 +197,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* we can enqueue the packet */
if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
q->stats.packets_in++;
if (qdisc_qlen(sch) > q->stats.maxq)
q->stats.maxq = qdisc_qlen(sch);
@@ -194,6 +223,7 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
[TCA_PIE_BETA] = {.type = NLA_U32},
[TCA_PIE_ECN] = {.type = NLA_U32},
[TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
};
static int pie_change(struct Qdisc *sch, struct nlattr *opt,
@@ -247,6 +277,10 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_PIE_BYTEMODE])
q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+ if (tb[TCA_PIE_DQ_RATE_ESTIMATOR])
+ q->params.dq_rate_estimator =
+ nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]);
+
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
@@ -266,6 +300,28 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
{
struct pie_sched_data *q = qdisc_priv(sch);
int qlen = sch->qstats.backlog; /* current queue size in bytes */
+ psched_time_t now = psched_get_time();
+ u32 dtime = 0;
+
+ /* If dq_rate_estimator is disabled, calculate qdelay using the
+ * packet timestamp.
+ */
+ if (!q->params.dq_rate_estimator) {
+ q->vars.qdelay = now - pie_get_enqueue_time(skb);
+
+ if (q->vars.dq_tstamp != DTIME_INVALID)
+ dtime = now - q->vars.dq_tstamp;
+
+ q->vars.dq_tstamp = now;
+
+ if (qlen == 0)
+ q->vars.qdelay = 0;
+
+ if (dtime == 0)
+ return;
+
+ goto burst_allowance_reduction;
+ }
/* If current queue is about 10 packets or more and dq_count is unset
* we have enough packets to calculate the drain rate. Save
@@ -289,10 +345,10 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
q->vars.dq_count += skb->len;
if (q->vars.dq_count >= QUEUE_THRESHOLD) {
- psched_time_t now = psched_get_time();
- u32 dtime = now - q->vars.dq_tstamp;
u32 count = q->vars.dq_count << PIE_SCALE;
+ dtime = now - q->vars.dq_tstamp;
+
if (dtime == 0)
return;
@@ -317,14 +373,19 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
q->vars.dq_tstamp = psched_get_time();
}
- if (q->vars.burst_time > 0) {
- if (q->vars.burst_time > dtime)
- q->vars.burst_time -= dtime;
- else
- q->vars.burst_time = 0;
- }
+ goto burst_allowance_reduction;
}
}
+
+ return;
+
+burst_allowance_reduction:
+ if (q->vars.burst_time > 0) {
+ if (q->vars.burst_time > dtime)
+ q->vars.burst_time -= dtime;
+ else
+ q->vars.burst_time = 0;
+ }
}
static void calculate_probability(struct Qdisc *sch)
@@ -332,19 +393,25 @@ static void calculate_probability(struct Qdisc *sch)
struct pie_sched_data *q = qdisc_priv(sch);
u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */
- psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
+ psched_time_t qdelay_old = 0; /* in pschedtime */
s64 delta = 0; /* determines the change in probability */
u64 oldprob;
u64 alpha, beta;
u32 power;
bool update_prob = true;
- q->vars.qdelay_old = q->vars.qdelay;
+ if (q->params.dq_rate_estimator) {
+ qdelay_old = q->vars.qdelay;
+ q->vars.qdelay_old = q->vars.qdelay;
- if (q->vars.avg_dq_rate > 0)
- qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
- else
- qdelay = 0;
+ if (q->vars.avg_dq_rate > 0)
+ qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
+ else
+ qdelay = 0;
+ } else {
+ qdelay = q->vars.qdelay;
+ qdelay_old = q->vars.qdelay_old;
+ }
/* If qdelay is zero and qlen is not, it means qlen is very small, less
* than dequeue_rate, so we do not update probabilty in this round
@@ -430,14 +497,18 @@ static void calculate_probability(struct Qdisc *sch)
/* We restart the measurement cycle if the following conditions are met
* 1. If the delay has been low for 2 consecutive Tupdate periods
* 2. Calculated drop probability is zero
- * 3. We have atleast one estimate for the avg_dq_rate ie.,
- * is a non-zero value
+ * 3. If average dq_rate_estimator is enabled, we have atleast one
+ * estimate for the avg_dq_rate ie., is a non-zero value
*/
if ((q->vars.qdelay < q->params.target / 2) &&
(q->vars.qdelay_old < q->params.target / 2) &&
q->vars.prob == 0 &&
- q->vars.avg_dq_rate > 0)
+ (!q->params.dq_rate_estimator || q->vars.avg_dq_rate > 0)) {
pie_vars_init(&q->vars);
+ }
+
+ if (!q->params.dq_rate_estimator)
+ q->vars.qdelay_old = qdelay;
}
static void pie_timer(struct timer_list *t)
@@ -497,7 +568,9 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
- nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+ nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) ||
+ nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR,
+ q->params.dq_rate_estimator))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -514,9 +587,6 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.prob = q->vars.prob,
.delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
- /* unscale and return dq_rate in bytes per sec */
- .avg_dq_rate = q->vars.avg_dq_rate *
- (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
.packets_in = q->stats.packets_in,
.overlimit = q->stats.overlimit,
.maxq = q->stats.maxq,
@@ -524,6 +594,14 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.ecn_mark = q->stats.ecn_mark,
};
+ /* avg_dq_rate is only valid if dq_rate_estimator is enabled */
+ st.dq_rate_estimating = q->params.dq_rate_estimator;
+
+ /* unscale and return dq_rate in bytes per sec */
+ if (q->params.dq_rate_estimator)
+ st.avg_dq_rate = q->vars.avg_dq_rate *
+ (PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
+
return gnet_stats_copy_app(d, &st, sizeof(st));
}
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 7cd68628c637..c609373c8661 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -922,7 +922,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
}
/* Verify priority mapping uses valid tcs */
- for (i = 0; i < TC_BITMASK + 1; i++) {
+ for (i = 0; i <= TC_BITMASK; i++) {
if (qopt->prio_tc_map[i] >= qopt->num_tc) {
NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
return -EINVAL;
@@ -1347,6 +1347,26 @@ out:
return err;
}
+static int taprio_mqprio_cmp(const struct net_device *dev,
+ const struct tc_mqprio_qopt *mqprio)
+{
+ int i;
+
+ if (!mqprio || mqprio->num_tc != dev->num_tc)
+ return -1;
+
+ for (i = 0; i < mqprio->num_tc; i++)
+ if (dev->tc_to_txq[i].count != mqprio->count[i] ||
+ dev->tc_to_txq[i].offset != mqprio->offset[i])
+ return -1;
+
+ for (i = 0; i <= TC_BITMASK; i++)
+ if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i])
+ return -1;
+
+ return 0;
+}
+
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -1398,6 +1418,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
admin = rcu_dereference(q->admin_sched);
rcu_read_unlock();
+ /* no changes - no new mqprio settings */
+ if (!taprio_mqprio_cmp(dev, mqprio))
+ mqprio = NULL;
+
if (mqprio && (oper || admin)) {
NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
err = -ENOTSUPP;
@@ -1455,7 +1479,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
mqprio->offset[i]);
/* Always use supplied priority mappings */
- for (i = 0; i < TC_BITMASK + 1; i++)
+ for (i = 0; i <= TC_BITMASK; i++)
netdev_set_prio_tc_map(dev, i,
mqprio->prio_tc_map[i]);
}
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b7d9fd285c71..b997072c72e5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -25,6 +25,7 @@
#include <linux/in.h>
#include <linux/sched/signal.h>
#include <linux/if_vlan.h>
+#include <linux/rcupdate_wait.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -798,6 +799,7 @@ static void smc_connect_work(struct work_struct *work)
smc->sk.sk_err = EPIPE;
else if (signal_pending(current))
smc->sk.sk_err = -sock_intr_errno(timeo);
+ sock_put(&smc->sk); /* passive closing */
goto out;
}
@@ -1735,7 +1737,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
- if (sk->sk_state == SMC_INIT) {
+ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
smc_switch_to_fallback(smc);
smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
@@ -2038,22 +2040,28 @@ static int __init smc_init(void)
if (rc)
goto out_pernet_subsys;
+ rc = smc_core_init();
+ if (rc) {
+ pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
rc = smc_llc_init();
if (rc) {
pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = smc_cdc_init();
if (rc) {
pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = proto_register(&smc_proto, 1);
if (rc) {
pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = proto_register(&smc_proto6, 1);
@@ -2085,6 +2093,8 @@ out_proto6:
proto_unregister(&smc_proto6);
out_proto:
proto_unregister(&smc_proto);
+out_core:
+ smc_core_exit();
out_pnet:
smc_pnet_exit();
out_pernet_subsys:
@@ -2095,14 +2105,15 @@ out_pernet_subsys:
static void __exit smc_exit(void)
{
- smc_core_exit();
static_branch_disable(&tcp_have_smc);
- smc_ib_unregister_client();
sock_unregister(PF_SMC);
+ smc_core_exit();
+ smc_ib_unregister_client();
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
smc_pnet_exit();
unregister_pernet_subsys(&smc_net_ops);
+ rcu_barrier();
}
module_init(smc_init);
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 7dc07ec2379b..164f1584861b 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -131,6 +131,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
{
int rc;
+ if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown))
+ return -EPIPE;
+
if (conn->lgr->is_smcd) {
spin_lock_bh(&conn->send_lock);
rc = smcd_cdc_msg_send(conn);
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 49bcebff6378..0879f7bed967 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -349,7 +349,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
smc->conn.lgr->sync_err = 1;
- smc_lgr_terminate(smc->conn.lgr);
+ smc_lgr_terminate(smc->conn.lgr, true);
}
}
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index d34e5adce2eb..290270c821ca 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -20,8 +20,6 @@
#include "smc_cdc.h"
#include "smc_close.h"
-#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ)
-
/* release the clcsock that is assigned to the smc_sock */
void smc_clcsock_release(struct smc_sock *smc)
{
@@ -110,6 +108,17 @@ int smc_close_abort(struct smc_connection *conn)
return smc_cdc_get_slot_and_msg_send(conn);
}
+static void smc_close_cancel_work(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+
+ release_sock(sk);
+ cancel_work_sync(&smc->conn.close_work);
+ cancel_delayed_work_sync(&smc->conn.tx_work);
+ lock_sock(sk);
+ sk->sk_state = SMC_CLOSED;
+}
+
/* terminate smc socket abnormally - active abort
* link group is terminated, i.e. RDMA communication no longer possible
*/
@@ -126,23 +135,21 @@ void smc_close_active_abort(struct smc_sock *smc)
switch (sk->sk_state) {
case SMC_ACTIVE:
sk->sk_state = SMC_PEERABORTWAIT;
- release_sock(sk);
- cancel_delayed_work_sync(&smc->conn.tx_work);
- lock_sock(sk);
+ smc_close_cancel_work(smc);
sk->sk_state = SMC_CLOSED;
sock_put(sk); /* passive closing */
break;
case SMC_APPCLOSEWAIT1:
case SMC_APPCLOSEWAIT2:
- release_sock(sk);
- cancel_delayed_work_sync(&smc->conn.tx_work);
- lock_sock(sk);
+ smc_close_cancel_work(smc);
sk->sk_state = SMC_CLOSED;
sock_put(sk); /* postponed passive closing */
break;
case SMC_PEERCLOSEWAIT1:
case SMC_PEERCLOSEWAIT2:
case SMC_PEERFINCLOSEWAIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
sk->sk_state = SMC_CLOSED;
smc_conn_free(&smc->conn);
release_clcsock = true;
@@ -150,7 +157,11 @@ void smc_close_active_abort(struct smc_sock *smc)
break;
case SMC_PROCESSABORT:
case SMC_APPFINCLOSEWAIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
sk->sk_state = SMC_CLOSED;
+ smc_conn_free(&smc->conn);
+ release_clcsock = true;
break;
case SMC_INIT:
case SMC_PEERABORTWAIT:
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 0d92456729ab..bb92c7c6214c 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -13,6 +13,8 @@
#include <linux/if_vlan.h>
#include <linux/random.h>
#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/reboot.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <rdma/ib_verbs.h>
@@ -39,6 +41,9 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */
.num = 0,
};
+static atomic_t lgr_cnt; /* number of existing link groups */
+static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
+
static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
struct smc_buf_desc *buf_desc);
@@ -161,10 +166,10 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
* of the DELETE LINK sequence from server; or as server to
* initiate the delete processing. See smc_llc_rx_delete_link().
*/
-static int smc_link_send_delete(struct smc_link *lnk)
+static int smc_link_send_delete(struct smc_link *lnk, bool orderly)
{
if (lnk->state == SMC_LNK_ACTIVE &&
- !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) {
+ !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) {
smc_llc_link_deleting(lnk);
return 0;
}
@@ -201,7 +206,7 @@ static void smc_lgr_free_work(struct work_struct *work)
if (!lgr->is_smcd && !lgr->terminating) {
/* try to send del link msg, on error free lgr immediately */
if (lnk->state == SMC_LNK_ACTIVE &&
- !smc_link_send_delete(lnk)) {
+ !smc_link_send_delete(lnk, true)) {
/* reschedule in case we never receive a response */
smc_lgr_schedule_free_work(lgr);
spin_unlock_bh(lgr_lock);
@@ -214,7 +219,7 @@ static void smc_lgr_free_work(struct work_struct *work)
if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
smc_llc_link_inactive(lnk);
- if (lgr->is_smcd)
+ if (lgr->is_smcd && !lgr->terminating)
smc_ism_signal_shutdown(lgr);
smc_lgr_free(lgr);
}
@@ -224,7 +229,7 @@ static void smc_lgr_terminate_work(struct work_struct *work)
struct smc_link_group *lgr = container_of(work, struct smc_link_group,
terminate_work);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(lgr, true);
}
/* create a new SMC link group */
@@ -275,6 +280,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
lgr->smcd = ini->ism_dev;
lgr_list = &ini->ism_dev->lgr_list;
lgr_lock = &lgr->smcd->lgr_lock;
+ lgr->peer_shutdown = 0;
+ atomic_inc(&ini->ism_dev->lgr_cnt);
} else {
/* SMC-R specific settings */
get_device(&ini->ib_dev->ibdev->dev);
@@ -317,6 +324,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
rc = smc_wr_create_link(lnk);
if (rc)
goto destroy_qp;
+ atomic_inc(&lgr_cnt);
+ atomic_inc(&ini->ib_dev->lnk_cnt);
}
smc->conn.lgr = lgr;
spin_lock_bh(lgr_lock);
@@ -380,7 +389,8 @@ void smc_conn_free(struct smc_connection *conn)
if (!lgr)
return;
if (lgr->is_smcd) {
- smc_ism_unset_conn(conn);
+ if (!list_empty(&lgr->list))
+ smc_ism_unset_conn(conn);
tasklet_kill(&conn->rx_tsklet);
} else {
smc_cdc_tx_dismiss_slots(conn);
@@ -403,6 +413,8 @@ static void smc_link_clear(struct smc_link *lnk)
smc_ib_destroy_queue_pair(lnk);
smc_ib_dealloc_protection_domain(lnk);
smc_wr_free_link_mem(lnk);
+ if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt))
+ wake_up(&lnk->smcibdev->lnks_deleted);
}
static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
@@ -480,11 +492,17 @@ static void smc_lgr_free(struct smc_link_group *lgr)
{
smc_lgr_free_bufs(lgr);
if (lgr->is_smcd) {
- smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- put_device(&lgr->smcd->dev);
+ if (!lgr->terminating) {
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(&lgr->smcd->dev);
+ }
+ if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
+ wake_up(&lgr->smcd->lgrs_deleted);
} else {
smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev);
+ if (!atomic_dec_return(&lgr_cnt))
+ wake_up(&lgrs_deleted);
}
kfree(lgr);
}
@@ -502,6 +520,20 @@ void smc_lgr_forget(struct smc_link_group *lgr)
spin_unlock_bh(lgr_lock);
}
+static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
+{
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ struct smc_buf_desc *buf_desc;
+
+ list_for_each_entry(buf_desc, &lgr->rmbs[i], list) {
+ buf_desc->len += sizeof(struct smcd_cdc_msg);
+ smc_ism_unregister_dmb(lgr->smcd, buf_desc);
+ }
+ }
+}
+
static void smc_sk_wake_ups(struct smc_sock *smc)
{
smc->sk.sk_write_space(&smc->sk);
@@ -510,20 +542,50 @@ static void smc_sk_wake_ups(struct smc_sock *smc)
}
/* kill a connection */
-static void smc_conn_kill(struct smc_connection *conn)
+static void smc_conn_kill(struct smc_connection *conn, bool soft)
{
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
- smc_close_abort(conn);
+ if (conn->lgr->is_smcd && conn->lgr->peer_shutdown)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ smc_close_abort(conn);
conn->killed = 1;
+ smc->sk.sk_err = ECONNABORTED;
smc_sk_wake_ups(smc);
+ if (conn->lgr->is_smcd) {
+ smc_ism_unset_conn(conn);
+ if (soft)
+ tasklet_kill(&conn->rx_tsklet);
+ else
+ tasklet_unlock_wait(&conn->rx_tsklet);
+ } else {
+ smc_cdc_tx_dismiss_slots(conn);
+ }
smc_lgr_unregister_conn(conn);
- smc->sk.sk_err = ECONNABORTED;
smc_close_active_abort(smc);
}
+static void smc_lgr_cleanup(struct smc_link_group *lgr)
+{
+ if (lgr->is_smcd) {
+ smc_ism_signal_shutdown(lgr);
+ smcd_unregister_all_dmbs(lgr);
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(&lgr->smcd->dev);
+ } else {
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
+ wake_up(&lnk->wr_reg_wait);
+ if (lnk->state != SMC_LNK_INACTIVE) {
+ smc_link_send_delete(lnk, false);
+ smc_llc_link_inactive(lnk);
+ }
+ }
+}
+
/* terminate link group */
-static void __smc_lgr_terminate(struct smc_link_group *lgr)
+static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
{
struct smc_connection *conn;
struct smc_sock *smc;
@@ -531,6 +593,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
if (lgr->terminating)
return; /* lgr already terminating */
+ if (!soft)
+ cancel_delayed_work_sync(&lgr->free_work);
lgr->terminating = 1;
if (!lgr->is_smcd)
smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
@@ -544,20 +608,25 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
smc = container_of(conn, struct smc_sock, conn);
sock_hold(&smc->sk); /* sock_put below */
lock_sock(&smc->sk);
- smc_conn_kill(conn);
+ smc_conn_kill(conn, soft);
release_sock(&smc->sk);
sock_put(&smc->sk); /* sock_hold above */
read_lock_bh(&lgr->conns_lock);
node = rb_first(&lgr->conns_all);
}
read_unlock_bh(&lgr->conns_lock);
- if (!lgr->is_smcd)
- wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
- smc_lgr_schedule_free_work_fast(lgr);
+ smc_lgr_cleanup(lgr);
+ if (soft)
+ smc_lgr_schedule_free_work_fast(lgr);
+ else
+ smc_lgr_free(lgr);
}
-/* unlink and terminate link group */
-void smc_lgr_terminate(struct smc_link_group *lgr)
+/* unlink and terminate link group
+ * @soft: true if link group shutdown can take its time
+ * false if immediate link group shutdown is required
+ */
+void smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
{
spinlock_t *lgr_lock;
@@ -567,9 +636,11 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
spin_unlock_bh(lgr_lock);
return; /* lgr already terminating */
}
+ if (!soft)
+ lgr->freeing = 1;
list_del_init(&lgr->list);
spin_unlock_bh(lgr_lock);
- __smc_lgr_terminate(lgr);
+ __smc_lgr_terminate(lgr, soft);
}
/* Called when IB port is terminated */
@@ -582,18 +653,20 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
if (!lgr->is_smcd &&
lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
- lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
+ lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) {
list_move(&lgr->list, &lgr_free_list);
+ lgr->freeing = 1;
+ }
}
spin_unlock_bh(&smc_lgr_list.lock);
list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
list_del_init(&lgr->list);
- __smc_lgr_terminate(lgr);
+ __smc_lgr_terminate(lgr, false);
}
}
-/* Called when SMC-D device is terminated or peer is lost */
+/* Called when peer lgr shutdown (regularly or abnormally) is received */
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
{
struct smc_link_group *lgr, *l;
@@ -604,6 +677,8 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
if ((!peer_gid || lgr->peer_gid == peer_gid) &&
(vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
+ if (peer_gid) /* peer triggered termination */
+ lgr->peer_shutdown = 1;
list_move(&lgr->list, &lgr_free_list);
}
}
@@ -612,11 +687,67 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
/* cancel the regular free workers and actually free lgrs */
list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
list_del_init(&lgr->list);
- __smc_lgr_terminate(lgr);
- cancel_delayed_work_sync(&lgr->free_work);
- if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */
- smc_ism_signal_shutdown(lgr);
- smc_lgr_free(lgr);
+ schedule_work(&lgr->terminate_work);
+ }
+}
+
+/* Called when an SMCD device is removed or the smc module is unloaded */
+void smc_smcd_terminate_all(struct smcd_dev *smcd)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_free_list);
+
+ spin_lock_bh(&smcd->lgr_lock);
+ list_splice_init(&smcd->lgr_list, &lgr_free_list);
+ list_for_each_entry(lgr, &lgr_free_list, list)
+ lgr->freeing = 1;
+ spin_unlock_bh(&smcd->lgr_lock);
+
+ list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ __smc_lgr_terminate(lgr, false);
+ }
+
+ if (atomic_read(&smcd->lgr_cnt))
+ wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt));
+}
+
+/* Called when an SMCR device is removed or the smc module is unloaded.
+ * If smcibdev is given, all SMCR link groups using this device are terminated.
+ * If smcibdev is NULL, all SMCR link groups are terminated.
+ */
+void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_free_list);
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (!smcibdev) {
+ list_splice_init(&smc_lgr_list.list, &lgr_free_list);
+ list_for_each_entry(lgr, &lgr_free_list, list)
+ lgr->freeing = 1;
+ } else {
+ list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
+ if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) {
+ list_move(&lgr->list, &lgr_free_list);
+ lgr->freeing = 1;
+ }
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ __smc_lgr_terminate(lgr, false);
+ }
+
+ if (smcibdev) {
+ if (atomic_read(&smcibdev->lnk_cnt))
+ wait_event(smcibdev->lnks_deleted,
+ !atomic_read(&smcibdev->lnk_cnt));
+ } else {
+ if (atomic_read(&lgr_cnt))
+ wait_event(lgrs_deleted, !atomic_read(&lgr_cnt));
}
}
@@ -1137,37 +1268,42 @@ static void smc_core_going_away(void)
spin_unlock(&smcd_dev_list.lock);
}
-/* Called (from smc_exit) when module is removed */
-void smc_core_exit(void)
+/* Clean up all SMC link groups */
+static void smc_lgrs_shutdown(void)
{
- struct smc_link_group *lgr, *lg;
- LIST_HEAD(lgr_freeing_list);
struct smcd_dev *smcd;
smc_core_going_away();
- spin_lock_bh(&smc_lgr_list.lock);
- list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
- spin_unlock_bh(&smc_lgr_list.lock);
+ smc_smcr_terminate_all(NULL);
spin_lock(&smcd_dev_list.lock);
list_for_each_entry(smcd, &smcd_dev_list.list, list)
- list_splice_init(&smcd->lgr_list, &lgr_freeing_list);
+ smc_smcd_terminate_all(smcd);
spin_unlock(&smcd_dev_list.lock);
+}
- list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
- list_del_init(&lgr->list);
- if (!lgr->is_smcd) {
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+static int smc_core_reboot_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ smc_lgrs_shutdown();
- if (lnk->state == SMC_LNK_ACTIVE)
- smc_llc_send_delete_link(lnk, SMC_LLC_REQ,
- false);
- smc_llc_link_inactive(lnk);
- }
- cancel_delayed_work_sync(&lgr->free_work);
- if (lgr->is_smcd)
- smc_ism_signal_shutdown(lgr);
- smc_lgr_free(lgr); /* free link group */
- }
+ return 0;
+}
+
+static struct notifier_block smc_reboot_notifier = {
+ .notifier_call = smc_core_reboot_event,
+};
+
+int __init smc_core_init(void)
+{
+ atomic_set(&lgr_cnt, 0);
+ return register_reboot_notifier(&smc_reboot_notifier);
+}
+
+/* Called (from smc_exit) when module is removed */
+void smc_core_exit(void)
+{
+ unregister_reboot_notifier(&smc_reboot_notifier);
+ smc_lgrs_shutdown();
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index e6fd1ed42064..c472e12951d1 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -228,6 +228,8 @@ struct smc_link_group {
/* Peer GID (remote) */
struct smcd_dev *smcd;
/* ISM device for VLAN reg. */
+ u8 peer_shutdown : 1;
+ /* peer triggered shutdownn */
};
};
};
@@ -285,7 +287,7 @@ static inline struct smc_connection *smc_lgr_find_conn(
static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr)
{
- if (!lgr->terminating)
+ if (!lgr->terminating && !lgr->freeing)
schedule_work(&lgr->terminate_work);
}
@@ -294,10 +296,12 @@ struct smc_clc_msg_accept_confirm;
struct smc_clc_msg_local;
void smc_lgr_forget(struct smc_link_group *lgr);
-void smc_lgr_terminate(struct smc_link_group *lgr);
+void smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
unsigned short vlan);
+void smc_smcd_terminate_all(struct smcd_dev *dev);
+void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
int smc_buf_create(struct smc_sock *smc, bool is_smcd);
int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn,
@@ -314,6 +318,7 @@ void smc_conn_free(struct smc_connection *conn);
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
void smcd_conn_free(struct smc_connection *conn);
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
+int smc_core_init(void);
void smc_core_exit(void);
static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index af05daeb0538..548632621f4b 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/workqueue.h>
#include <linux/scatterlist.h>
+#include <linux/wait.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
@@ -520,9 +521,9 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
if (!smcibdev->initialized)
return;
smcibdev->initialized = 0;
- smc_wr_remove_dev(smcibdev);
ib_destroy_cq(smcibdev->roce_cq_recv);
ib_destroy_cq(smcibdev->roce_cq_send);
+ smc_wr_remove_dev(smcibdev);
}
static struct ib_client smc_ib_client;
@@ -543,7 +544,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
smcibdev->ibdev = ibdev;
INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
-
+ atomic_set(&smcibdev->lnk_cnt, 0);
+ init_waitqueue_head(&smcibdev->lnks_deleted);
spin_lock(&smc_ib_devices.lock);
list_add_tail(&smcibdev->list, &smc_ib_devices.list);
spin_unlock(&smc_ib_devices.lock);
@@ -565,7 +567,7 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
schedule_work(&smcibdev->port_event_work);
}
-/* callback function for ib_register_client() */
+/* callback function for ib_unregister_client() */
static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
{
struct smc_ib_device *smcibdev;
@@ -575,6 +577,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
spin_lock(&smc_ib_devices.lock);
list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
spin_unlock(&smc_ib_devices.lock);
+ smc_smcr_terminate_all(smcibdev);
smc_ib_cleanup_per_ibdev(smcibdev);
ib_unregister_event_handler(&smcibdev->event_handler);
kfree(smcibdev);
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 6a0069db6cae..255db87547d3 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/if_ether.h>
+#include <linux/wait.h>
#include <rdma/ib_verbs.h>
#include <net/smc.h>
@@ -48,6 +49,8 @@ struct smc_ib_device { /* ib-device infos for smc */
struct work_struct port_event_work;
unsigned long port_event_mask;
DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS);
+ atomic_t lnk_cnt; /* number of links on ibdev */
+ wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/
};
struct smc_buf_desc;
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index ee7340898cb4..5c4727d5066e 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -146,6 +146,10 @@ out:
int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
{
struct smcd_dmb dmb;
+ int rc = 0;
+
+ if (!dmb_desc->dma_addr)
+ return rc;
memset(&dmb, 0, sizeof(dmb));
dmb.dmb_tok = dmb_desc->token;
@@ -153,7 +157,13 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
dmb.cpu_addr = dmb_desc->cpu_addr;
dmb.dma_addr = dmb_desc->dma_addr;
dmb.dmb_len = dmb_desc->len;
- return smcd->ops->unregister_dmb(smcd, &dmb);
+ rc = smcd->ops->unregister_dmb(smcd, &dmb);
+ if (!rc || rc == ISM_ERROR) {
+ dmb_desc->cpu_addr = NULL;
+ dmb_desc->dma_addr = 0;
+ }
+
+ return rc;
}
int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
@@ -226,6 +236,9 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr)
int rc;
union smcd_sw_event_info ev_info;
+ if (lgr->peer_shutdown)
+ return 0;
+
memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE);
ev_info.vlan_id = lgr->vlan_id;
ev_info.code = ISM_EVENT_REQUEST;
@@ -289,6 +302,7 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
spin_lock_init(&smcd->lgr_lock);
INIT_LIST_HEAD(&smcd->vlan);
INIT_LIST_HEAD(&smcd->lgr_list);
+ init_waitqueue_head(&smcd->lgrs_deleted);
smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
WQ_MEM_RECLAIM, name);
if (!smcd->event_wq) {
@@ -313,12 +327,12 @@ EXPORT_SYMBOL_GPL(smcd_register_dev);
void smcd_unregister_dev(struct smcd_dev *smcd)
{
spin_lock(&smcd_dev_list.lock);
- list_del(&smcd->list);
+ list_del_init(&smcd->list);
spin_unlock(&smcd_dev_list.lock);
smcd->going_away = 1;
+ smc_smcd_terminate_all(smcd);
flush_workqueue(smcd->event_wq);
destroy_workqueue(smcd->event_wq);
- smc_smcd_terminate(smcd, 0, VLAN_VID_MASK);
device_del(&smcd->dev);
}
@@ -372,7 +386,7 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
spin_lock_irqsave(&smcd->lock, flags);
conn = smcd->conn[dmbno];
- if (conn)
+ if (conn && !conn->killed)
tasklet_schedule(&conn->rx_tsklet);
spin_unlock_irqrestore(&smcd->lock, flags);
}
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index e1918ffaf125..a9f6431dd69a 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -614,7 +614,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
SMC_LLC_WAIT_TIME);
if (rc <= 0) {
- smc_lgr_terminate(smc_get_lgr(link));
+ smc_lgr_terminate(smc_get_lgr(link), true);
return;
}
next_interval = link->llc_testlink_time;
@@ -656,6 +656,7 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time)
void smc_llc_link_deleting(struct smc_link *link)
{
link->state = SMC_LNK_DELETING;
+ smc_wr_wakeup_tx_wait(link);
}
/* called in tasklet context */
@@ -663,6 +664,8 @@ void smc_llc_link_inactive(struct smc_link *link)
{
link->state = SMC_LNK_INACTIVE;
cancel_delayed_work(&link->llc_testlink_wrk);
+ smc_wr_wakeup_reg_wait(link);
+ smc_wr_wakeup_tx_wait(link);
}
/* called in worker context */
@@ -695,9 +698,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link,
int smc_llc_do_delete_rkey(struct smc_link *link,
struct smc_buf_desc *rmb_desc)
{
- int rc;
+ int rc = 0;
mutex_lock(&link->llc_delete_rkey_mutex);
+ if (link->state != SMC_LNK_ACTIVE)
+ goto out;
reinit_completion(&link->llc_delete_rkey);
rc = smc_llc_send_delete_rkey(link, rmb_desc);
if (rc)
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 824f096ee7de..0d42e7716b91 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -284,7 +284,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
if (rc)
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(lgr, true);
return rc;
}
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 50743dc56c86..337ee52ad3d3 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -50,6 +50,26 @@ struct smc_wr_tx_pend { /* control data for a pending send request */
/*------------------------------- completion --------------------------------*/
+/* returns true if at least one tx work request is pending on the given link */
+static inline bool smc_wr_is_tx_pend(struct smc_link *link)
+{
+ if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) !=
+ link->wr_tx_cnt) {
+ return true;
+ }
+ return false;
+}
+
+/* wait till all pending tx work requests on the given link are completed */
+static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
+{
+ if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link),
+ SMC_WR_TX_WAIT_PENDING_TIME))
+ return 0;
+ else /* timeout */
+ return -EPIPE;
+}
+
static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
{
u32 i;
@@ -75,7 +95,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
link->wr_reg_state = FAILED;
else
link->wr_reg_state = CONFIRMED;
- wake_up(&link->wr_reg_wait);
+ smc_wr_wakeup_reg_wait(link);
return;
}
@@ -171,6 +191,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
struct smc_rdma_wr **wr_rdma_buf,
struct smc_wr_tx_pend_priv **wr_pend_priv)
{
+ struct smc_link_group *lgr = smc_get_lgr(link);
struct smc_wr_tx_pend *wr_pend;
u32 idx = link->wr_tx_cnt;
struct ib_send_wr *wr_ib;
@@ -179,19 +200,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
*wr_buf = NULL;
*wr_pend_priv = NULL;
- if (in_softirq()) {
+ if (in_softirq() || lgr->terminating) {
rc = smc_wr_tx_get_free_slot_index(link, &idx);
if (rc)
return rc;
} else {
- rc = wait_event_timeout(
+ rc = wait_event_interruptible_timeout(
link->wr_tx_wait,
link->state == SMC_LNK_INACTIVE ||
+ lgr->terminating ||
(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
SMC_WR_TX_WAIT_FREE_SLOT_TIME);
if (!rc) {
/* timeout - terminate connections */
- smc_lgr_terminate_sched(smc_get_lgr(link));
+ smc_lgr_terminate_sched(lgr);
return -EPIPE;
}
if (idx == link->wr_tx_cnt)
@@ -227,6 +249,7 @@ int smc_wr_tx_put_slot(struct smc_link *link,
memset(&link->wr_tx_bufs[idx], 0,
sizeof(link->wr_tx_bufs[idx]));
test_and_clear_bit(idx, link->wr_tx_mask);
+ wake_up(&link->wr_tx_wait);
return 1;
}
@@ -510,8 +533,10 @@ void smc_wr_free_link(struct smc_link *lnk)
{
struct ib_device *ibdev;
- memset(lnk->wr_tx_mask, 0,
- BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+ if (smc_wr_tx_wait_no_pending_sends(lnk))
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) *
+ sizeof(*lnk->wr_tx_mask));
if (!lnk->smcibdev)
return;
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 09bf32fd3959..3ac99c898418 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -60,6 +60,16 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
atomic_long_set(wr_tx_id, val);
}
+static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
+{
+ wake_up_all(&lnk->wr_tx_wait);
+}
+
+static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk)
+{
+ wake_up(&lnk->wr_reg_wait);
+}
+
/* post a new receive work request to fill a completed old work request entry */
static inline int smc_wr_rx_post(struct smc_link *link)
{
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index f41096a759fa..55aeba681cf4 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -87,9 +87,9 @@ int tipc_bcast_get_mtu(struct net *net)
return tipc_link_mss(tipc_bc_sndlink(net));
}
-void tipc_bcast_disable_rcast(struct net *net)
+void tipc_bcast_toggle_rcast(struct net *net, bool supp)
{
- tipc_bc_base(net)->rcast_support = false;
+ tipc_bc_base(net)->rcast_support = supp;
}
static void tipc_bcbase_calc_bc_threshold(struct net *net)
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index dadad953e2be..9e847d9617d3 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -85,7 +85,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
int tipc_bcast_get_mtu(struct net *net);
-void tipc_bcast_disable_rcast(struct net *net);
+void tipc_bcast_toggle_rcast(struct net *net, bool supp);
int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
struct tipc_mc_method *method, struct tipc_nlist *dests,
u16 *cong_link_cnt);
diff --git a/net/tipc/core.c b/net/tipc/core.c
index fc01a13d7462..7532a00ac73d 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -34,8 +34,6 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include "core.h"
#include "name_table.h"
#include "subscr.h"
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 775848a5f27e..631d83c9705f 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -61,6 +61,12 @@
#include <net/genetlink.h>
#include <net/netns/hash.h>
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
struct tipc_node;
struct tipc_bearer;
struct tipc_bc_base;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index fb72031228c9..24d4d10756d3 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -550,7 +550,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
/* Disable replicast if even a single peer doesn't support it */
if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
- tipc_bcast_disable_rcast(net);
+ tipc_bcast_toggle_rcast(net, false);
return true;
}
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 66a65c2cdb23..92d04dc2a44b 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -35,6 +35,7 @@
*/
#include <net/sock.h>
+#include <linux/list_sort.h>
#include "core.h"
#include "netlink.h"
#include "name_table.h"
@@ -66,6 +67,7 @@ struct service_range {
/**
* struct tipc_service - container for all published instances of a service type
* @type: 32 bit 'type' value for service
+ * @publ_cnt: increasing counter for publications in this service
* @ranges: rb tree containing all service ranges for this service
* @service_list: links to adjacent name ranges in hash chain
* @subscriptions: list of subscriptions for this service type
@@ -74,6 +76,7 @@ struct service_range {
*/
struct tipc_service {
u32 type;
+ u32 publ_cnt;
struct rb_root ranges;
struct hlist_node service_list;
struct list_head subscriptions;
@@ -109,6 +112,7 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper,
INIT_LIST_HEAD(&publ->binding_node);
INIT_LIST_HEAD(&publ->local_publ);
INIT_LIST_HEAD(&publ->all_publ);
+ INIT_LIST_HEAD(&publ->list);
return publ;
}
@@ -244,6 +248,8 @@ static struct publication *tipc_service_insert_publ(struct net *net,
p = tipc_publ_create(type, lower, upper, scope, node, port, key);
if (!p)
goto err;
+ /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */
+ p->id = sc->publ_cnt++;
if (in_own_node(net, node))
list_add(&p->local_publ, &sr->local_publ);
list_add(&p->all_publ, &sr->all_publ);
@@ -278,6 +284,20 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr,
}
/**
+ * Code reused: time_after32() for the same purpose
+ */
+#define publication_after(pa, pb) time_after32((pa)->id, (pb)->id)
+static int tipc_publ_sort(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct publication *pa, *pb;
+
+ pa = container_of(a, struct publication, list);
+ pb = container_of(b, struct publication, list);
+ return publication_after(pa, pb);
+}
+
+/**
* tipc_service_subscribe - attach a subscription, and optionally
* issue the prescribed number of events if there is any service
* range overlapping with the requested range
@@ -286,36 +306,51 @@ static void tipc_service_subscribe(struct tipc_service *service,
struct tipc_subscription *sub)
{
struct tipc_subscr *sb = &sub->evt.s;
+ struct publication *p, *first, *tmp;
+ struct list_head publ_list;
struct service_range *sr;
struct tipc_name_seq ns;
- struct publication *p;
struct rb_node *n;
- bool first;
+ u32 filter;
ns.type = tipc_sub_read(sb, seq.type);
ns.lower = tipc_sub_read(sb, seq.lower);
ns.upper = tipc_sub_read(sb, seq.upper);
+ filter = tipc_sub_read(sb, filter);
tipc_sub_get(sub);
list_add(&sub->service_list, &service->subscriptions);
- if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS)
+ if (filter & TIPC_SUB_NO_STATUS)
return;
+ INIT_LIST_HEAD(&publ_list);
for (n = rb_first(&service->ranges); n; n = rb_next(n)) {
sr = container_of(n, struct service_range, tree_node);
if (sr->lower > ns.upper)
break;
if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper))
continue;
- first = true;
+ first = NULL;
list_for_each_entry(p, &sr->all_publ, all_publ) {
- tipc_sub_report_overlap(sub, sr->lower, sr->upper,
- TIPC_PUBLISHED, p->port,
- p->node, p->scope, first);
- first = false;
+ if (filter & TIPC_SUB_PORTS)
+ list_add_tail(&p->list, &publ_list);
+ else if (!first || publication_after(first, p))
+ /* Pick this range's *first* publication */
+ first = p;
}
+ if (first)
+ list_add_tail(&first->list, &publ_list);
+ }
+
+ /* Sort the publications before reporting */
+ list_sort(NULL, &publ_list, tipc_publ_sort);
+ list_for_each_entry_safe(p, tmp, &publ_list, list) {
+ tipc_sub_report_overlap(sub, p->lower, p->upper,
+ TIPC_PUBLISHED, p->port, p->node,
+ p->scope, true);
+ list_del_init(&p->list);
}
}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index f79066334cc8..728bc7016c38 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -58,6 +58,7 @@ struct tipc_group;
* @node: network address of publishing socket's node
* @port: publishing port
* @key: publication key, unique across the cluster
+ * @id: publication id
* @binding_node: all publications from the same node which bound this one
* - Remote publications: in node->publ_list
* Used by node/name distr to withdraw publications when node is lost
@@ -69,6 +70,7 @@ struct tipc_group;
* Used by closest_first and multicast receive lookup algorithms
* @all_publ: all publications identical to this one, whatever node and scope
* Used by round-robin lookup algorithm
+ * @list: to form a list of publications in temporal order
* @rcu: RCU callback head used for deferred freeing
*/
struct publication {
@@ -79,10 +81,12 @@ struct publication {
u32 node;
u32 port;
u32 key;
+ u32 id;
struct list_head binding_node;
struct list_head binding_sock;
struct list_head local_publ;
struct list_head all_publ;
+ struct list_head list;
struct rcu_head rcu;
};
diff --git a/net/tipc/node.c b/net/tipc/node.c
index aaf595613e6e..ab04e00cb95b 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -496,6 +496,9 @@ update:
tn->capabilities &= temp_node->capabilities;
}
+ tipc_bcast_toggle_rcast(net,
+ (tn->capabilities & TIPC_BCAST_RCAST));
+
goto exit;
}
n = kzalloc(sizeof(*n), GFP_ATOMIC);
@@ -557,6 +560,7 @@ update:
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
tn->capabilities &= temp_node->capabilities;
}
+ tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST));
trace_tipc_node_create(n, true, " ");
exit:
spin_unlock_bh(&tn->node_list_lock);
@@ -740,7 +744,8 @@ static bool tipc_node_cleanup(struct tipc_node *peer)
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
tn->capabilities &= temp_node->capabilities;
}
-
+ tipc_bcast_toggle_rcast(peer->net,
+ (tn->capabilities & TIPC_BCAST_RCAST));
spin_unlock_bh(&tn->node_list_lock);
return deleted;
}
@@ -2198,6 +2203,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
tn->capabilities &= temp_node->capabilities;
}
+ tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST));
err = 0;
err_out:
tipc_node_put(peer);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 5d7859aac78e..a1c8d722ca20 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2880,7 +2880,7 @@ static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
struct tipc_sock *tsk;
rcu_read_lock();
- tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params);
+ tsk = rhashtable_lookup(&tn->sk_rht, &portid, tsk_rht_params);
if (tsk)
sock_hold(&tsk->sk);
rcu_read_unlock();
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 5bb93dd5762b..bdca31ffe6da 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -861,6 +861,7 @@ static int __init tls_register(void)
tls_sw_proto_ops = inet_stream_ops;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
+ tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked,
tls_device_init();
tcp_register_ulp(&tcp_tls_ulp_ops);
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 83d9c80a684e..3a5dd1e07233 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -6,6 +6,7 @@
#include <net/snmp.h>
#include <net/tls.h>
+#ifdef CONFIG_PROC_FS
static const struct snmp_mib tls_mib_list[] = {
SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW),
SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW),
@@ -32,6 +33,7 @@ static int tls_statistics_seq_show(struct seq_file *seq, void *v)
return 0;
}
+#endif
int __net_init tls_proc_init(struct net *net)
{
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 141da093ff04..da9f9ce51e7b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1209,6 +1209,17 @@ sendpage_end:
return copied ? copied : ret;
}
+int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
+ MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY |
+ MSG_NO_SHARED_FRAGS))
+ return -ENOTSUPP;
+
+ return tls_sw_do_sendpage(sk, page, offset, size, flags);
+}
+
int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 1f4fde4711b6..74db4cd637a7 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -126,19 +126,18 @@ static struct proto vsock_proto = {
*/
#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
-static const struct vsock_transport *transport;
+#define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256)
+#define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256)
+#define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128
+
+/* Transport used for host->guest communication */
+static const struct vsock_transport *transport_h2g;
+/* Transport used for guest->host communication */
+static const struct vsock_transport *transport_g2h;
+/* Transport used for DGRAM communication */
+static const struct vsock_transport *transport_dgram;
static DEFINE_MUTEX(vsock_register_mutex);
-/**** EXPORTS ****/
-
-/* Get the ID of the local context. This is transport dependent. */
-
-int vm_sockets_get_local_cid(void)
-{
- return transport->get_local_cid();
-}
-EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
-
/**** UTILS ****/
/* Each bound VSocket is stored in the bind hash table and each connected
@@ -188,7 +187,7 @@ static int vsock_auto_bind(struct vsock_sock *vsk)
return __vsock_bind(sk, &local_addr);
}
-static int __init vsock_init_tables(void)
+static void vsock_init_tables(void)
{
int i;
@@ -197,7 +196,6 @@ static int __init vsock_init_tables(void)
for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
INIT_LIST_HEAD(&vsock_connected_table[i]);
- return 0;
}
static void __vsock_insert_bound(struct list_head *list,
@@ -230,9 +228,15 @@ static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
{
struct vsock_sock *vsk;
- list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table)
- if (addr->svm_port == vsk->local_addr.svm_port)
+ list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
+ if (vsock_addr_equals_addr(addr, &vsk->local_addr))
+ return sk_vsock(vsk);
+
+ if (addr->svm_port == vsk->local_addr.svm_port &&
+ (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
+ addr->svm_cid == VMADDR_CID_ANY))
return sk_vsock(vsk);
+ }
return NULL;
}
@@ -382,6 +386,88 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
}
EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
+static void vsock_deassign_transport(struct vsock_sock *vsk)
+{
+ if (!vsk->transport)
+ return;
+
+ vsk->transport->destruct(vsk);
+ module_put(vsk->transport->module);
+ vsk->transport = NULL;
+}
+
+/* Assign a transport to a socket and call the .init transport callback.
+ *
+ * Note: for stream socket this must be called when vsk->remote_addr is set
+ * (e.g. during the connect() or when a connection request on a listener
+ * socket is received).
+ * The vsk->remote_addr is used to decide which transport to use:
+ * - remote CID <= VMADDR_CID_HOST will use guest->host transport;
+ * - remote CID == local_cid (guest->host transport) will use guest->host
+ * transport for loopback (host->guest transports don't support loopback);
+ * - remote CID > VMADDR_CID_HOST will use host->guest transport;
+ */
+int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
+{
+ const struct vsock_transport *new_transport;
+ struct sock *sk = sk_vsock(vsk);
+ unsigned int remote_cid = vsk->remote_addr.svm_cid;
+ int ret;
+
+ switch (sk->sk_type) {
+ case SOCK_DGRAM:
+ new_transport = transport_dgram;
+ break;
+ case SOCK_STREAM:
+ if (remote_cid <= VMADDR_CID_HOST ||
+ (transport_g2h &&
+ remote_cid == transport_g2h->get_local_cid()))
+ new_transport = transport_g2h;
+ else
+ new_transport = transport_h2g;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ if (vsk->transport) {
+ if (vsk->transport == new_transport)
+ return 0;
+
+ vsk->transport->release(vsk);
+ vsock_deassign_transport(vsk);
+ }
+
+ /* We increase the module refcnt to prevent the transport unloading
+ * while there are open sockets assigned to it.
+ */
+ if (!new_transport || !try_module_get(new_transport->module))
+ return -ENODEV;
+
+ ret = new_transport->init(vsk, psk);
+ if (ret) {
+ module_put(new_transport->module);
+ return ret;
+ }
+
+ vsk->transport = new_transport;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vsock_assign_transport);
+
+bool vsock_find_cid(unsigned int cid)
+{
+ if (transport_g2h && cid == transport_g2h->get_local_cid())
+ return true;
+
+ if (transport_h2g && cid == VMADDR_CID_HOST)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(vsock_find_cid);
+
static struct sock *vsock_dequeue_accept(struct sock *listener)
{
struct vsock_sock *vlistener;
@@ -418,7 +504,12 @@ static bool vsock_is_pending(struct sock *sk)
static int vsock_send_shutdown(struct sock *sk, int mode)
{
- return transport->shutdown(vsock_sk(sk), mode);
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (!vsk->transport)
+ return -ENODEV;
+
+ return vsk->transport->shutdown(vsk, mode);
}
static void vsock_pending_work(struct work_struct *work)
@@ -528,13 +619,12 @@ static int __vsock_bind_stream(struct vsock_sock *vsk,
static int __vsock_bind_dgram(struct vsock_sock *vsk,
struct sockaddr_vm *addr)
{
- return transport->dgram_bind(vsk, addr);
+ return vsk->transport->dgram_bind(vsk, addr);
}
static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
{
struct vsock_sock *vsk = vsock_sk(sk);
- u32 cid;
int retval;
/* First ensure this socket isn't already bound. */
@@ -544,10 +634,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
/* Now bind to the provided address or select appropriate values if
* none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that
* like AF_INET prevents binding to a non-local IP address (in most
- * cases), we only allow binding to the local CID.
+ * cases), we only allow binding to a local CID.
*/
- cid = transport->get_local_cid();
- if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
+ if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid))
return -EADDRNOTAVAIL;
switch (sk->sk_socket->type) {
@@ -571,12 +660,12 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
static void vsock_connect_timeout(struct work_struct *work);
-struct sock *__vsock_create(struct net *net,
- struct socket *sock,
- struct sock *parent,
- gfp_t priority,
- unsigned short type,
- int kern)
+static struct sock *__vsock_create(struct net *net,
+ struct socket *sock,
+ struct sock *parent,
+ gfp_t priority,
+ unsigned short type,
+ int kern)
{
struct sock *sk;
struct vsock_sock *psk;
@@ -620,23 +709,20 @@ struct sock *__vsock_create(struct net *net,
vsk->trusted = psk->trusted;
vsk->owner = get_cred(psk->owner);
vsk->connect_timeout = psk->connect_timeout;
+ vsk->buffer_size = psk->buffer_size;
+ vsk->buffer_min_size = psk->buffer_min_size;
+ vsk->buffer_max_size = psk->buffer_max_size;
} else {
vsk->trusted = capable(CAP_NET_ADMIN);
vsk->owner = get_current_cred();
vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
+ vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE;
+ vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE;
+ vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE;
}
- if (transport->init(vsk, psk) < 0) {
- sk_free(sk);
- return NULL;
- }
-
- if (sock)
- vsock_insert_unbound(vsk);
-
return sk;
}
-EXPORT_SYMBOL_GPL(__vsock_create);
static void __vsock_release(struct sock *sk, int level)
{
@@ -650,7 +736,10 @@ static void __vsock_release(struct sock *sk, int level)
/* The release call is supposed to use lock_sock_nested()
* rather than lock_sock(), if a sock lock should be acquired.
*/
- transport->release(vsk);
+ if (vsk->transport)
+ vsk->transport->release(vsk);
+ else if (sk->sk_type == SOCK_STREAM)
+ vsock_remove_sock(vsk);
/* When "level" is SINGLE_DEPTH_NESTING, use the nested
* version to avoid the warning "possible recursive locking
@@ -678,7 +767,7 @@ static void vsock_sk_destruct(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
- transport->destruct(vsk);
+ vsock_deassign_transport(vsk);
/* When clearing these addresses, there's no need to set the family and
* possibly register the address family with the kernel.
@@ -700,15 +789,22 @@ static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return err;
}
+struct sock *vsock_create_connected(struct sock *parent)
+{
+ return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL,
+ parent->sk_type, 0);
+}
+EXPORT_SYMBOL_GPL(vsock_create_connected);
+
s64 vsock_stream_has_data(struct vsock_sock *vsk)
{
- return transport->stream_has_data(vsk);
+ return vsk->transport->stream_has_data(vsk);
}
EXPORT_SYMBOL_GPL(vsock_stream_has_data);
s64 vsock_stream_has_space(struct vsock_sock *vsk)
{
- return transport->stream_has_space(vsk);
+ return vsk->transport->stream_has_space(vsk);
}
EXPORT_SYMBOL_GPL(vsock_stream_has_space);
@@ -877,6 +973,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
} else if (sock->type == SOCK_STREAM) {
+ const struct vsock_transport *transport = vsk->transport;
lock_sock(sk);
/* Listening sockets that have connections in their accept
@@ -887,7 +984,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
mask |= EPOLLIN | EPOLLRDNORM;
/* If there is something in the queue then we can read. */
- if (transport->stream_is_active(vsk) &&
+ if (transport && transport->stream_is_active(vsk) &&
!(sk->sk_shutdown & RCV_SHUTDOWN)) {
bool data_ready_now = false;
int ret = transport->notify_poll_in(
@@ -952,6 +1049,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
struct sock *sk;
struct vsock_sock *vsk;
struct sockaddr_vm *remote_addr;
+ const struct vsock_transport *transport;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -960,6 +1058,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
err = 0;
sk = sock->sk;
vsk = vsock_sk(sk);
+ transport = vsk->transport;
lock_sock(sk);
@@ -1044,8 +1143,8 @@ static int vsock_dgram_connect(struct socket *sock,
if (err)
goto out;
- if (!transport->dgram_allow(remote_addr->svm_cid,
- remote_addr->svm_port)) {
+ if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
+ remote_addr->svm_port)) {
err = -EINVAL;
goto out;
}
@@ -1061,7 +1160,9 @@ out:
static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags)
{
- return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags);
+ struct vsock_sock *vsk = vsock_sk(sock->sk);
+
+ return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
}
static const struct proto_ops vsock_dgram_ops = {
@@ -1087,6 +1188,8 @@ static const struct proto_ops vsock_dgram_ops = {
static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
{
+ const struct vsock_transport *transport = vsk->transport;
+
if (!transport->cancel_pkt)
return -EOPNOTSUPP;
@@ -1123,6 +1226,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
int err;
struct sock *sk;
struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
struct sockaddr_vm *remote_addr;
long timeout;
DEFINE_WAIT(wait);
@@ -1157,19 +1261,26 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
goto out;
}
+ /* Set the remote address that we are connecting to. */
+ memcpy(&vsk->remote_addr, remote_addr,
+ sizeof(vsk->remote_addr));
+
+ err = vsock_assign_transport(vsk, NULL);
+ if (err)
+ goto out;
+
+ transport = vsk->transport;
+
/* The hypervisor and well-known contexts do not have socket
* endpoints.
*/
- if (!transport->stream_allow(remote_addr->svm_cid,
+ if (!transport ||
+ !transport->stream_allow(remote_addr->svm_cid,
remote_addr->svm_port)) {
err = -ENETUNREACH;
goto out;
}
- /* Set the remote address that we are connecting to. */
- memcpy(&vsk->remote_addr, remote_addr,
- sizeof(vsk->remote_addr));
-
err = vsock_auto_bind(vsk);
if (err)
goto out;
@@ -1364,6 +1475,23 @@ out:
return err;
}
+static void vsock_update_buffer_size(struct vsock_sock *vsk,
+ const struct vsock_transport *transport,
+ u64 val)
+{
+ if (val > vsk->buffer_max_size)
+ val = vsk->buffer_max_size;
+
+ if (val < vsk->buffer_min_size)
+ val = vsk->buffer_min_size;
+
+ if (val != vsk->buffer_size &&
+ transport && transport->notify_buffer_size)
+ transport->notify_buffer_size(vsk, &val);
+
+ vsk->buffer_size = val;
+}
+
static int vsock_stream_setsockopt(struct socket *sock,
int level,
int optname,
@@ -1373,6 +1501,7 @@ static int vsock_stream_setsockopt(struct socket *sock,
int err;
struct sock *sk;
struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
u64 val;
if (level != AF_VSOCK)
@@ -1393,23 +1522,26 @@ static int vsock_stream_setsockopt(struct socket *sock,
err = 0;
sk = sock->sk;
vsk = vsock_sk(sk);
+ transport = vsk->transport;
lock_sock(sk);
switch (optname) {
case SO_VM_SOCKETS_BUFFER_SIZE:
COPY_IN(val);
- transport->set_buffer_size(vsk, val);
+ vsock_update_buffer_size(vsk, transport, val);
break;
case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
COPY_IN(val);
- transport->set_max_buffer_size(vsk, val);
+ vsk->buffer_max_size = val;
+ vsock_update_buffer_size(vsk, transport, vsk->buffer_size);
break;
case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
COPY_IN(val);
- transport->set_min_buffer_size(vsk, val);
+ vsk->buffer_min_size = val;
+ vsock_update_buffer_size(vsk, transport, vsk->buffer_size);
break;
case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
@@ -1476,17 +1608,17 @@ static int vsock_stream_getsockopt(struct socket *sock,
switch (optname) {
case SO_VM_SOCKETS_BUFFER_SIZE:
- val = transport->get_buffer_size(vsk);
+ val = vsk->buffer_size;
COPY_OUT(val);
break;
case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
- val = transport->get_max_buffer_size(vsk);
+ val = vsk->buffer_max_size;
COPY_OUT(val);
break;
case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
- val = transport->get_min_buffer_size(vsk);
+ val = vsk->buffer_min_size;
COPY_OUT(val);
break;
@@ -1517,6 +1649,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk;
struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
ssize_t total_written;
long timeout;
int err;
@@ -1525,6 +1658,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
sk = sock->sk;
vsk = vsock_sk(sk);
+ transport = vsk->transport;
total_written = 0;
err = 0;
@@ -1546,7 +1680,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
goto out;
}
- if (sk->sk_state != TCP_ESTABLISHED ||
+ if (!transport || sk->sk_state != TCP_ESTABLISHED ||
!vsock_addr_bound(&vsk->local_addr)) {
err = -ENOTCONN;
goto out;
@@ -1656,6 +1790,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
{
struct sock *sk;
struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
int err;
size_t target;
ssize_t copied;
@@ -1666,11 +1801,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
sk = sock->sk;
vsk = vsock_sk(sk);
+ transport = vsk->transport;
err = 0;
lock_sock(sk);
- if (sk->sk_state != TCP_ESTABLISHED) {
+ if (!transport || sk->sk_state != TCP_ESTABLISHED) {
/* Recvmsg is supposed to return 0 if a peer performs an
* orderly shutdown. Differentiate between that case and when a
* peer has not connected or a local shutdown occured with the
@@ -1844,6 +1980,10 @@ static const struct proto_ops vsock_stream_ops = {
static int vsock_create(struct net *net, struct socket *sock,
int protocol, int kern)
{
+ struct vsock_sock *vsk;
+ struct sock *sk;
+ int ret;
+
if (!sock)
return -EINVAL;
@@ -1863,7 +2003,23 @@ static int vsock_create(struct net *net, struct socket *sock,
sock->state = SS_UNCONNECTED;
- return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM;
+ sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ vsk = vsock_sk(sk);
+
+ if (sock->type == SOCK_DGRAM) {
+ ret = vsock_assign_transport(vsk, NULL);
+ if (ret < 0) {
+ sock_put(sk);
+ return ret;
+ }
+ }
+
+ vsock_insert_unbound(vsk);
+
+ return 0;
}
static const struct net_proto_family vsock_family_ops = {
@@ -1876,11 +2032,20 @@ static long vsock_dev_do_ioctl(struct file *filp,
unsigned int cmd, void __user *ptr)
{
u32 __user *p = ptr;
+ u32 cid = VMADDR_CID_ANY;
int retval = 0;
switch (cmd) {
case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
- if (put_user(transport->get_local_cid(), p) != 0)
+ /* To be compatible with the VMCI behavior, we prioritize the
+ * guest CID instead of well-know host CID (VMADDR_CID_HOST).
+ */
+ if (transport_g2h)
+ cid = transport_g2h->get_local_cid();
+ else if (transport_h2g)
+ cid = transport_h2g->get_local_cid();
+
+ if (put_user(cid, p) != 0)
retval = -EFAULT;
break;
@@ -1920,24 +2085,13 @@ static struct miscdevice vsock_device = {
.fops = &vsock_device_ops,
};
-int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
+static int __init vsock_init(void)
{
- int err = mutex_lock_interruptible(&vsock_register_mutex);
+ int err = 0;
- if (err)
- return err;
-
- if (transport) {
- err = -EBUSY;
- goto err_busy;
- }
-
- /* Transport must be the owner of the protocol so that it can't
- * unload while there are open sockets.
- */
- vsock_proto.owner = owner;
- transport = t;
+ vsock_init_tables();
+ vsock_proto.owner = THIS_MODULE;
vsock_device.minor = MISC_DYNAMIC_MINOR;
err = misc_register(&vsock_device);
if (err) {
@@ -1958,7 +2112,6 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
goto err_unregister_proto;
}
- mutex_unlock(&vsock_register_mutex);
return 0;
err_unregister_proto:
@@ -1966,44 +2119,86 @@ err_unregister_proto:
err_deregister_misc:
misc_deregister(&vsock_device);
err_reset_transport:
- transport = NULL;
-err_busy:
- mutex_unlock(&vsock_register_mutex);
return err;
}
-EXPORT_SYMBOL_GPL(__vsock_core_init);
-void vsock_core_exit(void)
+static void __exit vsock_exit(void)
{
- mutex_lock(&vsock_register_mutex);
-
misc_deregister(&vsock_device);
sock_unregister(AF_VSOCK);
proto_unregister(&vsock_proto);
-
- /* We do not want the assignment below re-ordered. */
- mb();
- transport = NULL;
-
- mutex_unlock(&vsock_register_mutex);
}
-EXPORT_SYMBOL_GPL(vsock_core_exit);
-const struct vsock_transport *vsock_core_get_transport(void)
+const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk)
{
- /* vsock_register_mutex not taken since only the transport uses this
- * function and only while registered.
- */
- return transport;
+ return vsk->transport;
}
EXPORT_SYMBOL_GPL(vsock_core_get_transport);
-static void __exit vsock_exit(void)
+int vsock_core_register(const struct vsock_transport *t, int features)
+{
+ const struct vsock_transport *t_h2g, *t_g2h, *t_dgram;
+ int err = mutex_lock_interruptible(&vsock_register_mutex);
+
+ if (err)
+ return err;
+
+ t_h2g = transport_h2g;
+ t_g2h = transport_g2h;
+ t_dgram = transport_dgram;
+
+ if (features & VSOCK_TRANSPORT_F_H2G) {
+ if (t_h2g) {
+ err = -EBUSY;
+ goto err_busy;
+ }
+ t_h2g = t;
+ }
+
+ if (features & VSOCK_TRANSPORT_F_G2H) {
+ if (t_g2h) {
+ err = -EBUSY;
+ goto err_busy;
+ }
+ t_g2h = t;
+ }
+
+ if (features & VSOCK_TRANSPORT_F_DGRAM) {
+ if (t_dgram) {
+ err = -EBUSY;
+ goto err_busy;
+ }
+ t_dgram = t;
+ }
+
+ transport_h2g = t_h2g;
+ transport_g2h = t_g2h;
+ transport_dgram = t_dgram;
+
+err_busy:
+ mutex_unlock(&vsock_register_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(vsock_core_register);
+
+void vsock_core_unregister(const struct vsock_transport *t)
{
- /* Do nothing. This function makes this module removable. */
+ mutex_lock(&vsock_register_mutex);
+
+ if (transport_h2g == t)
+ transport_h2g = NULL;
+
+ if (transport_g2h == t)
+ transport_g2h = NULL;
+
+ if (transport_dgram == t)
+ transport_dgram = NULL;
+
+ mutex_unlock(&vsock_register_mutex);
}
+EXPORT_SYMBOL_GPL(vsock_core_unregister);
-module_init(vsock_init_tables);
+module_init(vsock_init);
module_exit(vsock_exit);
MODULE_AUTHOR("VMware, Inc.");
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7fa09c5e4625..3c7d07a99fc5 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -165,6 +165,8 @@ static const guid_t srv_id_template =
GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58,
0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3);
+static bool hvs_check_transport(struct vsock_sock *vsk);
+
static bool is_valid_srv_id(const guid_t *id)
{
return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4);
@@ -188,7 +190,8 @@ static void hvs_remote_addr_init(struct sockaddr_vm *remote,
static u32 host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT;
struct sock *sk;
- vsock_addr_init(remote, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+ /* Remote peer is always the host */
+ vsock_addr_init(remote, VMADDR_CID_HOST, VMADDR_PORT_ANY);
while (1) {
/* Wrap around ? */
@@ -360,13 +363,24 @@ static void hvs_open_connection(struct vmbus_channel *chan)
if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog)
goto out;
- new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
- sk->sk_type, 0);
+ new = vsock_create_connected(sk);
if (!new)
goto out;
new->sk_state = TCP_SYN_SENT;
vnew = vsock_sk(new);
+
+ hvs_addr_init(&vnew->local_addr, if_type);
+ hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr);
+
+ ret = vsock_assign_transport(vnew, vsock_sk(sk));
+ /* Transport assigned (looking at remote_addr) must be the
+ * same where we received the request.
+ */
+ if (ret || !hvs_check_transport(vnew)) {
+ sock_put(new);
+ goto out;
+ }
hvs_new = vnew->trans;
hvs_new->chan = chan;
} else {
@@ -430,9 +444,6 @@ static void hvs_open_connection(struct vmbus_channel *chan)
new->sk_state = TCP_ESTABLISHED;
sk_acceptq_added(sk);
- hvs_addr_init(&vnew->local_addr, if_type);
- hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr);
-
hvs_new->vm_srv_id = *if_type;
hvs_new->host_srv_id = *if_instance;
@@ -845,37 +856,9 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written,
return 0;
}
-static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- /* Ignored. */
-}
-
-static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- /* Ignored. */
-}
-
-static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- /* Ignored. */
-}
-
-static u64 hvs_get_buffer_size(struct vsock_sock *vsk)
-{
- return -ENOPROTOOPT;
-}
-
-static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk)
-{
- return -ENOPROTOOPT;
-}
-
-static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk)
-{
- return -ENOPROTOOPT;
-}
-
static struct vsock_transport hvs_transport = {
+ .module = THIS_MODULE,
+
.get_local_cid = hvs_get_local_cid,
.init = hvs_sock_init,
@@ -908,14 +891,13 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
- .set_buffer_size = hvs_set_buffer_size,
- .set_min_buffer_size = hvs_set_min_buffer_size,
- .set_max_buffer_size = hvs_set_max_buffer_size,
- .get_buffer_size = hvs_get_buffer_size,
- .get_min_buffer_size = hvs_get_min_buffer_size,
- .get_max_buffer_size = hvs_get_max_buffer_size,
};
+static bool hvs_check_transport(struct vsock_sock *vsk)
+{
+ return vsk->transport == &hvs_transport;
+}
+
static int hvs_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
{
@@ -964,7 +946,7 @@ static int __init hvs_init(void)
if (ret != 0)
return ret;
- ret = vsock_core_init(&hvs_transport);
+ ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H);
if (ret) {
vmbus_driver_unregister(&hvs_drv);
return ret;
@@ -975,7 +957,7 @@ static int __init hvs_init(void)
static void __exit hvs_exit(void)
{
- vsock_core_exit();
+ vsock_core_unregister(&hvs_transport);
vmbus_driver_unregister(&hvs_drv);
}
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 082a30936690..1458c5c8b64d 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -86,33 +86,6 @@ out_rcu:
return ret;
}
-static void virtio_transport_loopback_work(struct work_struct *work)
-{
- struct virtio_vsock *vsock =
- container_of(work, struct virtio_vsock, loopback_work);
- LIST_HEAD(pkts);
-
- spin_lock_bh(&vsock->loopback_list_lock);
- list_splice_init(&vsock->loopback_list, &pkts);
- spin_unlock_bh(&vsock->loopback_list_lock);
-
- mutex_lock(&vsock->rx_lock);
-
- if (!vsock->rx_run)
- goto out;
-
- while (!list_empty(&pkts)) {
- struct virtio_vsock_pkt *pkt;
-
- pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
- list_del_init(&pkt->list);
-
- virtio_transport_recv_pkt(pkt);
- }
-out:
- mutex_unlock(&vsock->rx_lock);
-}
-
static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
struct virtio_vsock_pkt *pkt)
{
@@ -370,59 +343,6 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
return val < virtqueue_get_vring_size(vq);
}
-static void virtio_transport_rx_work(struct work_struct *work)
-{
- struct virtio_vsock *vsock =
- container_of(work, struct virtio_vsock, rx_work);
- struct virtqueue *vq;
-
- vq = vsock->vqs[VSOCK_VQ_RX];
-
- mutex_lock(&vsock->rx_lock);
-
- if (!vsock->rx_run)
- goto out;
-
- do {
- virtqueue_disable_cb(vq);
- for (;;) {
- struct virtio_vsock_pkt *pkt;
- unsigned int len;
-
- if (!virtio_transport_more_replies(vsock)) {
- /* Stop rx until the device processes already
- * pending replies. Leave rx virtqueue
- * callbacks disabled.
- */
- goto out;
- }
-
- pkt = virtqueue_get_buf(vq, &len);
- if (!pkt) {
- break;
- }
-
- vsock->rx_buf_nr--;
-
- /* Drop short/long packets */
- if (unlikely(len < sizeof(pkt->hdr) ||
- len > sizeof(pkt->hdr) + pkt->len)) {
- virtio_transport_free_pkt(pkt);
- continue;
- }
-
- pkt->len = len - sizeof(pkt->hdr);
- virtio_transport_deliver_tap_pkt(pkt);
- virtio_transport_recv_pkt(pkt);
- }
- } while (!virtqueue_enable_cb(vq));
-
-out:
- if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
- virtio_vsock_rx_fill(vsock);
- mutex_unlock(&vsock->rx_lock);
-}
-
/* event_lock must be held */
static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
struct virtio_vsock_event *event)
@@ -542,6 +462,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
static struct virtio_transport virtio_transport = {
.transport = {
+ .module = THIS_MODULE,
+
.get_local_cid = virtio_transport_get_local_cid,
.init = virtio_transport_do_socket_init,
@@ -574,18 +496,92 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_block = virtio_transport_notify_send_pre_block,
.notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
-
- .set_buffer_size = virtio_transport_set_buffer_size,
- .set_min_buffer_size = virtio_transport_set_min_buffer_size,
- .set_max_buffer_size = virtio_transport_set_max_buffer_size,
- .get_buffer_size = virtio_transport_get_buffer_size,
- .get_min_buffer_size = virtio_transport_get_min_buffer_size,
- .get_max_buffer_size = virtio_transport_get_max_buffer_size,
+ .notify_buffer_size = virtio_transport_notify_buffer_size,
},
.send_pkt = virtio_transport_send_pkt,
};
+static void virtio_transport_loopback_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, loopback_work);
+ LIST_HEAD(pkts);
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_splice_init(&vsock->loopback_list, &pkts);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ mutex_lock(&vsock->rx_lock);
+
+ if (!vsock->rx_run)
+ goto out;
+
+ while (!list_empty(&pkts)) {
+ struct virtio_vsock_pkt *pkt;
+
+ pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+
+ virtio_transport_recv_pkt(&virtio_transport, pkt);
+ }
+out:
+ mutex_unlock(&vsock->rx_lock);
+}
+
+static void virtio_transport_rx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, rx_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ mutex_lock(&vsock->rx_lock);
+
+ if (!vsock->rx_run)
+ goto out;
+
+ do {
+ virtqueue_disable_cb(vq);
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ if (!virtio_transport_more_replies(vsock)) {
+ /* Stop rx until the device processes already
+ * pending replies. Leave rx virtqueue
+ * callbacks disabled.
+ */
+ goto out;
+ }
+
+ pkt = virtqueue_get_buf(vq, &len);
+ if (!pkt) {
+ break;
+ }
+
+ vsock->rx_buf_nr--;
+
+ /* Drop short/long packets */
+ if (unlikely(len < sizeof(pkt->hdr) ||
+ len > sizeof(pkt->hdr) + pkt->len)) {
+ virtio_transport_free_pkt(pkt);
+ continue;
+ }
+
+ pkt->len = len - sizeof(pkt->hdr);
+ virtio_transport_deliver_tap_pkt(pkt);
+ virtio_transport_recv_pkt(&virtio_transport, pkt);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+out:
+ if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+}
+
static int virtio_vsock_probe(struct virtio_device *vdev)
{
vq_callback_t *callbacks[] = {
@@ -776,7 +772,8 @@ static int __init virtio_vsock_init(void)
if (!virtio_vsock_workqueue)
return -ENOMEM;
- ret = vsock_core_init(&virtio_transport.transport);
+ ret = vsock_core_register(&virtio_transport.transport,
+ VSOCK_TRANSPORT_F_G2H);
if (ret)
goto out_wq;
@@ -787,7 +784,7 @@ static int __init virtio_vsock_init(void)
return 0;
out_vci:
- vsock_core_exit();
+ vsock_core_unregister(&virtio_transport.transport);
out_wq:
destroy_workqueue(virtio_vsock_workqueue);
return ret;
@@ -796,7 +793,7 @@ out_wq:
static void __exit virtio_vsock_exit(void)
{
unregister_virtio_driver(&virtio_vsock_driver);
- vsock_core_exit();
+ vsock_core_unregister(&virtio_transport.transport);
destroy_workqueue(virtio_vsock_workqueue);
}
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 828edd88488c..e5ea29c6bca7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -29,9 +29,10 @@
/* Threshold for detecting small packets to copy */
#define GOOD_COPY_LEN 128
-static const struct virtio_transport *virtio_transport_get_ops(void)
+static const struct virtio_transport *
+virtio_transport_get_ops(struct vsock_sock *vsk)
{
- const struct vsock_transport *t = vsock_core_get_transport();
+ const struct vsock_transport *t = vsock_core_get_transport(vsk);
return container_of(t, struct virtio_transport, transport);
}
@@ -168,7 +169,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
struct virtio_vsock_pkt *pkt;
u32 pkt_len = info->pkt_len;
- src_cid = vm_sockets_get_local_cid();
+ src_cid = virtio_transport_get_ops(vsk)->transport.get_local_cid();
src_port = vsk->local_addr.svm_port;
if (!info->remote_cid) {
dst_cid = vsk->remote_addr.svm_cid;
@@ -201,7 +202,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
virtio_transport_inc_tx_pkt(vvs, pkt);
- return virtio_transport_get_ops()->send_pkt(pkt);
+ return virtio_transport_get_ops(vsk)->send_pkt(pkt);
}
static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
@@ -452,20 +453,16 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk,
vsk->trans = vvs;
vvs->vsk = vsk;
- if (psk) {
+ if (psk && psk->trans) {
struct virtio_vsock_sock *ptrans = psk->trans;
- vvs->buf_size = ptrans->buf_size;
- vvs->buf_size_min = ptrans->buf_size_min;
- vvs->buf_size_max = ptrans->buf_size_max;
vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
- } else {
- vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
- vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
- vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
}
- vvs->buf_alloc = vvs->buf_size;
+ if (vsk->buffer_size > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ vsk->buffer_size = VIRTIO_VSOCK_MAX_BUF_SIZE;
+
+ vvs->buf_alloc = vsk->buffer_size;
spin_lock_init(&vvs->rx_lock);
spin_lock_init(&vvs->tx_lock);
@@ -475,71 +472,20 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk,
}
EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
-u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- return vvs->buf_size;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
-
-u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- return vvs->buf_size_min;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
-
-u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+/* sk_lock held by the caller */
+void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val)
{
struct virtio_vsock_sock *vvs = vsk->trans;
- return vvs->buf_size_max;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+ if (*val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ *val = VIRTIO_VSOCK_MAX_BUF_SIZE;
-void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
- val = VIRTIO_VSOCK_MAX_BUF_SIZE;
- if (val < vvs->buf_size_min)
- vvs->buf_size_min = val;
- if (val > vvs->buf_size_max)
- vvs->buf_size_max = val;
- vvs->buf_size = val;
- vvs->buf_alloc = val;
+ vvs->buf_alloc = *val;
virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
NULL);
}
-EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
-
-void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
- val = VIRTIO_VSOCK_MAX_BUF_SIZE;
- if (val > vvs->buf_size)
- vvs->buf_size = val;
- vvs->buf_size_min = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
-
-void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
- val = VIRTIO_VSOCK_MAX_BUF_SIZE;
- if (val < vvs->buf_size)
- vvs->buf_size = val;
- vvs->buf_size_max = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size);
int
virtio_transport_notify_poll_in(struct vsock_sock *vsk,
@@ -631,9 +577,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- return vvs->buf_size;
+ return vsk->buffer_size;
}
EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
@@ -745,9 +689,9 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
/* Normally packets are associated with a socket. There may be no socket if an
* attempt was made to connect to a socket that does not exist.
*/
-static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
+ struct virtio_vsock_pkt *pkt)
{
- const struct virtio_transport *t;
struct virtio_vsock_pkt *reply;
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RST,
@@ -767,7 +711,6 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
if (!reply)
return -ENOMEM;
- t = virtio_transport_get_ops();
if (!t) {
virtio_transport_free_pkt(reply);
return -ENOTCONN;
@@ -1043,13 +986,39 @@ virtio_transport_send_response(struct vsock_sock *vsk,
return virtio_transport_send_pkt_info(vsk, &info);
}
+static bool virtio_transport_space_update(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ bool space_available;
+
+ /* Listener sockets are not associated with any transport, so we are
+ * not able to take the state to see if there is space available in the
+ * remote peer, but since they are only used to receive requests, we
+ * can assume that there is always space available in the other peer.
+ */
+ if (!vvs)
+ return true;
+
+ /* buf_alloc and fwd_cnt is always included in the hdr */
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+ vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+ space_available = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+ return space_available;
+}
+
/* Handle server socket */
static int
-virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt,
+ struct virtio_transport *t)
{
struct vsock_sock *vsk = vsock_sk(sk);
struct vsock_sock *vchild;
struct sock *child;
+ int ret;
if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
virtio_transport_reset(vsk, pkt);
@@ -1061,8 +1030,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
return -ENOMEM;
}
- child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
- sk->sk_type, 0);
+ child = vsock_create_connected(sk);
if (!child) {
virtio_transport_reset(vsk, pkt);
return -ENOMEM;
@@ -1080,6 +1048,20 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
le32_to_cpu(pkt->hdr.src_port));
+ ret = vsock_assign_transport(vchild, vsk);
+ /* Transport assigned (looking at remote_addr) must be the same
+ * where we received the request.
+ */
+ if (ret || vchild->transport != &t->transport) {
+ release_sock(child);
+ virtio_transport_reset(vsk, pkt);
+ sock_put(child);
+ return ret;
+ }
+
+ if (virtio_transport_space_update(child, pkt))
+ child->sk_write_space(child);
+
vsock_insert_connected(vchild);
vsock_enqueue_accept(sk, child);
virtio_transport_send_response(vchild, pkt);
@@ -1090,26 +1072,11 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
return 0;
}
-static bool virtio_transport_space_update(struct sock *sk,
- struct virtio_vsock_pkt *pkt)
-{
- struct vsock_sock *vsk = vsock_sk(sk);
- struct virtio_vsock_sock *vvs = vsk->trans;
- bool space_available;
-
- /* buf_alloc and fwd_cnt is always included in the hdr */
- spin_lock_bh(&vvs->tx_lock);
- vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
- vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
- space_available = virtio_transport_has_space(vsk);
- spin_unlock_bh(&vvs->tx_lock);
- return space_available;
-}
-
/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
* lock.
*/
-void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+void virtio_transport_recv_pkt(struct virtio_transport *t,
+ struct virtio_vsock_pkt *pkt)
{
struct sockaddr_vm src, dst;
struct vsock_sock *vsk;
@@ -1131,7 +1098,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
le32_to_cpu(pkt->hdr.fwd_cnt));
if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
- (void)virtio_transport_reset_no_sock(pkt);
+ (void)virtio_transport_reset_no_sock(t, pkt);
goto free_pkt;
}
@@ -1142,7 +1109,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
if (!sk) {
sk = vsock_find_bound_socket(&dst);
if (!sk) {
- (void)virtio_transport_reset_no_sock(pkt);
+ (void)virtio_transport_reset_no_sock(t, pkt);
goto free_pkt;
}
}
@@ -1161,7 +1128,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
switch (sk->sk_state) {
case TCP_LISTEN:
- virtio_transport_recv_listen(sk, pkt);
+ virtio_transport_recv_listen(sk, pkt, t);
virtio_transport_free_pkt(pkt);
break;
case TCP_SYN_SENT:
@@ -1179,6 +1146,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
virtio_transport_free_pkt(pkt);
break;
}
+
release_sock(sk);
/* Release refcnt obtained when we fetched this socket out of the
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 6ba98a1efe2e..644d32e43d23 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -57,6 +57,7 @@ static bool vmci_transport_old_proto_override(bool *old_pkt_proto);
static u16 vmci_transport_new_proto_supported_versions(void);
static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto,
bool old_pkt_proto);
+static bool vmci_check_transport(struct vsock_sock *vsk);
struct vmci_transport_recv_pkt_info {
struct work_struct work;
@@ -74,15 +75,6 @@ static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
static int PROTOCOL_OVERRIDE = -1;
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144
-
-/* The default peer timeout indicates how long we will wait for a peer response
- * to a control message.
- */
-#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
-
/* Helper function to convert from a VMCI error code to a VSock error code. */
static s32 vmci_transport_error_to_vsock_error(s32 vmci_error)
@@ -1013,8 +1005,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
return -ECONNREFUSED;
}
- pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
- sk->sk_type, 0);
+ pending = vsock_create_connected(sk);
if (!pending) {
vmci_transport_send_reset(sk, pkt);
return -ENOMEM;
@@ -1027,14 +1018,24 @@ static int vmci_transport_recv_listen(struct sock *sk,
vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context,
pkt->src_port);
+ err = vsock_assign_transport(vpending, vsock_sk(sk));
+ /* Transport assigned (looking at remote_addr) must be the same
+ * where we received the request.
+ */
+ if (err || !vmci_check_transport(vpending)) {
+ vmci_transport_send_reset(sk, pkt);
+ sock_put(pending);
+ return err;
+ }
+
/* If the proposed size fits within our min/max, accept it. Otherwise
* propose our own size.
*/
- if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size &&
- pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) {
+ if (pkt->u.size >= vpending->buffer_min_size &&
+ pkt->u.size <= vpending->buffer_max_size) {
qp_size = pkt->u.size;
} else {
- qp_size = vmci_trans(vpending)->queue_pair_size;
+ qp_size = vpending->buffer_size;
}
/* Figure out if we are using old or new requests based on the
@@ -1103,7 +1104,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
pending->sk_state = TCP_SYN_SENT;
vmci_trans(vpending)->produce_size =
vmci_trans(vpending)->consume_size = qp_size;
- vmci_trans(vpending)->queue_pair_size = qp_size;
+ vpending->buffer_size = qp_size;
vmci_trans(vpending)->notify_ops->process_request(pending);
@@ -1397,8 +1398,8 @@ static int vmci_transport_recv_connecting_client_negotiate(
vsk->ignore_connecting_rst = false;
/* Verify that we're OK with the proposed queue pair size */
- if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size ||
- pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) {
+ if (pkt->u.size < vsk->buffer_min_size ||
+ pkt->u.size > vsk->buffer_max_size) {
err = -EINVAL;
goto destroy;
}
@@ -1503,8 +1504,7 @@ vmci_transport_recv_connecting_client_invalid(struct sock *sk,
vsk->sent_request = false;
vsk->ignore_connecting_rst = true;
- err = vmci_transport_send_conn_request(
- sk, vmci_trans(vsk)->queue_pair_size);
+ err = vmci_transport_send_conn_request(sk, vsk->buffer_size);
if (err < 0)
err = vmci_transport_error_to_vsock_error(err);
else
@@ -1588,21 +1588,6 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk,
INIT_LIST_HEAD(&vmci_trans(vsk)->elem);
vmci_trans(vsk)->sk = &vsk->sk;
spin_lock_init(&vmci_trans(vsk)->lock);
- if (psk) {
- vmci_trans(vsk)->queue_pair_size =
- vmci_trans(psk)->queue_pair_size;
- vmci_trans(vsk)->queue_pair_min_size =
- vmci_trans(psk)->queue_pair_min_size;
- vmci_trans(vsk)->queue_pair_max_size =
- vmci_trans(psk)->queue_pair_max_size;
- } else {
- vmci_trans(vsk)->queue_pair_size =
- VMCI_TRANSPORT_DEFAULT_QP_SIZE;
- vmci_trans(vsk)->queue_pair_min_size =
- VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN;
- vmci_trans(vsk)->queue_pair_max_size =
- VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX;
- }
return 0;
}
@@ -1818,8 +1803,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk)
if (vmci_transport_old_proto_override(&old_pkt_proto) &&
old_pkt_proto) {
- err = vmci_transport_send_conn_request(
- sk, vmci_trans(vsk)->queue_pair_size);
+ err = vmci_transport_send_conn_request(sk, vsk->buffer_size);
if (err < 0) {
sk->sk_state = TCP_CLOSE;
return err;
@@ -1827,8 +1811,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk)
} else {
int supported_proto_versions =
vmci_transport_new_proto_supported_versions();
- err = vmci_transport_send_conn_request2(
- sk, vmci_trans(vsk)->queue_pair_size,
+ err = vmci_transport_send_conn_request2(sk, vsk->buffer_size,
supported_proto_versions);
if (err < 0) {
sk->sk_state = TCP_CLOSE;
@@ -1881,46 +1864,6 @@ static bool vmci_transport_stream_is_active(struct vsock_sock *vsk)
return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle);
}
-static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk)
-{
- return vmci_trans(vsk)->queue_pair_size;
-}
-
-static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk)
-{
- return vmci_trans(vsk)->queue_pair_min_size;
-}
-
-static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk)
-{
- return vmci_trans(vsk)->queue_pair_max_size;
-}
-
-static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- if (val < vmci_trans(vsk)->queue_pair_min_size)
- vmci_trans(vsk)->queue_pair_min_size = val;
- if (val > vmci_trans(vsk)->queue_pair_max_size)
- vmci_trans(vsk)->queue_pair_max_size = val;
- vmci_trans(vsk)->queue_pair_size = val;
-}
-
-static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk,
- u64 val)
-{
- if (val > vmci_trans(vsk)->queue_pair_size)
- vmci_trans(vsk)->queue_pair_size = val;
- vmci_trans(vsk)->queue_pair_min_size = val;
-}
-
-static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk,
- u64 val)
-{
- if (val < vmci_trans(vsk)->queue_pair_size)
- vmci_trans(vsk)->queue_pair_size = val;
- vmci_trans(vsk)->queue_pair_max_size = val;
-}
-
static int vmci_transport_notify_poll_in(
struct vsock_sock *vsk,
size_t target,
@@ -2076,7 +2019,8 @@ static u32 vmci_transport_get_local_cid(void)
return vmci_get_context_id();
}
-static const struct vsock_transport vmci_transport = {
+static struct vsock_transport vmci_transport = {
+ .module = THIS_MODULE,
.init = vmci_transport_socket_init,
.destruct = vmci_transport_destruct,
.release = vmci_transport_release,
@@ -2103,15 +2047,26 @@ static const struct vsock_transport vmci_transport = {
.notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue,
.shutdown = vmci_transport_shutdown,
- .set_buffer_size = vmci_transport_set_buffer_size,
- .set_min_buffer_size = vmci_transport_set_min_buffer_size,
- .set_max_buffer_size = vmci_transport_set_max_buffer_size,
- .get_buffer_size = vmci_transport_get_buffer_size,
- .get_min_buffer_size = vmci_transport_get_min_buffer_size,
- .get_max_buffer_size = vmci_transport_get_max_buffer_size,
.get_local_cid = vmci_transport_get_local_cid,
};
+static bool vmci_check_transport(struct vsock_sock *vsk)
+{
+ return vsk->transport == &vmci_transport;
+}
+
+void vmci_vsock_transport_cb(bool is_host)
+{
+ int features;
+
+ if (is_host)
+ features = VSOCK_TRANSPORT_F_H2G;
+ else
+ features = VSOCK_TRANSPORT_F_G2H;
+
+ vsock_core_register(&vmci_transport, features);
+}
+
static int __init vmci_transport_init(void)
{
int err;
@@ -2128,7 +2083,6 @@ static int __init vmci_transport_init(void)
pr_err("Unable to create datagram handle. (%d)\n", err);
return vmci_transport_error_to_vsock_error(err);
}
-
err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED,
vmci_transport_qp_resumed_cb,
NULL, &vmci_transport_qp_resumed_sub_id);
@@ -2139,12 +2093,21 @@ static int __init vmci_transport_init(void)
goto err_destroy_stream_handle;
}
- err = vsock_core_init(&vmci_transport);
+ /* Register only with dgram feature, other features (H2G, G2H) will be
+ * registered when the first host or guest becomes active.
+ */
+ err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM);
if (err < 0)
goto err_unsubscribe;
+ err = vmci_register_vsock_callback(vmci_vsock_transport_cb);
+ if (err < 0)
+ goto err_unregister;
+
return 0;
+err_unregister:
+ vsock_core_unregister(&vmci_transport);
err_unsubscribe:
vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id);
err_destroy_stream_handle:
@@ -2170,7 +2133,8 @@ static void __exit vmci_transport_exit(void)
vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
}
- vsock_core_exit();
+ vmci_register_vsock_callback(NULL);
+ vsock_core_unregister(&vmci_transport);
}
module_exit(vmci_transport_exit);
diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
index 1ca1e8640b31..b7b072194282 100644
--- a/net/vmw_vsock/vmci_transport.h
+++ b/net/vmw_vsock/vmci_transport.h
@@ -108,9 +108,6 @@ struct vmci_transport {
struct vmci_qp *qpair;
u64 produce_size;
u64 consume_size;
- u64 queue_pair_size;
- u64 queue_pair_min_size;
- u64 queue_pair_max_size;
u32 detach_sub_id;
union vmci_transport_notify notify;
const struct vmci_transport_notify_ops *notify_ops;
diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h
index 7843f08d4290..a1aa5a998c0e 100644
--- a/net/vmw_vsock/vmci_transport_notify.h
+++ b/net/vmw_vsock/vmci_transport_notify.h
@@ -11,7 +11,6 @@
#include <linux/types.h>
#include <linux/vmw_vmci_defs.h>
#include <linux/vmw_vmci_api.h>
-#include <linux/vm_sockets.h>
#include "vmci_transport.h"
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 51bb6018f3bf..17b8a7d4b71b 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -3,13 +3,13 @@
# XFRM configuration
#
config XFRM
- bool
- depends on INET
- select GRO_CELLS
- select SKB_EXTENSIONS
+ bool
+ depends on INET
+ select GRO_CELLS
+ select SKB_EXTENSIONS
config XFRM_OFFLOAD
- bool
+ bool
config XFRM_ALGO
tristate
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 9b599ed66d97..2c86a2fc3915 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -480,6 +480,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
else
XFRM_INC_STATS(net,
LINUX_MIB_XFRMINSTATEINVALID);
+
+ if (encap_type == -1)
+ dev_put(skb->dev);
goto drop;
}
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 0f5131bc3342..7ac1542feaf8 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -732,30 +732,7 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
.get_link_net = xfrmi_get_link_net,
};
-static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn)
-{
- struct xfrm_if *xi;
- LIST_HEAD(list);
-
- xi = rtnl_dereference(xfrmn->xfrmi[0]);
- if (!xi)
- return;
-
- unregister_netdevice_queue(xi->dev, &list);
- unregister_netdevice_many(&list);
-}
-
-static void __net_exit xfrmi_exit_net(struct net *net)
-{
- struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
-
- rtnl_lock();
- xfrmi_destroy_interfaces(xfrmn);
- rtnl_unlock();
-}
-
static struct pernet_operations xfrmi_net_ops = {
- .exit = xfrmi_exit_net,
.id = &xfrmi_net_id,
.size = sizeof(struct xfrmi_net),
};
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index c6f3c4a1bd99..f3423562d933 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -495,6 +495,8 @@ static void ___xfrm_state_destroy(struct xfrm_state *x)
x->type->destructor(x);
xfrm_put_type(x->type);
}
+ if (x->xfrag.page)
+ put_page(x->xfrag.page);
xfrm_dev_state_free(x);
security_xfrm_state_free(x);
xfrm_state_free(x);